Merge fe7e1a2565 into 45c9ae312c

Use the flash-attention wheels in https://github.com/kingbri1/flash-attention
Show llama.cpp prompt processing on one line instead of many lines
2025-06-07 06:06:20 -04:00 · 2025-06-02 12:37:29 +07:00 · 2025-06-01 22:17:22 -07:00 · 2025-06-01 22:12:24 -07:00 · 2025-06-01 19:27:14 -07:00 · 2025-06-01 19:22:21 -07:00
8 changed files with 189 additions and 57 deletions
--- a/README.md
+++ b/README.md
@ -325,6 +325,18 @@ https://github.com/oobabooga/text-generation-webui/wiki

 ## Downloading models

+### Pointing to an existing AI model library
+
+Edit the file `text-generation-webui\user_data\CMD_FLAGS.txt` to include this line:
+
+```
+--model-dir 'D:\MyAIModels\'
+```
+
+Replace `D:\MyAIModels\` with the path to your model library folder. Sub-folders will be automatically parsed to enumerate all existing models.
+
+### Manual model download
+
 Models should be placed in the folder `text-generation-webui/user_data/models`. They are usually downloaded from [Hugging Face](https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads).

 * GGUF models are a single file and should be placed directly into `user_data/models`. Example:
--- a/download-model.py
+++ b/download-model.py
@ -32,6 +32,7 @@ class ModelDownloader:
        self.max_retries = max_retries
        self.session = self.get_session()
        self._progress_bar_slots = None
+        self.progress_queue = None

    def get_session(self):
        session = requests.Session()
@ -218,33 +219,45 @@ class ModelDownloader:

        max_retries = self.max_retries
        attempt = 0
+        file_downloaded_count_for_progress = 0
+
        try:
            while attempt < max_retries:
                attempt += 1
                session = self.session
                headers = {}
                mode = 'wb'
+                current_file_size_on_disk = 0

                try:
                    if output_path.exists() and not start_from_scratch:
-                        # Resume download
-                        r = session.get(url, stream=True, timeout=20)
-                        total_size = int(r.headers.get('content-length', 0))
-                        if output_path.stat().st_size >= total_size:
+                        current_file_size_on_disk = output_path.stat().st_size
+                        r_head = session.head(url, timeout=20)
+                        r_head.raise_for_status()
+                        total_size = int(r_head.headers.get('content-length', 0))
+
+                        if current_file_size_on_disk >= total_size and total_size > 0:
+                            if self.progress_queue is not None and total_size > 0:
+                                self.progress_queue.put((1.0, str(filename)))
                            return

-                        headers = {'Range': f'bytes={output_path.stat().st_size}-'}
+                        headers = {'Range': f'bytes={current_file_size_on_disk}-'}
                        mode = 'ab'

                    with session.get(url, stream=True, headers=headers, timeout=30) as r:
-                        r.raise_for_status()  # If status is not 2xx, raise an error
-                        total_size = int(r.headers.get('content-length', 0))
-                        block_size = 1024 * 1024  # 1MB
+                        r.raise_for_status()
+                        total_size_from_stream = int(r.headers.get('content-length', 0))
+                        if mode == 'ab':
+                            effective_total_size = current_file_size_on_disk + total_size_from_stream
+                        else:
+                            effective_total_size = total_size_from_stream

-                        filename_str = str(filename)  # Convert PosixPath to string if necessary
+                        block_size = 1024 * 1024
+                        filename_str = str(filename)

                        tqdm_kwargs = {
-                            'total': total_size,
+                            'total': effective_total_size,
+                            'initial': current_file_size_on_disk if mode == 'ab' else 0,
                            'unit': 'B',
                            'unit_scale': True,
                            'unit_divisor': 1024,
@ -261,16 +274,20 @@ class ModelDownloader:
                            })

                        with open(output_path, mode) as f:
+                            if mode == 'ab':
+                                f.seek(current_file_size_on_disk)
+
                            with tqdm.tqdm(**tqdm_kwargs) as t:
-                                count = 0
+                                file_downloaded_count_for_progress = current_file_size_on_disk
                                for data in r.iter_content(block_size):
                                    f.write(data)
                                    t.update(len(data))
-                                    if total_size != 0 and self.progress_bar is not None:
-                                        count += len(data)
-                                        self.progress_bar(float(count) / float(total_size), f"{filename_str}")
+                                    if effective_total_size != 0 and self.progress_queue is not None:
+                                        file_downloaded_count_for_progress += len(data)
+                                        progress_fraction = float(file_downloaded_count_for_progress) / float(effective_total_size)
+                                        self.progress_queue.put((progress_fraction, filename_str))
+                        break

-                        break  # Exit loop if successful
                except (RequestException, ConnectionError, Timeout) as e:
                    print(f"Error downloading {filename}: {e}.")
                    print(f"That was attempt {attempt}/{max_retries}.", end=' ')
@ -295,10 +312,9 @@ class ModelDownloader:
        finally:
            print(f"\nDownload of {len(file_list)} files to {output_folder} completed.")

-    def download_model_files(self, model, branch, links, sha256, output_folder, progress_bar=None, start_from_scratch=False, threads=4, specific_file=None, is_llamacpp=False):
-        self.progress_bar = progress_bar
+    def download_model_files(self, model, branch, links, sha256, output_folder, progress_queue=None, start_from_scratch=False, threads=4, specific_file=None, is_llamacpp=False):
+        self.progress_queue = progress_queue

-        # Create the folder and writing the metadata
        output_folder.mkdir(parents=True, exist_ok=True)

        if not is_llamacpp:
--- a/js/main.js
+++ b/js/main.js
@ -865,6 +865,46 @@ function navigateLastAssistantMessage(direction) {
  return false;
 }

+//------------------------------------------------
+// Paste Handler for Long Text
+//------------------------------------------------
+
+const MAX_PLAIN_TEXT_LENGTH = 2500;
+
+function setupPasteHandler() {
+  const textbox = document.querySelector("#chat-input textarea[data-testid=\"textbox\"]");
+  const fileInput = document.querySelector("#chat-input input[data-testid=\"file-upload\"]");
+
+  if (!textbox || !fileInput) {
+    setTimeout(setupPasteHandler, 500);
+    return;
+  }
+
+  textbox.addEventListener("paste", async (event) => {
+    const text = event.clipboardData?.getData("text");
+
+    if (text && text.length > MAX_PLAIN_TEXT_LENGTH) {
+      event.preventDefault();
+
+      const file = new File([text], "pasted_text.txt", {
+        type: "text/plain",
+        lastModified: Date.now()
+      });
+
+      const dataTransfer = new DataTransfer();
+      dataTransfer.items.add(file);
+      fileInput.files = dataTransfer.files;
+      fileInput.dispatchEvent(new Event("change", { bubbles: true }));
+    }
+  });
+}
+
+if (document.readyState === "loading") {
+  document.addEventListener("DOMContentLoaded", setupPasteHandler);
+} else {
+  setupPasteHandler();
+}
+
 //------------------------------------------------
 // Tooltips
 //------------------------------------------------
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@ -409,14 +409,31 @@ class LlamaServer:

 def filter_stderr_with_progress(process_stderr):
    progress_pattern = re.compile(r'slot update_slots: id.*progress = (\d+\.\d+)')
+    last_was_progress = False
+
    try:
        for line in iter(process_stderr.readline, ''):
+            line = line.rstrip('\n\r')  # Remove existing newlines
            progress_match = progress_pattern.search(line)
+
            if progress_match:
-                sys.stderr.write(line)
+                if last_was_progress:
+                    # Overwrite the previous progress line using carriage return
+                    sys.stderr.write(f'\r{line}')
+                else:
+                    # First progress line - print normally
+                    sys.stderr.write(line)
                sys.stderr.flush()
+                last_was_progress = True
            elif not line.startswith(('srv ', 'slot ')) and 'log_server_r: request: GET /health' not in line:
-                sys.stderr.write(line)
+                if last_was_progress:
+                    # Finish the progress line with a newline, then print the new line
+                    sys.stderr.write(f'\n{line}\n')
+                else:
+                    # Normal line - print with newline
+                    sys.stderr.write(f'{line}\n')
                sys.stderr.flush()
+                last_was_progress = False
+            # For filtered lines, don't change last_was_progress state
    except (ValueError, IOError):
        pass
--- a/modules/models.py
+++ b/modules/models.py
@ -116,7 +116,7 @@ def unload_model(keep_model_name=False):
        return

    is_llamacpp = (shared.model.__class__.__name__ == 'LlamaServer')
-    if shared.args.loader == 'ExLlamav3_HF':
+    if shared.model.__class__.__name__ == 'Exllamav3HF':
        shared.model.unload()

    shared.model = shared.tokenizer = None
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@ -1,4 +1,6 @@
 import importlib
+import queue
+import threading
 import traceback
 from functools import partial
 from pathlib import Path
@ -205,48 +207,51 @@ def load_lora_wrapper(selected_loras):


 def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), return_links=False, check=False):
+    downloader_module = importlib.import_module("download-model")
+    downloader = downloader_module.ModelDownloader()
+    update_queue = queue.Queue()
+
    try:
        # Handle direct GGUF URLs
        if repo_id.startswith("https://") and ("huggingface.co" in repo_id) and (repo_id.endswith(".gguf") or repo_id.endswith(".gguf?download=true")):
            try:
                path = repo_id.split("huggingface.co/")[1]
-
-                # Extract the repository ID (first two parts of the path)
                parts = path.split("/")
                if len(parts) >= 2:
                    extracted_repo_id = f"{parts[0]}/{parts[1]}"
-
-                    # Extract the filename (last part of the path)
-                    filename = repo_id.split("/")[-1]
-                    if "?download=true" in filename:
-                        filename = filename.replace("?download=true", "")
-
+                    filename = repo_id.split("/")[-1].replace("?download=true", "")
                    repo_id = extracted_repo_id
                    specific_file = filename
-            except:
-                pass
+            except Exception as e:
+                yield f"Error parsing GGUF URL: {e}"
+                progress(0.0)
+                return

-        if repo_id == "":
-            yield ("Please enter a model path")
+        if not repo_id:
+            yield "Please enter a model path."
+            progress(0.0)
            return

        repo_id = repo_id.strip()
        specific_file = specific_file.strip()
-        downloader = importlib.import_module("download-model").ModelDownloader()

-        progress(0.0)
+        progress(0.0, "Preparing download...")
+
        model, branch = downloader.sanitize_model_and_branch_names(repo_id, None)
-
-        yield ("Getting the download links from Hugging Face")
+        yield "Getting download links from Hugging Face..."
        links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(model, branch, text_only=False, specific_file=specific_file)

+        if not links:
+            yield "No files found to download for the given model/criteria."
+            progress(0.0)
+            return
+
        # Check for multiple GGUF files
        gguf_files = [link for link in links if link.lower().endswith('.gguf')]
        if len(gguf_files) > 1 and not specific_file:
            output = "Multiple GGUF files found. Please copy one of the following filenames to the 'File name' field:\n\n```\n"
            for link in gguf_files:
                output += f"{Path(link).name}\n"
-
            output += "```"
            yield output
            return
@ -255,17 +260,13 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
            output = "```\n"
            for link in links:
                output += f"{Path(link).name}" + "\n"
-
            output += "```"
            yield output
            return

-        yield ("Getting the output folder")
+        yield "Determining output folder..."
        output_folder = downloader.get_output_folder(
-            model,
-            branch,
-            is_lora,
-            is_llamacpp=is_llamacpp,
+            model, branch, is_lora, is_llamacpp=is_llamacpp,
            model_dir=shared.args.model_dir if shared.args.model_dir != shared.args_defaults.model_dir else None
        )

@ -275,19 +276,65 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
            output_folder = Path(shared.args.lora_dir)

        if check:
-            progress(0.5)
-
-            yield ("Checking previously downloaded files")
+            yield "Checking previously downloaded files..."
+            progress(0.5, "Verifying files...")
            downloader.check_model_files(model, branch, links, sha256, output_folder)
-            progress(1.0)
-        else:
-            yield (f"Downloading file{'s' if len(links) > 1 else ''} to `{output_folder}/`")
-            downloader.download_model_files(model, branch, links, sha256, output_folder, progress_bar=progress, threads=4, is_llamacpp=is_llamacpp)
+            progress(1.0, "Verification complete.")
+            yield "File check complete."
+            return

-            yield (f"Model successfully saved to `{output_folder}/`.")
-    except:
-        progress(1.0)
-        yield traceback.format_exc().replace('\n', '\n\n')
+        yield ""
+        progress(0.0, "Download starting...")
+
+        def downloader_thread_target():
+            try:
+                downloader.download_model_files(
+                    model, branch, links, sha256, output_folder,
+                    progress_queue=update_queue,
+                    threads=4,
+                    is_llamacpp=is_llamacpp,
+                    specific_file=specific_file
+                )
+                update_queue.put(("COMPLETED", f"Model successfully saved to `{output_folder}/`."))
+            except Exception as e:
+                tb_str = traceback.format_exc().replace('\n', '\n\n')
+                update_queue.put(("ERROR", tb_str))
+
+        download_thread = threading.Thread(target=downloader_thread_target)
+        download_thread.start()
+
+        while True:
+            try:
+                message = update_queue.get(timeout=0.2)
+                if not isinstance(message, tuple) or len(message) != 2:
+                    continue
+
+                msg_identifier, data = message
+
+                if msg_identifier == "COMPLETED":
+                    progress(1.0, "Download complete!")
+                    yield data
+                    break
+                elif msg_identifier == "ERROR":
+                    progress(0.0, "Error occurred")
+                    yield data
+                    break
+                elif isinstance(msg_identifier, float):
+                    progress_value = msg_identifier
+                    description_str = data
+                    progress(progress_value, f"Downloading: {description_str}")
+
+            except queue.Empty:
+                if not download_thread.is_alive():
+                    yield "Download process finished."
+                    break
+
+        download_thread.join()
+
+    except Exception as e:
+        progress(0.0)
+        tb_str = traceback.format_exc().replace('\n', '\n\n')
+        yield tb_str


 def update_truncation_length(current_length, state):
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@ -41,5 +41,5 @@ https://github.com/oobabooga/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3+
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
-https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@ -41,5 +41,5 @@ https://github.com/oobabooga/exllamav3/releases/download/v0.0.3/exllamav3-0.0.3+
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
 https://github.com/turboderp-org/exllamav2/releases/download/v0.3.1/exllamav2-0.3.1-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
-https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
Author	SHA1	Message	Date
Tomas M.	acd91a35d7	Merge `fe7e1a2565` into `45c9ae312c`	2025-06-02 12:37:29 +07:00
oobabooga	45c9ae312c	Use the flash-attention wheels in https://github.com/kingbri1/flash-attention	2025-06-01 22:17:22 -07:00
oobabooga	2db7745cbd	Show llama.cpp prompt processing on one line instead of many lines	2025-06-01 22:12:24 -07:00
oobabooga	ad6d0218ae	Fix after `219f0a7731`	2025-06-01 19:27:14 -07:00
oobabooga	92adceb7b5	UI: Fix the model downloader progress bar	2025-06-01 19:22:21 -07:00
oobabooga	7a81beb0c1	Turn long pasted text into an attachment automatically	2025-06-01 18:26:14 -07:00
Tomas M.	fe7e1a2565	Update README.md I placed the "Pointing to an existing AI model library" section first, as I believe, this is more relevant to majority of users.	2025-05-19 22:25:59 +00:00
oobabooga	e8595730b4	Merge pull request #6992 from oobabooga/dev Merge dev branch	2025-05-17 11:58:46 -03:00
oobabooga	17c29fa0a2	Merge pull request #6987 from oobabooga/dev Merge dev branch	2025-05-16 22:23:59 -03:00
oobabooga	dc3094549e	Merge pull request #6984 from oobabooga/dev Merge dev branch	2025-05-16 17:13:26 -03:00
oobabooga	ace8afb825	Merge dev branch	2025-05-01 12:25:04 -03:00
oobabooga	a41da1ec95	Merge pull request #6939 from oobabooga/dev Merge dev branch	2025-05-01 00:15:11 -03:00
oobabooga	6e6f9971a2	Merge pull request #6919 from oobabooga/dev Merge dev branch	2025-04-27 11:35:19 -03:00
oobabooga	1180bb0d80	Merge pull request #6913 from oobabooga/dev Merge dev branch	2025-04-27 00:12:16 -03:00
oobabooga	9bb9ce079e	Merge pull request #6912 from oobabooga/dev Merge dev branch	2025-04-27 00:03:16 -03:00
oobabooga	1aa76b3beb	Merge pull request #6885 from oobabooga/dev Merge dev branch	2025-04-22 22:38:24 -03:00
oobabooga	1df2b0d3ae	Merge pull request #6884 from oobabooga/dev Merge dev branch	2025-04-22 22:02:30 -03:00
oobabooga	62455b415c	Merge pull request #6883 from oobabooga/dev Merge dev branch	2025-04-22 21:54:34 -03:00
oobabooga	022664f2bd	Merge pull request #6881 from oobabooga/dev Merge dev branch	2025-04-22 12:15:34 -03:00
oobabooga	a778270536	Merge pull request #6869 from oobabooga/dev Merge dev branch	2025-04-22 12:09:20 -03:00
oobabooga	c19b995b8e	Merge pull request #6857 from oobabooga/dev Merge dev branch	2025-04-19 21:45:55 -03:00
oobabooga	b1495d52e5	Merge pull request #6855 from oobabooga/dev Merge dev branch	2025-04-19 01:53:11 -03:00
oobabooga	44a6d8a761	Merge pull request #6854 from oobabooga/dev Merge dev branch	2025-04-18 23:41:56 -03:00
oobabooga	4fa52a1302	Merge pull request #6852 from oobabooga/dev Merge dev branch	2025-04-18 22:15:40 -03:00
oobabooga	4eecb6611f	Merge pull request #6850 from oobabooga/dev Merge dev branch	2025-04-18 15:33:32 -03:00
oobabooga	c5e54c0b37	Merge pull request #6848 from oobabooga/dev Merge dev branch	2025-04-18 13:36:06 -03:00
oobabooga	14e6baeb48	Merge pull request #6838 from oobabooga/dev Merge dev branch	2025-04-09 14:48:37 -03:00
oobabooga	bb1905ebc5	Fix the colab notebook	2025-03-29 19:17:36 -07:00
oobabooga	9b80d1d6c2	Remove the stalebot	2025-03-29 13:44:37 -07:00
oobabooga	80cdbe4e09	Merge pull request #6797 from oobabooga/dev Merge dev branch	2025-03-15 00:11:25 -03:00
Kelvie Wong	769eee1ff3	Fix OpenAI API with new param (show_after), closes #6747 (#6749 ) --------- Co-authored-by: oobabooga <oobabooga4@gmail.com>	2025-02-18 07:02:19 -08:00
oobabooga	7c883ef2f0	Merge pull request #6746 from oobabooga/dev Merge dev branch	2025-02-14 23:25:31 -03:00