diff --git a/README.md b/README.md index 0833f9b0..7105ce23 100644 --- a/README.md +++ b/README.md @@ -12,10 +12,8 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github. ## Features -- Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), and [ExLlamaV2](https://github.com/turboderp-org/exllamav2). - - [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) is also supported via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile). - - Additional quantization libraries like [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM) can be used with the Transformers loader if you install them manually. -- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for llama.cpp GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory that doesn't interfere with your system environment. +- Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), [ExLlamaV2](https://github.com/turboderp-org/exllamav2), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) (the latter via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile)). +- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory that doesn't interfere with your system environment. - UI that resembles the original ChatGPT style. - Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats. - Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`. @@ -146,14 +144,14 @@ The `requirements*.txt` above contain various wheels precompiled through GitHub For NVIDIA GPU: ln -s docker/{nvidia/Dockerfile,nvidia/docker-compose.yml,.dockerignore} . For AMD GPU: -ln -s docker/{amd/Dockerfile,intel/docker-compose.yml,.dockerignore} . +ln -s docker/{amd/Dockerfile,amd/docker-compose.yml,.dockerignore} . For Intel GPU: ln -s docker/{intel/Dockerfile,amd/docker-compose.yml,.dockerignore} . For CPU only ln -s docker/{cpu/Dockerfile,cpu/docker-compose.yml,.dockerignore} . cp docker/.env.example .env #Create logs/cache dir : -mkdir -p logs cache +mkdir -p user_data/logs user_data/cache # Edit .env and set: # TORCH_CUDA_ARCH_LIST based on your GPU model # APP_RUNTIME_GID your host user's group id (run `id -g` in a terminal) diff --git a/css/main.css b/css/main.css index 20b7869d..888d50c0 100644 --- a/css/main.css +++ b/css/main.css @@ -131,7 +131,7 @@ gradio-app > :first-child { } .header_bar { - box-shadow: 0 0 3px rgba(22 22 22 / 35%); + border-right: var(--input-border-width) solid var(--input-border-color); margin-bottom: 0; overflow-x: scroll; text-wrap: nowrap; @@ -419,6 +419,14 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { padding-right: 1rem; } +.chat .message .timestamp { + font-size: 0.7em; + display: inline-block; + font-weight: normal; + opacity: 0.7; + margin-left: 5px; +} + .chat-parent.bigchat { flex: 1; } @@ -584,6 +592,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { padding: 0.65rem 2.5rem; border: 0; box-shadow: 0; + border-radius: 8px; } #chat-input textarea::placeholder { @@ -603,6 +612,16 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { display: none; } +#chat-input .submit-button { + display: none; +} + +#chat-input .upload-button { + margin-right: 16px; + margin-bottom: 7px; + background: transparent; +} + .chat-input-positioned { max-width: 54rem; left: 50%; @@ -827,7 +846,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { } #chat-col.bigchat { - padding-bottom: 80px !important; + padding-bottom: 15px !important; } .message-body ol, .message-body ul { @@ -1171,11 +1190,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { background-color: var(--light-theme-gray); } -#chat-controls { +.dark #chat-controls { border-left: 1px solid #d9d9d0; } -#past-chats-row { +.dark #past-chats-row { border-right: 1px solid #d9d9d0; } @@ -1236,42 +1255,31 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { position: relative; } -.footer-button { +/* New container for the buttons */ +.message-actions { position: absolute; + bottom: -23px; + left: 0; + display: flex; + gap: 5px; + opacity: 0; + transition: opacity 0.2s; +} + +.footer-button { padding: 0; margin: 0; border: none; border-radius: 3px; cursor: pointer; - opacity: 0; display: flex; align-items: center; - transition: opacity 0.2s; + justify-content: center; } -.footer-button.footer-copy-button { - bottom: -23px; - left: 0; -} - -.footer-button.footer-refresh-button { - bottom: -23px; - left: 25px; -} - -.footer-button.footer-continue-button { - bottom: -23px; - left: 50px; -} - -.footer-button.footer-remove-button { - bottom: -23px; - left: 75px; -} - -.message:hover .footer-button, -.user-message:hover .footer-button, -.assistant-message:hover .footer-button { +.message:hover .message-actions, +.user-message:hover .message-actions, +.assistant-message:hover .message-actions { opacity: 1; } @@ -1362,6 +1370,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* { contain: layout; } +.chat .message-body .thinking-content p, +.chat .message-body .thinking-content li { + font-size: 15px !important; +} + /* Animation for opening thinking blocks */ @keyframes fadeIn { from { opacity: 0; } @@ -1399,6 +1412,53 @@ strong { color: #07ff07; } + +.message-attachments { + display: flex; + flex-wrap: wrap; + gap: 8px; + margin-top: 8px; +} + +.attachment-box { + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; + padding: 8px; + background: rgb(0 0 0 / 5%); + border-radius: 6px; + border: 1px solid rgb(0 0 0 / 10%); + min-width: 80px; + max-width: 120px; +} + +.attachment-icon { + margin-bottom: 4px; + color: #555; +} + +.attachment-name { + font-size: 0.8em; + text-align: center; + word-break: break-word; + overflow: hidden; + text-overflow: ellipsis; + display: -webkit-box; + -webkit-line-clamp: 2; + -webkit-box-orient: vertical; +} + +.dark .attachment-box { + background: rgb(255 255 255 / 5%); + border: 1px solid rgb(255 255 255 / 10%); +} + +.dark .attachment-icon { + color: #ccc; +} + + /* --- Message Versioning Styles --- */ .message-versioning-container { @@ -1490,4 +1550,3 @@ strong { .message-versioning-container[hidden] { display: none; -} diff --git a/docker/amd/Dockerfile b/docker/amd/Dockerfile index 66e5863c..c23083f7 100644 --- a/docker/amd/Dockerfile +++ b/docker/amd/Dockerfile @@ -14,7 +14,7 @@ WORKDIR /home/app/ RUN git clone https://github.com/oobabooga/text-generation-webui.git WORKDIR /home/app/text-generation-webui RUN GPU_CHOICE=B LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose -COPY CMD_FLAGS.txt /home/app/text-generation-webui/ +COPY /user_data/CMD_FLAGS.txt /home/app/text-generation-webui/user_data EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005} WORKDIR /home/app/text-generation-webui # set umask to ensure group read / write at runtime diff --git a/docker/amd/docker-compose.yml b/docker/amd/docker-compose.yml index 8866e9ed..a727ca3e 100644 --- a/docker/amd/docker-compose.yml +++ b/docker/amd/docker-compose.yml @@ -41,14 +41,4 @@ services: security_opt: - seccomp=unconfined volumes: - - ./cache:/home/app/text-generation-webui/cache - - ./characters:/home/app/text-generation-webui/characters - - ./extensions:/home/app/text-generation-webui/extensions - - ./loras:/home/app/text-generation-webui/loras - - ./logs:/home/app/text-generation-webui/logs - - ./models:/home/app/text-generation-webui/models - - ./presets:/home/app/text-generation-webui/presets - - ./prompts:/home/app/text-generation-webui/prompts - - ./softprompts:/home/app/text-generation-webui/softprompts - - ./training:/home/app/text-generation-webui/training - - ./cloudflared:/etc/cloudflared + - ./user_data:/home/app/text-generation-webui/user_data diff --git a/docker/intel/Dockerfile b/docker/intel/Dockerfile index cab62442..4a709803 100644 --- a/docker/intel/Dockerfile +++ b/docker/intel/Dockerfile @@ -14,7 +14,7 @@ WORKDIR /home/app/ RUN git clone https://github.com/oobabooga/text-generation-webui.git WORKDIR /home/app/text-generation-webui RUN GPU_CHOICE=D LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose -COPY CMD_FLAGS.txt /home/app/text-generation-webui/ +COPY /user_data/CMD_FLAGS.txt /home/app/text-generation-webui/user_data EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005} # set umask to ensure group read / write at runtime WORKDIR /home/app/text-generation-webui diff --git a/docker/intel/docker-compose.yml b/docker/intel/docker-compose.yml index 78e06698..bb48dd22 100644 --- a/docker/intel/docker-compose.yml +++ b/docker/intel/docker-compose.yml @@ -41,12 +41,4 @@ services: security_opt: - seccomp=unconfined volumes: - - ./characters:/home/app/text-generation-webui/characters - - ./extensions:/home/app/text-generation-webui/extensions - - ./loras:/home/app/text-generation-webui/loras - - ./models:/home/app/text-generation-webui/models - - ./presets:/home/app/text-generation-webui/presets - - ./prompts:/home/app/text-generation-webui/prompts - - ./softprompts:/home/app/text-generation-webui/softprompts - - ./training:/home/app/text-generation-webui/training - - ./cloudflared:/etc/cloudflared + - ./user_data:/home/app/text-generation-webui/user_data diff --git a/extensions/openai/script.py b/extensions/openai/script.py index 2c98ee78..b6abae20 100644 --- a/extensions/openai/script.py +++ b/extensions/openai/script.py @@ -115,13 +115,17 @@ async def openai_completions(request: Request, request_data: CompletionRequest): if request_data.stream: async def generator(): async with streaming_semaphore: - response = OAIcompletions.stream_completions(to_dict(request_data), is_legacy=is_legacy) - async for resp in iterate_in_threadpool(response): - disconnected = await request.is_disconnected() - if disconnected: - break + try: + response = OAIcompletions.stream_completions(to_dict(request_data), is_legacy=is_legacy) + async for resp in iterate_in_threadpool(response): + disconnected = await request.is_disconnected() + if disconnected: + break - yield {"data": json.dumps(resp)} + yield {"data": json.dumps(resp)} + finally: + stop_everything_event() + return return EventSourceResponse(generator()) # SSE streaming @@ -143,13 +147,17 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion if request_data.stream: async def generator(): async with streaming_semaphore: - response = OAIcompletions.stream_chat_completions(to_dict(request_data), is_legacy=is_legacy) - async for resp in iterate_in_threadpool(response): - disconnected = await request.is_disconnected() - if disconnected: - break + try: + response = OAIcompletions.stream_chat_completions(to_dict(request_data), is_legacy=is_legacy) + async for resp in iterate_in_threadpool(response): + disconnected = await request.is_disconnected() + if disconnected: + break - yield {"data": json.dumps(resp)} + yield {"data": json.dumps(resp)} + finally: + stop_everything_event() + return return EventSourceResponse(generator()) # SSE streaming diff --git a/js/global_scope_js.js b/js/global_scope_js.js index cff8d3e8..78e83492 100644 --- a/js/global_scope_js.js +++ b/js/global_scope_js.js @@ -18,6 +18,37 @@ function copyToClipboard(element) { }); } +function branchHere(element) { + if (!element) return; + + const messageElement = element.closest(".message, .user-message, .assistant-message"); + if (!messageElement) return; + + const index = messageElement.getAttribute("data-index"); + if (!index) return; + + const branchIndexInput = document.getElementById("Branch-index").querySelector("input"); + if (!branchIndexInput) { + console.error("Element with ID 'Branch-index' not found."); + return; + } + const branchButton = document.getElementById("Branch"); + + if (!branchButton) { + console.error("Required element 'Branch' not found."); + return; + } + + branchIndexInput.value = index; + + // Trigger any 'change' or 'input' events Gradio might be listening for + const event = new Event("input", { bubbles: true }); // 'change' might also work + branchIndexInput.dispatchEvent(event); + + branchButton.click(); // Gradio will now pick up the 'index' + +} + function regenerateClick() { document.getElementById("Regenerate").click(); } diff --git a/js/main.js b/js/main.js index 68575568..9db116a3 100644 --- a/js/main.js +++ b/js/main.js @@ -132,8 +132,6 @@ targetElement.addEventListener("scroll", function() { // Create a MutationObserver instance const observer = new MutationObserver(function(mutations) { - updateCssProperties(); - if (targetElement.classList.contains("_generating")) { typing.parentNode.classList.add("visible-dots"); document.getElementById("stop").style.display = "flex"; @@ -446,32 +444,6 @@ const chatInput = document.querySelector("#chat-input textarea"); // Variables to store current dimensions let currentChatInputHeight = chatInput.clientHeight; -// Update chat layout based on chat and input dimensions -function updateCssProperties() { - const chatInputHeight = chatInput.clientHeight; - - // Check if the chat container is visible - if (chatContainer.clientHeight > 0) { - // Adjust scrollTop based on input height change - if (chatInputHeight !== currentChatInputHeight) { - const deltaHeight = chatInputHeight - currentChatInputHeight; - if (!isScrolled && deltaHeight < 0) { - chatContainer.scrollTop = chatContainer.scrollHeight; - } else { - chatContainer.scrollTop += deltaHeight; - } - - currentChatInputHeight = chatInputHeight; - } - } -} - -// Observe textarea size changes and call update function -new ResizeObserver(updateCssProperties).observe(document.querySelector("#chat-input textarea")); - -// Handle changes in window size -window.addEventListener("resize", updateCssProperties); - //------------------------------------------------ // Focus on the rename text area when it becomes visible //------------------------------------------------ diff --git a/modules/chat.py b/modules/chat.py index 30c2c29a..17b75d90 100644 --- a/modules/chat.py +++ b/modules/chat.py @@ -37,6 +37,30 @@ def strftime_now(format): return datetime.now().strftime(format) +def get_current_timestamp(): + """Returns the current time in 24-hour format""" + return datetime.now().strftime('%b %d, %Y %H:%M') + + +def update_message_metadata(metadata_dict, role, index, **fields): + """ + Updates or adds metadata fields for a specific message. + + Args: + metadata_dict: The metadata dictionary + role: The role (user, assistant, etc) + index: The message index + **fields: Arbitrary metadata fields to update/add + """ + key = f"{role}_{index}" + if key not in metadata_dict: + metadata_dict[key] = {} + + # Update with provided fields + for field_name, field_value in fields.items(): + metadata_dict[key][field_name] = field_value + + jinja_env = ImmutableSandboxedEnvironment( trim_blocks=True, lstrip_blocks=True, @@ -133,7 +157,9 @@ def generate_chat_prompt(user_input, state, **kwargs): impersonate = kwargs.get('impersonate', False) _continue = kwargs.get('_continue', False) also_return_rows = kwargs.get('also_return_rows', False) - history = kwargs.get('history', state['history'])['internal'] + history_data = kwargs.get('history', state['history']) + history = history_data['internal'] + metadata = history_data.get('metadata', {}) # Templates chat_template_str = state['chat_template_str'] @@ -172,11 +198,13 @@ def generate_chat_prompt(user_input, state, **kwargs): messages.append({"role": "system", "content": context}) insert_pos = len(messages) - for entry in reversed(history): + for i, entry in enumerate(reversed(history)): user_msg = entry[0].strip() assistant_msg = entry[1].strip() tool_msg = entry[2].strip() if len(entry) > 2 else '' + row_idx = len(history) - i - 1 + if tool_msg: messages.insert(insert_pos, {"role": "tool", "content": tool_msg}) @@ -184,10 +212,40 @@ def generate_chat_prompt(user_input, state, **kwargs): messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg}) if user_msg not in ['', '<|BEGIN-VISIBLE-CHAT|>']: - messages.insert(insert_pos, {"role": "user", "content": user_msg}) + # Check for user message attachments in metadata + user_key = f"user_{row_idx}" + enhanced_user_msg = user_msg + + # Add attachment content if present + if user_key in metadata and "attachments" in metadata[user_key]: + attachments_text = "" + for attachment in metadata[user_key]["attachments"]: + filename = attachment.get("name", "file") + content = attachment.get("content", "") + attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n" + + if attachments_text: + enhanced_user_msg = f"{user_msg}\n\nATTACHMENTS:\n{attachments_text}" + + messages.insert(insert_pos, {"role": "user", "content": enhanced_user_msg}) user_input = user_input.strip() if user_input and not impersonate and not _continue: + # For the current user input being processed, check if we need to add attachments + if not impersonate and not _continue and len(history_data.get('metadata', {})) > 0: + current_row_idx = len(history) + user_key = f"user_{current_row_idx}" + + if user_key in metadata and "attachments" in metadata[user_key]: + attachments_text = "" + for attachment in metadata[user_key]["attachments"]: + filename = attachment.get("name", "file") + content = attachment.get("content", "") + attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n" + + if attachments_text: + user_input = f"{user_input}\n\nATTACHMENTS:\n{attachments_text}" + messages.append({"role": "user", "content": user_input}) def make_prompt(messages): @@ -256,7 +314,6 @@ def generate_chat_prompt(user_input, state, **kwargs): # Resort to truncating the user input else: - user_message = messages[-1]['content'] # Bisect the truncation point @@ -341,12 +398,111 @@ def get_stopping_strings(state): return result +def add_message_version(history, row_idx, is_current=True): + """Add the current message as a version in the history metadata""" + if 'metadata' not in history: + history['metadata'] = {} + + if row_idx >= len(history['internal']) or not history['internal'][row_idx][1].strip(): + return # Skip if row doesn't exist or message is empty + + key = f"assistant_{row_idx}" + + # Initialize metadata structures if needed + if key not in history['metadata']: + history['metadata'][key] = {"timestamp": get_current_timestamp()} + if "versions" not in history['metadata'][key]: + history['metadata'][key]["versions"] = [] + + # Add current message as a version + history['metadata'][key]["versions"].append({ + "content": history['internal'][row_idx][1], + "visible_content": history['visible'][row_idx][1], + "timestamp": get_current_timestamp() + }) + + # Update index if this is the current version + if is_current: + history['metadata'][key]["current_version_index"] = len(history['metadata'][key]["versions"]) - 1 + + +def add_message_attachment(history, row_idx, file_path, is_user=True): + """Add a file attachment to a message in history metadata""" + if 'metadata' not in history: + history['metadata'] = {} + + key = f"{'user' if is_user else 'assistant'}_{row_idx}" + + if key not in history['metadata']: + history['metadata'][key] = {"timestamp": get_current_timestamp()} + if "attachments" not in history['metadata'][key]: + history['metadata'][key]["attachments"] = [] + + # Get file info using pathlib + path = Path(file_path) + filename = path.name + file_extension = path.suffix.lower() + + try: + # Handle different file types + if file_extension == '.pdf': + # Process PDF file + content = extract_pdf_text(path) + file_type = "application/pdf" + else: + # Default handling for text files + with open(path, 'r', encoding='utf-8') as f: + content = f.read() + file_type = "text/plain" + + # Add attachment + attachment = { + "name": filename, + "type": file_type, + "content": content, + } + + history['metadata'][key]["attachments"].append(attachment) + return content # Return the content for reuse + except Exception as e: + logger.error(f"Error processing attachment {filename}: {e}") + return None + + +def extract_pdf_text(pdf_path): + """Extract text from a PDF file""" + import PyPDF2 + + text = "" + try: + with open(pdf_path, 'rb') as file: + pdf_reader = PyPDF2.PdfReader(file) + for page_num in range(len(pdf_reader.pages)): + page = pdf_reader.pages[page_num] + text += page.extract_text() + "\n\n" + + return text.strip() + except Exception as e: + logger.error(f"Error extracting text from PDF: {e}") + return f"[Error extracting PDF text: {str(e)}]" + + def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_message=True, for_ui=False): + # Handle dict format with text and files + files = [] + if isinstance(text, dict): + files = text.get('files', []) + text = text.get('text', '') + history = state['history'] output = copy.deepcopy(history) output = apply_extensions('history', output) state = apply_extensions('state', state) + # Initialize metadata if not present + if 'metadata' not in output: + output['metadata'] = {} + visible_text = None stopping_strings = get_stopping_strings(state) is_stream = state['stream'] @@ -355,44 +511,70 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess if not (regenerate or _continue): visible_text = html.escape(text) + # Process file attachments and store in metadata + row_idx = len(output['internal']) + + # Add attachments to metadata only, not modifying the message text + for file_path in files: + add_message_attachment(output, row_idx, file_path, is_user=True) + # Apply extensions text, visible_text = apply_extensions('chat_input', text, visible_text, state) text = apply_extensions('input', text, state, is_chat=True) + # Current row index output['internal'].append([text, '']) output['visible'].append([visible_text, '']) + # Add metadata with timestamp + update_message_metadata(output['metadata'], "user", row_idx, timestamp=get_current_timestamp()) # *Is typing...* if loading_message: yield { 'visible': output['visible'][:-1] + [[output['visible'][-1][0], shared.processing_message]], - 'internal': output['internal'] + 'internal': output['internal'], + 'metadata': output['metadata'] } else: text, visible_text = output['internal'][-1][0], output['visible'][-1][0] if regenerate: + row_idx = len(output['internal']) - 1 + + # Store the existing response as a version before regenerating + add_message_version(output, row_idx, is_current=False) + if loading_message: yield { 'visible': output['visible'][:-1] + [[visible_text, shared.processing_message]], - 'internal': output['internal'][:-1] + [[text, '']] + 'internal': output['internal'][:-1] + [[text, '']], + 'metadata': output['metadata'] } elif _continue: last_reply = [output['internal'][-1][1], output['visible'][-1][1]] if loading_message: yield { 'visible': output['visible'][:-1] + [[visible_text, last_reply[1] + '...']], - 'internal': output['internal'] + 'internal': output['internal'], + 'metadata': output['metadata'] } # Generate the prompt kwargs = { '_continue': _continue, - 'history': output if _continue else {k: v[:-1] for k, v in output.items()} + 'history': output if _continue else { + k: (v[:-1] if k in ['internal', 'visible'] else v) + for k, v in output.items() + } } + prompt = apply_extensions('custom_generate_chat_prompt', text, state, **kwargs) if prompt is None: prompt = generate_chat_prompt(text, state, **kwargs) + # Add timestamp for assistant's response at the start of generation + row_idx = len(output['internal']) - 1 + update_message_metadata(output['metadata'], "assistant", row_idx, timestamp=get_current_timestamp()) + # Generate reply = None for j, reply in enumerate(generate_reply(prompt, state, stopping_strings=stopping_strings, is_chat=True, for_ui=for_ui)): @@ -421,6 +603,11 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess if is_stream: yield output + # Add the newly generated response as a version (only for regeneration) + if regenerate: + row_idx = len(output['internal']) - 1 + add_message_version(output, row_idx, is_current=True) + output['visible'][-1][1] = apply_extensions('output', output['visible'][-1][1], state, is_chat=True) yield output @@ -508,9 +695,19 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False): def remove_last_message(history): + if 'metadata' not in history: + history['metadata'] = {} + if len(history['visible']) > 0 and history['internal'][-1][0] != '<|BEGIN-VISIBLE-CHAT|>': + row_idx = len(history['internal']) - 1 last = history['visible'].pop() history['internal'].pop() + + # Remove metadata directly by known keys + if f"user_{row_idx}" in history['metadata']: + del history['metadata'][f"user_{row_idx}"] + if f"assistant_{row_idx}" in history['metadata']: + del history['metadata'][f"assistant_{row_idx}"] else: last = ['', ''] @@ -527,30 +724,54 @@ def send_last_reply_to_input(history): def replace_last_reply(text, state): history = state['history'] + # Initialize metadata if not present + if 'metadata' not in history: + history['metadata'] = {} + if len(text.strip()) == 0: return history elif len(history['visible']) > 0: + row_idx = len(history['internal']) - 1 history['visible'][-1][1] = html.escape(text) history['internal'][-1][1] = apply_extensions('input', text, state, is_chat=True) + update_message_metadata(history['metadata'], "assistant", row_idx, timestamp=get_current_timestamp()) return history def send_dummy_message(text, state): history = state['history'] + + # Initialize metadata if not present + if 'metadata' not in history: + history['metadata'] = {} + + row_idx = len(history['internal']) history['visible'].append([html.escape(text), '']) history['internal'].append([apply_extensions('input', text, state, is_chat=True), '']) + update_message_metadata(history['metadata'], "user", row_idx, timestamp=get_current_timestamp()) + return history def send_dummy_reply(text, state): history = state['history'] + + # Initialize metadata if not present + if 'metadata' not in history: + history['metadata'] = {} + if len(history['visible']) > 0 and not history['visible'][-1][1] == '': + row_idx = len(history['internal']) history['visible'].append(['', '']) history['internal'].append(['', '']) + # We don't need to add system metadata + row_idx = len(history['internal']) - 1 history['visible'][-1][1] = html.escape(text) history['internal'][-1][1] = apply_extensions('input', text, state, is_chat=True) + update_message_metadata(history['metadata'], "assistant", row_idx, timestamp=get_current_timestamp()) + return history @@ -560,7 +781,8 @@ def redraw_html(history, name1, name2, mode, style, character, reset_cache=False def start_new_chat(state): mode = state['mode'] - history = {'internal': [], 'visible': []} + # Initialize with empty metadata dictionary + history = {'internal': [], 'visible': [], 'metadata': {}} if mode != 'instruct': greeting = replace_character_names(state['greeting'], state['name1'], state['name2']) @@ -568,6 +790,9 @@ def start_new_chat(state): history['internal'] += [['<|BEGIN-VISIBLE-CHAT|>', greeting]] history['visible'] += [['', apply_extensions('output', html.escape(greeting), state, is_chat=True)]] + # Add timestamp for assistant's greeting + update_message_metadata(history['metadata'], "assistant", 0, timestamp=get_current_timestamp()) + unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S') save_history(history, unique_id, state['character_menu'], state['mode']) @@ -749,6 +974,16 @@ def load_history(unique_id, character, mode): 'visible': f['data_visible'] } + # Add metadata if it doesn't exist + if 'metadata' not in history: + history['metadata'] = {} + # Add placeholder timestamps for existing messages + for i, (user_msg, asst_msg) in enumerate(history['internal']): + if user_msg and user_msg != '<|BEGIN-VISIBLE-CHAT|>': + update_message_metadata(history['metadata'], "user", i, timestamp="") + if asst_msg: + update_message_metadata(history['metadata'], "assistant", i, timestamp="") + return history @@ -764,6 +999,16 @@ def load_history_json(file, history): 'visible': f['data_visible'] } + # Add metadata if it doesn't exist + if 'metadata' not in history: + history['metadata'] = {} + # Add placeholder timestamps + for i, (user_msg, asst_msg) in enumerate(history['internal']): + if user_msg and user_msg != '<|BEGIN-VISIBLE-CHAT|>': + update_message_metadata(history['metadata'], "user", i, timestamp="") + if asst_msg: + update_message_metadata(history['metadata'], "assistant", i, timestamp="") + return history except: return history @@ -1093,7 +1338,7 @@ def handle_replace_last_reply_click(text, state): message_versioning.append_message_version(history, state, is_bot=True) html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) - return [history, html, ""] + return [history, html, {"text": "", "files": []}] def handle_send_dummy_message_click(text, state): @@ -1102,7 +1347,7 @@ def handle_send_dummy_message_click(text, state): message_versioning.append_message_version(history, state, is_bot=False) html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) - return [history, html, ""] + return [history, html, {"text": "", "files": []}] def handle_send_dummy_reply_click(text, state): @@ -1111,7 +1356,7 @@ def handle_send_dummy_reply_click(text, state): message_versioning.append_message_version(history, state, is_bot=True) html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) - return [history, html, ""] + return [history, html, {"text": "", "files": []}] def handle_remove_last_click(state): @@ -1119,7 +1364,7 @@ def handle_remove_last_click(state): save_history(history, state['unique_id'], state['character_menu'], state['mode']) html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']) - return [history, html, last_input] + return [history, html, {"text": last_input, "files": []}] def handle_unique_id_select(state): @@ -1175,7 +1420,13 @@ def handle_delete_chat_confirm_click(state): def handle_branch_chat_click(state): - history = state['history'] + branch_from_index = state['branch_index'] + if branch_from_index == -1: + history = state['history'] + else: + history = state['history'] + history['visible'] = history['visible'][:branch_from_index + 1] + history['internal'] = history['internal'][:branch_from_index + 1] new_unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S') save_history(history, new_unique_id, state['character_menu'], state['mode']) @@ -1186,7 +1437,7 @@ def handle_branch_chat_click(state): past_chats_update = gr.update(choices=histories, value=new_unique_id) - return [history, html, past_chats_update] + return [history, html, past_chats_update, -1] def handle_rename_chat_click(): @@ -1328,7 +1579,7 @@ def handle_your_picture_change(picture, state): def handle_send_instruction_click(state): state['mode'] = 'instruct' - state['history'] = {'internal': [], 'visible': []} + state['history'] = {'internal': [], 'visible': [], 'metadata': {}} output = generate_chat_prompt("Input", state) diff --git a/modules/html_generator.py b/modules/html_generator.py index 689ab58a..66dc4827 100644 --- a/modules/html_generator.py +++ b/modules/html_generator.py @@ -169,11 +169,7 @@ def convert_to_markdown(string, message_id=None): thinking_block = f'''
- - - - - + {info_svg_small} {title_text}
{thinking_html}
@@ -339,11 +335,59 @@ copy_svg = '''''' continue_svg = '''''' remove_svg = '''''' +branch_svg = '''''' +info_svg = '''''' +info_svg_small = '''''' +attachment_svg = '''''' copy_button = f'' +branch_button = f'' refresh_button = f'' continue_button = f'' remove_button = f'' +info_button = f'' + + +def format_message_timestamp(history, role, index): + """Get a formatted timestamp HTML span for a message if available""" + key = f"{role}_{index}" + if 'metadata' in history and key in history['metadata'] and history['metadata'][key].get('timestamp'): + timestamp = history['metadata'][key]['timestamp'] + return f"{timestamp}" + + return "" + + +def format_message_attachments(history, role, index): + """Get formatted HTML for message attachments if available""" + key = f"{role}_{index}" + if 'metadata' in history and key in history['metadata'] and 'attachments' in history['metadata'][key]: + attachments = history['metadata'][key]['attachments'] + if not attachments: + return "" + + attachments_html = '
' + for attachment in attachments: + attachments_html += ( + f'
' + f'
{attachment_svg}
' + f'
{html.escape(attachment["name"])}
' + f'
' + ) + attachments_html += '
' + return attachments_html + + return "" + +def actions_html(history, i, info_message=""): + return (f'
' + f'{copy_button}' + f'{refresh_button if i == len(history["visible"]) - 1 else ""}' + f'{continue_button if i == len(history["visible"]) - 1 else ""}' + f'{remove_button if i == len(history["visible"]) - 1 else ""}' + f'{branch_button}' + f'{info_message}' + f'
') def generate_instruct_html(history): @@ -356,6 +400,27 @@ def generate_instruct_html(history): versioning_nav_user = message_versioning.get_message_version_nav_elements(i, 0) versioning_nav_bot = message_versioning.get_message_version_nav_elements(i, 1) + # Get timestamps + user_timestamp = format_message_timestamp(history, "user", i) + assistant_timestamp = format_message_timestamp(history, "assistant", i) + + # Get attachments + user_attachments = format_message_attachments(history, "user", i) + assistant_attachments = format_message_attachments(history, "assistant", i) + + # Create info buttons for timestamps if they exist + info_message_user = "" + if user_timestamp != "": + # Extract the timestamp value from the span + user_timestamp_value = user_timestamp.split('>', 1)[1].split('<', 1)[0] + info_message_user = info_button.replace("message", user_timestamp_value) + + info_message_assistant = "" + if assistant_timestamp != "": + # Extract the timestamp value from the span + assistant_timestamp_value = assistant_timestamp.split('>', 1)[1].split('<', 1)[0] + info_message_assistant = info_button.replace("message", assistant_timestamp_value) + if converted_visible[0]: # Don't display empty user messages selected_class = " selected-message" if message_versioning.is_message_selected(i, 0) else "" output += ( @@ -364,8 +429,8 @@ def generate_instruct_html(history): f'data-raw="{html.escape(row_internal[0], quote=True)}">' f'
' f'
{converted_visible[0]}
' - f'{copy_button}' - f'{versioning_nav_user}' + f'{user_attachments}' + f'
{copy_button}{info_message_user}
' f'
' f'' ) @@ -373,15 +438,12 @@ def generate_instruct_html(history): selected_class = " selected-message" if message_versioning.is_message_selected(i, 1) else "" output += ( f'
' + f'data-raw="{html.escape(row_internal[1], quote=True)}"' + f'data-index={i}>' f'
' f'
{converted_visible[1]}
' - f'{copy_button}' - f'{refresh_button if i == len(history["visible"]) - 1 else ""}' - f'{continue_button if i == len(history["visible"]) - 1 else ""}' - f'{remove_button if i == len(history["visible"]) - 1 else ""}' - f'{versioning_nav_bot}' + f'{assistant_attachments}' + f'{actions_html(history, i, info_message_assistant)}' f'
' f'
' ) @@ -408,10 +470,17 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache= row_visible = history['visible'][i] row_internal = history['internal'][i] converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible] - versioning_nav_user = message_versioning.get_message_version_nav_elements(i, 0) versioning_nav_bot = message_versioning.get_message_version_nav_elements(i, 1) + # Get timestamps + user_timestamp = format_message_timestamp(history, "user", i) + assistant_timestamp = format_message_timestamp(history, "assistant", i) + + # Get attachments + user_attachments = format_message_attachments(history, "user", i) + assistant_attachments = format_message_attachments(history, "assistant", i) + if converted_visible[0]: # Don't display empty user messages selected_class = " selected-message" if message_versioning.is_message_selected(i, 0) else "" output += ( @@ -420,28 +489,25 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache= f'data-raw="{html.escape(row_internal[0], quote=True)}">' f'
{img_me}
' f'
' - f'
{name1}
' + f'
{name1}{user_timestamp}
' f'
{converted_visible[0]}
' - f'{copy_button}' - f'{versioning_nav_user}' + f'{user_attachments}' + f'
{copy_button}
' f'
' f'' ) selected_class = " selected-message" if message_versioning.is_message_selected(i, 1) else "" output += ( - f'
' + f'
' f'
{img_bot}
' f'
' - f'
{name2}
' + f'
{name2}{assistant_timestamp}
' f'
{converted_visible[1]}
' - f'{copy_button}' - f'{refresh_button if i == len(history["visible"]) - 1 else ""}' - f'{continue_button if i == len(history["visible"]) - 1 else ""}' - f'{remove_button if i == len(history["visible"]) - 1 else ""}' - f'{versioning_nav_bot}' + f'{assistant_attachments}' + f'{actions_html(history, i)}' f'
' f'
' ) @@ -457,20 +523,40 @@ def generate_chat_html(history, name1, name2, reset_cache=False): row_visible = history['visible'][i] row_internal = history['internal'][i] converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible] - versioning_nav_user = message_versioning.get_message_version_nav_elements(i, 0) versioning_nav_bot = message_versioning.get_message_version_nav_elements(i, 1) + # Get timestamps + user_timestamp = format_message_timestamp(history, "user", i) + assistant_timestamp = format_message_timestamp(history, "assistant", i) + + # Get attachments + user_attachments = format_message_attachments(history, "user", i) + assistant_attachments = format_message_attachments(history, "assistant", i) + + # Create info buttons for timestamps if they exist + info_message_user = "" + if user_timestamp != "": + # Extract the timestamp value from the span + user_timestamp_value = user_timestamp.split('>', 1)[1].split('<', 1)[0] + info_message_user = info_button.replace("message", user_timestamp_value) + + info_message_assistant = "" + if assistant_timestamp != "": + # Extract the timestamp value from the span + assistant_timestamp_value = assistant_timestamp.split('>', 1)[1].split('<', 1)[0] + info_message_assistant = info_button.replace("message", assistant_timestamp_value) + if converted_visible[0]: # Don't display empty user messages selected_class = " selected-message" if message_versioning.is_message_selected(i, 0) else "" output += ( f'
' f'
' f'
{converted_visible[0]}
' - f'{copy_button}' - f'{versioning_nav_user}' + f'{user_attachments}' + f'
{copy_button}{info_message_user}
' f'
' f'
' ) @@ -478,15 +564,12 @@ def generate_chat_html(history, name1, name2, reset_cache=False): selected_class = " selected-message" if message_versioning.is_message_selected(i, 1) else "" output += ( f'
' + f'data-raw="{html.escape(row_internal[1], quote=True)}"' + f'data-index={i}>' f'
' f'
{converted_visible[1]}
' - f'{copy_button}' - f'{refresh_button if i == len(history["visible"]) - 1 else ""}' - f'{continue_button if i == len(history["visible"]) - 1 else ""}' - f'{remove_button if i == len(history["visible"]) - 1 else ""}' - f'{versioning_nav_bot}' + f'{assistant_attachments}' + f'{actions_html(history, i, info_message_assistant)}' f'
' f'
' ) diff --git a/modules/loaders.py b/modules/loaders.py index 79a7a4a3..6fbd2198 100644 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -90,11 +90,6 @@ loaders_and_params = OrderedDict({ 'ctx_size_draft', 'speculative_decoding_accordion', ], - 'HQQ': [ - 'hqq_backend', - 'trust_remote_code', - 'no_use_fast', - ], 'TensorRT-LLM': [ 'ctx_size', 'cpp_runner', @@ -158,7 +153,6 @@ def transformers_samplers(): loaders_samplers = { 'Transformers': transformers_samplers(), - 'HQQ': transformers_samplers(), 'ExLlamav3_HF': { 'temperature', 'dynatemp_low', diff --git a/modules/models.py b/modules/models.py index 9ecee803..4218d58c 100644 --- a/modules/models.py +++ b/modules/models.py @@ -21,7 +21,6 @@ def load_model(model_name, loader=None): 'ExLlamav3_HF': ExLlamav3_HF_loader, 'ExLlamav2_HF': ExLlamav2_HF_loader, 'ExLlamav2': ExLlamav2_loader, - 'HQQ': HQQ_loader, 'TensorRT-LLM': TensorRT_LLM_loader, } @@ -102,21 +101,6 @@ def ExLlamav2_loader(model_name): return model, tokenizer -def HQQ_loader(model_name): - try: - from hqq.core.quantize import HQQBackend, HQQLinear - from hqq.models.hf.base import AutoHQQHFModel - except ModuleNotFoundError: - raise ModuleNotFoundError("Failed to import 'hqq'. Please install it manually following the instructions in the HQQ GitHub repository.") - - logger.info(f"Loading HQQ model with backend: \"{shared.args.hqq_backend}\"") - - model_dir = Path(f'{shared.args.model_dir}/{model_name}') - model = AutoHQQHFModel.from_quantized(str(model_dir)) - HQQLinear.set_backend(getattr(HQQBackend, shared.args.hqq_backend)) - return model - - def TensorRT_LLM_loader(model_name): try: from modules.tensorrt_llm import TensorRTLLMModel diff --git a/modules/models_settings.py b/modules/models_settings.py index 6b9493ca..df5a8e8d 100644 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -2,7 +2,7 @@ import functools import json import re import subprocess -from math import exp +from math import floor from pathlib import Path import gradio as gr @@ -154,10 +154,11 @@ def get_model_metadata(model): for pat in settings: if re.match(pat.lower(), Path(model).name.lower()): for k in settings[pat]: + new_k = k if k == 'n_gpu_layers': - k = 'gpu_layers' + new_k = 'gpu_layers' - model_settings[k] = settings[pat][k] + model_settings[new_k] = settings[pat][k] # Load instruction template if defined by name rather than by value if model_settings['instruction_template'] != 'Custom (obtained from model metadata)': @@ -182,8 +183,6 @@ def infer_loader(model_name, model_settings, hf_quant_method=None): loader = 'ExLlamav3_HF' elif re.match(r'.*exl2', model_name.lower()): loader = 'ExLlamav2_HF' - elif re.match(r'.*-hqq', model_name.lower()): - return 'HQQ' else: loader = 'Transformers' @@ -331,8 +330,6 @@ def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type): n_layers = None n_kv_heads = None embedding_dim = None - context_length = None - feed_forward_dim = None for key, value in metadata.items(): if key.endswith('.block_count'): @@ -341,10 +338,6 @@ def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type): n_kv_heads = value elif key.endswith('.embedding_length'): embedding_dim = value - elif key.endswith('.context_length'): - context_length = value - elif key.endswith('.feed_forward_length'): - feed_forward_dim = value if gpu_layers > n_layers: gpu_layers = n_layers @@ -359,22 +352,16 @@ def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type): # Derived features size_per_layer = size_in_mb / max(n_layers, 1e-6) - context_per_layer = context_length / max(n_layers, 1e-6) - ffn_per_embedding = feed_forward_dim / max(embedding_dim, 1e-6) kv_cache_factor = n_kv_heads * cache_type * ctx_size - - # Helper function for smaller - def smaller(x, y): - return 1 if x < y else 0 + embedding_per_context = embedding_dim / ctx_size # Calculate VRAM using the model # Details: https://oobabooga.github.io/blog/posts/gguf-vram-formula/ vram = ( - (size_per_layer - 21.19195204848197) - * exp(0.0001047328491557063 * size_in_mb * smaller(ffn_per_embedding, 2.671096993407845)) - + 0.0006621544775632052 * context_per_layer - + 3.34664386576376e-05 * kv_cache_factor - ) * (1.363306170123392 + gpu_layers) + 1255.163594536052 + (size_per_layer - 17.99552795246051 + 3.148552680382576e-05 * kv_cache_factor) + * (gpu_layers + max(0.9690636483914102, cache_type - (floor(50.77817218646521 * embedding_per_context) + 9.987899908205632))) + + 1516.522943869404 + ) return vram @@ -451,7 +438,7 @@ def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type, - If for_ui=False: (vram_usage, adjusted_layers) or just vram_usage """ if loader != 'llama.cpp' or model in ["None", None] or not model.endswith(".gguf"): - vram_info = "
Estimated VRAM to load the model:" + vram_info = "
Estimated VRAM to load the model:
" if for_ui: return (vram_info, gr.update()) if auto_adjust else vram_info else: @@ -485,7 +472,7 @@ def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type, return_free = False if (for_ui and shared.model_name not in [None, 'None']) else True available_vram = get_nvidia_vram(return_free=return_free) if available_vram > 0: - tolerance = 906 + tolerance = 577 while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance: current_layers -= 1 @@ -493,7 +480,7 @@ def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type, vram_usage = estimate_vram(model, current_layers, ctx_size, cache_type) if for_ui: - vram_info = f"
Estimated VRAM to load the model: {vram_usage:.0f} MiB" + vram_info = f"
Estimated VRAM to load the model: {vram_usage:.0f} MiB
" if auto_adjust: return vram_info, gr.update(value=current_layers, maximum=max_layers) else: diff --git a/modules/shared.py b/modules/shared.py index 4e0a20db..d2305f30 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -47,6 +47,7 @@ settings = { 'max_new_tokens_max': 4096, 'prompt_lookup_num_tokens': 0, 'max_tokens_second': 0, + 'max_updates_second': 12, 'auto_max_new_tokens': True, 'ban_eos_token': False, 'add_bos_token': True, @@ -86,7 +87,7 @@ group.add_argument('--idle-timeout', type=int, default=0, help='Unload model aft # Model loader group = parser.add_argument_group('Model loader') -group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, HQQ, TensorRT-LLM.') +group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, TensorRT-LLM.') # Transformers/Accelerate group = parser.add_argument_group('Transformers/Accelerate') @@ -151,10 +152,6 @@ group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to n group.add_argument('--num_experts_per_token', type=int, default=2, metavar='N', help='Number of experts to use for generation. Applies to MoE models like Mixtral.') group.add_argument('--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) in ExLlamaV2.') -# HQQ -group = parser.add_argument_group('HQQ') -group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.') - # TensorRT-LLM group = parser.add_argument_group('TensorRT-LLM') group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn\'t support streaming yet.') @@ -262,8 +259,6 @@ def fix_loader_name(name): return 'ExLlamav2_HF' elif name in ['exllamav3-hf', 'exllamav3_hf', 'exllama-v3-hf', 'exllama_v3_hf', 'exllama-v3_hf', 'exllama3-hf', 'exllama3_hf', 'exllama-3-hf', 'exllama_3_hf', 'exllama-3_hf']: return 'ExLlamav3_HF' - elif name in ['hqq']: - return 'HQQ' elif name in ['tensorrt', 'tensorrtllm', 'tensorrt_llm', 'tensorrt-llm', 'tensort', 'tensortllm']: return 'TensorRT-LLM' diff --git a/modules/text_generation.py b/modules/text_generation.py index 00b9275a..962311df 100644 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -65,39 +65,41 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap all_stop_strings += st shared.stop_everything = False + last_update = -1 reply = '' is_stream = state['stream'] if len(all_stop_strings) > 0 and not state['stream']: state = copy.deepcopy(state) state['stream'] = True + min_update_interval = 0 + if state.get('max_updates_second', 0) > 0: + min_update_interval = 1 / state['max_updates_second'] + # Generate - last_update = -1 - latency_threshold = 1 / 1000 for reply in generate_func(question, original_question, state, stopping_strings, is_chat=is_chat): - cur_time = time.monotonic() reply, stop_found = apply_stopping_strings(reply, all_stop_strings) if escape_html: reply = html.escape(reply) if is_stream: + cur_time = time.time() + # Limit number of tokens/second to make text readable in real time if state['max_tokens_second'] > 0: diff = 1 / state['max_tokens_second'] - (cur_time - last_update) if diff > 0: time.sleep(diff) - last_update = time.monotonic() + last_update = time.time() yield reply # Limit updates to avoid lag in the Gradio UI # API updates are not limited else: - # If 'generate_func' takes less than 0.001 seconds to yield the next token - # (equivalent to more than 1000 tok/s), assume that the UI is lagging behind and skip yielding - if (cur_time - last_update) > latency_threshold: + if cur_time - last_update > min_update_interval: + last_update = cur_time yield reply - last_update = time.monotonic() if stop_found or (state['max_tokens_second'] > 0 and shared.stop_everything): break diff --git a/modules/ui.py b/modules/ui.py index eeb6ce92..5e8fa14e 100644 --- a/modules/ui.py +++ b/modules/ui.py @@ -109,7 +109,6 @@ def list_model_elements(): 'threads', 'threads_batch', 'batch_size', - 'hqq_backend', 'ctx_size', 'cache_type', 'tensor_split', @@ -192,6 +191,7 @@ def list_interface_input_elements(): 'max_new_tokens', 'prompt_lookup_num_tokens', 'max_tokens_second', + 'max_updates_second', 'do_sample', 'dynamic_temperature', 'temperature_last', @@ -210,6 +210,7 @@ def list_interface_input_elements(): 'negative_prompt', 'dry_sequence_breakers', 'grammar_string', + 'branch_index' ] # Chat elements diff --git a/modules/ui_chat.py b/modules/ui_chat.py index d2954a4b..502b19a0 100644 --- a/modules/ui_chat.py +++ b/modules/ui_chat.py @@ -24,7 +24,8 @@ def create_ui(): with gr.Row(elem_id='past-chats-row', elem_classes=['pretty_scrollbar']): with gr.Column(): with gr.Row(elem_id='past-chats-buttons'): - shared.gradio['branch_chat'] = gr.Button('Branch', elem_classes='refresh-button', interactive=not mu) + shared.gradio['branch_chat'] = gr.Button('Branch', elem_classes='refresh-button', elem_id='Branch', interactive=not mu) + shared.gradio['branch_index'] = gr.Number(value=-1, precision=0, visible=False, elem_id="Branch-index", interactive=True) shared.gradio['rename_chat'] = gr.Button('Rename', elem_classes='refresh-button', interactive=not mu) shared.gradio['delete_chat'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu) shared.gradio['Start new chat'] = gr.Button('New chat', elem_classes=['refresh-button', 'focus-on-chat-input']) @@ -47,13 +48,13 @@ def create_ui(): with gr.Row(): with gr.Column(elem_id='chat-col'): shared.gradio['display'] = gr.JSON(value={}, visible=False) # Hidden buffer - shared.gradio['html_display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, '', '', 'chat', 'cai-chat', '')['html'], visible=True) + shared.gradio['html_display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': [], 'metadata': {}}, '', '', 'chat', 'cai-chat', '')['html'], visible=True) with gr.Row(elem_id="chat-input-row"): with gr.Column(scale=1, elem_id='gr-hover-container'): gr.HTML(value='
', elem_id='gr-hover') with gr.Column(scale=10, elem_id='chat-input-container'): - shared.gradio['textbox'] = gr.Textbox(label='', placeholder='Send a message', elem_id='chat-input', elem_classes=['add_scrollbar']) + shared.gradio['textbox'] = gr.MultimodalTextbox(label='', placeholder='Send a message', file_types=['text', '.pdf'], file_count="multiple", elem_id='chat-input', elem_classes=['add_scrollbar']) shared.gradio['show_controls'] = gr.Checkbox(value=shared.settings['show_controls'], label='Show controls (Ctrl+S)', elem_id='show-controls') shared.gradio['typing-dots'] = gr.HTML(value='
', label='typing', elem_id='typing-container') @@ -79,8 +80,8 @@ def create_ui(): shared.gradio['Send dummy reply'] = gr.Button('Send dummy reply') with gr.Row(): - shared.gradio['send-chat-to-default'] = gr.Button('Send to default') - shared.gradio['send-chat-to-notebook'] = gr.Button('Send to notebook') + shared.gradio['send-chat-to-default'] = gr.Button('Send to Default') + shared.gradio['send-chat-to-notebook'] = gr.Button('Send to Notebook') with gr.Row(elem_id='chat-controls', elem_classes=['pretty_scrollbar']): with gr.Column(): @@ -195,7 +196,7 @@ def create_event_handlers(): shared.gradio['Generate'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then( + lambda x: (x, {"text": "", "files": []}), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then( lambda: None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.add("_generating")').then( chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then( None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then( @@ -203,7 +204,7 @@ def create_event_handlers(): shared.gradio['textbox'].submit( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then( + lambda x: (x, {"text": "", "files": []}), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then( lambda: None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.add("_generating")').then( chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then( None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then( @@ -271,7 +272,7 @@ def create_event_handlers(): shared.gradio['branch_chat'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - chat.handle_branch_chat_click, gradio('interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False) + chat.handle_branch_chat_click, gradio('interface_state'), gradio('history', 'display', 'unique_id', 'branch_index'), show_progress=False) shared.gradio['rename_chat'].click(chat.handle_rename_chat_click, None, gradio('rename_to', 'rename-row'), show_progress=False) shared.gradio['rename_to-cancel'].click(lambda: gr.update(visible=False), None, gradio('rename-row'), show_progress=False) diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py index 5b7dfdd8..862b3893 100644 --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -39,11 +39,9 @@ def create_ui(): with gr.Row(): with gr.Column(): shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=get_initial_gpu_layers_max(), step=1, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.') - shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model.') + shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. Common values: 4096, 8192, 16384, 32768, 65536, 131072. ⚠️ Lower this value if you can\'t load the model.') shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).') - shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend) - with gr.Column(): shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info()) shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.') @@ -312,7 +310,7 @@ def get_initial_vram_info(): for_ui=True ) - return "
Estimated VRAM to load the model:" + return "
Estimated VRAM to load the model:
" def get_initial_gpu_layers_max(): diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py index 84f9fbfc..733d0901 100644 --- a/modules/ui_parameters.py +++ b/modules/ui_parameters.py @@ -71,6 +71,8 @@ def create_ui(default_preset): shared.gradio['max_new_tokens'] = gr.Slider(minimum=shared.settings['max_new_tokens_min'], maximum=shared.settings['max_new_tokens_max'], value=shared.settings['max_new_tokens'], step=1, label='max_new_tokens', info='⚠️ Setting this too high can cause prompt truncation.') shared.gradio['prompt_lookup_num_tokens'] = gr.Slider(value=shared.settings['prompt_lookup_num_tokens'], minimum=0, maximum=10, step=1, label='prompt_lookup_num_tokens', info='Activates Prompt Lookup Decoding.') shared.gradio['max_tokens_second'] = gr.Slider(value=shared.settings['max_tokens_second'], minimum=0, maximum=20, step=1, label='Maximum tokens/second', info='To make text readable in real time.') + shared.gradio['max_updates_second'] = gr.Slider(value=shared.settings['max_updates_second'], minimum=0, maximum=24, step=1, label='Maximum UI updates/second', info='Set this if you experience lag in the UI during streaming.') + with gr.Column(): with gr.Row(): with gr.Column(): diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index af5f7d8a..afb5f9d4 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -13,6 +13,7 @@ peft==0.15.* Pillow>=9.5.0 psutil pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -30,8 +31,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 4e011989..46c33034 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -12,6 +12,7 @@ peft==0.15.* Pillow>=9.5.0 psutil pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -29,7 +30,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index a3bd1350..c8e94cbd 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -12,6 +12,7 @@ peft==0.15.* Pillow>=9.5.0 psutil pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -29,7 +30,7 @@ sse-starlette==1.6.5 tiktoken # AMD wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 363365bf..dc403ae2 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -12,6 +12,7 @@ peft==0.15.* Pillow>=9.5.0 psutil pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -29,7 +30,7 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 2843fed2..5c643c4c 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -12,6 +12,7 @@ peft==0.15.* Pillow>=9.5.0 psutil pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -29,8 +30,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index bd7c4a4f..ccabea84 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -12,6 +12,7 @@ peft==0.15.* Pillow>=9.5.0 psutil pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -29,5 +30,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index b5aa1cf7..7e9da47f 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -12,6 +12,7 @@ peft==0.15.* Pillow>=9.5.0 psutil pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -29,5 +30,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 89947cbe..fdf5cd0e 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -13,6 +13,7 @@ peft==0.15.* Pillow>=9.5.0 psutil pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -30,8 +31,8 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt index 2e631bf0..22d39ded 100644 --- a/requirements/full/requirements_nowheels.txt +++ b/requirements/full/requirements_nowheels.txt @@ -12,6 +12,7 @@ peft==0.15.* Pillow>=9.5.0 psutil pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index 79959398..ec9bafc6 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -4,6 +4,7 @@ jinja2==3.1.6 markdown numpy==1.26.* pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -15,5 +16,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index ca16e4c7..025a737e 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -4,6 +4,7 @@ jinja2==3.1.6 markdown numpy==1.26.* pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -15,5 +16,5 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index 18e1c506..32644e87 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -4,6 +4,7 @@ jinja2==3.1.6 markdown numpy==1.26.* pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -15,6 +16,6 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index 693f4712..bd5c1d9b 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -4,6 +4,7 @@ jinja2==3.1.6 markdown numpy==1.26.* pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -15,5 +16,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index 8635d11e..51f2b7d9 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -4,6 +4,7 @@ jinja2==3.1.6 markdown numpy==1.26.* pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -15,5 +16,5 @@ sse-starlette==1.6.5 tiktoken # llama.cpp (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index e844596e..aad6bf5a 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -4,6 +4,7 @@ jinja2==3.1.6 markdown numpy==1.26.* pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -15,5 +16,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt index 6f9566ba..4c055426 100644 --- a/requirements/portable/requirements_nowheels.txt +++ b/requirements/portable/requirements_nowheels.txt @@ -4,6 +4,7 @@ jinja2==3.1.6 markdown numpy==1.26.* pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index 9b7435d1..3d98d1b0 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -4,6 +4,7 @@ jinja2==3.1.6 markdown numpy==1.26.* pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -15,5 +16,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index 513b7a15..f954b8d2 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -4,6 +4,7 @@ jinja2==3.1.6 markdown numpy==1.26.* pydantic==2.8.2 +PyPDF2==3.0.1 pyyaml requests rich @@ -15,5 +16,5 @@ sse-starlette==1.6.5 tiktoken # CUDA wheels -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" -https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.12.0/llama_cpp_binaries-0.12.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows" +https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" diff --git a/user_data/settings-template.yaml b/user_data/settings-template.yaml index db481e84..ce0f77e1 100644 --- a/user_data/settings-template.yaml +++ b/user_data/settings-template.yaml @@ -18,6 +18,7 @@ max_new_tokens_min: 1 max_new_tokens_max: 4096 prompt_lookup_num_tokens: 0 max_tokens_second: 0 +max_updates_second: 12 auto_max_new_tokens: true ban_eos_token: false add_bos_token: true