diff --git a/.github/workflows/build-portable-release-cuda.yml b/.github/workflows/build-portable-release-cuda.yml
index fb9e61b0..571cbac0 100644
--- a/.github/workflows/build-portable-release-cuda.yml
+++ b/.github/workflows/build-portable-release-cuda.yml
@@ -102,6 +102,8 @@ jobs:
         shell: bash
         run: |
             rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
+            allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
+            find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
 
             # Define common variables
             CUDA_VERSION="${{ matrix.cuda }}"
diff --git a/.github/workflows/build-portable-release-vulkan.yml b/.github/workflows/build-portable-release-vulkan.yml
index 8de29791..4e88d4d9 100644
--- a/.github/workflows/build-portable-release-vulkan.yml
+++ b/.github/workflows/build-portable-release-vulkan.yml
@@ -101,6 +101,8 @@ jobs:
         shell: bash
         run: |
             rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
+            allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
+            find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
 
             # Define common variables
             AVX_SUPPORT="${{ matrix.avx }}"
diff --git a/.github/workflows/build-portable-release.yml b/.github/workflows/build-portable-release.yml
index bdf96cec..6910ce2c 100644
--- a/.github/workflows/build-portable-release.yml
+++ b/.github/workflows/build-portable-release.yml
@@ -101,6 +101,8 @@ jobs:
         shell: bash
         run: |
             rm -rf .git cmd* update_wizard* Colab-TextGen-GPU.ipynb docker
+            allowed=("character_bias" "gallery" "openai" "sd_api_pictures")
+            find extensions/ -mindepth 1 -maxdepth 1 -type d | grep -v -E "$(printf '%s|' "${allowed[@]}" | sed 's/|$//')" | xargs rm -rf
 
             # Define common variables
             AVX_SUPPORT="${{ matrix.avx }}"
diff --git a/README.md b/README.md
index 4b541b9e..55df33d2 100644
--- a/README.md
+++ b/README.md
@@ -12,18 +12,20 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 
 ## Features
 
-- Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), and [ExLlamaV2](https://github.com/turboderp-org/exllamav2).
-  - [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) is also supported via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile).
-  - Additional quantization libraries like [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM) can be used with the Transformers loader if you install them manually.
-- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for llama.cpp GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory that doesn't interfere with your system environment.
-- UI that resembles the original ChatGPT style.
-- Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats.
-- Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`.
-- Free-form text generation in the Default/Notebook tabs without being limited to chat turns. You can send formatted conversations from the Chat tab to these.
-- Multiple sampling parameters and generation options for sophisticated text generation control.
-- Switch between different models easily in the UI without restarting, with fine control over settings.
-- OpenAI-compatible API with Chat and Completions endpoints – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
+- Supports multiple text generation backends in one UI/API, including [llama.cpp](https://github.com/ggerganov/llama.cpp), [Transformers](https://github.com/huggingface/transformers), [ExLlamaV3](https://github.com/turboderp-org/exllamav3), [ExLlamaV2](https://github.com/turboderp-org/exllamav2), and [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) (the latter via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile)).
+- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory.
 - 100% offline and private, with zero telemetry, external resources, or remote update requests.
+- Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats.
+- **File attachments**: Upload text files and PDF documents to talk about their contents.
+- **Web search**: Optionally search the internet with LLM-generated queries to add context to the conversation.
+- Aesthetic UI with dark and light themes.
+- `instruct` mode for instruction-following (like ChatGPT), and `chat-instruct`/`chat` modes for talking to custom characters.
+- Edit messages, navigate between message versions, and branch conversations at any point.
+- Multiple sampling parameters and generation options for sophisticated text generation control.
+- Switch between different models in the UI without restarting.
+- Automatic GPU layers for GGUF models (on NVIDIA GPUs).
+- Free-form text generation in the Default/Notebook tabs without being limited to chat turns.
+- OpenAI-compatible API with Chat and Completions endpoints, including tool-calling support – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
 - Extension support, with numerous built-in and user-contributed extensions available. See the [wiki](https://github.com/oobabooga/text-generation-webui/wiki/07-%E2%80%90-Extensions) and [extensions directory](https://github.com/oobabooga/text-generation-webui-extensions) for details.
 
 ## How to install
@@ -44,7 +46,7 @@ Download from: https://github.com/oobabooga/text-generation-webui/releases
 
 To restart the web UI later, just run the same `start_` script. If you need to reinstall, delete the `installer_files` folder created during setup and run the script again.
 
-You can use command-line flags, like `./start_linux.sh --help`, or add them to `user_data/CMD_FLAGS.txt` (such as `--api` to enable API use). To update the project, run `update_wizard_linux.sh`, `update_wizard_windows.bat`, `update_wizard_macos.sh`, or `update_wizard_wsl.bat`.
+You can use command-line flags, like `./start_linux.sh --help`, or add them to `user_data/CMD_FLAGS.txt` (such as `--api` to enable API use). To update the project, run `update_wizard_linux.sh`, `update_wizard_windows.bat`, or `update_wizard_macos.sh`.
 
 <details>
 <summary>
@@ -55,12 +57,12 @@ Setup details and information about installing manually
 
 The script uses Miniconda to set up a Conda environment in the `installer_files` folder.
 
-If you ever need to install something manually in the `installer_files` environment, you can launch an interactive shell using the cmd script: `cmd_linux.sh`, `cmd_windows.bat`, `cmd_macos.sh`, or `cmd_wsl.bat`.
+If you ever need to install something manually in the `installer_files` environment, you can launch an interactive shell using the cmd script: `cmd_linux.sh`, `cmd_windows.bat`, or `cmd_macos.sh`.
 
 * There is no need to run any of those scripts (`start_`, `update_wizard_`, or `cmd_`) as admin/root.
 * To install the requirements for extensions, you can use the `extensions_reqs` script for your OS. At the end, this script will install the main requirements for the project to make sure that they take precedence in case of version conflicts.
 * For additional instructions about AMD and WSL setup, consult [the documentation](https://github.com/oobabooga/text-generation-webui/wiki).
-* For automated installation, you can use the `GPU_CHOICE`, `USE_CUDA118`, `LAUNCH_AFTER_INSTALL`, and `INSTALL_EXTENSIONS` environment variables. For instance: `GPU_CHOICE=A USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh`.
+* For automated installation, you can use the `GPU_CHOICE`, `LAUNCH_AFTER_INSTALL`, and `INSTALL_EXTENSIONS` environment variables. For instance: `GPU_CHOICE=A LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh`.
 
 ### Manual installation using Conda
 
@@ -90,7 +92,7 @@ conda activate textgen
 |--------|---------|---------|
 | Linux/WSL | NVIDIA | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124` |
 | Linux/WSL | CPU only | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cpu` |
-| Linux | AMD | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/rocm6.1` |
+| Linux | AMD | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/rocm6.2.4` |
 | MacOS + MPS | Any | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0` |
 | Windows | NVIDIA | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu124` |
 | Windows | CPU only | `pip3 install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0` |
@@ -146,14 +148,14 @@ The `requirements*.txt` above contain various wheels precompiled through GitHub
 For NVIDIA GPU:
 ln -s docker/{nvidia/Dockerfile,nvidia/docker-compose.yml,.dockerignore} .
 For AMD GPU: 
-ln -s docker/{amd/Dockerfile,intel/docker-compose.yml,.dockerignore} .
+ln -s docker/{amd/Dockerfile,amd/docker-compose.yml,.dockerignore} .
 For Intel GPU:
 ln -s docker/{intel/Dockerfile,amd/docker-compose.yml,.dockerignore} .
 For CPU only
 ln -s docker/{cpu/Dockerfile,cpu/docker-compose.yml,.dockerignore} .
 cp docker/.env.example .env
 #Create logs/cache dir : 
-mkdir -p logs cache
+mkdir -p user_data/logs user_data/cache
 # Edit .env and set: 
 #   TORCH_CUDA_ARCH_LIST based on your GPU model
 #   APP_RUNTIME_GID      your host user's group id (run `id -g` in a terminal)
@@ -187,13 +189,13 @@ usage: server.py [-h] [--multi-user] [--character CHARACTER] [--model MODEL] [--
                  [--extensions EXTENSIONS [EXTENSIONS ...]] [--verbose] [--idle-timeout IDLE_TIMEOUT] [--loader LOADER] [--cpu] [--cpu-memory CPU_MEMORY] [--disk] [--disk-cache-dir DISK_CACHE_DIR]
                  [--load-in-8bit] [--bf16] [--no-cache] [--trust-remote-code] [--force-safetensors] [--no_use_fast] [--use_flash_attention_2] [--use_eager_attention] [--torch-compile] [--load-in-4bit]
                  [--use_double_quant] [--compute_dtype COMPUTE_DTYPE] [--quant_type QUANT_TYPE] [--flash-attn] [--threads THREADS] [--threads-batch THREADS_BATCH] [--batch-size BATCH_SIZE] [--no-mmap]
-                 [--mlock] [--n-gpu-layers N_GPU_LAYERS] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size N]
+                 [--mlock] [--gpu-layers N] [--tensor-split TENSOR_SPLIT] [--numa] [--no-kv-offload] [--row-split] [--extra-flags EXTRA_FLAGS] [--streaming-llm] [--ctx-size N] [--cache-type N]
                  [--model-draft MODEL_DRAFT] [--draft-max DRAFT_MAX] [--gpu-layers-draft GPU_LAYERS_DRAFT] [--device-draft DEVICE_DRAFT] [--ctx-size-draft CTX_SIZE_DRAFT] [--gpu-split GPU_SPLIT]
-                 [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--enable_tp] [--hqq-backend HQQ_BACKEND] [--cpp-runner]
-                 [--cache_type CACHE_TYPE] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR] [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE]
-                 [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT] [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH]
-                 [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--subpath SUBPATH] [--old-colors] [--api] [--public-api]
-                 [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY] [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui]
+                 [--autosplit] [--cfg-cache] [--no_flash_attn] [--no_xformers] [--no_sdpa] [--num_experts_per_token N] [--enable_tp] [--cpp-runner] [--deepspeed] [--nvme-offload-dir NVME_OFFLOAD_DIR]
+                 [--local_rank LOCAL_RANK] [--alpha_value ALPHA_VALUE] [--rope_freq_base ROPE_FREQ_BASE] [--compress_pos_emb COMPRESS_POS_EMB] [--listen] [--listen-port LISTEN_PORT]
+                 [--listen-host LISTEN_HOST] [--share] [--auto-launch] [--gradio-auth GRADIO_AUTH] [--gradio-auth-path GRADIO_AUTH_PATH] [--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE]
+                 [--subpath SUBPATH] [--old-colors] [--portable] [--api] [--public-api] [--public-api-id PUBLIC_API_ID] [--api-port API_PORT] [--api-key API_KEY] [--admin-key ADMIN_KEY]
+                 [--api-enable-ipv6] [--api-disable-ipv4] [--nowebui]
 
 Text generation web UI
 
@@ -215,7 +217,7 @@ Basic settings:
   --idle-timeout IDLE_TIMEOUT               Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again.
 
 Model loader:
-  --loader LOADER                           Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, HQQ,
+  --loader LOADER                           Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2,
                                             TensorRT-LLM.
 
 Transformers/Accelerate:
@@ -246,16 +248,18 @@ llama.cpp:
   --batch-size BATCH_SIZE                   Maximum number of prompt tokens to batch together when calling llama_eval.
   --no-mmap                                 Prevent mmap from being used.
   --mlock                                   Force the system to keep the model in RAM.
-  --n-gpu-layers N_GPU_LAYERS               Number of layers to offload to the GPU.
+  --gpu-layers N, --n-gpu-layers N          Number of layers to offload to the GPU.
   --tensor-split TENSOR_SPLIT               Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.
   --numa                                    Activate NUMA task allocation for llama.cpp.
   --no-kv-offload                           Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.
   --row-split                               Split the model by rows across GPUs. This may improve multi-gpu performance.
-  --extra-flags EXTRA_FLAGS                 Extra flags to pass to llama-server. Format: "flag1=value1;flag2;flag3=value3". Example: "override-tensor=exps=CPU"
+  --extra-flags EXTRA_FLAGS                 Extra flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"
   --streaming-llm                           Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.
 
-Context and cache management:
+Context and cache:
   --ctx-size N, --n_ctx N, --max_seq_len N  Context size in tokens.
+  --cache-type N, --cache_type N            KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits
+                                            separately, e.g. q4_q8).
 
 Speculative decoding:
   --model-draft MODEL_DRAFT                 Path to the draft model for speculative decoding.
@@ -274,15 +278,9 @@ ExLlamaV2:
   --num_experts_per_token N                 Number of experts to use for generation. Applies to MoE models like Mixtral.
   --enable_tp                               Enable Tensor Parallelism (TP) in ExLlamaV2.
 
-HQQ:
-  --hqq-backend HQQ_BACKEND                 Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.
-
 TensorRT-LLM:
   --cpp-runner                              Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn't support streaming yet.
 
-Cache:
-  --cache_type CACHE_TYPE                   KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.
-
 DeepSpeed:
   --deepspeed                               Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.
   --nvme-offload-dir NVME_OFFLOAD_DIR       DeepSpeed: Directory to use for ZeRO-3 NVME offloading.
@@ -305,6 +303,7 @@ Gradio:
   --ssl-certfile SSL_CERTFILE               The path to the SSL certificate cert file.
   --subpath SUBPATH                         Customize the subpath for gradio, use with reverse proxy
   --old-colors                              Use the legacy Gradio colors, before the December/2024 update.
+  --portable                                Hide features not available in portable mode like training.
 
 API:
   --api                                     Enable the API extension.
diff --git a/css/chat_style-Dark.css b/css/chat_style-Dark.css
index 368a2a16..6a4784cc 100644
--- a/css/chat_style-Dark.css
+++ b/css/chat_style-Dark.css
@@ -1,7 +1,9 @@
 .message {
     display: grid;
+    align-items: start;
     grid-template-columns: 60px minmax(0, 1fr);
-    padding-bottom: 28px;
+    padding-bottom: 22px;
+    padding-top: 6px;
     font-size: 18px;
     font-family: Roboto, Arial, sans-serif; /* Modern font */
     line-height: 1.5;
@@ -102,6 +104,7 @@
 @media screen and (width <= 688px) {
     .message {
         display: grid;
+        align-items: start;
         grid-template-columns: 60px minmax(0, 1fr);
         padding-bottom: 25px;
         font-size: 15px;
diff --git a/css/chat_style-TheEncrypted777.css b/css/chat_style-TheEncrypted777.css
index 6404f41d..fbd47072 100644
--- a/css/chat_style-TheEncrypted777.css
+++ b/css/chat_style-TheEncrypted777.css
@@ -2,8 +2,10 @@
 
 .message {
     display: grid;
+    align-items: start;
     grid-template-columns: 60px minmax(0, 1fr);
-    padding-bottom: 28px;
+    padding-bottom: 21px;
+    padding-top: 7px;
     font-size: 18px;
     font-family: 'Noto Sans', Arial, sans-serif;
     line-height: 1.428571429;
@@ -100,6 +102,7 @@
 @media screen and (width <= 688px) {
     .message {
         display: grid;
+        align-items: start;
         grid-template-columns: 60px minmax(0, 1fr);
         padding-bottom: 25px;
         font-size: 15px;
diff --git a/css/chat_style-cai-chat-square.css b/css/chat_style-cai-chat-square.css
index 854fff60..291a1209 100644
--- a/css/chat_style-cai-chat-square.css
+++ b/css/chat_style-cai-chat-square.css
@@ -16,6 +16,7 @@
 }
 
 .message {
-    padding-bottom: 2em;
+    padding-bottom: 1.5em;
+    padding-top: 0.5em;
     grid-template-columns: 70px minmax(0, 1fr);
 }
diff --git a/css/chat_style-cai-chat.css b/css/chat_style-cai-chat.css
index 93276bd3..b06b1269 100644
--- a/css/chat_style-cai-chat.css
+++ b/css/chat_style-cai-chat.css
@@ -1,7 +1,9 @@
 .message {
     display: grid;
+    align-items: start;
     grid-template-columns: 60px minmax(0, 1fr);
-    padding-bottom: 2em;
+    padding-bottom: 1.5em;
+    padding-top: 0.5em;
     font-size: 15px;
     font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
     line-height: 22.5px !important;
diff --git a/css/chat_style-messenger.css b/css/chat_style-messenger.css
index f0fd1578..65af5f7a 100644
--- a/css/chat_style-messenger.css
+++ b/css/chat_style-messenger.css
@@ -1,5 +1,6 @@
 .message {
-    padding-bottom: 25px;
+    padding-bottom: 22px;
+    padding-top: 3px;
     font-size: 15px;
     font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
     line-height: 1.428571429;
diff --git a/css/chat_style-wpp.css b/css/chat_style-wpp.css
index 30ca61f3..353201c2 100644
--- a/css/chat_style-wpp.css
+++ b/css/chat_style-wpp.css
@@ -1,5 +1,6 @@
 .message {
-    padding-bottom: 25px;
+    padding-bottom: 22px;
+    padding-top: 3px;
     font-size: 15px;
     font-family: 'Noto Sans', Helvetica, Arial, sans-serif;
     line-height: 1.428571429;
diff --git a/css/html_instruct_style.css b/css/html_instruct_style.css
index 4613b380..6ad250aa 100644
--- a/css/html_instruct_style.css
+++ b/css/html_instruct_style.css
@@ -8,10 +8,6 @@
     padding-top: 0 !important;
 }
 
-.chat > .messages > :last-child {
-    margin-bottom: 1.7rem !important;
-}
-
 .chat .message-body p, .chat .message-body li {
     font-size: 1rem !important;
     line-height: 28px !important;
@@ -46,7 +42,7 @@
 }
 
 .chat .user-message {
-    background: #f5f5f5;
+    background: #f3f4f6;
     padding: 1.5rem 1rem;
     padding-bottom: 2rem;
     border-radius: 0;
@@ -61,16 +57,16 @@
 }
 
 .dark .chat .user-message {
-    background: transparent;
+    background: var(--light-gray);
 }
 
 .dark .chat .assistant-message {
-    background: var(--light-gray);
+    background: transparent;
 }
 
 .chat .user-message .text,
 .chat .assistant-message .text {
-    max-width: 645px;
+    max-width: 700px;
     margin-left: auto;
     margin-right: auto;
 }
diff --git a/css/main.css b/css/main.css
index d6e5ac83..967d94ed 100644
--- a/css/main.css
+++ b/css/main.css
@@ -1,11 +1,11 @@
 :root {
     --darker-gray: #202123;
-    --dark-gray: #343541;
-    --light-gray: #444654;
-    --light-theme-gray: #f5f5f5;
+    --dark-gray: #2A2B32;
+    --light-gray: #373943;
+    --light-theme-gray: #f9fbff;
     --border-color-dark: #525252;
     --header-width: 112px;
-    --selected-item-color-dark: #32333e;
+    --selected-item-color-dark: #2E2F38;
 }
 
 @font-face {
@@ -131,7 +131,7 @@ gradio-app > :first-child {
 }
 
 .header_bar {
-    box-shadow: 0 0 3px rgba(22 22 22 / 35%);
+    border-right: var(--input-border-width) solid var(--input-border-color);
     margin-bottom: 0;
     overflow-x: scroll;
     text-wrap: nowrap;
@@ -265,7 +265,7 @@ button {
 
 .dark .pretty_scrollbar::-webkit-scrollbar-thumb,
 .dark .pretty_scrollbar::-webkit-scrollbar-thumb:hover {
-    background: #ccc;
+    background: rgb(255 255 255 / 10%);
     border-radius: 10px;
 }
 
@@ -389,8 +389,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 .chat {
     margin-left: auto;
     margin-right: auto;
-    min-height: var(--chat-height);
-    overflow-y: auto;
+    flex: 1;
+    overflow-y: hidden;
     display: flex;
     flex-direction: column;
     word-break: break-word;
@@ -401,10 +401,9 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 .chat-parent {
-    height: calc(100dvh - 98px - var(--input-delta));
+    flex: 1;
     overflow: auto !important;
     border-radius: 0 !important;
-    margin-bottom: var(--input-delta) !important;
 }
 
 .chat-parent .prose {
@@ -420,14 +419,22 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     padding-right: 1rem;
 }
 
+.chat .message .timestamp {
+    font-size: 0.7em;
+    display: inline-block;
+    font-weight: normal;
+    opacity: 0.7;
+    margin-left: 5px;
+}
+
 .chat-parent.bigchat {
-    height: calc(100dvh - 98px - var(--input-delta)) !important;
-    margin-bottom: var(--input-delta) !important;
+    flex: 1;
 }
 
 .chat > .messages {
     display: flex;
     flex-direction: column;
+    min-height: calc(100vh - 102px);
 }
 
 .chat > .messages > :first-child {
@@ -546,7 +553,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     border-radius: 5px;
     font-size: 82%;
     padding: 1px 3px;
-    background: white !important;
+    background: #f3f4f6 !important;
     color: #1f2328;
 }
 
@@ -560,18 +567,17 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     padding: 15px;
 }
 
-.message-body :not(pre) > code::before {
-    content: "`";
-}
-
-.message-body :not(pre) > code::after {
-    content: "`";
-}
-
 .message-body :not(pre) > code {
     white-space: normal !important;
     font-weight: bold;
-    font-family: unset;
+    font-size: 0.95em;
+    font-family: Menlo,"Roboto Mono","Courier New",Courier,monospace,Inter,sans-serif;
+    padding: .15rem .3rem;
+    background-color: #ececec;
+}
+
+.dark .message-body :not(pre) > code {
+    background-color: rgb(255 255 255 / 10%);
 }
 
 #chat-input {
@@ -582,7 +588,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 #chat-input textarea {
+    background: #f3f4f6;
     padding: 0.65rem 2.5rem;
+    border: 0;
+    box-shadow: 0;
+    border-radius: 8px;
 }
 
 #chat-input textarea::placeholder {
@@ -602,9 +612,17 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     display: none;
 }
 
+#chat-input .submit-button {
+    display: none;
+}
+
+#chat-input .upload-button {
+    margin-right: 16px;
+    margin-bottom: 7px;
+    background: transparent;
+}
+
 .chat-input-positioned {
-    position: absolute;
-    bottom: 0;
     max-width: 54rem;
     left: 50%;
     transform: translateX(-50%);
@@ -744,7 +762,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 
 .hover-menu button {
     width: 100%;
-    background: transparent !important;
+    background: white !important;
     border-radius: 0 !important;
     justify-content: space-between;
     margin: 0 !important;
@@ -760,7 +778,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 .hover-menu button:hover {
-    background: var(--button-secondary-background-fill-hover) !important;
+    background: #dbeafe !important;
+}
+
+.dark .hover-menu button:hover {
+    background: var(--selected-item-color-dark) !important;
 }
 
 .transparent-substring {
@@ -789,6 +811,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 #chat-input-container {
+    display: flex;
+    flex-direction: column;
     min-width: 0 !important;
 }
 
@@ -798,9 +822,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 #chat-input-row {
-    padding-bottom: 1.5em;
-    padding-left: 1rem;
-    padding-right: 1rem;
+    padding: 1rem;
+    padding-top: 0;
 }
 
 #chat-input-row.bigchat {
@@ -808,27 +831,22 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 #chat-col {
-    padding-bottom: 100px;
+    height: 100dvh;
+    display: flex;
+    flex-direction: column;
+    padding-bottom: 0;
+    gap: 0;
 }
 
 @media screen and (width <= 924px) {
     #chat-col {
-        padding-bottom: 100px;
         margin-top: 32px;
-        position: relative; /* Ensure positioning for the pseudo-element */
-    }
-
-    .chat-parent {
-        height: calc(100dvh - 98px - var(--input-delta) - 32px);
-    }
-
-    .chat-parent.bigchat {
-        height: calc(100dvh - 98px - var(--input-delta) - 32px) !important;
+        height: calc(100dvh - 32px);
     }
 }
 
 #chat-col.bigchat {
-    padding-bottom: 80px !important;
+    padding-bottom: 15px !important;
 }
 
 .message-body ol, .message-body ul {
@@ -985,6 +1003,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     cursor: pointer;
 }
 
+#past-chats .selected,
+#past-chats label:hover {
+    background-color: #dbeafe !important;
+}
+
 #past-chats-buttons,
 #delete-chat-row,
 #rename-row {
@@ -993,7 +1016,6 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     gap: 9px;
 }
 
-
 #past-chats-row,
 #chat-controls {
     width: 260px;
@@ -1111,12 +1133,9 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     color: #9ca3af;
 }
 
-.dark .hover-menu {
-    background-color: var(--darker-gray);
-}
-
 .dark .hover-menu button {
     border-color: var(--border-color-primary);
+    background-color: var(--darker-gray) !important;
 }
 
 .dark #chat-controls,
@@ -1125,8 +1144,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     border: 0 !important;
 }
 
-.dark #past-chats .selected,
-.dark #past-chats label:hover {
+.dark gradio-app .gradio-container.gradio-container-4-37-2 .contain #past-chats .selected,
+.dark gradio-app .gradio-container.gradio-container-4-37-2 .contain #past-chats label:hover {
     background-color: var(--selected-item-color-dark) !important;
 }
 
@@ -1163,7 +1182,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 .header_bar button.selected {
-    background: #E0E0E0;
+    background: #dbeafe;
 }
 
 #chat-controls,
@@ -1171,11 +1190,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     background-color: var(--light-theme-gray);
 }
 
-#chat-controls {
+.dark #chat-controls {
     border-left: 1px solid #d9d9d0;
 }
 
-#past-chats-row {
+.dark #past-chats-row {
     border-right: 1px solid #d9d9d0;
 }
 
@@ -1236,42 +1255,31 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     position: relative;
 }
 
-.footer-button {
+/* New container for the buttons */
+.message-actions {
     position: absolute;
+    bottom: -23px;
+    left: 0;
+    display: flex;
+    gap: 5px;
+    opacity: 0;
+    transition: opacity 0.2s;
+}
+
+.footer-button {
     padding: 0;
     margin: 0;
     border: none;
     border-radius: 3px;
     cursor: pointer;
-    opacity: 0;
     display: flex;
     align-items: center;
-    transition: opacity 0.2s;
+    justify-content: center;
 }
 
-.footer-button.footer-copy-button {
-    bottom: -23px;
-    left: 0;
-}
-
-.footer-button.footer-refresh-button {
-    bottom: -23px;
-    left: 25px;
-}
-
-.footer-button.footer-continue-button {
-    bottom: -23px;
-    left: 50px;
-}
-
-.footer-button.footer-remove-button {
-    bottom: -23px;
-    left: 75px;
-}
-
-.message:hover .footer-button,
-.user-message:hover .footer-button,
-.assistant-message:hover .footer-button {
+.message:hover .message-actions,
+.user-message:hover .message-actions,
+.assistant-message:hover .message-actions {
     opacity: 1;
 }
 
@@ -1362,6 +1370,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     contain: layout;
 }
 
+.chat .message-body .thinking-content p,
+.chat .message-body .thinking-content li {
+    font-size: 15px !important;
+}
+
 /* Animation for opening thinking blocks */
 @keyframes fadeIn {
     from { opacity: 0; }
@@ -1382,3 +1395,163 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     50% { opacity: 1; }
     100% { opacity: 0.6; }
 }
+
+strong {
+    font-weight: bold;
+}
+
+.min.svelte-1ybaih5 {
+    min-height: 0;
+}
+
+#vram-info .value {
+    color: #008d00;
+}
+
+.dark #vram-info .value {
+    color: #07ff07;
+}
+
+.message-attachments {
+    display: flex;
+    flex-wrap: wrap;
+    gap: 8px;
+    margin-top: 8px;
+    padding-bottom: 6px;
+}
+
+.attachment-box {
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    justify-content: center;
+    padding: 8px;
+    background: rgb(0 0 0 / 5%);
+    border-radius: 6px;
+    border: 1px solid rgb(0 0 0 / 10%);
+    min-width: 80px;
+    max-width: 120px;
+}
+
+.attachment-icon {
+    margin-bottom: 4px;
+    color: #555;
+}
+
+.attachment-name {
+    font-size: 0.8em;
+    text-align: center;
+    word-break: break-word;
+    overflow: hidden;
+    text-overflow: ellipsis;
+    display: -webkit-box;
+    -webkit-line-clamp: 2;
+    -webkit-box-orient: vertical;
+}
+
+.dark .attachment-box {
+    background: rgb(255 255 255 / 5%);
+    border: 1px solid rgb(255 255 255 / 10%);
+}
+
+.dark .attachment-icon {
+    color: #ccc;
+}
+
+/* Message Editing Styles */
+.editing-textarea {
+    width: 100%;
+    min-height: 200px;
+    max-height: 65vh;
+    padding: 10px;
+    border-radius: 5px;
+    border: 1px solid #ccc;
+    background-color: var(--light-theme-gray);
+    font-family: inherit;
+    font-size: inherit;
+    resize: vertical;
+}
+
+.dark .editing-textarea {
+    border: 1px solid var(--border-color-dark);
+    background-color: var(--darker-gray);
+}
+
+.editing-textarea:focus {
+    outline: none;
+    border-color: var(--selected-item-color-dark);
+}
+
+.edit-controls-container {
+    margin-top: 0;
+    display: flex;
+    gap: 8px;
+    padding-bottom: 8px;
+}
+
+.edit-control-button {
+    padding: 6px 12px;
+    border: 1px solid #ccc;
+    border-radius: 4px;
+    cursor: pointer;
+    background-color: #f8f9fa;
+    color: #212529;
+    font-size: 12px;
+    margin: 0;
+}
+
+.dark .edit-control-button {
+    border: 1px solid var(--border-color-dark);
+    background-color: var(--light-gray);
+    color: #efefef;
+}
+
+/* --- Simple Version Navigation --- */
+.version-navigation {
+    position: absolute;
+    bottom: -23px;
+    right: 0;
+    display: flex;
+    align-items: center;
+    gap: 5px;
+    opacity: 0;
+    transition: opacity 0.2s;
+}
+
+.message:hover .version-navigation,
+.user-message:hover .version-navigation,
+.assistant-message:hover .version-navigation {
+    opacity: 1;
+}
+
+.version-nav-button {
+    padding: 2px 6px;
+    font-size: 12px;
+    min-width: auto;
+}
+
+.version-nav-button[disabled] {
+    opacity: 0.3;
+    cursor: not-allowed;
+}
+
+.version-position {
+    font-size: 11px;
+    color: currentcolor;
+    font-family: monospace;
+    min-width: 35px;
+    text-align: center;
+    opacity: 0.8;
+    user-select: none;
+}
+
+.token-display {
+    font-family: monospace;
+    font-size: 13px;
+    color: var(--body-text-color-subdued);
+    margin-top: 4px;
+}
+
+button:focus {
+    outline: none;
+}
diff --git a/docker/amd/Dockerfile b/docker/amd/Dockerfile
index 66e5863c..c23083f7 100644
--- a/docker/amd/Dockerfile
+++ b/docker/amd/Dockerfile
@@ -14,7 +14,7 @@ WORKDIR /home/app/
 RUN git clone https://github.com/oobabooga/text-generation-webui.git 
 WORKDIR /home/app/text-generation-webui
 RUN GPU_CHOICE=B LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
-COPY CMD_FLAGS.txt /home/app/text-generation-webui/
+COPY /user_data/CMD_FLAGS.txt /home/app/text-generation-webui/user_data
 EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
 WORKDIR /home/app/text-generation-webui
 # set umask to ensure group read / write at runtime
diff --git a/docker/amd/docker-compose.yml b/docker/amd/docker-compose.yml
index 4709ae94..a727ca3e 100644
--- a/docker/amd/docker-compose.yml
+++ b/docker/amd/docker-compose.yml
@@ -22,7 +22,7 @@ services:
         TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5} 
         BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
         APP_GID: ${APP_GID:-6972} 
-        APP_UID: ${APP_UID-6972} 
+        APP_UID: ${APP_UID:-6972} 
     env_file: .env
     user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
     ports:
@@ -41,14 +41,4 @@ services:
     security_opt:
       - seccomp=unconfined
     volumes:
-      - ./cache:/home/app/text-generation-webui/cache
-      - ./characters:/home/app/text-generation-webui/characters
-      - ./extensions:/home/app/text-generation-webui/extensions
-      - ./loras:/home/app/text-generation-webui/loras
-      - ./logs:/home/app/text-generation-webui/logs
-      - ./models:/home/app/text-generation-webui/models
-      - ./presets:/home/app/text-generation-webui/presets
-      - ./prompts:/home/app/text-generation-webui/prompts
-      - ./softprompts:/home/app/text-generation-webui/softprompts
-      - ./training:/home/app/text-generation-webui/training
-      - ./cloudflared:/etc/cloudflared
+      - ./user_data:/home/app/text-generation-webui/user_data
diff --git a/docker/cpu/docker-compose.yml b/docker/cpu/docker-compose.yml
index c9d415ae..9aba314a 100644
--- a/docker/cpu/docker-compose.yml
+++ b/docker/cpu/docker-compose.yml
@@ -22,7 +22,7 @@ services:
         TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5} 
         BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
         APP_GID: ${APP_GID:-6972} 
-        APP_UID: ${APP_UID-6972} 
+        APP_UID: ${APP_UID:-6972} 
     env_file: .env
     user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
     ports:
diff --git a/docker/intel/Dockerfile b/docker/intel/Dockerfile
index cab62442..4a709803 100644
--- a/docker/intel/Dockerfile
+++ b/docker/intel/Dockerfile
@@ -14,7 +14,7 @@ WORKDIR /home/app/
 RUN git clone https://github.com/oobabooga/text-generation-webui.git 
 WORKDIR /home/app/text-generation-webui
 RUN GPU_CHOICE=D LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
-COPY CMD_FLAGS.txt /home/app/text-generation-webui/
+COPY /user_data/CMD_FLAGS.txt /home/app/text-generation-webui/user_data
 EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
 # set umask to ensure group read / write at runtime
 WORKDIR /home/app/text-generation-webui
diff --git a/docker/intel/docker-compose.yml b/docker/intel/docker-compose.yml
index 31e9dde0..bb48dd22 100644
--- a/docker/intel/docker-compose.yml
+++ b/docker/intel/docker-compose.yml
@@ -22,7 +22,7 @@ services:
         TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5} 
         BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
         APP_GID: ${APP_GID:-6972} 
-        APP_UID: ${APP_UID-6972} 
+        APP_UID: ${APP_UID:-6972} 
     env_file: .env
     user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
     ports:
@@ -41,12 +41,4 @@ services:
     security_opt:
       - seccomp=unconfined
     volumes:
-      - ./characters:/home/app/text-generation-webui/characters
-      - ./extensions:/home/app/text-generation-webui/extensions
-      - ./loras:/home/app/text-generation-webui/loras
-      - ./models:/home/app/text-generation-webui/models
-      - ./presets:/home/app/text-generation-webui/presets
-      - ./prompts:/home/app/text-generation-webui/prompts
-      - ./softprompts:/home/app/text-generation-webui/softprompts
-      - ./training:/home/app/text-generation-webui/training
-      - ./cloudflared:/etc/cloudflared
+      - ./user_data:/home/app/text-generation-webui/user_data
diff --git a/docker/nvidia/Dockerfile b/docker/nvidia/Dockerfile
index 900a4329..82594a26 100644
--- a/docker/nvidia/Dockerfile
+++ b/docker/nvidia/Dockerfile
@@ -14,7 +14,7 @@ WORKDIR /home/app/
 RUN git clone https://github.com/oobabooga/text-generation-webui.git 
 WORKDIR /home/app/text-generation-webui
 RUN GPU_CHOICE=A LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
-COPY CMD_FLAGS.txt /home/app/text-generation-webui/
+COPY /user_data/CMD_FLAGS.txt /home/app/text-generation-webui/user_data
 EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
 WORKDIR /home/app/text-generation-webui
 # set umask to ensure group read / write at runtime
diff --git a/docker/nvidia/docker-compose.yml b/docker/nvidia/docker-compose.yml
index 835dd838..23d5cacc 100644
--- a/docker/nvidia/docker-compose.yml
+++ b/docker/nvidia/docker-compose.yml
@@ -22,7 +22,7 @@ services:
         TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST:-7.5} 
         BUILD_EXTENSIONS: ${BUILD_EXTENSIONS:-}
         APP_GID: ${APP_GID:-6972} 
-        APP_UID: ${APP_UID-6972} 
+        APP_UID: ${APP_UID:-6972} 
     env_file: .env
     user: "${APP_RUNTIME_UID:-6972}:${APP_RUNTIME_GID:-6972}"
     ports:
@@ -31,17 +31,7 @@ services:
     stdin_open: true
     tty: true
     volumes:
-      - ./cache:/home/app/text-generation-webui/cache
-      - ./characters:/home/app/text-generation-webui/characters
-      - ./extensions:/home/app/text-generation-webui/extensions
-      - ./loras:/home/app/text-generation-webui/loras
-      - ./logs:/home/app/text-generation-webui/logs
-      - ./models:/home/app/text-generation-webui/models
-      - ./presets:/home/app/text-generation-webui/presets
-      - ./prompts:/home/app/text-generation-webui/prompts
-      - ./softprompts:/home/app/text-generation-webui/softprompts
-      - ./training:/home/app/text-generation-webui/training
-      - ./cloudflared:/etc/cloudflared
+      - ./user_data:/home/app/text-generation-webui/user_data
     deploy:
       resources:
         reservations:
diff --git a/docs/12 - OpenAI API.md b/docs/12 - OpenAI API.md
index 364c6b09..db9befed 100644
--- a/docs/12 - OpenAI API.md	
+++ b/docs/12 - OpenAI API.md	
@@ -257,6 +257,85 @@ headers = {
 
 in any of the examples above.
 
+#### Tool/Function Calling Example
+
+You need to use a model with tools support. The prompt will be automatically formatted using the model's Jinja2 template.
+
+Request:
+
+```
+curl http://127.0.0.1:5000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [
+      {
+        "role": "system",
+        "content": "You are a helpful assistant."
+      },
+      {
+        "role": "user",
+        "content": "What time is it currently in New York City?"
+      }
+    ],
+    "tools": [
+      {
+        "type": "function",
+        "function": {
+          "name": "get_current_time",
+          "description": "Get current time in a specific timezones",
+          "parameters": {
+            "type": "object",
+            "required": ["timezone"],
+            "properties": {
+              "timezone": {
+                "type": "string",
+                "description": "IANA timezone name (e.g., America/New_York, Europe/London). Use Europe/Berlin as local timezone if no timezone provided by the user."
+              }
+            }
+          }
+        }
+      }
+    ]
+  }'
+```
+
+Sample response:
+
+```
+{
+    "id": "chatcmpl-1746532051477984256",
+    "object": "chat.completion",
+    "created": 1746532051,
+    "model": "qwen2.5-coder-14b-instruct-q4_k_m.gguf",
+    "choices": [
+        {
+            "index": 0,
+            "finish_reason": "tool_calls",
+            "message": {
+                "role": "assistant",
+                "content": "```xml\n<function>\n{\n  \"name\": \"get_current_time\",\n  \"arguments\": {\n    \"timezone\": \"America/New_York\"\n  }\n}\n</function>\n```"
+            },
+            "tool_calls": [
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "get_current_time",
+                        "arguments": "{\"timezone\": \"America/New_York\"}"
+                    },
+                    "id": "call_52ij07mh",
+                    "index": "0"
+                }
+            ]
+        }
+    ],
+    "usage": {
+        "prompt_tokens": 224,
+        "completion_tokens": 38,
+        "total_tokens": 262
+    }
+}
+```
+
 ### Environment variables
 
 The following environment variables can be used (they take precedence over everything else):
diff --git a/extensions/openai/completions.py b/extensions/openai/completions.py
index 75e2cc11..5181b18b 100644
--- a/extensions/openai/completions.py
+++ b/extensions/openai/completions.py
@@ -1,16 +1,14 @@
-import base64
 import copy
-import re
+import json
 import time
 from collections import deque
-from io import BytesIO
 
-import requests
 import tiktoken
-from PIL import Image
+from pydantic import ValidationError
 
 from extensions.openai.errors import InvalidRequestError
-from extensions.openai.utils import debug_msg
+from extensions.openai.typing import ToolDefinition
+from extensions.openai.utils import debug_msg, getToolCallId, parseToolCall
 from modules import shared
 from modules.chat import (
     generate_chat_prompt,
@@ -96,72 +94,32 @@ def convert_history(history):
     user_input_last = True
     system_message = ""
 
-    # Multimodal: convert OpenAI format to multimodal extension format
-    if any('content' in entry and isinstance(entry['content'], list) for entry in history):
-        new_history = []
-        for entry in history:
-            if isinstance(entry['content'], list):
-                for item in entry['content']:
-                    if not isinstance(item, dict):
-                        continue
-
-                    image_url = None
-                    content = None
-                    if item['type'] == 'image_url' and isinstance(item['image_url'], dict):
-                        image_url = item['image_url']['url']
-                    elif item['type'] == 'text' and isinstance(item['text'], str):
-                        content = item['text']
-                    if image_url:
-                        new_history.append({"image_url": image_url, "role": "user"})
-                    if content:
-                        new_history.append({"content": content, "role": "user"})
-            else:
-                new_history.append(entry)
-
-        history = new_history
-
     for entry in history:
-        if "image_url" in entry:
-            image_url = entry['image_url']
-            if "base64" in image_url:
-                image_url = re.sub('^data:image/.+;base64,', '', image_url)
-                img = Image.open(BytesIO(base64.b64decode(image_url)))
-            else:
-                try:
-                    my_res = requests.get(image_url)
-                    img = Image.open(BytesIO(my_res.content))
-                except Exception:
-                    raise 'Image cannot be loaded from the URL!'
-
-            buffered = BytesIO()
-            if img.mode in ("RGBA", "P"):
-                img = img.convert("RGB")
-
-            img.save(buffered, format="JPEG")
-            img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
-            content = f'<img src="data:image/jpeg;base64,{img_str}">'
-        else:
-            content = entry["content"]
-
+        content = entry["content"]
         role = entry["role"]
 
         if role == "user":
             user_input = content
             user_input_last = True
             if current_message:
-                chat_dialogue.append([current_message, ''])
+                chat_dialogue.append([current_message, '', ''])
                 current_message = ""
 
             current_message = content
         elif role == "assistant":
+            if "tool_calls" in entry and isinstance(entry["tool_calls"], list) and len(entry["tool_calls"]) > 0 and content.strip() == "":
+                continue  # skip tool calls
             current_reply = content
             user_input_last = False
             if current_message:
-                chat_dialogue.append([current_message, current_reply])
+                chat_dialogue.append([current_message, current_reply, ''])
                 current_message = ""
                 current_reply = ""
             else:
-                chat_dialogue.append(['', current_reply])
+                chat_dialogue.append(['', current_reply, ''])
+        elif role == "tool":
+            user_input_last = False
+            chat_dialogue.append(['', '', content])
         elif role == "system":
             system_message += f"\n{content}" if system_message else content
 
@@ -181,6 +139,10 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
     if 'messages' not in body:
         raise InvalidRequestError(message="messages is required", param='messages')
 
+    tools = None
+    if 'tools' in body and body['tools'] is not None and isinstance(body['tools'], list) and len(body['tools']) > 0:
+        tools = validateTools(body['tools'])  # raises InvalidRequestError if validation fails
+
     messages = body['messages']
     for m in messages:
         if 'role' not in m:
@@ -238,6 +200,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
         'custom_system_message': custom_system_message,
         'chat_template_str': chat_template_str,
         'chat-instruct_command': chat_instruct_command,
+        'tools': tools,
         'history': history,
         'stream': stream
     })
@@ -250,7 +213,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
     requested_model = generate_params.pop('model')
     logprob_proc = generate_params.pop('logprob_proc', None)
 
-    def chat_streaming_chunk(content):
+    def chat_streaming_chunk(content, chunk_tool_calls=None):
         # begin streaming
         chunk = {
             "id": cmpl_id,
@@ -260,7 +223,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
             resp_list: [{
                 "index": 0,
                 "finish_reason": None,
-                "delta": {'role': 'assistant', 'content': content},
+                "delta": {'role': 'assistant', 'content': content, 'tool_calls': chunk_tool_calls},
             }],
         }
 
@@ -269,6 +232,7 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
             chunk[resp_list][0]["logprobs"] = {'top_logprobs': [top_logprobs]}
         # else:
         #    chunk[resp_list][0]["logprobs"] = None
+
         return chunk
 
     # generate reply #######################################
@@ -277,8 +241,6 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
         yield {'prompt': prompt}
         return
 
-    debug_msg({'prompt': prompt, 'generate_params': generate_params})
-
     if stream:
         yield chat_streaming_chunk('')
 
@@ -288,8 +250,23 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
     answer = ''
     seen_content = ''
 
+    tool_calls = []
+    end_last_tool_call = 0
+    supported_tools = [x["function"]["name"] for x in tools] if tools is not None else None
+
     for a in generator:
         answer = a['internal'][-1][1]
+
+        if supported_tools is not None:
+            tool_call = parseToolCall(answer[end_last_tool_call:], supported_tools) if len(answer) > 0 else []
+            if len(tool_call) > 0:
+                for tc in tool_call:
+                    tc["id"] = getToolCallId()
+                    tc["index"] = str(len(tool_calls))
+                    tc["function"]["arguments"] = json.dumps(tc["function"]["arguments"])
+                    tool_calls.append(tc)
+                end_last_tool_call = len(answer)
+
         if stream:
             len_seen = len(seen_content)
             new_content = answer[len_seen:]
@@ -297,18 +274,25 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
             if not new_content or chr(0xfffd) in new_content:  # partial unicode character, don't send it yet.
                 continue
 
-            seen_content = answer
             chunk = chat_streaming_chunk(new_content)
+
+            seen_content = answer
             yield chunk
 
+        # stop generation if tool_calls were generated previously
+        if len(tool_calls) > 0:
+            break
+
     token_count = len(encode(prompt)[0])
     completion_token_count = len(encode(answer)[0])
     stop_reason = "stop"
+    if len(tool_calls) > 0:
+        stop_reason = "tool_calls"
     if token_count + completion_token_count >= generate_params['truncation_length'] or completion_token_count >= generate_params['max_new_tokens']:
         stop_reason = "length"
 
     if stream:
-        chunk = chat_streaming_chunk('')
+        chunk = chat_streaming_chunk('', tool_calls)
         chunk[resp_list][0]['finish_reason'] = stop_reason
         chunk['usage'] = {
             "prompt_tokens": token_count,
@@ -326,7 +310,8 @@ def chat_completions_common(body: dict, is_legacy: bool = False, stream=False, p
             resp_list: [{
                 "index": 0,
                 "finish_reason": stop_reason,
-                "message": {"role": "assistant", "content": answer}
+                "message": {"role": "assistant", "content": answer},
+                "tool_calls": tool_calls
             }],
             "usage": {
                 "prompt_tokens": token_count,
@@ -515,3 +500,19 @@ def completions(body: dict, is_legacy: bool = False) -> dict:
 def stream_completions(body: dict, is_legacy: bool = False):
     for resp in completions_common(body, is_legacy, stream=True):
         yield resp
+
+
+def validateTools(tools: list[dict]):
+    # Validate each tool definition in the JSON array
+    valid_tools = None
+    for idx in range(len(tools)):
+        tool = tools[idx]
+        try:
+            tool_definition = ToolDefinition(**tool)
+            if valid_tools is None:
+                valid_tools = []
+            valid_tools.append(tool)
+        except ValidationError:
+            raise InvalidRequestError(message=f"Invalid tool specification at index {idx}.", param='tools')
+
+    return valid_tools
diff --git a/extensions/openai/script.py b/extensions/openai/script.py
index a995da9d..24bcd69d 100644
--- a/extensions/openai/script.py
+++ b/extensions/openai/script.py
@@ -14,6 +14,7 @@ from fastapi.requests import Request
 from fastapi.responses import JSONResponse
 from pydub import AudioSegment
 from sse_starlette import EventSourceResponse
+from starlette.concurrency import iterate_in_threadpool
 
 import extensions.openai.completions as OAIcompletions
 import extensions.openai.images as OAIimages
@@ -114,18 +115,28 @@ async def openai_completions(request: Request, request_data: CompletionRequest):
     if request_data.stream:
         async def generator():
             async with streaming_semaphore:
-                response = OAIcompletions.stream_completions(to_dict(request_data), is_legacy=is_legacy)
-                for resp in response:
-                    disconnected = await request.is_disconnected()
-                    if disconnected:
-                        break
+                try:
+                    response = OAIcompletions.stream_completions(to_dict(request_data), is_legacy=is_legacy)
+                    async for resp in iterate_in_threadpool(response):
+                        disconnected = await request.is_disconnected()
+                        if disconnected:
+                            break
 
-                    yield {"data": json.dumps(resp)}
+                        yield {"data": json.dumps(resp)}
+                finally:
+                    stop_everything_event()
+                    response.close()
+                    return
 
         return EventSourceResponse(generator())  # SSE streaming
 
     else:
-        response = OAIcompletions.completions(to_dict(request_data), is_legacy=is_legacy)
+        response = await asyncio.to_thread(
+            OAIcompletions.completions,
+            to_dict(request_data),
+            is_legacy=is_legacy
+        )
+
         return JSONResponse(response)
 
 
@@ -137,18 +148,28 @@ async def openai_chat_completions(request: Request, request_data: ChatCompletion
     if request_data.stream:
         async def generator():
             async with streaming_semaphore:
-                response = OAIcompletions.stream_chat_completions(to_dict(request_data), is_legacy=is_legacy)
-                for resp in response:
-                    disconnected = await request.is_disconnected()
-                    if disconnected:
-                        break
+                try:
+                    response = OAIcompletions.stream_chat_completions(to_dict(request_data), is_legacy=is_legacy)
+                    async for resp in iterate_in_threadpool(response):
+                        disconnected = await request.is_disconnected()
+                        if disconnected:
+                            break
 
-                    yield {"data": json.dumps(resp)}
+                        yield {"data": json.dumps(resp)}
+                finally:
+                    stop_everything_event()
+                    response.close()
+                    return
 
         return EventSourceResponse(generator())  # SSE streaming
 
     else:
-        response = OAIcompletions.chat_completions(to_dict(request_data), is_legacy=is_legacy)
+        response = await asyncio.to_thread(
+            OAIcompletions.chat_completions,
+            to_dict(request_data),
+            is_legacy=is_legacy
+        )
+
         return JSONResponse(response)
 
 
@@ -436,7 +457,7 @@ def run_server():
 
     # Start server
     logging.getLogger("uvicorn.error").propagate = False
-    uvicorn.run(app, host=server_addrs, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile)
+    uvicorn.run(app, host=server_addrs, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile, access_log=False)
 
 
 def setup():
diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py
index b1979cbc..b28ebb4e 100644
--- a/extensions/openai/typing.py
+++ b/extensions/openai/typing.py
@@ -1,8 +1,8 @@
 import json
 import time
-from typing import Dict, List
+from typing import Dict, List, Optional
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, validator
 
 
 class GenerationOptions(BaseModel):
@@ -54,6 +54,48 @@ class GenerationOptions(BaseModel):
     grammar_string: str = ""
 
 
+class ToolDefinition(BaseModel):
+    function: 'ToolFunction'
+    type: str
+
+
+class ToolFunction(BaseModel):
+    description: str
+    name: str
+    parameters: 'ToolParameters'
+
+
+class ToolParameters(BaseModel):
+    properties: Optional[Dict[str, 'ToolProperty']] = None
+    required: Optional[list[str]] = None
+    type: str
+    description: Optional[str] = None
+
+
+class ToolProperty(BaseModel):
+    description: Optional[str] = None
+    type: Optional[str] = None  # we are faced with definitions like anyOf, e.g. {'type': 'function', 'function': {'name': 'git_create_branch', 'description': 'Creates a new branch from an optional base branch', 'parameters': {'type': 'object', 'properties': {'repo_path': {'title': 'Repo Path', 'type': 'string'}, 'branch_name': {'title': 'Branch Name', 'type': 'string'}, 'base_branch': {'anyOf': [{'type': 'string'}, {'type': 'null'}], 'default': None, 'title': 'Base Branch'}}, 'required': ['repo_path', 'branch_name'], 'title': 'GitCreateBranch'}}}
+
+
+class FunctionCall(BaseModel):
+    name: str
+    arguments: Optional[str] = None
+    parameters: Optional[str] = None
+
+    @validator('arguments', allow_reuse=True)
+    def checkPropertyArgsOrParams(cls, v, values, **kwargs):
+        if not v and not values.get('parameters'):
+            raise ValueError("At least one of 'arguments' or 'parameters' must be provided as property in FunctionCall type")
+        return v
+
+
+class ToolCall(BaseModel):
+    id: str
+    index: int
+    type: str
+    function: FunctionCall
+
+
 class CompletionRequestParams(BaseModel):
     model: str | None = Field(default=None, description="Unused parameter. To change the model, use the /v1/internal/model/load endpoint.")
     prompt: str | List[str]
@@ -92,6 +134,7 @@ class ChatCompletionRequestParams(BaseModel):
     frequency_penalty: float | None = 0
     function_call: str | dict | None = Field(default=None, description="Unused parameter.")
     functions: List[dict] | None = Field(default=None, description="Unused parameter.")
+    tools: List[dict] | None = Field(default=None, description="Tools signatures passed via MCP.")
     logit_bias: dict | None = None
     max_tokens: int | None = None
     n: int | None = Field(default=1, description="Unused parameter.")
diff --git a/extensions/openai/utils.py b/extensions/openai/utils.py
index 2b414769..9a1de2e7 100644
--- a/extensions/openai/utils.py
+++ b/extensions/openai/utils.py
@@ -1,5 +1,8 @@
 import base64
+import json
 import os
+import random
+import re
 import time
 import traceback
 from typing import Callable, Optional
@@ -52,3 +55,94 @@ def _start_cloudflared(port: int, tunnel_id: str, max_attempts: int = 3, on_star
             time.sleep(3)
 
         raise Exception('Could not start cloudflared.')
+
+
+def getToolCallId() -> str:
+    letter_bytes = "abcdefghijklmnopqrstuvwxyz0123456789"
+    b = [random.choice(letter_bytes) for _ in range(8)]
+    return "call_" + "".join(b).lower()
+
+
+def checkAndSanitizeToolCallCandidate(candidate_dict: dict, tool_names: list[str]):
+    # check if property 'function' exists and is a dictionary, otherwise adapt dict
+    if 'function' not in candidate_dict and 'name' in candidate_dict and isinstance(candidate_dict['name'], str):
+        candidate_dict = {"type": "function", "function": candidate_dict}
+    if 'function' in candidate_dict and isinstance(candidate_dict['function'], str):
+        candidate_dict['name'] = candidate_dict['function']
+        del candidate_dict['function']
+        candidate_dict = {"type": "function", "function": candidate_dict}
+    if 'function' in candidate_dict and isinstance(candidate_dict['function'], dict):
+        # check if 'name' exists within 'function' and is part of known tools
+        if 'name' in candidate_dict['function'] and candidate_dict['function']['name'] in tool_names:
+            candidate_dict["type"] = "function"  # ensure required property 'type' exists and has the right value
+            # map property 'parameters' used by some older models to 'arguments'
+            if "arguments" not in candidate_dict["function"] and "parameters" in candidate_dict["function"]:
+                candidate_dict["function"]["arguments"] = candidate_dict["function"]["parameters"]
+                del candidate_dict["function"]["parameters"]
+            return candidate_dict
+    return None
+
+
+def parseToolCall(answer: str, tool_names: list[str]):
+    matches = []
+
+    # abort on very short answers to save computation cycles
+    if len(answer) < 10:
+        return matches
+
+    # Define the regex pattern to find the JSON content wrapped in <function>, <tools>, <tool_call>, and other tags observed from various models
+    patterns = [r"(```[^\n]*)\n(.*?)```", r"<([^>]+)>(.*?)</\1>"]
+
+    for pattern in patterns:
+        for match in re.finditer(pattern, answer, re.DOTALL):
+            # print(match.group(2))
+            if match.group(2) is None:
+                continue
+            # remove backtick wraps if present
+            candidate = re.sub(r"^```(json|xml|python[^\n]*)\n", "", match.group(2).strip())
+            candidate = re.sub(r"```$", "", candidate.strip())
+            # unwrap inner tags
+            candidate = re.sub(pattern, r"\2", candidate.strip(), flags=re.DOTALL)
+            # llm might have generated multiple json objects separated by linebreaks, check for this pattern and try parsing each object individually
+            if re.search(r"\}\s*\n\s*\{", candidate) is not None:
+                candidate = re.sub(r"\}\s*\n\s*\{", "},\n{", candidate)
+            if not candidate.strip().startswith("["):
+                candidate = "[" + candidate + "]"
+
+            candidates = []
+            try:
+                # parse the candidate JSON into a dictionary
+                candidates = json.loads(candidate)
+                if not isinstance(candidates, list):
+                    candidates = [candidates]
+            except json.JSONDecodeError:
+                # Ignore invalid JSON silently
+                continue
+
+            for candidate_dict in candidates:
+                checked_candidate = checkAndSanitizeToolCallCandidate(candidate_dict, tool_names)
+                if checked_candidate is not None:
+                    matches.append(checked_candidate)
+
+        # last resort if nothing has been mapped: LLM might have produced plain json tool call without xml-like tags
+        if len(matches) == 0:
+            try:
+                candidate = answer
+                # llm might have generated multiple json objects separated by linebreaks, check for this pattern and try parsing each object individually
+                if re.search(r"\}\s*\n\s*\{", candidate) is not None:
+                    candidate = re.sub(r"\}\s*\n\s*\{", "},\n{", candidate)
+                if not candidate.strip().startswith("["):
+                    candidate = "[" + candidate + "]"
+                # parse the candidate JSON into a dictionary
+                candidates = json.loads(candidate)
+                if not isinstance(candidates, list):
+                    candidates = [candidates]
+                for candidate_dict in candidates:
+                    checked_candidate = checkAndSanitizeToolCallCandidate(candidate_dict, tool_names)
+                    if checked_candidate is not None:
+                        matches.append(checked_candidate)
+            except json.JSONDecodeError:
+                # Ignore invalid JSON silently
+                pass
+
+    return matches
diff --git a/extensions/superboogav2/chromadb.py b/extensions/superboogav2/chromadb.py
index 6e93dd92..9344e25c 100644
--- a/extensions/superboogav2/chromadb.py
+++ b/extensions/superboogav2/chromadb.py
@@ -1,10 +1,11 @@
 import math
 import random
 import threading
-import torch
+
 import chromadb
 import numpy as np
 import posthog
+import torch
 from chromadb.config import Settings
 from chromadb.utils import embedding_functions
 
@@ -292,6 +293,8 @@ class ChromaCollector():
 
         for doc in documents:
             doc_tokens = encode(doc)[0]
+            if isinstance(doc_tokens, np.ndarray):
+                doc_tokens = doc_tokens.tolist()
             doc_token_count = len(doc_tokens)
             if current_token_count + doc_token_count > max_token_count:
                 # If adding this document would exceed the max token count,
diff --git a/js/global_scope_js.js b/js/global_scope_js.js
index 29d2d8bd..3274f47e 100644
--- a/js/global_scope_js.js
+++ b/js/global_scope_js.js
@@ -1,3 +1,7 @@
+// -------------------------------------------------
+// Event handlers
+// -------------------------------------------------
+
 function copyToClipboard(element) {
   if (!element) return;
 
@@ -18,6 +22,201 @@ function copyToClipboard(element) {
   });
 }
 
+function branchHere(element) {
+  if (!element) return;
+
+  const messageElement = element.closest(".message, .user-message, .assistant-message");
+  if (!messageElement) return;
+
+  const index = messageElement.getAttribute("data-index");
+  if (!index) return;
+
+  const branchIndexInput = document.getElementById("Branch-index").querySelector("input");
+  if (!branchIndexInput) {
+    console.error("Element with ID 'Branch-index' not found.");
+    return;
+  }
+  const branchButton = document.getElementById("Branch");
+
+  if (!branchButton) {
+    console.error("Required element 'Branch' not found.");
+    return;
+  }
+
+  branchIndexInput.value = index;
+
+  // Trigger any 'change' or 'input' events Gradio might be listening for
+  const event = new Event("input", { bubbles: true });
+  branchIndexInput.dispatchEvent(event);
+
+  branchButton.click();
+}
+
+// -------------------------------------------------
+// Message Editing Functions
+// -------------------------------------------------
+
+function editHere(buttonElement) {
+  if (!buttonElement) return;
+
+  const messageElement = buttonElement.closest(".message, .user-message, .assistant-message");
+  if (!messageElement) return;
+
+  const messageBody = messageElement.querySelector(".message-body");
+  if (!messageBody) return;
+
+  // If already editing, focus the textarea
+  const existingTextarea = messageBody.querySelector(".editing-textarea");
+  if (existingTextarea) {
+    existingTextarea.focus();
+    return;
+  }
+
+  // Determine role based on message element - handle different chat modes
+  const isUserMessage = messageElement.classList.contains("user-message") ||
+                       messageElement.querySelector(".text-you") !== null ||
+                       messageElement.querySelector(".circle-you") !== null;
+
+  startEditing(messageElement, messageBody, isUserMessage);
+}
+
+function startEditing(messageElement, messageBody, isUserMessage) {
+  const rawText = messageElement.getAttribute("data-raw") || messageBody.textContent;
+  const originalHTML = messageBody.innerHTML;
+
+  // Create editing interface
+  const editingInterface = createEditingInterface(rawText);
+
+  // Replace message content
+  messageBody.innerHTML = "";
+  messageBody.appendChild(editingInterface.textarea);
+  messageBody.appendChild(editingInterface.controls);
+
+  editingInterface.textarea.focus();
+  editingInterface.textarea.setSelectionRange(rawText.length, rawText.length);
+
+  // Setup event handlers
+  setupEditingHandlers(editingInterface.textarea, messageElement, originalHTML, messageBody, isUserMessage);
+}
+
+function createEditingInterface(text) {
+  const textarea = document.createElement("textarea");
+  textarea.value = text;
+  textarea.className = "editing-textarea";
+  textarea.rows = Math.max(3, text.split("\n").length);
+
+  const controls = document.createElement("div");
+  controls.className = "edit-controls-container";
+
+  const saveButton = document.createElement("button");
+  saveButton.textContent = "Save";
+  saveButton.className = "edit-control-button";
+  saveButton.type = "button";
+
+  const cancelButton = document.createElement("button");
+  cancelButton.textContent = "Cancel";
+  cancelButton.className = "edit-control-button edit-cancel-button";
+  cancelButton.type = "button";
+
+  controls.appendChild(saveButton);
+  controls.appendChild(cancelButton);
+
+  return { textarea, controls, saveButton, cancelButton };
+}
+
+function setupEditingHandlers(textarea, messageElement, originalHTML, messageBody, isUserMessage) {
+  const saveButton = messageBody.querySelector(".edit-control-button:not(.edit-cancel-button)");
+  const cancelButton = messageBody.querySelector(".edit-cancel-button");
+
+  const submitEdit = () => {
+    const index = messageElement.getAttribute("data-index");
+    if (!index || !submitMessageEdit(index, textarea.value, isUserMessage)) {
+      cancelEdit();
+    }
+  };
+
+  const cancelEdit = () => {
+    messageBody.innerHTML = originalHTML;
+  };
+
+  // Event handlers
+  saveButton.onclick = submitEdit;
+  cancelButton.onclick = cancelEdit;
+
+  textarea.onkeydown = (e) => {
+    if (e.key === "Enter" && !e.shiftKey) {
+      e.preventDefault();
+      submitEdit();
+    } else if (e.key === "Escape") {
+      e.preventDefault();
+      cancelEdit();
+    }
+  };
+}
+
+function submitMessageEdit(index, newText, isUserMessage) {
+  const editIndexInput = document.getElementById("Edit-message-index")?.querySelector("input");
+  const editTextInput = document.getElementById("Edit-message-text")?.querySelector("textarea");
+  const editRoleInput = document.getElementById("Edit-message-role")?.querySelector("textarea");
+  const editButton = document.getElementById("Edit-message");
+
+  if (!editIndexInput || !editTextInput || !editRoleInput || !editButton) {
+    console.error("Edit elements not found");
+    return false;
+  }
+
+  editIndexInput.value = index;
+  editTextInput.value = newText;
+  editRoleInput.value = isUserMessage ? "user" : "assistant";
+
+  editIndexInput.dispatchEvent(new Event("input", { bubbles: true }));
+  editTextInput.dispatchEvent(new Event("input", { bubbles: true }));
+  editRoleInput.dispatchEvent(new Event("input", { bubbles: true }));
+
+  editButton.click();
+  return true;
+}
+
+function navigateVersion(element, direction) {
+  if (!element) return;
+
+  const messageElement = element.closest(".message, .user-message, .assistant-message");
+  if (!messageElement) return;
+
+  const index = messageElement.getAttribute("data-index");
+  if (!index) return;
+
+  // Determine role based on message element classes
+  let role = "assistant"; // Default role
+  if (messageElement.classList.contains("user-message") ||
+      messageElement.querySelector(".text-you") ||
+      messageElement.querySelector(".circle-you")) {
+    role = "user";
+  }
+
+  const indexInput = document.getElementById("Navigate-message-index")?.querySelector("input");
+  const directionInput = document.getElementById("Navigate-direction")?.querySelector("textarea");
+  const roleInput = document.getElementById("Navigate-message-role")?.querySelector("textarea");
+  const navigateButton = document.getElementById("Navigate-version");
+
+  if (!indexInput || !directionInput || !roleInput || !navigateButton) {
+    console.error("Navigation control elements (index, direction, role, or button) not found.");
+    return;
+  }
+
+  indexInput.value = index;
+  directionInput.value = direction;
+  roleInput.value = role;
+
+  // Trigger 'input' events for Gradio to pick up changes
+  const event = new Event("input", { bubbles: true });
+  indexInput.dispatchEvent(event);
+  directionInput.dispatchEvent(event);
+  roleInput.dispatchEvent(event);
+
+  navigateButton.click();
+}
+
 function regenerateClick() {
   document.getElementById("Regenerate").click();
 }
diff --git a/js/main.js b/js/main.js
index 33b7d6bd..f23dc246 100644
--- a/js/main.js
+++ b/js/main.js
@@ -1,3 +1,7 @@
+// ------------------------------------------------
+// Main
+// ------------------------------------------------
+
 let main_parent = document.getElementById("chat-tab").parentNode;
 let extensions = document.getElementById("extensions");
 
@@ -39,9 +43,24 @@ document.querySelector(".header_bar").addEventListener("click", function(event)
 //------------------------------------------------
 // Keyboard shortcuts
 //------------------------------------------------
+
+// --- Helper functions --- //
+function isModifiedKeyboardEvent() {
+  return (event instanceof KeyboardEvent &&
+    event.shiftKey ||
+    event.ctrlKey ||
+    event.altKey ||
+    event.metaKey);
+}
+
+function isFocusedOnEditableTextbox() {
+  if (event.target.tagName === "INPUT" || event.target.tagName === "TEXTAREA") {
+    return !!event.target.value;
+  }
+}
+
 let previousTabId = "chat-tab-button";
 document.addEventListener("keydown", function(event) {
-
   // Stop generation on Esc pressed
   if (event.key === "Escape") {
     // Find the element with id 'stop' and click it
@@ -49,10 +68,15 @@ document.addEventListener("keydown", function(event) {
     if (stopButton) {
       stopButton.click();
     }
+    return;
+  }
+
+  if (!document.querySelector("#chat-tab").checkVisibility() ) {
+    return;
   }
 
   // Show chat controls on Ctrl + S
-  else if (event.ctrlKey && event.key == "s") {
+  if (event.ctrlKey && event.key == "s") {
     event.preventDefault();
 
     var showControlsElement = document.getElementById("show-controls");
@@ -82,24 +106,29 @@ document.addEventListener("keydown", function(event) {
     document.getElementById("Remove-last").click();
   }
 
-  // Copy last on Ctrl + Shift + K
-  else if (event.ctrlKey && event.shiftKey && event.key === "K") {
-    event.preventDefault();
-    document.getElementById("Copy-last").click();
-  }
-
-  // Replace last on Ctrl + Shift + L
-  else if (event.ctrlKey && event.shiftKey && event.key === "L") {
-    event.preventDefault();
-    document.getElementById("Replace-last").click();
-  }
-
   // Impersonate on Ctrl + Shift + M
   else if (event.ctrlKey && event.shiftKey && event.key === "M") {
     event.preventDefault();
     document.getElementById("Impersonate").click();
   }
 
+  // --- Simple version navigation --- //
+  if (!isFocusedOnEditableTextbox()) {
+    // Version navigation on Arrow keys (horizontal)
+    if (!isModifiedKeyboardEvent() && event.key === "ArrowLeft") {
+      event.preventDefault();
+      navigateLastAssistantMessage("left");
+    }
+
+    else if (!isModifiedKeyboardEvent() && event.key === "ArrowRight") {
+      event.preventDefault();
+      if (!navigateLastAssistantMessage("right")) {
+        // If can't navigate right (last version), regenerate
+        document.getElementById("Regenerate").click();
+      }
+    }
+  }
+
 });
 
 //------------------------------------------------
@@ -132,8 +161,6 @@ targetElement.addEventListener("scroll", function() {
 
 // Create a MutationObserver instance
 const observer = new MutationObserver(function(mutations) {
-  updateCssProperties();
-
   if (targetElement.classList.contains("_generating")) {
     typing.parentNode.classList.add("visible-dots");
     document.getElementById("stop").style.display = "flex";
@@ -144,12 +171,24 @@ const observer = new MutationObserver(function(mutations) {
     document.getElementById("Generate").style.display = "flex";
   }
 
-
   doSyntaxHighlighting();
 
   if (!isScrolled && targetElement.scrollTop !== targetElement.scrollHeight) {
     targetElement.scrollTop = targetElement.scrollHeight;
   }
+
+  const chatElement = document.getElementById("chat");
+  if (chatElement && chatElement.getAttribute("data-mode") === "instruct") {
+    const messagesContainer = chatElement.querySelector(".messages");
+    const lastChild = messagesContainer?.lastElementChild;
+    const prevSibling = lastChild?.previousElementSibling;
+    if (lastChild && prevSibling) {
+      lastChild.style.setProperty("margin-bottom",
+        `max(0px, calc(max(70vh, 100vh - ${prevSibling.offsetHeight}px - 102px) - ${lastChild.offsetHeight}px))`,
+        "important"
+      );
+    }
+  }
 });
 
 // Configure the observer to watch for changes in the subtree and attributes
@@ -436,38 +475,6 @@ const chatInput = document.querySelector("#chat-input textarea");
 // Variables to store current dimensions
 let currentChatInputHeight = chatInput.clientHeight;
 
-// Update chat layout based on chat and input dimensions
-function updateCssProperties() {
-  const chatInputHeight = chatInput.clientHeight;
-
-  // Check if the chat container is visible
-  if (chatContainer.clientHeight > 0) {
-    const chatContainerParentHeight = chatContainer.parentNode.clientHeight;
-    const newChatHeight = `${chatContainerParentHeight - chatInputHeight - 80}px`;
-
-    document.documentElement.style.setProperty("--chat-height", newChatHeight);
-    document.documentElement.style.setProperty("--input-delta", `${chatInputHeight - 40}px`);
-
-    // Adjust scrollTop based on input height change
-    if (chatInputHeight !== currentChatInputHeight) {
-      const deltaHeight = chatInputHeight - currentChatInputHeight;
-      if (!isScrolled && deltaHeight < 0) {
-        chatContainer.scrollTop = chatContainer.scrollHeight;
-      } else {
-        chatContainer.scrollTop += deltaHeight;
-      }
-
-      currentChatInputHeight = chatInputHeight;
-    }
-  }
-}
-
-// Observe textarea size changes and call update function
-new ResizeObserver(updateCssProperties).observe(document.querySelector("#chat-input textarea"));
-
-// Handle changes in window size
-window.addEventListener("resize", updateCssProperties);
-
 //------------------------------------------------
 // Focus on the rename text area when it becomes visible
 //------------------------------------------------
@@ -720,7 +727,7 @@ function isMobile() {
 // Function to initialize sidebars
 function initializeSidebars() {
   const isOnMobile = isMobile();
-  
+
   if (isOnMobile) {
     // Mobile state: Hide sidebars and set closed states
     [pastChatsRow, chatControlsRow, headerBar].forEach(el => {
@@ -813,3 +820,55 @@ function createMobileTopBar() {
 }
 
 createMobileTopBar();
+
+//------------------------------------------------
+// Simple Navigation Functions
+//------------------------------------------------
+
+function navigateLastAssistantMessage(direction) {
+  const chat = document.querySelector("#chat");
+  if (!chat) return false;
+
+  const messages = chat.querySelectorAll("[data-index]");
+  if (messages.length === 0) return false;
+
+  // Find the last assistant message (starting from the end)
+  let lastAssistantMessage = null;
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (
+      msg.classList.contains("assistant-message") ||
+      msg.querySelector(".circle-bot") ||
+      msg.querySelector(".text-bot")
+    ) {
+      lastAssistantMessage = msg;
+      break;
+    }
+  }
+
+  if (!lastAssistantMessage) return false;
+
+  const buttons = lastAssistantMessage.querySelectorAll(".version-nav-button");
+
+  for (let i = 0; i < buttons.length; i++) {
+    const button = buttons[i];
+    const onclick = button.getAttribute("onclick");
+    const disabled = button.hasAttribute("disabled");
+
+    const isLeft = onclick && onclick.includes("'left'");
+    const isRight = onclick && onclick.includes("'right'");
+
+    if (!disabled) {
+      if (direction === "left" && isLeft) {
+        navigateVersion(button, direction);
+        return true;
+      }
+      if (direction === "right" && isRight) {
+        navigateVersion(button, direction);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
diff --git a/modules/chat.py b/modules/chat.py
index 4becb7f5..2db72f36 100644
--- a/modules/chat.py
+++ b/modules/chat.py
@@ -5,6 +5,7 @@ import html
 import json
 import pprint
 import re
+import time
 from datetime import datetime
 from functools import partial
 from pathlib import Path
@@ -30,12 +31,37 @@ from modules.text_generation import (
     get_max_prompt_length
 )
 from modules.utils import delete_file, get_available_characters, save_file
+from modules.web_search import add_web_search_attachments
 
 
 def strftime_now(format):
     return datetime.now().strftime(format)
 
 
+def get_current_timestamp():
+    """Returns the current time in 24-hour format"""
+    return datetime.now().strftime('%b %d, %Y %H:%M')
+
+
+def update_message_metadata(metadata_dict, role, index, **fields):
+    """
+    Updates or adds metadata fields for a specific message.
+
+    Args:
+        metadata_dict: The metadata dictionary
+        role: The role (user, assistant, etc)
+        index: The message index
+        **fields: Arbitrary metadata fields to update/add
+    """
+    key = f"{role}_{index}"
+    if key not in metadata_dict:
+        metadata_dict[key] = {}
+
+    # Update with provided fields
+    for field_name, field_value in fields.items():
+        metadata_dict[key][field_name] = field_value
+
+
 jinja_env = ImmutableSandboxedEnvironment(
     trim_blocks=True,
     lstrip_blocks=True,
@@ -132,7 +158,9 @@ def generate_chat_prompt(user_input, state, **kwargs):
     impersonate = kwargs.get('impersonate', False)
     _continue = kwargs.get('_continue', False)
     also_return_rows = kwargs.get('also_return_rows', False)
-    history = kwargs.get('history', state['history'])['internal']
+    history_data = kwargs.get('history', state['history'])
+    history = history_data['internal']
+    metadata = history_data.get('metadata', {})
 
     # Templates
     chat_template_str = state['chat_template_str']
@@ -145,7 +173,7 @@ def generate_chat_prompt(user_input, state, **kwargs):
     instruct_renderer = partial(
         instruction_template.render,
         builtin_tools=None,
-        tools=None,
+        tools=state['tools'] if 'tools' in state else None,
         tools_in_user_message=False,
         add_generation_prompt=False
     )
@@ -171,18 +199,62 @@ def generate_chat_prompt(user_input, state, **kwargs):
             messages.append({"role": "system", "content": context})
 
     insert_pos = len(messages)
-    for user_msg, assistant_msg in reversed(history):
-        user_msg = user_msg.strip()
-        assistant_msg = assistant_msg.strip()
+    for i, entry in enumerate(reversed(history)):
+        user_msg = entry[0].strip()
+        assistant_msg = entry[1].strip()
+        tool_msg = entry[2].strip() if len(entry) > 2 else ''
+
+        row_idx = len(history) - i - 1
+
+        if tool_msg:
+            messages.insert(insert_pos, {"role": "tool", "content": tool_msg})
 
         if assistant_msg:
             messages.insert(insert_pos, {"role": "assistant", "content": assistant_msg})
 
         if user_msg not in ['', '<|BEGIN-VISIBLE-CHAT|>']:
-            messages.insert(insert_pos, {"role": "user", "content": user_msg})
+            # Check for user message attachments in metadata
+            user_key = f"user_{row_idx}"
+            enhanced_user_msg = user_msg
+
+            # Add attachment content if present
+            if user_key in metadata and "attachments" in metadata[user_key]:
+                attachments_text = ""
+                for attachment in metadata[user_key]["attachments"]:
+                    filename = attachment.get("name", "file")
+                    content = attachment.get("content", "")
+                    attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
+
+                if attachments_text:
+                    enhanced_user_msg = f"{user_msg}\n\nATTACHMENTS:\n{attachments_text}"
+
+            messages.insert(insert_pos, {"role": "user", "content": enhanced_user_msg})
 
     user_input = user_input.strip()
-    if user_input and not impersonate and not _continue:
+
+    # Check if we have attachments even with empty input
+    has_attachments = False
+    if not impersonate and not _continue and len(history_data.get('metadata', {})) > 0:
+        current_row_idx = len(history)
+        user_key = f"user_{current_row_idx}"
+        has_attachments = user_key in metadata and "attachments" in metadata[user_key]
+
+    if (user_input or has_attachments) and not impersonate and not _continue:
+        # For the current user input being processed, check if we need to add attachments
+        if not impersonate and not _continue and len(history_data.get('metadata', {})) > 0:
+            current_row_idx = len(history)
+            user_key = f"user_{current_row_idx}"
+
+            if user_key in metadata and "attachments" in metadata[user_key]:
+                attachments_text = ""
+                for attachment in metadata[user_key]["attachments"]:
+                    filename = attachment.get("name", "file")
+                    content = attachment.get("content", "")
+                    attachments_text += f"\nName: {filename}\nContents:\n\n=====\n{content}\n=====\n\n"
+
+                if attachments_text:
+                    user_input = f"{user_input}\n\nATTACHMENTS:\n{attachments_text}"
+
         messages.append({"role": "user", "content": user_input})
 
     def make_prompt(messages):
@@ -251,7 +323,6 @@ def generate_chat_prompt(user_input, state, **kwargs):
 
             # Resort to truncating the user input
             else:
-
                 user_message = messages[-1]['content']
 
                 # Bisect the truncation point
@@ -288,6 +359,50 @@ def generate_chat_prompt(user_input, state, **kwargs):
         return prompt
 
 
+def count_prompt_tokens(text_input, state):
+    """Count tokens for current history + input including attachments"""
+    if shared.tokenizer is None:
+        return "Tokenizer not available"
+
+    try:
+        # Handle dict format with text and files
+        files = []
+        if isinstance(text_input, dict):
+            files = text_input.get('files', [])
+            text = text_input.get('text', '')
+        else:
+            text = text_input
+            files = []
+
+        # Create temporary history copy to add attachments
+        temp_history = copy.deepcopy(state['history'])
+        if 'metadata' not in temp_history:
+            temp_history['metadata'] = {}
+
+        # Process attachments if any
+        if files:
+            row_idx = len(temp_history['internal'])
+            for file_path in files:
+                add_message_attachment(temp_history, row_idx, file_path, is_user=True)
+
+        # Create temp state with modified history
+        temp_state = copy.deepcopy(state)
+        temp_state['history'] = temp_history
+
+        # Build prompt using existing logic
+        prompt = generate_chat_prompt(text, temp_state)
+        current_tokens = get_encoded_length(prompt)
+        max_tokens = temp_state['truncation_length']
+
+        percentage = (current_tokens / max_tokens) * 100 if max_tokens > 0 else 0
+
+        return f"History + Input:<br/>{current_tokens:,} / {max_tokens:,} tokens ({percentage:.1f}%)"
+
+    except Exception as e:
+        logger.error(f"Error counting tokens: {e}")
+        return f"Error: {str(e)}"
+
+
 def get_stopping_strings(state):
     stopping_strings = []
     renderers = []
@@ -336,6 +451,114 @@ def get_stopping_strings(state):
     return result
 
 
+def add_message_version(history, role, row_idx, is_current=True):
+    key = f"{role}_{row_idx}"
+    if 'metadata' not in history:
+        history['metadata'] = {}
+    if key not in history['metadata']:
+        history['metadata'][key] = {}
+
+    if "versions" not in history['metadata'][key]:
+        history['metadata'][key]["versions"] = []
+
+    # Determine which index to use for content based on role
+    content_idx = 0 if role == 'user' else 1
+    current_content = history['internal'][row_idx][content_idx]
+    current_visible = history['visible'][row_idx][content_idx]
+
+    history['metadata'][key]["versions"].append({
+        "content": current_content,
+        "visible_content": current_visible,
+        "timestamp": get_current_timestamp()
+    })
+
+    if is_current:
+        # Set the current_version_index to the newly added version (which is now the last one).
+        history['metadata'][key]["current_version_index"] = len(history['metadata'][key]["versions"]) - 1
+
+
+def add_message_attachment(history, row_idx, file_path, is_user=True):
+    """Add a file attachment to a message in history metadata"""
+    if 'metadata' not in history:
+        history['metadata'] = {}
+
+    key = f"{'user' if is_user else 'assistant'}_{row_idx}"
+
+    if key not in history['metadata']:
+        history['metadata'][key] = {"timestamp": get_current_timestamp()}
+    if "attachments" not in history['metadata'][key]:
+        history['metadata'][key]["attachments"] = []
+
+    # Get file info using pathlib
+    path = Path(file_path)
+    filename = path.name
+    file_extension = path.suffix.lower()
+
+    try:
+        # Handle different file types
+        if file_extension == '.pdf':
+            # Process PDF file
+            content = extract_pdf_text(path)
+            file_type = "application/pdf"
+        else:
+            # Default handling for text files
+            with open(path, 'r', encoding='utf-8') as f:
+                content = f.read()
+            file_type = "text/plain"
+
+        # Add attachment
+        attachment = {
+            "name": filename,
+            "type": file_type,
+            "content": content,
+        }
+
+        history['metadata'][key]["attachments"].append(attachment)
+        return content  # Return the content for reuse
+    except Exception as e:
+        logger.error(f"Error processing attachment {filename}: {e}")
+        return None
+
+
+def extract_pdf_text(pdf_path):
+    """Extract text from a PDF file"""
+    import PyPDF2
+
+    text = ""
+    try:
+        with open(pdf_path, 'rb') as file:
+            pdf_reader = PyPDF2.PdfReader(file)
+            for page_num in range(len(pdf_reader.pages)):
+                page = pdf_reader.pages[page_num]
+                text += page.extract_text() + "\n\n"
+
+        return text.strip()
+    except Exception as e:
+        logger.error(f"Error extracting text from PDF: {e}")
+        return f"[Error extracting PDF text: {str(e)}]"
+
+
+def generate_search_query(user_message, state):
+    """Generate a search query from user message using the LLM"""
+    # Augment the user message with search instruction
+    augmented_message = f"{user_message}\n\n=====\n\nPlease turn the message above into a short web search query in the same language as the message. Respond with only the search query, nothing else."
+
+    # Use a minimal state for search query generation but keep the full history
+    search_state = state.copy()
+    search_state['max_new_tokens'] = 64
+    search_state['auto_max_new_tokens'] = False
+    search_state['enable_thinking'] = False
+
+    # Generate the full prompt using existing history + augmented message
+    formatted_prompt = generate_chat_prompt(augmented_message, search_state)
+
+    query = ""
+    for reply in generate_reply(formatted_prompt, search_state, stopping_strings=[], is_chat=True):
+        query = reply.strip()
+
+    return query
+
+
 def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_message=True, for_ui=False):
     # Handle dict format with text and files
     files = []
@@ -509,16 +732,19 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess
     yield output
 
 
-def impersonate_wrapper(text, state):
+def impersonate_wrapper(textbox, state):
+    text = textbox['text']
     static_output = chat_html_wrapper(state['history'], state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
 
     prompt = generate_chat_prompt('', state, impersonate=True)
     stopping_strings = get_stopping_strings(state)
 
-    yield text + '...', static_output
+    textbox['text'] = text + '...'
+    yield textbox, static_output
     reply = None
     for reply in generate_reply(prompt + text, state, stopping_strings=stopping_strings, is_chat=True):
-        yield (text + reply).lstrip(' '), static_output
+        textbox['text'] = (text + reply).lstrip(' ')
+        yield textbox, static_output
         if shared.stop_everything:
             return
 
@@ -564,56 +790,81 @@ def generate_chat_reply_wrapper(text, state, regenerate=False, _continue=False):
         send_dummy_reply(state['start_with'], state)
 
     history = state['history']
+    last_save_time = time.monotonic()
+    save_interval = 8
     for i, history in enumerate(generate_chat_reply(text, state, regenerate, _continue, loading_message=True, for_ui=True)):
         yield chat_html_wrapper(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu']), history
 
+        current_time = time.monotonic()
+        # Save on first iteration or if save_interval seconds have passed
+        if i == 0 or (current_time - last_save_time) >= save_interval:
+            save_history(history, state['unique_id'], state['character_menu'], state['mode'])
+            last_save_time = current_time
+
     save_history(history, state['unique_id'], state['character_menu'], state['mode'])
 
 
 def remove_last_message(history):
+    if 'metadata' not in history:
+        history['metadata'] = {}
+
     if len(history['visible']) > 0 and history['internal'][-1][0] != '<|BEGIN-VISIBLE-CHAT|>':
+        row_idx = len(history['internal']) - 1
         last = history['visible'].pop()
         history['internal'].pop()
+
+        # Remove metadata directly by known keys
+        if f"user_{row_idx}" in history['metadata']:
+            del history['metadata'][f"user_{row_idx}"]
+        if f"assistant_{row_idx}" in history['metadata']:
+            del history['metadata'][f"assistant_{row_idx}"]
     else:
         last = ['', '']
 
     return html.unescape(last[0]), history
 
 
-def send_last_reply_to_input(history):
-    if len(history['visible']) > 0:
-        return html.unescape(history['visible'][-1][1])
-    else:
-        return ''
-
-
-def replace_last_reply(text, state):
-    history = state['history']
-
-    if len(text.strip()) == 0:
-        return history
-    elif len(history['visible']) > 0:
-        history['visible'][-1][1] = html.escape(text)
-        history['internal'][-1][1] = apply_extensions('input', text, state, is_chat=True)
-
-    return history
-
-
 def send_dummy_message(text, state):
     history = state['history']
+
+    # Handle both dict and string inputs
+    if isinstance(text, dict):
+        text = text['text']
+
+    # Initialize metadata if not present
+    if 'metadata' not in history:
+        history['metadata'] = {}
+
+    row_idx = len(history['internal'])
     history['visible'].append([html.escape(text), ''])
     history['internal'].append([apply_extensions('input', text, state, is_chat=True), ''])
+    update_message_metadata(history['metadata'], "user", row_idx, timestamp=get_current_timestamp())
+
     return history
 
 
 def send_dummy_reply(text, state):
     history = state['history']
+
+    # Handle both dict and string inputs
+    if isinstance(text, dict):
+        text = text['text']
+
+    # Initialize metadata if not present
+    if 'metadata' not in history:
+        history['metadata'] = {}
+
     if len(history['visible']) > 0 and not history['visible'][-1][1] == '':
+        row_idx = len(history['internal'])
         history['visible'].append(['', ''])
         history['internal'].append(['', ''])
+        # We don't need to add system metadata
 
+    row_idx = len(history['internal']) - 1
     history['visible'][-1][1] = html.escape(text)
     history['internal'][-1][1] = apply_extensions('input', text, state, is_chat=True)
+    update_message_metadata(history['metadata'], "assistant", row_idx, timestamp=get_current_timestamp())
+
     return history
 
 
@@ -623,7 +874,8 @@ def redraw_html(history, name1, name2, mode, style, character, reset_cache=False
 
 def start_new_chat(state):
     mode = state['mode']
-    history = {'internal': [], 'visible': []}
+    # Initialize with empty metadata dictionary
+    history = {'internal': [], 'visible': [], 'metadata': {}}
 
     if mode != 'instruct':
         greeting = replace_character_names(state['greeting'], state['name1'], state['name2'])
@@ -631,6 +883,9 @@ def start_new_chat(state):
             history['internal'] += [['<|BEGIN-VISIBLE-CHAT|>', greeting]]
             history['visible'] += [['', apply_extensions('output', html.escape(greeting), state, is_chat=True)]]
 
+            # Add timestamp for assistant's greeting
+            update_message_metadata(history['metadata'], "assistant", 0, timestamp=get_current_timestamp())
+
     unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S')
     save_history(history, unique_id, state['character_menu'], state['mode'])
 
@@ -811,6 +1066,16 @@ def load_history(unique_id, character, mode):
             'visible': f['data_visible']
         }
 
+    # Add metadata if it doesn't exist
+    if 'metadata' not in history:
+        history['metadata'] = {}
+        # Add placeholder timestamps for existing messages
+        for i, (user_msg, asst_msg) in enumerate(history['internal']):
+            if user_msg and user_msg != '<|BEGIN-VISIBLE-CHAT|>':
+                update_message_metadata(history['metadata'], "user", i, timestamp="")
+            if asst_msg:
+                update_message_metadata(history['metadata'], "assistant", i, timestamp="")
+
     return history
 
 
@@ -826,6 +1091,16 @@ def load_history_json(file, history):
                 'visible': f['data_visible']
             }
 
+        # Add metadata if it doesn't exist
+        if 'metadata' not in history:
+            history['metadata'] = {}
+            # Add placeholder timestamps
+            for i, (user_msg, asst_msg) in enumerate(history['internal']):
+                if user_msg and user_msg != '<|BEGIN-VISIBLE-CHAT|>':
+                    update_message_metadata(history['metadata'], "user", i, timestamp="")
+                if asst_msg:
+                    update_message_metadata(history['metadata'], "assistant", i, timestamp="")
+
         return history
     except:
         return history
@@ -1147,20 +1422,12 @@ def my_yaml_output(data):
     return result
 
 
-def handle_replace_last_reply_click(text, state):
-    history = replace_last_reply(text, state)
-    save_history(history, state['unique_id'], state['character_menu'], state['mode'])
-    html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
-
-    return [history, html, ""]
-
-
 def handle_send_dummy_message_click(text, state):
     history = send_dummy_message(text, state)
     save_history(history, state['unique_id'], state['character_menu'], state['mode'])
     html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
 
-    return [history, html, ""]
+    return [history, html, {"text": "", "files": []}]
 
 
 def handle_send_dummy_reply_click(text, state):
@@ -1168,7 +1435,7 @@ def handle_send_dummy_reply_click(text, state):
     save_history(history, state['unique_id'], state['character_menu'], state['mode'])
     html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
 
-    return [history, html, ""]
+    return [history, html, {"text": "", "files": []}]
 
 
 def handle_remove_last_click(state):
@@ -1176,7 +1443,7 @@ def handle_remove_last_click(state):
     save_history(history, state['unique_id'], state['character_menu'], state['mode'])
     html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
 
-    return [history, html, last_input]
+    return [history, html, {"text": last_input, "files": []}]
 
 
 def handle_unique_id_select(state):
@@ -1222,7 +1489,13 @@ def handle_delete_chat_confirm_click(state):
 
 
 def handle_branch_chat_click(state):
-    history = state['history']
+    branch_from_index = state['branch_index']
+    if branch_from_index == -1:
+        history = state['history']
+    else:
+        history = state['history']
+        history['visible'] = history['visible'][:branch_from_index + 1]
+        history['internal'] = history['internal'][:branch_from_index + 1]
     new_unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S')
     save_history(history, new_unique_id, state['character_menu'], state['mode'])
 
@@ -1233,7 +1506,93 @@ def handle_branch_chat_click(state):
 
     past_chats_update = gr.update(choices=histories, value=new_unique_id)
 
-    return [history, html, past_chats_update]
+    return [history, html, past_chats_update, -1]
+
+
+def handle_edit_message_click(state):
+    history = state['history']
+    message_index = int(state['edit_message_index'])
+    new_text = state['edit_message_text']
+    role = state['edit_message_role']  # "user" or "assistant"
+
+    if message_index >= len(history['internal']):
+        html_output = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
+        return [history, html_output]
+
+    role_idx = 0 if role == "user" else 1
+
+    if 'metadata' not in history:
+        history['metadata'] = {}
+
+    key = f"{role}_{message_index}"
+    if key not in history['metadata']:
+        history['metadata'][key] = {}
+
+    # If no versions exist yet for this message, store the current (pre-edit) content as the first version.
+    if "versions" not in history['metadata'][key] or not history['metadata'][key]["versions"]:
+        original_content = history['internal'][message_index][role_idx]
+        original_visible = history['visible'][message_index][role_idx]
+        original_timestamp = history['metadata'][key].get('timestamp', get_current_timestamp())
+
+        history['metadata'][key]["versions"] = [{
+            "content": original_content,
+            "visible_content": original_visible,
+            "timestamp": original_timestamp
+        }]
+
+    history['internal'][message_index][role_idx] = apply_extensions('input', new_text, state, is_chat=True)
+    history['visible'][message_index][role_idx] = html.escape(new_text)
+
+    add_message_version(history, role, message_index, is_current=True)
+
+    save_history(history, state['unique_id'], state['character_menu'], state['mode'])
+    html_output = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
+
+    return [history, html_output]
+
+
+def handle_navigate_version_click(state):
+    history = state['history']
+    message_index = int(state['navigate_message_index'])
+    direction = state['navigate_direction']
+    role = state['navigate_message_role']
+
+    if not role:
+        logger.error("Role not provided for version navigation.")
+        html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
+        return [history, html]
+
+    key = f"{role}_{message_index}"
+    if 'metadata' not in history or key not in history['metadata'] or 'versions' not in history['metadata'][key]:
+        html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
+        return [history, html]
+
+    metadata = history['metadata'][key]
+    versions = metadata['versions']
+    # Default to the last version if current_version_index is not set
+    current_idx = metadata.get('current_version_index', len(versions) - 1 if versions else 0)
+
+    if direction == 'left':
+        new_idx = max(0, current_idx - 1)
+    else:  # right
+        new_idx = min(len(versions) - 1, current_idx + 1)
+
+    if new_idx == current_idx:
+        html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
+        return [history, html]
+
+    msg_content_idx = 0 if role == 'user' else 1  # 0 for user content, 1 for assistant content in the pair
+    version_to_load = versions[new_idx]
+    history['internal'][message_index][msg_content_idx] = version_to_load['content']
+    history['visible'][message_index][msg_content_idx] = version_to_load['visible_content']
+    metadata['current_version_index'] = new_idx
+    update_message_metadata(history['metadata'], role, message_index, timestamp=version_to_load['timestamp'])
+
+    # Redraw and save
+    html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])
+    save_history(history, state['unique_id'], state['character_menu'], state['mode'])
+
+    return [history, html]
 
 
 def handle_rename_chat_click():
@@ -1375,7 +1734,7 @@ def handle_your_picture_change(picture, state):
 
 def handle_send_instruction_click(state):
     state['mode'] = 'instruct'
-    state['history'] = {'internal': [], 'visible': []}
+    state['history'] = {'internal': [], 'visible': [], 'metadata': {}}
 
     output = generate_chat_prompt("Input", state)
 
diff --git a/modules/exllamav3_hf.py b/modules/exllamav3_hf.py
index 12b22f64..1254ff5d 100644
--- a/modules/exllamav3_hf.py
+++ b/modules/exllamav3_hf.py
@@ -119,7 +119,7 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
         reset = True
 
         # Maximum number of tokens to process in a single forward pass
-        max_chunk_size = 2048
+        max_chunk_size = 256
 
         # Make the forward call
         if labels is None:
@@ -245,3 +245,20 @@ class Exllamav3HF(PreTrainedModel, GenerationMixin):
         pretrained_model_name_or_path = Path(f'{shared.args.model_dir}') / Path(pretrained_model_name_or_path)
 
         return Exllamav3HF(pretrained_model_name_or_path)
+
+    def unload(self):
+        """Properly unload the ExllamaV3 model and free GPU memory."""
+        if hasattr(self, 'ex_model') and self.ex_model is not None:
+            self.ex_model.unload()
+            self.ex_model = None
+
+        if hasattr(self, 'ex_cache') and self.ex_cache is not None:
+            self.ex_cache = None
+
+        # Clean up any additional ExllamaV3 resources
+        if hasattr(self, 'past_seq'):
+            self.past_seq = None
+        if hasattr(self, 'past_seq_negative'):
+            self.past_seq_negative = None
+        if hasattr(self, 'ex_cache_negative'):
+            self.ex_cache_negative = None
diff --git a/modules/html_generator.py b/modules/html_generator.py
index 67d15b6e..cbf3e19c 100644
--- a/modules/html_generator.py
+++ b/modules/html_generator.py
@@ -169,11 +169,7 @@ def convert_to_markdown(string, message_id=None):
         thinking_block = f'''
         <details class="thinking-block" data-block-id="{block_id}" data-streaming="{str(is_streaming).lower()}">
             <summary class="thinking-header">
-                <svg class="thinking-icon" width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
-                    <path d="M8 1.33334C4.31868 1.33334 1.33334 4.31868 1.33334 8.00001C1.33334 11.6813 4.31868 14.6667 8 14.6667C11.6813 14.6667 14.6667 11.6813 14.6667 8.00001C14.6667 4.31868 11.6813 1.33334 8 1.33334Z" stroke="currentColor" stroke-width="1.33" stroke-linecap="round" stroke-linejoin="round"/>
-                    <path d="M8 10.6667V8.00001" stroke="currentColor" stroke-width="1.33" stroke-linecap="round" stroke-linejoin="round"/>
-                    <path d="M8 5.33334H8.00667" stroke="currentColor" stroke-width="1.33" stroke-linecap="round" stroke-linejoin="round"/>
-                </svg>
+                {info_svg_small}
                 <span class="thinking-title">{title_text}</span>
             </summary>
             <div class="thinking-content pretty_scrollbar">{thinking_html}</div>
@@ -339,41 +335,164 @@ copy_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" vie
 refresh_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="tabler-icon tabler-icon-repeat"><path d="M4 12v-3a3 3 0 0 1 3 -3h13m-3 -3l3 3l-3 3"></path><path d="M20 12v3a3 3 0 0 1 -3 3h-13m3 3l-3 -3l3 -3"></path></svg>'''
 continue_svg = '''<svg  xmlns="http://www.w3.org/2000/svg"  width="20"  height="20"  viewBox="0 0 24 24"  fill="none"  stroke="currentColor"  stroke-width="2"  stroke-linecap="round"  stroke-linejoin="round"  class="icon icon-tabler icons-tabler-outline icon-tabler-player-play"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M7 4v16l13 -8z" /></svg>'''
 remove_svg = '''<svg  xmlns="http://www.w3.org/2000/svg"  width="20"  height="20"  viewBox="0 0 24 24"  fill="none"  stroke="currentColor"  stroke-width="2"  stroke-linecap="round"  stroke-linejoin="round"  class="icon icon-tabler icons-tabler-outline icon-tabler-trash"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M4 7l16 0" /><path d="M10 11l0 6" /><path d="M14 11l0 6" /><path d="M5 7l1 12a2 2 0 0 0 2 2h8a2 2 0 0 0 2 -2l1 -12" /><path d="M9 7v-3a1 1 0 0 1 1 -1h4a1 1 0 0 1 1 1v3" /></svg>'''
+branch_svg = '''<svg  xmlns="http://www.w3.org/2000/svg"  width="24"  height="24"  viewBox="0 0 24 24"  fill="none"  stroke="currentColor"  stroke-width="2"  stroke-linecap="round"  stroke-linejoin="round"  class="icon icon-tabler icons-tabler-outline icon-tabler-git-branch"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M7 18m-2 0a2 2 0 1 0 4 0a2 2 0 1 0 -4 0" /><path d="M7 6m-2 0a2 2 0 1 0 4 0a2 2 0 1 0 -4 0" /><path d="M17 6m-2 0a2 2 0 1 0 4 0a2 2 0 1 0 -4 0" /><path d="M7 8l0 8" /><path d="M9 18h6a2 2 0 0 0 2 -2v-5" /><path d="M14 14l3 -3l3 3" /></svg>'''
+edit_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="tabler-icon tabler-icon-pencil"><path d="M4 20h4l10.5 -10.5a2.828 2.828 0 1 0 -4 -4l-10.5 10.5v4"></path><path d="M13.5 6.5l4 4"></path></svg>'''
+info_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="thinking-icon tabler-icon tabler-icon-info-circle"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M12 2a10 10 0 0 1 0 20a10 10 0 0 1 0 -20z" /><path d="M12 16v-4" /><path d="M12 8h.01" /></svg>'''
+info_svg_small = '''<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="thinking-icon tabler-icon tabler-icon-info-circle"><path stroke="none" d="M0 0h24v24H0z" fill="none"/><path d="M12 2a10 10 0 0 1 0 20a10 10 0 0 1 0 -20z" /><path d="M12 16v-4" /><path d="M12 8h.01" /></svg>'''
+attachment_svg = '''<svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M21.44 11.05l-9.19 9.19a6 6 0 0 1-8.48-8.48l9.19-9.19a4 4 0 0 1 5.66 5.66l-9.2 9.19a2 2 0 0 1-2.83-2.83l8.49-8.48"></path></svg>'''
 
 copy_button = f'<button class="footer-button footer-copy-button" title="Copy" onclick="copyToClipboard(this)">{copy_svg}</button>'
+branch_button = f'<button class="footer-button footer-branch-button" title="Branch here" onclick="branchHere(this)">{branch_svg}</button>'
+edit_button = f'<button class="footer-button footer-edit-button" title="Edit" onclick="editHere(this)">{edit_svg}</button>'
 refresh_button = f'<button class="footer-button footer-refresh-button" title="Regenerate" onclick="regenerateClick()">{refresh_svg}</button>'
 continue_button = f'<button class="footer-button footer-continue-button" title="Continue" onclick="continueClick()">{continue_svg}</button>'
 remove_button = f'<button class="footer-button footer-remove-button" title="Remove last reply" onclick="removeLastClick()">{remove_svg}</button>'
+info_button = f'<button class="footer-button footer-info-button" title="message">{info_svg}</button>'
+
+
+def format_message_timestamp(history, role, index):
+    """Get a formatted timestamp HTML span for a message if available"""
+    key = f"{role}_{index}"
+    if 'metadata' in history and key in history['metadata'] and history['metadata'][key].get('timestamp'):
+        timestamp = history['metadata'][key]['timestamp']
+        return f"<span class='timestamp'>{timestamp}</span>"
+
+    return ""
+
+
+def format_message_attachments(history, role, index):
+    """Get formatted HTML for message attachments if available"""
+    key = f"{role}_{index}"
+    if 'metadata' in history and key in history['metadata'] and 'attachments' in history['metadata'][key]:
+        attachments = history['metadata'][key]['attachments']
+        if not attachments:
+            return ""
+
+        attachments_html = '<div class="message-attachments">'
+        for attachment in attachments:
+            name = html.escape(attachment["name"])
+
+            # Make clickable if URL exists
+            if "url" in attachment:
+                name = f'<a href="{html.escape(attachment["url"])}" target="_blank" rel="noopener noreferrer">{name}</a>'
+
+            attachments_html += (
+                f'<div class="attachment-box">'
+                f'<div class="attachment-icon">{attachment_svg}</div>'
+                f'<div class="attachment-name">{name}</div>'
+                f'</div>'
+            )
+        attachments_html += '</div>'
+        return attachments_html
+
+    return ""
+
+
+def get_version_navigation_html(history, i, role):
+    """Generate simple navigation arrows for message versions"""
+    key = f"{role}_{i}"
+    metadata = history.get('metadata', {})
+
+    if key not in metadata or 'versions' not in metadata[key]:
+        return ""
+
+    versions = metadata[key]['versions']
+    # Default to the last version if current_version_index isn't set in metadata
+    current_idx = metadata[key].get('current_version_index', len(versions) - 1 if versions else 0)
+
+    if len(versions) <= 1:
+        return ""
+
+    left_disabled = ' disabled' if current_idx == 0 else ''
+    right_disabled = ' disabled' if current_idx >= len(versions) - 1 else ''
+
+    left_arrow = f'<button class="footer-button version-nav-button"{left_disabled} onclick="navigateVersion(this, \'left\')" title="Previous version">&lt;</button>'
+    right_arrow = f'<button class="footer-button version-nav-button"{right_disabled} onclick="navigateVersion(this, \'right\')" title="Next version">&gt;</button>'
+    position = f'<span class="version-position">{current_idx + 1}/{len(versions)}</span>'
+
+    return f'<div class="version-navigation">{left_arrow}{position}{right_arrow}</div>'
+
+
+def actions_html(history, i, role, info_message=""):
+    action_buttons = ""
+    version_nav_html = ""
+
+    if role == "assistant":
+        action_buttons = (
+            f'{copy_button}'
+            f'{edit_button}'
+            f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
+            f'{continue_button if i == len(history["visible"]) - 1 else ""}'
+            f'{remove_button if i == len(history["visible"]) - 1 else ""}'
+            f'{branch_button}'
+        )
+
+        version_nav_html = get_version_navigation_html(history, i, "assistant")
+    elif role == "user":
+        action_buttons = (
+            f'{copy_button}'
+            f'{edit_button}'
+        )
+
+        version_nav_html = get_version_navigation_html(history, i, "user")
+
+    return (f'<div class="message-actions">'
+            f'{action_buttons}'
+            f'{info_message}'
+            f'</div>'
+            f'{version_nav_html}')
 
 
 def generate_instruct_html(history):
-    output = f'<style>{instruct_css}</style><div class="chat" id="chat"><div class="messages">'
+    output = f'<style>{instruct_css}</style><div class="chat" id="chat" data-mode="instruct"><div class="messages">'
 
     for i in range(len(history['visible'])):
         row_visible = history['visible'][i]
         row_internal = history['internal'][i]
         converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
 
+        # Get timestamps
+        user_timestamp = format_message_timestamp(history, "user", i)
+        assistant_timestamp = format_message_timestamp(history, "assistant", i)
+
+        # Get attachments
+        user_attachments = format_message_attachments(history, "user", i)
+        assistant_attachments = format_message_attachments(history, "assistant", i)
+
+        # Create info buttons for timestamps if they exist
+        info_message_user = ""
+        if user_timestamp != "":
+            # Extract the timestamp value from the span
+            user_timestamp_value = user_timestamp.split('>', 1)[1].split('<', 1)[0]
+            info_message_user = info_button.replace("message", user_timestamp_value)
+
+        info_message_assistant = ""
+        if assistant_timestamp != "":
+            # Extract the timestamp value from the span
+            assistant_timestamp_value = assistant_timestamp.split('>', 1)[1].split('<', 1)[0]
+            info_message_assistant = info_button.replace("message", assistant_timestamp_value)
+
         if converted_visible[0]:  # Don't display empty user messages
             output += (
                 f'<div class="user-message" '
-                f'data-raw="{html.escape(row_internal[0], quote=True)}">'
+                f'data-raw="{html.escape(row_internal[0], quote=True)}"'
+                f'data-index={i}>'
                 f'<div class="text">'
                 f'<div class="message-body">{converted_visible[0]}</div>'
-                f'{copy_button}'
+                f'{user_attachments}'
+                f'{actions_html(history, i, "user", info_message_user)}'
                 f'</div>'
                 f'</div>'
             )
 
         output += (
             f'<div class="assistant-message" '
-            f'data-raw="{html.escape(row_internal[1], quote=True)}">'
+            f'data-raw="{html.escape(row_internal[1], quote=True)}"'
+            f'data-index={i}>'
             f'<div class="text">'
             f'<div class="message-body">{converted_visible[1]}</div>'
-            f'{copy_button}'
-            f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
-            f'{continue_button if i == len(history["visible"]) - 1 else ""}'
-            f'{remove_button if i == len(history["visible"]) - 1 else ""}'
+            f'{assistant_attachments}'
+            f'{actions_html(history, i, "assistant", info_message_assistant)}'
             f'</div>'
             f'</div>'
         )
@@ -401,30 +520,39 @@ def generate_cai_chat_html(history, name1, name2, style, character, reset_cache=
         row_internal = history['internal'][i]
         converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
 
+        # Get timestamps
+        user_timestamp = format_message_timestamp(history, "user", i)
+        assistant_timestamp = format_message_timestamp(history, "assistant", i)
+
+        # Get attachments
+        user_attachments = format_message_attachments(history, "user", i)
+        assistant_attachments = format_message_attachments(history, "assistant", i)
+
         if converted_visible[0]:  # Don't display empty user messages
             output += (
                 f'<div class="message" '
-                f'data-raw="{html.escape(row_internal[0], quote=True)}">'
+                f'data-raw="{html.escape(row_internal[0], quote=True)}"'
+                f'data-index={i}>'
                 f'<div class="circle-you">{img_me}</div>'
                 f'<div class="text">'
-                f'<div class="username">{name1}</div>'
+                f'<div class="username">{name1}{user_timestamp}</div>'
                 f'<div class="message-body">{converted_visible[0]}</div>'
-                f'{copy_button}'
+                f'{user_attachments}'
+                f'{actions_html(history, i, "user")}'
                 f'</div>'
                 f'</div>'
             )
 
         output += (
             f'<div class="message" '
-            f'data-raw="{html.escape(row_internal[1], quote=True)}">'
+            f'data-raw="{html.escape(row_internal[1], quote=True)}"'
+            f'data-index={i}>'
             f'<div class="circle-bot">{img_bot}</div>'
             f'<div class="text">'
-            f'<div class="username">{name2}</div>'
+            f'<div class="username">{name2}{assistant_timestamp}</div>'
             f'<div class="message-body">{converted_visible[1]}</div>'
-            f'{copy_button}'
-            f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
-            f'{continue_button if i == len(history["visible"]) - 1 else ""}'
-            f'{remove_button if i == len(history["visible"]) - 1 else ""}'
+            f'{assistant_attachments}'
+            f'{actions_html(history, i, "assistant")}'
             f'</div>'
             f'</div>'
         )
@@ -441,26 +569,48 @@ def generate_chat_html(history, name1, name2, reset_cache=False):
         row_internal = history['internal'][i]
         converted_visible = [convert_to_markdown_wrapped(entry, message_id=i, use_cache=i != len(history['visible']) - 1) for entry in row_visible]
 
+        # Get timestamps
+        user_timestamp = format_message_timestamp(history, "user", i)
+        assistant_timestamp = format_message_timestamp(history, "assistant", i)
+
+        # Get attachments
+        user_attachments = format_message_attachments(history, "user", i)
+        assistant_attachments = format_message_attachments(history, "assistant", i)
+
+        # Create info buttons for timestamps if they exist
+        info_message_user = ""
+        if user_timestamp != "":
+            # Extract the timestamp value from the span
+            user_timestamp_value = user_timestamp.split('>', 1)[1].split('<', 1)[0]
+            info_message_user = info_button.replace("message", user_timestamp_value)
+
+        info_message_assistant = ""
+        if assistant_timestamp != "":
+            # Extract the timestamp value from the span
+            assistant_timestamp_value = assistant_timestamp.split('>', 1)[1].split('<', 1)[0]
+            info_message_assistant = info_button.replace("message", assistant_timestamp_value)
+
         if converted_visible[0]:  # Don't display empty user messages
             output += (
                 f'<div class="message" '
-                f'data-raw="{html.escape(row_internal[0], quote=True)}">'
+                f'data-raw="{html.escape(row_internal[0], quote=True)}"'
+                f'data-index={i}>'
                 f'<div class="text-you">'
                 f'<div class="message-body">{converted_visible[0]}</div>'
-                f'{copy_button}'
+                f'{user_attachments}'
+                f'{actions_html(history, i, "user", info_message_user)}'
                 f'</div>'
                 f'</div>'
             )
 
         output += (
             f'<div class="message" '
-            f'data-raw="{html.escape(row_internal[1], quote=True)}">'
+            f'data-raw="{html.escape(row_internal[1], quote=True)}"'
+            f'data-index={i}>'
             f'<div class="text-bot">'
             f'<div class="message-body">{converted_visible[1]}</div>'
-            f'{copy_button}'
-            f'{refresh_button if i == len(history["visible"]) - 1 else ""}'
-            f'{continue_button if i == len(history["visible"]) - 1 else ""}'
-            f'{remove_button if i == len(history["visible"]) - 1 else ""}'
+            f'{assistant_attachments}'
+            f'{actions_html(history, i, "assistant", info_message_assistant)}'
             f'</div>'
             f'</div>'
         )
diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py
index d9187db8..d695c74e 100644
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@@ -66,7 +66,7 @@ class LlamaServer:
             "top_k": state["top_k"],
             "top_p": state["top_p"],
             "min_p": state["min_p"],
-            "tfs_z": state["tfs"],
+            "top_n_sigma": state["top_n_sigma"] if state["top_n_sigma"] > 0 else -1,
             "typical_p": state["typical_p"],
             "repeat_penalty": state["repetition_penalty"],
             "repeat_last_n": state["repetition_penalty_range"],
@@ -102,8 +102,10 @@ class LlamaServer:
 
             penalty_found = False
             for s in samplers:
-                if s.strip() in ["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]:
+                if s.strip() in ["dry", "top_k", "top_p", "top_n_sigma", "min_p", "temperature", "xtc"]:
                     filtered_samplers.append(s.strip())
+                elif s.strip() == "typical_p":
+                    filtered_samplers.append("typ_p")
                 elif not penalty_found and s.strip() == "repetition_penalty":
                     filtered_samplers.append("penalties")
                     penalty_found = True
@@ -144,8 +146,9 @@ class LlamaServer:
             pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
             print()
 
-        # Make a direct request with streaming enabled using a context manager
-        with self.session.post(url, json=payload, stream=True) as response:
+        # Make the generation request
+        response = self.session.post(url, json=payload, stream=True)
+        try:
             response.raise_for_status()  # Raise an exception for HTTP errors
 
             full_text = ""
@@ -182,6 +185,8 @@ class LlamaServer:
                     print(f"JSON decode error: {e}")
                     print(f"Problematic line: {line}")
                     continue
+        finally:
+            response.close()
 
     def generate(self, prompt, state):
         output = ""
@@ -210,14 +215,15 @@ class LlamaServer:
             pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
             print()
 
-        response = self.session.post(url, json=payload)
-        result = response.json()
+        for retry in range(5):
+            response = self.session.post(url, json=payload)
+            result = response.json()
 
-        if "completion_probabilities" in result:
-            if use_samplers:
-                return result["completion_probabilities"][0]["top_probs"]
-            else:
-                return result["completion_probabilities"][0]["top_logprobs"]
+            if "completion_probabilities" in result:
+                if use_samplers:
+                    return result["completion_probabilities"][0]["top_probs"]
+                else:
+                    return result["completion_probabilities"][0]["top_logprobs"]
         else:
             raise Exception(f"Unexpected response format: 'completion_probabilities' not found in {result}")
 
@@ -255,9 +261,10 @@ class LlamaServer:
             self.server_path,
             "--model", self.model_path,
             "--ctx-size", str(shared.args.ctx_size),
-            "--n-gpu-layers", str(shared.args.n_gpu_layers),
+            "--gpu-layers", str(shared.args.gpu_layers),
             "--batch-size", str(shared.args.batch_size),
             "--port", str(self.port),
+            "--no-webui",
         ]
 
         if shared.args.flash_attn:
@@ -278,8 +285,10 @@ class LlamaServer:
             cmd.append("--no-kv-offload")
         if shared.args.row_split:
             cmd += ["--split-mode", "row"]
+        cache_type = "fp16"
         if shared.args.cache_type != "fp16" and shared.args.cache_type in llamacpp_valid_cache_types:
             cmd += ["--cache-type-k", shared.args.cache_type, "--cache-type-v", shared.args.cache_type]
+            cache_type = shared.args.cache_type
         if shared.args.compress_pos_emb != 1:
             cmd += ["--rope-freq-scale", str(1.0 / shared.args.compress_pos_emb)]
         if shared.args.rope_freq_base > 0:
@@ -316,9 +325,15 @@ class LlamaServer:
             for flag_item in extra_flags.split(','):
                 if '=' in flag_item:
                     flag, value = flag_item.split('=', 1)
-                    cmd += [f"--{flag}", value]
+                    if len(flag) <= 3:
+                        cmd += [f"-{flag}", value]
+                    else:
+                        cmd += [f"--{flag}", value]
                 else:
-                    cmd.append(f"--{flag_item}")
+                    if len(flag_item) <= 3:
+                        cmd.append(f"-{flag_item}")
+                    else:
+                        cmd.append(f"--{flag_item}")
 
         env = os.environ.copy()
         if os.name == 'posix':
@@ -333,6 +348,7 @@ class LlamaServer:
             print(' '.join(str(item) for item in cmd[1:]))
             print()
 
+        logger.info(f"Using gpu_layers={shared.args.gpu_layers} | ctx_size={shared.args.ctx_size} | cache_type={cache_type}")
         # Start the server with pipes for output
         self.process = subprocess.Popen(
             cmd,
diff --git a/modules/loaders.py b/modules/loaders.py
index 738198b1..6fbd2198 100644
--- a/modules/loaders.py
+++ b/modules/loaders.py
@@ -5,7 +5,7 @@ import gradio as gr
 
 loaders_and_params = OrderedDict({
     'llama.cpp': [
-        'n_gpu_layers',
+        'gpu_layers',
         'threads',
         'threads_batch',
         'batch_size',
@@ -28,6 +28,7 @@ loaders_and_params = OrderedDict({
         'device_draft',
         'ctx_size_draft',
         'speculative_decoding_accordion',
+        'vram_info',
     ],
     'Transformers': [
         'gpu_split',
@@ -84,17 +85,11 @@ loaders_and_params = OrderedDict({
         'no_flash_attn',
         'no_xformers',
         'no_sdpa',
-        'exllamav2_info',
         'model_draft',
         'draft_max',
         'ctx_size_draft',
         'speculative_decoding_accordion',
     ],
-    'HQQ': [
-        'hqq_backend',
-        'trust_remote_code',
-        'no_use_fast',
-    ],
     'TensorRT-LLM': [
         'ctx_size',
         'cpp_runner',
@@ -158,7 +153,6 @@ def transformers_samplers():
 
 loaders_samplers = {
     'Transformers': transformers_samplers(),
-    'HQQ': transformers_samplers(),
     'ExLlamav3_HF': {
         'temperature',
         'dynatemp_low',
@@ -299,7 +293,7 @@ loaders_samplers = {
         'typical_p',
         'xtc_threshold',
         'xtc_probability',
-        'tfs',
+        'top_n_sigma',
         'dry_multiplier',
         'dry_allowed_length',
         'dry_base',
diff --git a/modules/logits.py b/modules/logits.py
index 32aef7ae..56a20572 100644
--- a/modules/logits.py
+++ b/modules/logits.py
@@ -7,6 +7,7 @@ from modules import models, shared
 from modules.logging_colors import logger
 from modules.models import load_model
 from modules.text_generation import generate_reply
+from modules.utils import check_model_loaded
 
 global_scores = None
 
@@ -33,9 +34,9 @@ def get_next_logits(*args, **kwargs):
 
 
 def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return_dict=False):
-    if shared.model is None:
-        logger.error("No model is loaded! Select one in the Model tab.")
-        return 'Error: No model is loaded1 Select one in the Model tab.', previous
+    model_is_loaded, error_message = check_model_loaded()
+    if not model_is_loaded:
+        return error_message, previous
 
     # llama.cpp case
     if shared.model.__class__.__name__ == 'LlamaServer':
diff --git a/modules/models.py b/modules/models.py
index d0b0402a..d329ae3c 100644
--- a/modules/models.py
+++ b/modules/models.py
@@ -21,7 +21,6 @@ def load_model(model_name, loader=None):
         'ExLlamav3_HF': ExLlamav3_HF_loader,
         'ExLlamav2_HF': ExLlamav2_HF_loader,
         'ExLlamav2': ExLlamav2_loader,
-        'HQQ': HQQ_loader,
         'TensorRT-LLM': TensorRT_LLM_loader,
     }
 
@@ -71,7 +70,6 @@ def llama_cpp_server_loader(model_name):
     else:
         model_file = sorted(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0]
 
-    logger.info(f"llama.cpp weights detected: \"{model_file}\"")
     try:
         model = LlamaServer(model_file)
         return model, model
@@ -103,21 +101,6 @@ def ExLlamav2_loader(model_name):
     return model, tokenizer
 
 
-def HQQ_loader(model_name):
-    try:
-        from hqq.core.quantize import HQQBackend, HQQLinear
-        from hqq.models.hf.base import AutoHQQHFModel
-    except ModuleNotFoundError:
-        raise ModuleNotFoundError("Failed to import 'hqq'. Please install it manually following the instructions in the HQQ GitHub repository.")
-
-    logger.info(f"Loading HQQ model with backend: \"{shared.args.hqq_backend}\"")
-
-    model_dir = Path(f'{shared.args.model_dir}/{model_name}')
-    model = AutoHQQHFModel.from_quantized(str(model_dir))
-    HQQLinear.set_backend(getattr(HQQBackend, shared.args.hqq_backend))
-    return model
-
-
 def TensorRT_LLM_loader(model_name):
     try:
         from modules.tensorrt_llm import TensorRTLLMModel
@@ -133,10 +116,13 @@ def unload_model(keep_model_name=False):
         return
 
     is_llamacpp = (shared.model.__class__.__name__ == 'LlamaServer')
+    if shared.args.loader == 'ExLlamav3_HF':
+        shared.model.unload()
 
     shared.model = shared.tokenizer = None
     shared.lora_names = []
     shared.model_dirty_from_training = False
+
     if not is_llamacpp:
         from modules.torch_utils import clear_torch_cache
         clear_torch_cache()
diff --git a/modules/models_settings.py b/modules/models_settings.py
index ae589bb3..c914bdea 100644
--- a/modules/models_settings.py
+++ b/modules/models_settings.py
@@ -1,7 +1,11 @@
+import functools
 import json
 import re
+import subprocess
+from math import floor
 from pathlib import Path
 
+import gradio as gr
 import yaml
 
 from modules import chat, loaders, metadata_gguf, shared, ui
@@ -54,7 +58,7 @@ def get_model_metadata(model):
         else:
             model_file = list(path.glob('*.gguf'))[0]
 
-        metadata = metadata_gguf.load_metadata(model_file)
+        metadata = load_gguf_metadata_with_cache(model_file)
 
         for k in metadata:
             if k.endswith('context_length'):
@@ -67,7 +71,8 @@ def get_model_metadata(model):
             elif k.endswith('rope.scaling.factor'):
                 model_settings['compress_pos_emb'] = metadata[k]
             elif k.endswith('block_count'):
-                model_settings['n_gpu_layers'] = metadata[k] + 1
+                model_settings['gpu_layers'] = metadata[k] + 1
+                model_settings['max_gpu_layers'] = metadata[k] + 1
 
         if 'tokenizer.chat_template' in metadata:
             template = metadata['tokenizer.chat_template']
@@ -149,7 +154,11 @@ def get_model_metadata(model):
     for pat in settings:
         if re.match(pat.lower(), Path(model).name.lower()):
             for k in settings[pat]:
-                model_settings[k] = settings[pat][k]
+                new_k = k
+                if k == 'n_gpu_layers':
+                    new_k = 'gpu_layers'
+
+                model_settings[new_k] = settings[pat][k]
 
     # Load instruction template if defined by name rather than by value
     if model_settings['instruction_template'] != 'Custom (obtained from model metadata)':
@@ -174,8 +183,6 @@ def infer_loader(model_name, model_settings, hf_quant_method=None):
         loader = 'ExLlamav3_HF'
     elif re.match(r'.*exl2', model_name.lower()):
         loader = 'ExLlamav2_HF'
-    elif re.match(r'.*-hqq', model_name.lower()):
-        return 'HQQ'
     else:
         loader = 'Transformers'
 
@@ -209,15 +216,27 @@ def apply_model_settings_to_state(model, state):
     model_settings = get_model_metadata(model)
     if 'loader' in model_settings:
         loader = model_settings.pop('loader')
-
-        # If the user is using an alternative loader for the same model type, let them keep using it
         if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2']):
             state['loader'] = loader
 
     for k in model_settings:
-        if k in state:
+        if k in state and k != 'gpu_layers':  # Skip gpu_layers, handle separately
             state[k] = model_settings[k]
 
+    # Handle GPU layers and VRAM update for llama.cpp
+    if state['loader'] == 'llama.cpp' and 'gpu_layers' in model_settings:
+        vram_info, gpu_layers_update = update_gpu_layers_and_vram(
+            state['loader'],
+            model,
+            model_settings['gpu_layers'],
+            state['ctx_size'],
+            state['cache_type'],
+            auto_adjust=True
+        )
+
+        state['gpu_layers'] = gpu_layers_update
+        state['vram_info'] = vram_info
+
     return state
 
 
@@ -277,3 +296,197 @@ def save_instruction_template(model, template):
         yield (f"Instruction template for `{model}` unset in `{p}`, as the value for template was `{template}`.")
     else:
         yield (f"Instruction template for `{model}` saved to `{p}` as `{template}`.")
+
+
+@functools.lru_cache(maxsize=1)
+def load_gguf_metadata_with_cache(model_file):
+    return metadata_gguf.load_metadata(model_file)
+
+
+def get_model_size_mb(model_file: Path) -> float:
+    filename = model_file.name
+
+    # Check for multipart pattern
+    match = re.match(r'(.+)-\d+-of-\d+\.gguf$', filename)
+
+    if match:
+        # It's a multipart file, find all matching parts
+        base_pattern = match.group(1)
+        part_files = sorted(model_file.parent.glob(f'{base_pattern}-*-of-*.gguf'))
+        total_size = sum(p.stat().st_size for p in part_files)
+    else:
+        # Single part
+        total_size = model_file.stat().st_size
+
+    return total_size / (1024 ** 2)  # Return size in MB
+
+
+def estimate_vram(gguf_file, gpu_layers, ctx_size, cache_type):
+    model_file = Path(f'{shared.args.model_dir}/{gguf_file}')
+    metadata = load_gguf_metadata_with_cache(model_file)
+    size_in_mb = get_model_size_mb(model_file)
+
+    # Extract values from metadata
+    n_layers = None
+    n_kv_heads = None
+    embedding_dim = None
+
+    for key, value in metadata.items():
+        if key.endswith('.block_count'):
+            n_layers = value
+        elif key.endswith('.attention.head_count_kv'):
+            n_kv_heads = max(value) if isinstance(value, list) else value
+        elif key.endswith('.embedding_length'):
+            embedding_dim = value
+
+    if gpu_layers > n_layers:
+        gpu_layers = n_layers
+
+    # Convert cache_type to numeric
+    if cache_type == 'q4_0':
+        cache_type = 4
+    elif cache_type == 'q8_0':
+        cache_type = 8
+    else:
+        cache_type = 16
+
+    # Derived features
+    size_per_layer = size_in_mb / max(n_layers, 1e-6)
+    kv_cache_factor = n_kv_heads * cache_type * ctx_size
+    embedding_per_context = embedding_dim / ctx_size
+
+    # Calculate VRAM using the model
+    # Details: https://oobabooga.github.io/blog/posts/gguf-vram-formula/
+    vram = (
+        (size_per_layer - 17.99552795246051 + 3.148552680382576e-05 * kv_cache_factor)
+        * (gpu_layers + max(0.9690636483914102, cache_type - (floor(50.77817218646521 * embedding_per_context) + 9.987899908205632)))
+        + 1516.522943869404
+    )
+
+    return vram
+
+
+def get_nvidia_vram(return_free=True):
+    """
+    Calculates VRAM statistics across all NVIDIA GPUs by parsing nvidia-smi output.
+
+    Args:
+        return_free (bool): If True, returns free VRAM. If False, returns total VRAM.
+
+    Returns:
+        int: Either the total free VRAM or total VRAM in MiB summed across all detected NVIDIA GPUs.
+             Returns -1 if nvidia-smi command fails (not found, error, etc.).
+             Returns 0 if nvidia-smi succeeds but no GPU memory info found.
+    """
+    try:
+        # Execute nvidia-smi command
+        result = subprocess.run(
+            ['nvidia-smi'],
+            capture_output=True,
+            text=True,
+            check=False
+        )
+
+        # Check if nvidia-smi returned an error
+        if result.returncode != 0:
+            return -1
+
+        # Parse the output for memory usage patterns
+        output = result.stdout
+
+        # Find memory usage like "XXXXMiB / YYYYMiB"
+        # Captures used and total memory for each GPU
+        matches = re.findall(r"(\d+)\s*MiB\s*/\s*(\d+)\s*MiB", output)
+
+        if not matches:
+            # No GPUs found in expected format
+            return 0
+
+        total_vram_mib = 0
+        total_free_vram_mib = 0
+
+        for used_mem_str, total_mem_str in matches:
+            try:
+                used_mib = int(used_mem_str)
+                total_mib = int(total_mem_str)
+                total_vram_mib += total_mib
+                total_free_vram_mib += (total_mib - used_mib)
+            except ValueError:
+                # Skip malformed entries
+                pass
+
+        # Return either free or total VRAM based on the flag
+        return total_free_vram_mib if return_free else total_vram_mib
+
+    except FileNotFoundError:
+        # nvidia-smi not found (likely no NVIDIA drivers installed)
+        return -1
+    except Exception:
+        # Handle any other unexpected exceptions
+        return -1
+
+
+def update_gpu_layers_and_vram(loader, model, gpu_layers, ctx_size, cache_type, auto_adjust=False, for_ui=True):
+    """
+    Unified function to handle GPU layers and VRAM updates.
+
+    Args:
+        for_ui: If True, returns Gradio updates. If False, returns raw values.
+
+    Returns:
+        - If for_ui=True: (vram_info_update, gpu_layers_update) or just vram_info_update
+        - If for_ui=False: (vram_usage, adjusted_layers) or just vram_usage
+    """
+    if loader != 'llama.cpp' or model in ["None", None] or not model.endswith(".gguf"):
+        vram_info = "<div id=\"vram-info\"'>Estimated VRAM to load the model:</div>"
+        if for_ui:
+            return (vram_info, gr.update()) if auto_adjust else vram_info
+        else:
+            return (0, gpu_layers) if auto_adjust else 0
+
+    current_layers = gpu_layers
+    max_layers = gpu_layers
+
+    if auto_adjust:
+        # Get model settings including user preferences
+        model_settings = get_model_metadata(model)
+
+        # Get the true maximum layers
+        max_layers = model_settings.get('max_gpu_layers', model_settings.get('gpu_layers', gpu_layers))
+
+        # Check if this is a user-saved setting
+        user_config = shared.user_config
+        model_regex = Path(model).name + '$'
+        has_user_setting = model_regex in user_config and 'gpu_layers' in user_config[model_regex]
+
+        if has_user_setting:
+            # For user settings, just use the current value (which already has user pref)
+            # but ensure the slider maximum is correct
+            current_layers = gpu_layers  # Already has user setting
+        else:
+            # No user setting, auto-adjust from the maximum
+            current_layers = max_layers  # Start from max
+
+            # Auto-adjust based on available/total VRAM
+            # If a model is loaded and it's for the UI, use the total VRAM to avoid confusion
+            return_free = False if (for_ui and shared.model_name not in [None, 'None']) else True
+            available_vram = get_nvidia_vram(return_free=return_free)
+            if available_vram > 0:
+                tolerance = 577
+                while current_layers > 0 and estimate_vram(model, current_layers, ctx_size, cache_type) > available_vram - tolerance:
+                    current_layers -= 1
+
+    # Calculate VRAM with current layers
+    vram_usage = estimate_vram(model, current_layers, ctx_size, cache_type)
+
+    if for_ui:
+        vram_info = f"<div id=\"vram-info\"'>Estimated VRAM to load the model: <span class=\"value\">{vram_usage:.0f} MiB</span></div>"
+        if auto_adjust:
+            return vram_info, gr.update(value=current_layers, maximum=max_layers)
+        else:
+            return vram_info
+    else:
+        if auto_adjust:
+            return vram_usage, current_layers
+        else:
+            return vram_usage
diff --git a/modules/presets.py b/modules/presets.py
index a432bf52..cf706605 100644
--- a/modules/presets.py
+++ b/modules/presets.py
@@ -11,7 +11,7 @@ from modules.logging_colors import logger
 
 
 def default_preset():
-    return {
+    result = {
         'temperature': 1,
         'dynatemp_low': 1,
         'dynatemp_high': 1,
@@ -46,10 +46,17 @@ def default_preset():
         'do_sample': True,
         'dynamic_temperature': False,
         'temperature_last': False,
-        'sampler_priority': 'repetition_penalty\npresence_penalty\nfrequency_penalty\ndry\ntemperature\ndynamic_temperature\nquadratic_sampling\ntop_n_sigma\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat\nxtc\nencoder_repetition_penalty\nno_repeat_ngram',
+        'sampler_priority': 'repetition_penalty\npresence_penalty\nfrequency_penalty\ndry\ntop_n_sigma\ntemperature\ndynamic_temperature\nquadratic_sampling\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat\nxtc\nencoder_repetition_penalty\nno_repeat_ngram',
         'dry_sequence_breakers': '"\\n", ":", "\\"", "*"',
     }
 
+    if shared.args.portable:
+        samplers = result['sampler_priority'].split('\n')
+        samplers = [sampler for sampler in samplers if sampler in ["dry", "top_k", "top_p", "top_n_sigma", "min_p", "temperature", "xtc", "typical_p", "repetition_penalty"]]
+        result['sampler_priority'] = '\n'.join(samplers)
+
+    return result
+
 
 def presets_params():
     return [k for k in default_preset()]
diff --git a/modules/shared.py b/modules/shared.py
index fb10c014..d2305f30 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -60,7 +60,6 @@ settings = {
     'custom_stopping_strings': '',
     'custom_token_bans': '',
     'negative_prompt': '',
-    'autoload_model': False,
     'dark_theme': True,
     'default_extensions': [],
     'instruction_template_str': "{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{%- if not ns.found -%}\n    {{- '' + 'Below is an instruction that describes a task. Write a response that appropriately completes the request.' + '\\n\\n' -}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' -%}\n        {{- '' + message['content'] + '\\n\\n' -}}\n    {%- else -%}\n        {%- if message['role'] == 'user' -%}\n            {{-'### Instruction:\\n' + message['content'] + '\\n\\n'-}}\n        {%- else -%}\n            {{-'### Response:\\n' + message['content'] + '\\n\\n' -}}\n        {%- endif -%}\n    {%- endif -%}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n    {{-'### Response:\\n'-}}\n{%- endif -%}",
@@ -88,7 +87,7 @@ group.add_argument('--idle-timeout', type=int, default=0, help='Unload model aft
 
 # Model loader
 group = parser.add_argument_group('Model loader')
-group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, HQQ, TensorRT-LLM.')
+group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, ExLlamav3_HF, ExLlamav2_HF, ExLlamav2, TensorRT-LLM.')
 
 # Transformers/Accelerate
 group = parser.add_argument_group('Transformers/Accelerate')
@@ -121,7 +120,7 @@ group.add_argument('--threads-batch', type=int, default=0, help='Number of threa
 group.add_argument('--batch-size', type=int, default=256, help='Maximum number of prompt tokens to batch together when calling llama_eval.')
 group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.')
 group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.')
-group.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.')
+group.add_argument('--gpu-layers', '--n-gpu-layers', type=int, default=256, metavar='N', help='Number of layers to offload to the GPU.')
 group.add_argument('--tensor-split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.')
 group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.')
 group.add_argument('--no-kv-offload', action='store_true', help='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
@@ -130,9 +129,9 @@ group.add_argument('--extra-flags', type=str, default=None, help='Extra flags to
 group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
 
 # Cache
-group = parser.add_argument_group('Context and cache management')
+group = parser.add_argument_group('Context and cache')
 group.add_argument('--ctx-size', '--n_ctx', '--max_seq_len', type=int, default=8192, metavar='N', help='Context size in tokens.')
-group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).')
+group.add_argument('--cache-type', '--cache_type', type=str, default='fp16', metavar='N', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8 (can specify k_bits and v_bits separately, e.g. q4_q8).')
 
 # Speculative decoding
 group = parser.add_argument_group('Speculative decoding')
@@ -153,18 +152,10 @@ group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to n
 group.add_argument('--num_experts_per_token', type=int, default=2, metavar='N', help='Number of experts to use for generation. Applies to MoE models like Mixtral.')
 group.add_argument('--enable_tp', action='store_true', help='Enable Tensor Parallelism (TP) in ExLlamaV2.')
 
-# HQQ
-group = parser.add_argument_group('HQQ')
-group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.')
-
 # TensorRT-LLM
 group = parser.add_argument_group('TensorRT-LLM')
 group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn\'t support streaming yet.')
 
-# Cache
-group = parser.add_argument_group('Cache')
-group.add_argument('--cache_type', type=str, default='fp16', help='KV cache type; valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4.')
-
 # DeepSpeed
 group = parser.add_argument_group('DeepSpeed')
 group.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.')
@@ -190,6 +181,7 @@ group.add_argument('--ssl-keyfile', type=str, help='The path to the SSL certific
 group.add_argument('--ssl-certfile', type=str, help='The path to the SSL certificate cert file.', default=None)
 group.add_argument('--subpath', type=str, help='Customize the subpath for gradio, use with reverse proxy')
 group.add_argument('--old-colors', action='store_true', help='Use the legacy Gradio colors, before the December/2024 update.')
+group.add_argument('--portable', action='store_true', help='Hide features not available in portable mode like training.')
 
 # API
 group = parser.add_argument_group('API')
@@ -267,8 +259,6 @@ def fix_loader_name(name):
         return 'ExLlamav2_HF'
     elif name in ['exllamav3-hf', 'exllamav3_hf', 'exllama-v3-hf', 'exllama_v3_hf', 'exllama-v3_hf', 'exllama3-hf', 'exllama3_hf', 'exllama-3-hf', 'exllama_3_hf', 'exllama-3_hf']:
         return 'ExLlamav3_HF'
-    elif name in ['hqq']:
-        return 'HQQ'
     elif name in ['tensorrt', 'tensorrtllm', 'tensorrt_llm', 'tensorrt-llm', 'tensort', 'tensortllm']:
         return 'TensorRT-LLM'
 
@@ -311,11 +301,13 @@ if args.api or args.public_api:
     add_extension('openai', last=True)
 
 # Load model-specific settings
-with Path(f'{args.model_dir}/config.yaml') as p:
-    if p.exists():
-        model_config = yaml.safe_load(open(p, 'r').read())
-    else:
-        model_config = {}
+p = Path(f'{args.model_dir}/config.yaml')
+if p.exists():
+    model_config = yaml.safe_load(open(p, 'r').read())
+else:
+    model_config = {}
+del p
+
 
 # Load custom model-specific settings
 user_config = load_user_config()
diff --git a/modules/tensorrt_llm.py b/modules/tensorrt_llm.py
index 73178c39..0527d493 100644
--- a/modules/tensorrt_llm.py
+++ b/modules/tensorrt_llm.py
@@ -1,15 +1,15 @@
 from pathlib import Path
 
-import torch
-
 import tensorrt_llm
+import torch
+from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp
+
 from modules import shared
 from modules.logging_colors import logger
 from modules.text_generation import (
     get_max_prompt_length,
     get_reply_from_output_ids
 )
-from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp
 
 
 class TensorRTLLMModel:
diff --git a/modules/text_generation.py b/modules/text_generation.py
index 8d091868..1fd6d810 100644
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@@ -14,6 +14,7 @@ from modules.callbacks import Iteratorize
 from modules.extensions import apply_extensions
 from modules.html_generator import generate_basic_html
 from modules.logging_colors import logger
+from modules.utils import check_model_loaded
 
 
 def generate_reply(*args, **kwargs):
@@ -34,8 +35,8 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap
     # Find the appropriate generation function
     generate_func = apply_extensions('custom_generate_reply')
     if generate_func is None:
-        if shared.model_name == 'None' or shared.model is None:
-            logger.error("No model is loaded! Select one in the Model tab.")
+        model_is_loaded, error_message = check_model_loaded()
+        if not model_is_loaded:
             yield ''
             return
 
@@ -471,7 +472,7 @@ def generate_reply_HF(question, original_question, state, stopping_strings=None,
         t1 = time.time()
         original_tokens = len(original_input_ids[0])
         new_tokens = len(output) - (original_tokens if not shared.is_seq2seq else 0)
-        print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
+        logger.info(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
         return
 
 
@@ -480,7 +481,7 @@ def generate_reply_custom(question, original_question, state, stopping_strings=N
     For models that do not use the transformers library for sampling
     """
 
-    seed = set_manual_seed(state['seed'])
+    state['seed'] = set_manual_seed(state['seed'])
     t0 = time.time()
     reply = ''
     try:
@@ -500,15 +501,15 @@ def generate_reply_custom(question, original_question, state, stopping_strings=N
         t1 = time.time()
         original_tokens = len(encode(original_question)[0])
         new_tokens = len(encode(original_question + reply)[0]) - original_tokens
-        print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
+        logger.info(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {state["seed"]})')
         return
 
 
-def print_prompt(prompt, max_chars=2000):
+def print_prompt(prompt, max_chars=-1):
     DARK_YELLOW = "\033[38;5;3m"
     RESET = "\033[0m"
 
-    if len(prompt) > max_chars:
+    if max_chars > 0 and len(prompt) > max_chars:
         half_chars = max_chars // 2
         hidden_len = len(prompt[half_chars:-half_chars])
         hidden_msg = f"{DARK_YELLOW}[...{hidden_len} characters hidden...]{RESET}"
diff --git a/modules/ui.py b/modules/ui.py
index fb016f87..9f4d67cb 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -61,7 +61,7 @@ if not shared.args.old_colors:
         background_fill_primary_dark='var(--darker-gray)',
         body_background_fill="white",
         block_background_fill="transparent",
-        body_text_color="#333",
+        body_text_color='rgb(64, 64, 64)',
         button_secondary_background_fill="#f4f4f4",
         button_secondary_border_color="var(--border-color-primary)",
 
@@ -71,6 +71,7 @@ if not shared.args.old_colors:
         block_background_fill_dark='transparent',
         block_border_color_dark='transparent',
         input_border_color_dark='var(--border-color-dark)',
+        input_border_color_focus_dark='var(--border-color-dark)',
         checkbox_border_color_dark='var(--border-color-dark)',
         border_color_primary_dark='var(--border-color-dark)',
         button_secondary_border_color_dark='var(--border-color-dark)',
@@ -89,6 +90,8 @@ if not shared.args.old_colors:
         checkbox_label_shadow='none',
         block_shadow='none',
         block_shadow_dark='none',
+        input_shadow_focus='none',
+        input_shadow_focus_dark='none',
         button_large_radius='0.375rem',
         button_large_padding='6px 12px',
         input_radius='0.375rem',
@@ -105,11 +108,10 @@ def list_model_elements():
         'filter_by_loader',
         'loader',
         'cpu_memory',
-        'n_gpu_layers',
+        'gpu_layers',
         'threads',
         'threads_batch',
         'batch_size',
-        'hqq_backend',
         'ctx_size',
         'cache_type',
         'tensor_split',
@@ -211,6 +213,15 @@ def list_interface_input_elements():
         'negative_prompt',
         'dry_sequence_breakers',
         'grammar_string',
+        'navigate_message_index',
+        'navigate_direction',
+        'navigate_message_role',
+        'edit_message_index',
+        'edit_message_text',
+        'edit_message_role',
+        'branch_index',
+        'enable_web_search',
+        'web_search_pages',
     ]
 
     # Chat elements
diff --git a/modules/ui_chat.py b/modules/ui_chat.py
index 0d588549..d79aa523 100644
--- a/modules/ui_chat.py
+++ b/modules/ui_chat.py
@@ -24,7 +24,8 @@ def create_ui():
         with gr.Row(elem_id='past-chats-row', elem_classes=['pretty_scrollbar']):
             with gr.Column():
                 with gr.Row(elem_id='past-chats-buttons'):
-                    shared.gradio['branch_chat'] = gr.Button('Branch', elem_classes='refresh-button', interactive=not mu)
+                    shared.gradio['branch_chat'] = gr.Button('Branch', elem_classes='refresh-button', elem_id='Branch', interactive=not mu)
+                    shared.gradio['branch_index'] = gr.Number(value=-1, precision=0, visible=False, elem_id="Branch-index", interactive=True)
                     shared.gradio['rename_chat'] = gr.Button('Rename', elem_classes='refresh-button', interactive=not mu)
                     shared.gradio['delete_chat'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu)
                     shared.gradio['Start new chat'] = gr.Button('New chat', elem_classes=['refresh-button', 'focus-on-chat-input'])
@@ -46,14 +47,14 @@ def create_ui():
 
         with gr.Row():
             with gr.Column(elem_id='chat-col'):
-                shared.gradio['html_display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': []}, '', '', 'chat', 'cai-chat', '')['html'], visible=True)
                 shared.gradio['display'] = gr.JSON(value={}, visible=False)  # Hidden buffer
+                shared.gradio['html_display'] = gr.HTML(value=chat_html_wrapper({'internal': [], 'visible': [], 'metadata': {}}, '', '', 'chat', 'cai-chat', '')['html'], visible=True)
                 with gr.Row(elem_id="chat-input-row"):
                     with gr.Column(scale=1, elem_id='gr-hover-container'):
                         gr.HTML(value='<div class="hover-element" onclick="void(0)"><span style="width: 100px; display: block" id="hover-element-button">&#9776;</span><div class="hover-menu" id="hover-menu"></div>', elem_id='gr-hover')
 
                     with gr.Column(scale=10, elem_id='chat-input-container'):
-                        shared.gradio['textbox'] = gr.Textbox(label='', placeholder='Send a message', elem_id='chat-input', elem_classes=['add_scrollbar'])
+                        shared.gradio['textbox'] = gr.MultimodalTextbox(label='', placeholder='Send a message', file_types=['text', '.pdf'], file_count="multiple", elem_id='chat-input', elem_classes=['add_scrollbar'])
                         shared.gradio['show_controls'] = gr.Checkbox(value=shared.settings['show_controls'], label='Show controls (Ctrl+S)', elem_id='show-controls')
                         shared.gradio['typing-dots'] = gr.HTML(value='<div class="typing"><span></span><span class="dot1"></span><span class="dot2"></span></div>', label='typing', elem_id='typing-container')
 
@@ -70,8 +71,6 @@ def create_ui():
                 shared.gradio['Remove last'] = gr.Button('Remove last reply (Ctrl + Shift + Backspace)', elem_id='Remove-last')
 
             with gr.Row():
-                shared.gradio['Replace last reply'] = gr.Button('Replace last reply (Ctrl + Shift + L)', elem_id='Replace-last')
-                shared.gradio['Copy last reply'] = gr.Button('Copy last reply (Ctrl + Shift + K)', elem_id='Copy-last')
                 shared.gradio['Impersonate'] = gr.Button('Impersonate (Ctrl + Shift + M)', elem_id='Impersonate')
 
             with gr.Row():
@@ -79,14 +78,20 @@ def create_ui():
                 shared.gradio['Send dummy reply'] = gr.Button('Send dummy reply')
 
             with gr.Row():
-                shared.gradio['send-chat-to-default'] = gr.Button('Send to default')
-                shared.gradio['send-chat-to-notebook'] = gr.Button('Send to notebook')
+                shared.gradio['send-chat-to-default'] = gr.Button('Send to Default')
+                shared.gradio['send-chat-to-notebook'] = gr.Button('Send to Notebook')
 
         with gr.Row(elem_id='chat-controls', elem_classes=['pretty_scrollbar']):
             with gr.Column():
                 with gr.Row():
                     shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar'])
 
+                with gr.Row():
+                    shared.gradio['enable_web_search'] = gr.Checkbox(value=shared.settings.get('enable_web_search', False), label='Activate web search')
+
+                with gr.Row(visible=shared.settings.get('enable_web_search', False)) as shared.gradio['web_search_row']:
+                    shared.gradio['web_search_pages'] = gr.Number(value=shared.settings.get('web_search_pages', 3), precision=0, label='Number of pages to download', minimum=1, maximum=10)
+
                 with gr.Row():
                     shared.gradio['mode'] = gr.Radio(choices=['instruct', 'chat-instruct', 'chat'], value=shared.settings['mode'] if shared.settings['mode'] in ['chat', 'chat-instruct'] else None, label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode')
 
@@ -96,6 +101,22 @@ def create_ui():
                 with gr.Row():
                     shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=shared.settings['mode'] == 'chat-instruct', elem_classes=['add_scrollbar'])
 
+                with gr.Row():
+                    shared.gradio['count_tokens'] = gr.Button('Count tokens', size='sm')
+
+                shared.gradio['token_display'] = gr.HTML(value='', elem_classes='token-display')
+
+        # Hidden elements for version navigation and editing
+        with gr.Row(visible=False):
+            shared.gradio['navigate_message_index'] = gr.Number(value=-1, precision=0, elem_id="Navigate-message-index")
+            shared.gradio['navigate_direction'] = gr.Textbox(value="", elem_id="Navigate-direction")
+            shared.gradio['navigate_message_role'] = gr.Textbox(value="", elem_id="Navigate-message-role")
+            shared.gradio['navigate_version'] = gr.Button(elem_id="Navigate-version")
+            shared.gradio['edit_message_index'] = gr.Number(value=-1, precision=0, elem_id="Edit-message-index")
+            shared.gradio['edit_message_text'] = gr.Textbox(value="", elem_id="Edit-message-text")
+            shared.gradio['edit_message_role'] = gr.Textbox(value="", elem_id="Edit-message-role")
+            shared.gradio['edit_message'] = gr.Button(elem_id="Edit-message")
+
 
 def create_chat_settings_ui():
     mu = shared.args.multi_user
@@ -185,7 +206,7 @@ def create_event_handlers():
 
     shared.gradio['Generate'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
+        lambda x: (x, {"text": "", "files": []}), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
         lambda: None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.add("_generating")').then(
         chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then(
         None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then(
@@ -193,7 +214,7 @@ def create_event_handlers():
 
     shared.gradio['textbox'].submit(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
+        lambda x: (x, {"text": "", "files": []}), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then(
         lambda: None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.add("_generating")').then(
         chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then(
         None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then(
@@ -221,10 +242,6 @@ def create_event_handlers():
         None, None, None, js='() => document.getElementById("chat").parentNode.parentNode.parentNode.classList.remove("_generating")').then(
         None, None, None, js=f'() => {{{ui.audio_notification_js}}}')
 
-    shared.gradio['Replace last reply'].click(
-        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        chat.handle_replace_last_reply_click, gradio('textbox', 'interface_state'), gradio('history', 'display', 'textbox'), show_progress=False)
-
     shared.gradio['Send dummy message'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         chat.handle_send_dummy_message_click, gradio('textbox', 'interface_state'), gradio('history', 'display', 'textbox'), show_progress=False)
@@ -258,7 +275,7 @@ def create_event_handlers():
 
     shared.gradio['branch_chat'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
-        chat.handle_branch_chat_click, gradio('interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False)
+        chat.handle_branch_chat_click, gradio('interface_state'), gradio('history', 'display', 'unique_id', 'branch_index'), show_progress=False)
 
     shared.gradio['rename_chat'].click(chat.handle_rename_chat_click, None, gradio('rename_to', 'rename-row'), show_progress=False)
     shared.gradio['rename_to-cancel'].click(lambda: gr.update(visible=False), None, gradio('rename-row'), show_progress=False)
@@ -290,7 +307,14 @@ def create_event_handlers():
         None, gradio('mode'), None, js="(mode) => {mode === 'instruct' ? document.getElementById('character-menu').parentNode.parentNode.style.display = 'none' : document.getElementById('character-menu').parentNode.parentNode.style.display = ''}")
 
     shared.gradio['chat_style'].change(chat.redraw_html, gradio(reload_arr), gradio('display'), show_progress=False)
-    shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, gradio('history'), gradio('textbox'), show_progress=False)
+
+    shared.gradio['navigate_version'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.handle_navigate_version_click, gradio('interface_state'), gradio('history', 'display'), show_progress=False)
+
+    shared.gradio['edit_message'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.handle_edit_message_click, gradio('interface_state'), gradio('history', 'display'), show_progress=False)
 
     # Save/delete a character
     shared.gradio['save_character'].click(chat.handle_save_character_click, gradio('name2'), gradio('save_character_filename', 'character_saver'), show_progress=False)
@@ -347,3 +371,13 @@ def create_event_handlers():
         None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_notebook()}}')
 
     shared.gradio['show_controls'].change(None, gradio('show_controls'), None, js=f'(x) => {{{ui.show_controls_js}; toggle_controls(x)}}')
+
+    shared.gradio['count_tokens'].click(
+        ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
+        chat.count_prompt_tokens, gradio('textbox', 'interface_state'), gradio('token_display'), show_progress=False)
+
+    shared.gradio['enable_web_search'].change(
+        lambda x: gr.update(visible=x),
+        gradio('enable_web_search'),
+        gradio('web_search_row')
+    )
diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
index d13bcff7..862b3893 100644
--- a/modules/ui_model_menu.py
+++ b/modules/ui_model_menu.py
@@ -14,6 +14,7 @@ from modules.models_settings import (
     get_model_metadata,
     save_instruction_template,
     save_model_settings,
+    update_gpu_layers_and_vram,
     update_model_parameters
 )
 from modules.utils import gradio
@@ -26,71 +27,34 @@ def create_ui():
         with gr.Row():
             with gr.Column():
                 with gr.Row():
-                    with gr.Column():
-                        with gr.Row():
-                            shared.gradio['model_menu'] = gr.Dropdown(choices=utils.get_available_models(), value=lambda: shared.model_name, label='Model', elem_classes='slim-dropdown', interactive=not mu)
-                            ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
-                            shared.gradio['load_model'] = gr.Button("Load", visible=not shared.settings['autoload_model'], elem_classes='refresh-button', interactive=not mu)
-                            shared.gradio['unload_model'] = gr.Button("Unload", elem_classes='refresh-button', interactive=not mu)
-                            shared.gradio['save_model_settings'] = gr.Button("Save settings", elem_classes='refresh-button', interactive=not mu)
+                    shared.gradio['model_menu'] = gr.Dropdown(choices=utils.get_available_models(), value=lambda: shared.model_name, label='Model', elem_classes='slim-dropdown', interactive=not mu)
+                    ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
+                    shared.gradio['load_model'] = gr.Button("Load", elem_classes='refresh-button', interactive=not mu)
+                    shared.gradio['unload_model'] = gr.Button("Unload", elem_classes='refresh-button', interactive=not mu)
+                    shared.gradio['save_model_settings'] = gr.Button("Save settings", elem_classes='refresh-button', interactive=not mu)
 
-                    with gr.Column():
-                        with gr.Row():
-                            shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown', interactive=not mu)
-                            ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu)
-                            shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu)
-
-        with gr.Row():
-            with gr.Column():
-                shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys(), value=None)
+                shared.gradio['loader'] = gr.Dropdown(label="Model loader", choices=loaders.loaders_and_params.keys() if not shared.args.portable else ['llama.cpp'], value=None)
                 with gr.Blocks():
+                    gr.Markdown("## Main options")
                     with gr.Row():
                         with gr.Column():
-                            shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
-                            shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
-                            shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
-                            shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
-                            shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend)
-                            shared.gradio['ctx_size'] = gr.Number(label='ctx-size', precision=0, step=256, value=shared.args.ctx_size, info='Context length. ⚠️ Lower this value if you can\'t load the model. Common values: 2048, 4096, 8192, 16384, 32768, 65536.')
-                            shared.gradio['cache_type'] = gr.Dropdown(label="cache_type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
-                            shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
+                            shared.gradio['gpu_layers'] = gr.Slider(label="gpu-layers", minimum=0, maximum=get_initial_gpu_layers_max(), step=1, value=shared.args.gpu_layers, info='Must be greater than 0 for the GPU to be used. ⚠️ Lower this value if you can\'t load the model.')
+                            shared.gradio['ctx_size'] = gr.Slider(label='ctx-size', minimum=256, maximum=131072, step=256, value=shared.args.ctx_size, info='Context length. Common values: 4096, 8192, 16384, 32768, 65536, 131072. ⚠️ Lower this value if you can\'t load the model.')
                             shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7')
-                            shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags)
-                            shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
-                            shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
-                            shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')
-                            shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
-                            shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info='Used by load-in-4bit.')
-                            shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type, info='Used by load-in-4bit.')
-                            shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')
-
+                            shared.gradio['cache_type'] = gr.Dropdown(label="cache-type", choices=['fp16', 'q8_0', 'q4_0', 'fp8', 'q8', 'q7', 'q6', 'q5', 'q4', 'q3', 'q2'], value=shared.args.cache_type, allow_custom_value=True, info='Valid options: llama.cpp - fp16, q8_0, q4_0; ExLlamaV2 - fp16, fp8, q8, q6, q4; ExLlamaV3 - fp16, q2 to q8. For ExLlamaV3, you can type custom combinations for separate k/v bits (e.g. q4_q8).')
                         with gr.Column():
+                            shared.gradio['vram_info'] = gr.HTML(value=get_initial_vram_info())
+                            shared.gradio['flash_attn'] = gr.Checkbox(label="flash-attn", value=shared.args.flash_attn, info='Use flash-attention.')
+                            shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming-llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
                             shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit)
                             shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit)
                             shared.gradio['torch_compile'] = gr.Checkbox(label="torch-compile", value=shared.args.torch_compile, info='Compile the model with torch.compile for improved performance.')
-                            shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.')
                             shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')
-                            shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.')
-                            shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
-                            shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
-                            shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
-                            shared.gradio['no_kv_offload'] = gr.Checkbox(label="no_kv_offload", value=shared.args.no_kv_offload, info='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
-                            shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
-                            shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
-                            shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
                             shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant, info='Used by load-in-4bit.')
-                            shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.')
-                            shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
                             shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.')
                             shared.gradio['enable_tp'] = gr.Checkbox(label="enable_tp", value=shared.args.enable_tp, info='Enable Tensor Parallelism (TP).')
-                            shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn)
-                            shared.gradio['no_xformers'] = gr.Checkbox(label="no_xformers", value=shared.args.no_xformers)
-                            shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa)
-                            shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
                             shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.')
                             shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code)
-                            shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
-                            shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.")
                             shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `ctx_size` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.')
 
                             # Speculative decoding
@@ -99,15 +63,50 @@ def create_ui():
                                     shared.gradio['model_draft'] = gr.Dropdown(label="model-draft", choices=utils.get_available_models(), value=lambda: shared.args.model_draft, elem_classes='slim-dropdown', info='Draft model. Speculative decoding only works with models sharing the same vocabulary (e.g., same model family).', interactive=not mu)
                                     ui.create_refresh_button(shared.gradio['model_draft'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu)
 
-                                shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding.')
                                 shared.gradio['gpu_layers_draft'] = gr.Slider(label="gpu-layers-draft", minimum=0, maximum=256, value=shared.args.gpu_layers_draft, info='Number of layers to offload to the GPU for the draft model.')
+                                shared.gradio['draft_max'] = gr.Number(label="draft-max", precision=0, step=1, value=shared.args.draft_max, info='Number of tokens to draft for speculative decoding. Recommended value: 4.')
                                 shared.gradio['device_draft'] = gr.Textbox(label="device-draft", value=shared.args.device_draft, info='Comma-separated list of devices to use for offloading the draft model. Example: CUDA0,CUDA1')
                                 shared.gradio['ctx_size_draft'] = gr.Number(label="ctx-size-draft", precision=0, step=256, value=shared.args.ctx_size_draft, info='Size of the prompt context for the draft model. If 0, uses the same as the main model.')
 
-            with gr.Column():
-                with gr.Row():
-                    shared.gradio['autoload_model'] = gr.Checkbox(value=shared.settings['autoload_model'], label='Autoload the model', info='Whether to load the model as soon as it is selected in the Model dropdown.', interactive=not mu)
+                    gr.Markdown("## Other options")
+                    with gr.Accordion("See more options", open=False, elem_classes='tgw-accordion'):
+                        with gr.Row():
+                            with gr.Column():
+                                shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads)
+                                shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch)
+                                shared.gradio['batch_size'] = gr.Slider(label="batch_size", minimum=1, maximum=4096, step=1, value=shared.args.batch_size)
+                                shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40')
+                                shared.gradio['extra_flags'] = gr.Textbox(label='extra-flags', info='Additional flags to pass to llama-server. Format: "flag1=value1,flag2,flag3=value3". Example: "override-tensor=exps=CPU"', value=shared.args.extra_flags)
+                                shared.gradio['cpu_memory'] = gr.Number(label="Maximum CPU memory in GiB. Use this for CPU offloading.", value=shared.args.cpu_memory)
+                                shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.')
+                                shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.')
+                                shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.')
+                                shared.gradio['compute_dtype'] = gr.Dropdown(label="compute_dtype", choices=["bfloat16", "float16", "float32"], value=shared.args.compute_dtype, info='Used by load-in-4bit.')
+                                shared.gradio['quant_type'] = gr.Dropdown(label="quant_type", choices=["nf4", "fp4"], value=shared.args.quant_type, info='Used by load-in-4bit.')
+                                shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.')
 
+                            with gr.Column():
+                                shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.')
+                                shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk)
+                                shared.gradio['row_split'] = gr.Checkbox(label="row_split", value=shared.args.row_split, info='Split the model by rows across GPUs. This may improve multi-gpu performance.')
+                                shared.gradio['no_kv_offload'] = gr.Checkbox(label="no_kv_offload", value=shared.args.no_kv_offload, info='Do not offload the  K, Q, V to the GPU. This saves VRAM but reduces the performance.')
+                                shared.gradio['no_mmap'] = gr.Checkbox(label="no-mmap", value=shared.args.no_mmap)
+                                shared.gradio['mlock'] = gr.Checkbox(label="mlock", value=shared.args.mlock)
+                                shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.')
+                                shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.')
+                                shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16)
+                                shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn)
+                                shared.gradio['no_xformers'] = gr.Checkbox(label="no_xformers", value=shared.args.no_xformers)
+                                shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa)
+                                shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.')
+                                shared.gradio['no_use_fast'] = gr.Checkbox(label="no_use_fast", value=shared.args.no_use_fast, info='Set use_fast=False while loading the tokenizer.')
+                                if not shared.args.portable:
+                                    with gr.Row():
+                                        shared.gradio['lora_menu'] = gr.Dropdown(multiselect=True, choices=utils.get_available_loras(), value=shared.lora_names, label='LoRA(s)', elem_classes='slim-dropdown', interactive=not mu)
+                                        ui.create_refresh_button(shared.gradio['lora_menu'], lambda: None, lambda: {'choices': utils.get_available_loras(), 'value': shared.lora_names}, 'refresh-button', interactive=not mu)
+                                        shared.gradio['lora_menu_apply'] = gr.Button(value='Apply LoRAs', elem_classes='refresh-button', interactive=not mu)
+
+            with gr.Column():
                 with gr.Tab("Download"):
                     shared.gradio['custom_model_menu'] = gr.Textbox(label="Download model or LoRA", info="Enter the Hugging Face username/model path, for instance: facebook/galactica-125m. To specify a branch, add it at the end after a \":\" character like this: facebook/galactica-125m:main. To download a single file, enter its name in the second box.", interactive=not mu)
                     shared.gradio['download_specific_file'] = gr.Textbox(placeholder="File name (for GGUF models)", show_label=False, max_lines=1, interactive=not mu)
@@ -132,11 +131,10 @@ def create_event_handlers():
 
     # In this event handler, the interface state is read and updated
     # with the model defaults (if any), and then the model is loaded
-    # unless "autoload_model" is unchecked
     shared.gradio['model_menu'].change(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         handle_load_model_event_initial, gradio('model_menu', 'interface_state'), gradio(ui.list_interface_input_elements()) + gradio('interface_state'), show_progress=False).then(
-        load_model_wrapper, gradio('model_menu', 'loader', 'autoload_model'), gradio('model_status'), show_progress=True).success(
+        partial(load_model_wrapper, autoload=False), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=True).success(
         handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False)
 
     shared.gradio['load_model'].click(
@@ -145,15 +143,31 @@ def create_event_handlers():
         partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=True).success(
         handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False)
 
-    shared.gradio['unload_model'].click(handle_unload_model_click, None, gradio('model_status'), show_progress=False)
+    shared.gradio['unload_model'].click(handle_unload_model_click, None, gradio('model_status'), show_progress=False).then(
+        partial(update_gpu_layers_and_vram, auto_adjust=True), gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'), gradio('vram_info', 'gpu_layers'), show_progress=False)
+
     shared.gradio['save_model_settings'].click(
         ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then(
         save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False)
 
-    shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False)
+    # For ctx_size and cache_type - auto-adjust GPU layers
+    for param in ['ctx_size', 'cache_type']:
+        shared.gradio[param].change(
+            partial(update_gpu_layers_and_vram, auto_adjust=True),
+            gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'),
+            gradio('vram_info', 'gpu_layers'), show_progress=False)
+
+    # For manual gpu_layers changes - only update VRAM
+    shared.gradio['gpu_layers'].change(
+        partial(update_gpu_layers_and_vram, auto_adjust=False),
+        gradio('loader', 'model_menu', 'gpu_layers', 'ctx_size', 'cache_type'),
+        gradio('vram_info'), show_progress=False)
+
+    if not shared.args.portable:
+        shared.gradio['lora_menu_apply'].click(load_lora_wrapper, gradio('lora_menu'), gradio('model_status'), show_progress=False)
+
     shared.gradio['download_model_button'].click(download_model_wrapper, gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True)
     shared.gradio['get_file_list'].click(partial(download_model_wrapper, return_links=True), gradio('custom_model_menu', 'download_specific_file'), gradio('model_status'), show_progress=True)
-    shared.gradio['autoload_model'].change(lambda x: gr.update(visible=not x), gradio('autoload_model'), gradio('load_model'))
     shared.gradio['customized_template_submit'].click(save_instruction_template, gradio('model_menu', 'customized_template'), gradio('model_status'), show_progress=True)
 
 
@@ -192,6 +206,26 @@ def load_lora_wrapper(selected_loras):
 
 def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), return_links=False, check=False):
     try:
+        # Handle direct GGUF URLs
+        if repo_id.startswith("https://") and ("huggingface.co" in repo_id) and (repo_id.endswith(".gguf") or repo_id.endswith(".gguf?download=true")):
+            try:
+                path = repo_id.split("huggingface.co/")[1]
+
+                # Extract the repository ID (first two parts of the path)
+                parts = path.split("/")
+                if len(parts) >= 2:
+                    extracted_repo_id = f"{parts[0]}/{parts[1]}"
+
+                    # Extract the filename (last part of the path)
+                    filename = repo_id.split("/")[-1]
+                    if "?download=true" in filename:
+                        filename = filename.replace("?download=true", "")
+
+                    repo_id = extracted_repo_id
+                    specific_file = filename
+            except:
+                pass
+
         if repo_id == "":
             yield ("Please enter a model path")
             return
@@ -205,6 +239,18 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur
 
         yield ("Getting the download links from Hugging Face")
         links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(model, branch, text_only=False, specific_file=specific_file)
+
+        # Check for multiple GGUF files
+        gguf_files = [link for link in links if link.lower().endswith('.gguf')]
+        if len(gguf_files) > 1 and not specific_file:
+            output = "Multiple GGUF files found. Please copy one of the following filenames to the 'File name' field:\n\n```\n"
+            for link in gguf_files:
+                output += f"{Path(link).name}\n"
+
+            output += "```"
+            yield output
+            return
+
         if return_links:
             output = "```\n"
             for link in links:
@@ -252,10 +298,34 @@ def update_truncation_length(current_length, state):
     return current_length
 
 
+def get_initial_vram_info():
+    if shared.model_name != 'None' and shared.args.loader == 'llama.cpp':
+        return update_gpu_layers_and_vram(
+            shared.args.loader,
+            shared.model_name,
+            shared.args.gpu_layers,
+            shared.args.ctx_size,
+            shared.args.cache_type,
+            auto_adjust=False,
+            for_ui=True
+        )
+
+    return "<div id=\"vram-info\"'>Estimated VRAM to load the model:</div>"
+
+
+def get_initial_gpu_layers_max():
+    if shared.model_name != 'None' and shared.args.loader == 'llama.cpp':
+        model_settings = get_model_metadata(shared.model_name)
+        return model_settings.get('max_gpu_layers', model_settings.get('gpu_layers', 256))
+
+    return 256
+
+
 def handle_load_model_event_initial(model, state):
     state = apply_model_settings_to_state(model, state)
     output = ui.apply_interface_values(state)
-    update_model_parameters(state)
+    update_model_parameters(state)  # This updates the command-line flags
+
     return output + [state]
 
 
diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py
index 3f609d71..733d0901 100644
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@@ -21,7 +21,7 @@ def create_ui(default_preset):
                         shared.gradio['random_preset'] = gr.Button('🎲', elem_classes='refresh-button')
 
                 with gr.Column():
-                    shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All"] + list(loaders.loaders_and_params.keys()), value="All", elem_classes='slim-dropdown')
+                    shared.gradio['filter_by_loader'] = gr.Dropdown(label="Filter by loader", choices=["All"] + list(loaders.loaders_and_params.keys()) if not shared.args.portable else ['llama.cpp'], value="All", elem_classes='slim-dropdown')
 
             with gr.Row():
                 with gr.Column():
@@ -82,7 +82,7 @@ def create_ui(default_preset):
                             shared.gradio['auto_max_new_tokens'] = gr.Checkbox(value=shared.settings['auto_max_new_tokens'], label='auto_max_new_tokens', info='Expand max_new_tokens to the available context length.')
                             shared.gradio['ban_eos_token'] = gr.Checkbox(value=shared.settings['ban_eos_token'], label='Ban the eos_token', info='Forces the model to never end the generation prematurely.')
                             shared.gradio['add_bos_token'] = gr.Checkbox(value=shared.settings['add_bos_token'], label='Add the bos_token to the beginning of prompts', info='Disabling this can make the replies more creative.')
-                            shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='enable_thinking', info='Used by Qwen3 to toggle <think> mode.')
+                            shared.gradio['enable_thinking'] = gr.Checkbox(value=shared.settings['enable_thinking'], label='Enable thinking', info='Used by Qwen3 to toggle <think> mode.')
                             shared.gradio['skip_special_tokens'] = gr.Checkbox(value=shared.settings['skip_special_tokens'], label='Skip special tokens', info='Some specific models need this unset.')
                             shared.gradio['stream'] = gr.Checkbox(value=shared.settings['stream'], label='Activate text streaming')
                             shared.gradio['static_cache'] = gr.Checkbox(value=shared.settings['static_cache'], label='Static KV cache', info='Use a static cache for improved performance.')
diff --git a/modules/ui_session.py b/modules/ui_session.py
index 7cf9f6e6..a4eba667 100644
--- a/modules/ui_session.py
+++ b/modules/ui_session.py
@@ -23,11 +23,15 @@ def create_ui():
                         shared.gradio['bool_menu'] = gr.CheckboxGroup(choices=get_boolean_arguments(), value=get_boolean_arguments(active=True), label="Boolean command-line flags", elem_classes='checkboxgroup-table')
 
             with gr.Column():
-                extension_name = gr.Textbox(lines=1, label='Install or update an extension', info='Enter the GitHub URL below and press Enter. For a list of extensions, see: https://github.com/oobabooga/text-generation-webui-extensions ⚠️  WARNING ⚠️ : extensions can execute arbitrary code. Make sure to inspect their source code before activating them.', interactive=not mu)
-                extension_status = gr.Markdown()
+                if not shared.args.portable:
+                    extension_name = gr.Textbox(lines=1, label='Install or update an extension', info='Enter the GitHub URL below and press Enter. For a list of extensions, see: https://github.com/oobabooga/text-generation-webui-extensions ⚠️  WARNING ⚠️ : extensions can execute arbitrary code. Make sure to inspect their source code before activating them.', interactive=not mu)
+                    extension_status = gr.Markdown()
+                else:
+                    pass
 
         shared.gradio['theme_state'] = gr.Textbox(visible=False, value='dark' if shared.settings['dark_theme'] else 'light')
-        extension_name.submit(clone_or_pull_repository, extension_name, extension_status, show_progress=False)
+        if not shared.args.portable:
+            extension_name.submit(clone_or_pull_repository, extension_name, extension_status, show_progress=False)
 
         # Reset interface event
         shared.gradio['reset_interface'].click(
diff --git a/modules/utils.py b/modules/utils.py
index 77324139..577c55b8 100644
--- a/modules/utils.py
+++ b/modules/utils.py
@@ -72,6 +72,20 @@ def natural_keys(text):
     return [atoi(c) for c in re.split(r'(\d+)', text)]
 
 
+def check_model_loaded():
+    if shared.model_name == 'None' or shared.model is None:
+        if len(get_available_models()) == 0:
+            error_msg = "No model is loaded.\n\nTo get started:\n1) Place a GGUF file in your user_data/models folder\n2) Go to the Model tab and select it"
+            logger.error(error_msg)
+            return False, error_msg
+        else:
+            error_msg = "No model is loaded. Please select one in the Model tab."
+            logger.error(error_msg)
+            return False, error_msg
+
+    return True, None
+
+
 def get_available_models():
     # Get all GGUF files
     gguf_files = get_available_ggufs()
@@ -123,7 +137,7 @@ def get_available_models():
 
     model_dirs = sorted(model_dirs, key=natural_keys)
 
-    return ['None'] + filtered_gguf_files + model_dirs
+    return filtered_gguf_files + model_dirs
 
 
 def get_available_ggufs():
diff --git a/modules/web_search.py b/modules/web_search.py
new file mode 100644
index 00000000..1f670349
--- /dev/null
+++ b/modules/web_search.py
@@ -0,0 +1,129 @@
+import concurrent.futures
+from concurrent.futures import as_completed
+from datetime import datetime
+
+import requests
+from bs4 import BeautifulSoup
+from duckduckgo_search import DDGS
+
+from modules.logging_colors import logger
+
+
+def get_current_timestamp():
+    """Returns the current time in 24-hour format"""
+    return datetime.now().strftime('%b %d, %Y %H:%M')
+
+
+def download_web_page(url, timeout=5):
+    """Download and extract text from a web page"""
+    try:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+        response = requests.get(url, headers=headers, timeout=timeout)
+        response.raise_for_status()
+
+        soup = BeautifulSoup(response.content, 'html.parser')
+
+        # Remove script and style elements
+        for script in soup(["script", "style"]):
+            script.decompose()
+
+        # Get text and clean it up
+        text = soup.get_text()
+        lines = (line.strip() for line in text.splitlines())
+        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+        text = ' '.join(chunk for chunk in chunks if chunk)
+
+        return text
+    except Exception as e:
+        logger.error(f"Error downloading {url}: {e}")
+        return f"[Error downloading content from {url}: {str(e)}]"
+
+
+def perform_web_search(query, num_pages=3, max_workers=5):
+    """Perform web search and return results with content"""
+    try:
+        with DDGS() as ddgs:
+            results = list(ddgs.text(query, max_results=num_pages))
+
+        # Prepare download tasks
+        download_tasks = []
+        for i, result in enumerate(results):
+            url = result.get('href', '')
+            title = result.get('title', f'Search Result {i+1}')
+            download_tasks.append((url, title, i))
+
+        search_results = [None] * len(download_tasks)  # Pre-allocate to maintain order
+
+        # Download pages in parallel
+        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+            # Submit all download tasks
+            future_to_task = {
+                executor.submit(download_web_page, task[0]): task
+                for task in download_tasks
+            }
+
+            # Collect results as they complete
+            for future in as_completed(future_to_task):
+                url, title, index = future_to_task[future]
+                try:
+                    content = future.result()
+                    search_results[index] = {
+                        'title': title,
+                        'url': url,
+                        'content': content
+                    }
+                except Exception as e:
+                    logger.error(f"Error downloading {url}: {e}")
+                    # Include failed downloads with empty content
+                    search_results[index] = {
+                        'title': title,
+                        'url': url,
+                        'content': ''
+                    }
+
+        return search_results
+
+    except Exception as e:
+        logger.error(f"Error performing web search: {e}")
+        return []
+
+
+def add_web_search_attachments(history, row_idx, user_message, search_query, state):
+    """Perform web search and add results as attachments"""
+    if not search_query:
+        logger.warning("No search query provided")
+        return
+
+    try:
+        logger.info(f"Using search query: {search_query}")
+
+        # Perform web search
+        num_pages = int(state.get('web_search_pages', 3))
+        search_results = perform_web_search(search_query, num_pages)
+
+        if not search_results:
+            logger.warning("No search results found")
+            return
+
+        # Add search results as attachments
+        key = f"user_{row_idx}"
+        if key not in history['metadata']:
+            history['metadata'][key] = {"timestamp": get_current_timestamp()}
+        if "attachments" not in history['metadata'][key]:
+            history['metadata'][key]["attachments"] = []
+
+        for result in search_results:
+            attachment = {
+                "name": result['title'],
+                "type": "text/html",
+                "url": result['url'],
+                "content": result['content']
+            }
+            history['metadata'][key]["attachments"].append(attachment)
+
+        logger.info(f"Added {len(search_results)} web search results as attachments")
+
+    except Exception as e:
+        logger.error(f"Error in web search: {e}")
diff --git a/one_click.py b/one_click.py
index 065afd99..482a6aa9 100644
--- a/one_click.py
+++ b/one_click.py
@@ -126,7 +126,7 @@ def check_env():
         sys.exit(1)
 
     # Ensure this is a new environment and not the base environment
-    if os.environ["CONDA_DEFAULT_ENV"] == "base":
+    if os.environ.get("CONDA_DEFAULT_ENV", "") == "base":
         print("Create an environment for this project and activate it. Exiting...")
         sys.exit(1)
 
@@ -222,7 +222,7 @@ def update_pytorch_and_python():
     if "+cu" in torver:
         install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cu124"
     elif "+rocm" in torver:
-        install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.1"
+        install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/rocm6.2.4"
     elif "+cpu" in torver:
         install_cmd = f"{base_cmd} --index-url https://download.pytorch.org/whl/cpu"
     elif "+cxx11" in torver:
@@ -273,7 +273,7 @@ def install_webui():
             "What is your GPU?",
             {
                 'A': 'NVIDIA - CUDA 12.4',
-                'B': 'AMD - Linux/macOS only, requires ROCm 6.1',
+                'B': 'AMD - Linux/macOS only, requires ROCm 6.2.4',
                 'C': 'Apple M Series',
                 'D': 'Intel Arc (beta)',
                 'N': 'CPU mode'
@@ -314,7 +314,7 @@ def install_webui():
     if selected_gpu == "NVIDIA":
         install_pytorch += "--index-url https://download.pytorch.org/whl/cu124"
     elif selected_gpu == "AMD":
-        install_pytorch += "--index-url https://download.pytorch.org/whl/rocm6.1"
+        install_pytorch += "--index-url https://download.pytorch.org/whl/rocm6.2.4"
     elif selected_gpu in ["APPLE", "NONE"]:
         install_pytorch += "--index-url https://download.pytorch.org/whl/cpu"
     elif selected_gpu == "INTEL":
diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt
index 6f265eba..2c322715 100644
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@@ -1,7 +1,9 @@
 accelerate==1.5.*
+beautifulsoup4==4.13.4
 bitsandbytes==0.45.*
 colorama
 datasets
+duckduckgo_search==8.0.2
 einops
 fastapi==0.112.4
 gradio==4.37.*
@@ -13,6 +15,7 @@ peft==0.15.*
 Pillow>=9.5.0
 psutil
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
@@ -30,12 +33,12 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
 https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt
index c8e75ee7..6aeb325e 100644
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@@ -1,6 +1,8 @@
 accelerate==1.5.*
+beautifulsoup4==4.13.4
 colorama
 datasets
+duckduckgo_search==8.0.2
 einops
 fastapi==0.112.4
 gradio==4.37.*
@@ -12,6 +14,7 @@ peft==0.15.*
 Pillow>=9.5.0
 psutil
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
@@ -29,6 +32,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt
index e54d6d9c..3b052423 100644
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@@ -1,6 +1,8 @@
 accelerate==1.5.*
+beautifulsoup4==4.13.4
 colorama
 datasets
+duckduckgo_search==8.0.2
 einops
 fastapi==0.112.4
 gradio==4.37.*
@@ -12,6 +14,7 @@ peft==0.15.*
 Pillow>=9.5.0
 psutil
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
@@ -29,6 +32,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.1.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt
index d714ea3d..8c51459e 100644
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@@ -1,6 +1,8 @@
 accelerate==1.5.*
+beautifulsoup4==4.13.4
 colorama
 datasets
+duckduckgo_search==8.0.2
 einops
 fastapi==0.112.4
 gradio==4.37.*
@@ -12,6 +14,7 @@ peft==0.15.*
 Pillow>=9.5.0
 psutil
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
@@ -29,7 +32,7 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
+https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt
index 89f4f576..b9f15d45 100644
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@@ -1,6 +1,8 @@
 accelerate==1.5.*
+beautifulsoup4==4.13.4
 colorama
 datasets
+duckduckgo_search==8.0.2
 einops
 fastapi==0.112.4
 gradio==4.37.*
@@ -12,6 +14,7 @@ peft==0.15.*
 Pillow>=9.5.0
 psutil
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
@@ -29,8 +32,8 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6-py3-none-any.whl
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9-py3-none-any.whl
+https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl
diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt
index 47ad5759..0877d968 100644
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@@ -1,6 +1,8 @@
 accelerate==1.5.*
+beautifulsoup4==4.13.4
 colorama
 datasets
+duckduckgo_search==8.0.2
 einops
 fastapi==0.112.4
 gradio==4.37.*
@@ -12,6 +14,7 @@ peft==0.15.*
 Pillow>=9.5.0
 psutil
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
@@ -29,5 +32,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt
index 334f11df..cab78237 100644
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@@ -1,6 +1,8 @@
 accelerate==1.5.*
+beautifulsoup4==4.13.4
 colorama
 datasets
+duckduckgo_search==8.0.2
 einops
 fastapi==0.112.4
 gradio==4.37.*
@@ -12,6 +14,7 @@ peft==0.15.*
 Pillow>=9.5.0
 psutil
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
@@ -29,5 +32,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt
index e216c9cd..dfd42577 100644
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@@ -1,7 +1,9 @@
 accelerate==1.5.*
+beautifulsoup4==4.13.4
 bitsandbytes==0.45.*
 colorama
 datasets
+duckduckgo_search==8.0.2
 einops
 fastapi==0.112.4
 gradio==4.37.*
@@ -13,6 +15,7 @@ peft==0.15.*
 Pillow>=9.5.0
 psutil
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
@@ -30,12 +33,12 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a6/exllamav3-0.0.1a6+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav3/releases/download/v0.0.1a9/exllamav3-0.0.1a9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
 https://github.com/oobabooga/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt
index 2e631bf0..5d9f84ce 100644
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@@ -1,6 +1,8 @@
 accelerate==1.5.*
+beautifulsoup4==4.13.4
 colorama
 datasets
+duckduckgo_search==8.0.2
 einops
 fastapi==0.112.4
 gradio==4.37.*
@@ -12,6 +14,7 @@ peft==0.15.*
 Pillow>=9.5.0
 psutil
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt
index c720daa7..fdae681d 100644
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@@ -1,9 +1,12 @@
+beautifulsoup4==4.13.4
+duckduckgo_search==8.0.2
 fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
 markdown
 numpy==1.26.*
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
@@ -15,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt
deleted file mode 100644
index 7d9c00c0..00000000
--- a/requirements/portable/requirements_amd.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-fastapi==0.112.4
-gradio==4.37.*
-jinja2==3.1.6
-markdown
-numpy==1.26.*
-pydantic==2.8.2
-pyyaml
-requests
-rich
-tqdm
-
-# API
-flask_cloudflared==0.0.14
-sse-starlette==1.6.5
-tiktoken
-
-# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_amd_noavx2.txt b/requirements/portable/requirements_amd_noavx2.txt
deleted file mode 100644
index d718c1b1..00000000
--- a/requirements/portable/requirements_amd_noavx2.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-fastapi==0.112.4
-gradio==4.37.*
-jinja2==3.1.6
-markdown
-numpy==1.26.*
-pydantic==2.8.2
-pyyaml
-requests
-rich
-tqdm
-
-# API
-flask_cloudflared==0.0.14
-sse-starlette==1.6.5
-tiktoken
-
-# AMD wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+rocm6.1.2avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt
index 9e184b53..a58f39f7 100644
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@@ -1,9 +1,12 @@
+beautifulsoup4==4.13.4
+duckduckgo_search==8.0.2
 fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
 markdown
 numpy==1.26.*
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
@@ -15,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt
index ec059716..91ea3a6d 100644
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@@ -1,9 +1,12 @@
+beautifulsoup4==4.13.4
+duckduckgo_search==8.0.2
 fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
 markdown
 numpy==1.26.*
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
@@ -15,6 +18,6 @@ sse-starlette==1.6.5
 tiktoken
 
 # Mac wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0-py3-none-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0"
diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt
index d473b824..37e5aa40 100644
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@@ -1,9 +1,12 @@
+beautifulsoup4==4.13.4
+duckduckgo_search==8.0.2
 fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
 markdown
 numpy==1.26.*
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
@@ -15,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx2-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt
index d3fffb43..dcb2884b 100644
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@@ -1,9 +1,12 @@
+beautifulsoup4==4.13.4
+duckduckgo_search==8.0.2
 fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
 markdown
 numpy==1.26.*
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
@@ -15,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # llama.cpp (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cpuavx-py3-none-win_amd64.whl; platform_system == "Windows"
diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt
index cdfa6a01..8f1295bb 100644
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@@ -1,9 +1,12 @@
+beautifulsoup4==4.13.4
+duckduckgo_search==8.0.2
 fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
 markdown
 numpy==1.26.*
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
@@ -15,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+cu124avx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt
index 6f9566ba..21805fe2 100644
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@@ -1,9 +1,12 @@
+beautifulsoup4==4.13.4
+duckduckgo_search==8.0.2
 fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
 markdown
 numpy==1.26.*
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt
index 1a7ce6ed..858b4488 100644
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@@ -1,9 +1,12 @@
+beautifulsoup4==4.13.4
+duckduckgo_search==8.0.2
 fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
 markdown
 numpy==1.26.*
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
@@ -15,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkan-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt
index 4737321d..569bae99 100644
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@@ -1,9 +1,12 @@
+beautifulsoup4==4.13.4
+duckduckgo_search==8.0.2
 fastapi==0.112.4
 gradio==4.37.*
 jinja2==3.1.6
 markdown
 numpy==1.26.*
 pydantic==2.8.2
+PyPDF2==3.0.1
 pyyaml
 requests
 rich
@@ -15,5 +18,5 @@ sse-starlette==1.6.5
 tiktoken
 
 # CUDA wheels
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
-https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.8.0/llama_cpp_binaries-0.8.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-win_amd64.whl; platform_system == "Windows"
+https://github.com/oobabooga/llama-cpp-binaries/releases/download/v0.14.0/llama_cpp_binaries-0.14.0+vulkanavx-py3-none-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64"
diff --git a/server.py b/server.py
index 169578a5..c22ed1f1 100644
--- a/server.py
+++ b/server.py
@@ -51,6 +51,7 @@ from modules.models import load_model, unload_model_if_idle
 from modules.models_settings import (
     get_fallback_settings,
     get_model_metadata,
+    update_gpu_layers_and_vram,
     update_model_parameters
 )
 from modules.shared import do_cmd_flags_warnings
@@ -90,7 +91,7 @@ def create_interface():
         'instruction_template_str': shared.settings['instruction_template_str'],
         'prompt_menu-default': shared.settings['prompt-default'],
         'prompt_menu-notebook': shared.settings['prompt-notebook'],
-        'filter_by_loader': shared.args.loader or 'All'
+        'filter_by_loader': (shared.args.loader or 'All') if not shared.args.portable else 'llama.cpp'
     })
 
     if Path("user_data/cache/pfp_character.png").exists():
@@ -127,7 +128,8 @@ def create_interface():
 
         ui_parameters.create_ui(shared.settings['preset'])  # Parameters tab
         ui_model_menu.create_ui()  # Model tab
-        training.create_ui()  # Training tab
+        if not shared.args.portable:
+            training.create_ui()  # Training tab
         ui_session.create_ui()  # Session tab
 
         # Generation events
@@ -247,6 +249,20 @@ if __name__ == "__main__":
         model_settings = get_model_metadata(model_name)
         update_model_parameters(model_settings, initial=True)  # hijack the command-line arguments
 
+        # Auto-adjust GPU layers if not provided by user and it's a llama.cpp model
+        if 'gpu_layers' not in shared.provided_arguments and shared.args.loader == 'llama.cpp' and 'gpu_layers' in model_settings:
+            vram_usage, adjusted_layers = update_gpu_layers_and_vram(
+                shared.args.loader,
+                model_name,
+                model_settings['gpu_layers'],
+                shared.args.ctx_size,
+                shared.args.cache_type,
+                auto_adjust=True,
+                for_ui=False
+            )
+
+            shared.args.gpu_layers = adjusted_layers
+
         # Load the model
         shared.model, shared.tokenizer = load_model(model_name)
         if shared.args.lora:
diff --git a/start_linux.sh b/start_linux.sh
index 00082f07..e2b00558 100755
--- a/start_linux.sh
+++ b/start_linux.sh
@@ -1,10 +1,15 @@
 #!/usr/bin/env bash
 
+# environment isolation
+export PYTHONNOUSERSITE=1
+unset PYTHONPATH
+unset PYTHONHOME
+
 cd "$(dirname "${BASH_SOURCE[0]}")"
 
 # Portable install case
 if [ -d "portable_env" ]; then
-    ./portable_env/bin/python3 server.py --api --auto-launch "$@"
+    ./portable_env/bin/python3 server.py --portable --api --auto-launch "$@"
     exit $?
 fi
 
@@ -61,10 +66,6 @@ if [ ! -e "$INSTALL_ENV_DIR/bin/python" ]; then
     exit
 fi
 
-# environment isolation
-export PYTHONNOUSERSITE=1
-unset PYTHONPATH
-unset PYTHONHOME
 export CUDA_PATH="$INSTALL_ENV_DIR"
 export CUDA_HOME="$CUDA_PATH"
 
diff --git a/start_macos.sh b/start_macos.sh
index 628f59cc..bff11bc1 100755
--- a/start_macos.sh
+++ b/start_macos.sh
@@ -1,10 +1,15 @@
 #!/bin/bash
 
+# environment isolation
+export PYTHONNOUSERSITE=1
+unset PYTHONPATH
+unset PYTHONHOME
+
 cd "$(dirname "${BASH_SOURCE[0]}")"
 
 # Portable install case
 if [ -d "portable_env" ]; then
-    ./portable_env/bin/python3 server.py --api --auto-launch --api-port 5005 "$@"
+    ./portable_env/bin/python3 server.py --portable --api --auto-launch --api-port 5005 "$@"
     exit $?
 fi
 
@@ -61,10 +66,6 @@ if [ ! -e "$INSTALL_ENV_DIR/bin/python" ]; then
     exit
 fi
 
-# environment isolation
-export PYTHONNOUSERSITE=1
-unset PYTHONPATH
-unset PYTHONHOME
 export CUDA_PATH="$INSTALL_ENV_DIR"
 export CUDA_HOME="$CUDA_PATH"
 
diff --git a/start_windows.bat b/start_windows.bat
index 451b85e0..f5e66ec2 100755
--- a/start_windows.bat
+++ b/start_windows.bat
@@ -1,11 +1,16 @@
 @echo off
 setlocal enabledelayedexpansion
 
+@rem environment isolation
+set PYTHONNOUSERSITE=1
+set PYTHONPATH=
+set PYTHONHOME=
+
 cd /D "%~dp0"
 
 @rem Portable install case
 if exist "portable_env" (
-    .\portable_env\python.exe server.py --api --auto-launch %*
+    .\portable_env\python.exe server.py --portable --api --auto-launch %*
     exit /b %errorlevel%
 )
 
@@ -87,10 +92,6 @@ if not exist "%INSTALL_ENV_DIR%" (
 @rem check if conda environment was actually created
 if not exist "%INSTALL_ENV_DIR%\python.exe" ( echo. && echo Conda environment is empty. && goto end )
 
-@rem environment isolation
-set PYTHONNOUSERSITE=1
-set PYTHONPATH=
-set PYTHONHOME=
 set "CUDA_PATH=%INSTALL_ENV_DIR%"
 set "CUDA_HOME=%CUDA_PATH%"
 
diff --git a/user_data/settings-template.yaml b/user_data/settings-template.yaml
index 20896da3..ce0f77e1 100644
--- a/user_data/settings-template.yaml
+++ b/user_data/settings-template.yaml
@@ -31,7 +31,6 @@ seed: -1
 custom_stopping_strings: ''
 custom_token_bans: ''
 negative_prompt: ''
-autoload_model: false
 dark_theme: true
 default_extensions: []
 instruction_template_str: |-