Merge pull request #6797 from oobabooga/dev

Merge dev branch
2025-06-07 06:06:20 -04:00 · 2025-03-15 00:11:25 -03:00 · 2025-03-15 00:11:25 -03:00 · 80cdbe4e09
commit 80cdbe4e09
parent 769eee1ff3 758c3f15a5
28 changed files with 676 additions and 228 deletions
--- a/css/main.css
+++ b/css/main.css
@ -249,8 +249,8 @@ button {
 }

 .pretty_scrollbar::-webkit-scrollbar {
-    width: 7px;
-    height: 7px;
+    width: 8px;
+    height: 8px;
 }

 .pretty_scrollbar::-webkit-scrollbar-track {
@ -295,7 +295,7 @@ audio {
    width: 0;
    text-align: left;
    direction: rtl;
-    right: 5px;
+    right: 13px;
 }

 #default-token-counter {
@ -1163,7 +1163,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }

 .header_bar button.selected {
-    background: white;
+    background: #E0E0E0;
 }

 #chat-controls,
--- a/extensions/openai/typing.py
+++ b/extensions/openai/typing.py
@ -21,6 +21,7 @@ class GenerationOptions(BaseModel):
    eta_cutoff: float = 0
    tfs: float = 1
    top_a: float = 0
+    top_n_sigma: float = 0
    dry_multiplier: float = 0
    dry_allowed_length: int = 2
    dry_base: float = 1.75
--- a/extensions/perplexity_colors/script.py
+++ b/extensions/perplexity_colors/script.py
@ -1,9 +1,14 @@
 import time

+import html
+import functools
+import re
+
 import gradio
 import numpy as np
 import torch
 from transformers import LogitsProcessor
+import colorsys

 from modules import html_generator, shared

@ -28,7 +33,7 @@ class PerplexityLogits(LogitsProcessor):
        self.verbose = verbose

    def __call__(self, input_ids, scores):
-        # t0 = time.time()
+        #t0 = time.time()
        probs = torch.softmax(scores, dim=-1, dtype=torch.float)
        log_probs = torch.nan_to_num(torch.log(probs))  # Note: This is to convert log(0) nan to 0, but probs*log_probs makes this 0 not affect the perplexity.
        entropy = -torch.sum(probs * log_probs)
@ -42,9 +47,8 @@ class PerplexityLogits(LogitsProcessor):
        if len(self.selected_probs) > 0:
            # Is the selected token in the top tokens?
            if self.verbose:
-                print('Probs: Token after', shared.tokenizer.decode(last_token_id))
-                print('Probs:', [shared.tokenizer.decode(token_id) for token_id in self.top_token_ids_list[-1][0]])
-                print('Probs:', [round(float(prob), 4) for prob in self.top_probs_list[-1][0]])
+                print(shared.tokenizer.decode(last_token_id), [shared.tokenizer.decode(token_id) for token_id in self.top_token_ids_list[-1][0]],
+                    [round(float(prob), 4) for prob in self.top_probs_list[-1][0]])
            if last_token_id in self.top_token_ids_list[-1][0]:
                idx = self.top_token_ids_list[-1][0].index(last_token_id)
                self.selected_probs.append(self.top_probs_list[-1][0][idx])
@ -60,7 +64,7 @@ class PerplexityLogits(LogitsProcessor):
            pplbar = "-"
            if not np.isnan(perplexity):
                pplbar = "*" * round(perplexity)
-            print(f"PPL: Token after {shared.tokenizer.decode(last_token_id)}\t{perplexity:.2f}\t{pplbar}")
+            print(f"PPL for token after {shared.tokenizer.decode(last_token_id)}: {perplexity:.2f} {pplbar}")

        # Get top 5 probabilities
        top_tokens_and_probs = torch.topk(probs, 5)
@ -73,14 +77,15 @@ class PerplexityLogits(LogitsProcessor):
        probs = probs.cpu().numpy().flatten()
        self.last_probs = probs  # Need to keep this as a reference for top probs

-        # t1 = time.time()
-        # print(f"PPL Processor: {(t1-t0):.3f} s")
+        #t1 = time.time()
+        #print(f"PPL Processor: {(t1-t0):.3f} s")
        # About 1 ms, though occasionally up to around 100 ms, not sure why...
        # Doesn't actually modify the logits!
        return scores


 # Stores the perplexity and top probabilities
+# global ppl_logits_processor
 ppl_logits_processor = None


@ -91,130 +96,192 @@ def logits_processor_modifier(logits_processor_list, input_ids):
        logits_processor_list.append(ppl_logits_processor)


+def get_last_token(text, tokens_list, token_ids_list, token_probs_list):
+    for token, token_id, prob in zip(tokens_list, token_ids_list, token_probs_list):
+        if text.strip().endswith(token.strip()): # Whitespace could be a problem
+            return token, token_id, prob
+    # Unknown?
+    print("Last token not found in list:", tokens_list)
+    return '', -1, 0.0
+
+
 def output_modifier(text):
    global ppl_logits_processor
-    # t0 = time.time()
+    #t0 = time.time()
+    original_text = text

-    if not params['active']:
+    if not params['active'] or ppl_logits_processor is None:
        return text

+    # Space at the beginning to account for tokenization spaces...
+    text = ' ' + html.unescape(text)
+
    # TODO: It's probably more efficient to do this above rather than modifying all these lists
    # Remove last element of perplexities_list, top_token_ids_list, top_tokens_list, top_probs_list since everything is off by one because this extension runs before generation
-    perplexities = ppl_logits_processor.perplexities_list[:-1]
-    top_token_ids_list = ppl_logits_processor.top_token_ids_list[:-1]
+    perplexities = ppl_logits_processor.perplexities_list
+    top_token_ids_list = ppl_logits_processor.top_token_ids_list
    top_tokens_list = [[shared.tokenizer.decode(token_id) for token_id in top_token_ids[0]] for top_token_ids in top_token_ids_list]
-    top_probs_list = ppl_logits_processor.top_probs_list[:-1]
+    top_probs_list = ppl_logits_processor.top_probs_list
    # Remove first element of generated_token_ids, generated_tokens, selected_probs because they are for the last token of the prompt
    gen_token_ids = ppl_logits_processor.generated_token_ids[1:]
+    # Add last sampled token, if possible (it could be past the end of the top 5 list)
+    last_token, last_token_id, last_prob = get_last_token(text, top_tokens_list[-1], top_token_ids_list[-1][0], top_probs_list[-1][0])
+    if last_token_id != -1:
+        gen_token_ids.append(last_token_id)
    gen_tokens = [shared.tokenizer.decode(token_id) for token_id in gen_token_ids]
    sel_probs = ppl_logits_processor.selected_probs[1:]
+    if last_token_id != -1:
+        sel_probs.append(last_prob)

    end_part = '</div></div>' if params['probability_dropdown'] else '</span>'  # Helps with finding the index after replacing part of the text.

-    i = 0
-    for token, prob, ppl, top_tokens, top_probs in zip(gen_tokens, sel_probs, perplexities, top_tokens_list, top_probs_list):
+    # Initial space added to deal with some tokenizers...
+    # Used to find where the message started generating, for working with "continue" generations
+    # Doesn't work for longer messages... Not sure how I should handle this
+    full_msg = shared.tokenizer.decode([token_id for token_id in gen_token_ids[:-1]]).strip()
+    
+    # There was an issue with tab lengths being off by one...
+    # Seems like it might be model-dependent...
+    #text = re.sub(r'( {3,})', r'\1 ', text)
+    # Subtracting 2 to hopefully help with the tokenization spaces and continue issues,
+    # Though it's possible it could overwrite the previous token if it's the same in the last 2 chars
+    i = text.find(full_msg) - 2
+    if i < 0:
+        # Backup, try removing the extra whitespace (needed for continue)
+        i = text.find(full_msg.strip()) - 2
+        if i < 0:
+            i = 0
+
+    #i = 0
+    # Add token index for ability to regenerate from there
+    nonwhitespace_token_found = False
+    missing_token_count = 0
+    for index, token, prob, ppl, top_tokens, top_probs in zip(range(len(gen_tokens)), gen_tokens, sel_probs, perplexities, top_tokens_list, top_probs_list):
+        # Somehow this works without issues, but not sure how...
+        if not nonwhitespace_token_found and token.strip() == '':
+            #print('Ignoring initial whitespace token...')
+            continue
+        nonwhitespace_token_found = True
+        max_prob = top_probs[0][0]
        color = 'ffffff'
        if params['color_by_probability'] and params['color_by_perplexity']:
-            color = probability_perplexity_color_scale(prob, ppl)
+            color = probability_perplexity_color_scale(prob, max_prob, ppl)
        elif params['color_by_perplexity']:
            color = perplexity_color_scale(ppl)
        elif params['color_by_probability']:
            color = probability_color_scale(prob)
-        if token in text[i:]:
+        if token.strip() in text[i:]:
            if params['probability_dropdown']:
-                text = text[:i] + text[i:].replace(token, add_dropdown_html(token, color, top_tokens, top_probs[0], ppl), 1)
+                text = text[:i] + text[i:].replace(token.replace('\n', ''), add_dropdown_html(token, index, i, color, top_tokens, top_probs[0], ppl), 1)
            else:
-                text = text[:i] + text[i:].replace(token, add_color_html(token, color), 1)
+                text = text[:i] + text[i:].replace(token.replace('\n', ''), add_color_html(token, color), 1)
+            
+            # This might be slightly inefficient
            i += text[i:].find(end_part) + len(end_part)
+        else:
+            missing_token_count += 1
+            print('Missing token:', token, '...', text[i:i+20])
+            # If there are any missing tokens, then either the tokenization was off, or this is the start of a conversation, or something else went wrong
+        if missing_token_count > 5:
+            print("Canceling token coloring...")
+            return original_text
+

    # Use full perplexity list for calculating the average here.
-    print('Average perplexity:', round(np.mean(ppl_logits_processor.perplexities_list[:-1]), 4))
-    # t1 = time.time()
-    # print(f"Modifier: {(t1-t0):.3f} s")
+    # Fix issue with mean of empty slice
+    if len(ppl_logits_processor.perplexities_list) > 1:
+        print('Average perplexity:', round(np.mean(ppl_logits_processor.perplexities_list[:-1]), 4))
+    #t1 = time.time()
+    #print(f"Output modifier: {(t1-t0):.3f} s")
    # About 50 ms
-    return text
+    return text.strip() # Remove extra beginning whitespace that some tokenizers add


 def probability_color_scale(prob):
    '''
    Green-yellow-red color scale
    '''
+    # hue (0.0 = red, 0.33 = green)
+    # saturation (0.0 = gray / white, 1.0 = normal, just leave at 1.0)
+    # brightness (0.0 = black, 1.0 = brightest, use something in between for better readability if you want...)
+    hue = prob * 0.33
+    rv, gv, bv = colorsys.hsv_to_rgb(hue, 1.0, 1.0)
+    # to hex
+    hex_col = f"{int(rv*255):02x}{int(gv*255):02x}{int(bv*255):02x}"

-    rv = 0
-    gv = 0
-    if prob <= 0.5:
-        rv = 'ff'
-        gv = hex(int(255 * prob * 2))[2:]
-        if len(gv) < 2:
-            gv = '0' * (2 - len(gv)) + gv
-    else:
-        rv = hex(int(255 - 255 * (prob - 0.5) * 2))[2:]
-        gv = 'ff'
-        if len(rv) < 2:
-            rv = '0' * (2 - len(rv)) + rv
-
-    return rv + gv + '00'
+    return hex_col


 def perplexity_color_scale(ppl):
    '''
    Red component only, white for 0 perplexity (sorry if you're not in dark mode)
    '''
-    value = hex(max(int(255.0 - params['ppl_scale'] * (float(ppl) - 1.0)), 0))[2:]
-    if len(value) < 2:
-        value = '0' * (2 - len(value)) + value
+    # hue (0.0 = red)
+    # saturation (1.0 = red)
+    # brightness (0.0 = black, 1.0 = red)
+    # scale saturation from white to red the higher the perplexity

-    return 'ff' + value + value
+    ppl = min(ppl, params['ppl_scale'])  # clip ppl to 0-params['ppl_scale'] for color scaling. 15 should be fine for clipping and scaling
+    sat = ppl / params['ppl_scale']
+    rv, gv, bv = colorsys.hsv_to_rgb(0.0, sat, 1.0)
+
+    # to hex
+    hex_col = f"{int(rv*255):02x}{int(gv*255):02x}{int(bv*255):02x}"
+    
+    return hex_col


-def probability_perplexity_color_scale(prob, ppl):
+def probability_perplexity_color_scale(prob, max_prob, ppl):
    '''
-    Green-yellow-red for probability and blue component for perplexity
+    Green-yellow-red for relative probability compared to maximum for the current token, and blue component for perplexity
    '''
-
-    rv = 0
-    gv = 0
-    bv = hex(min(max(int(params['ppl_scale'] * (float(ppl) - 1.0)), 0), 255))[2:]
-    if len(bv) < 2:
-        bv = '0' * (2 - len(bv)) + bv
-
-    if prob <= 0.5:
-        rv = 'ff'
-        gv = hex(int(255 * prob * 2))[2:]
-        if len(gv) < 2:
-            gv = '0' * (2 - len(gv)) + gv
-    else:
-        rv = hex(int(255 - 255 * (prob - 0.5) * 2))[2:]
-        gv = 'ff'
-        if len(rv) < 2:
-            rv = '0' * (2 - len(rv)) + rv
-
-    return rv + gv + bv
+    hue = prob/max_prob * 0.33
+    rv, gv, _ = colorsys.hsv_to_rgb(hue, 1.0, 1.0)
+    
+    ppl = min(ppl, params['ppl_scale'])  # clip ppl to 0-params['ppl_scale'] for color scaling. 15 should be fine for clipping and scaling
+    bv = ppl / params['ppl_scale']
+    
+    # to hex
+    hex_col = f"{int(rv*255):02x}{int(gv*255):02x}{int(bv*255):02x}"
+    
+    return hex_col


 def add_color_html(token, color):
-    return f'<span style="color: #{color}">{token}</span>'
+    output = ''
+    output += f'<span style="color: #{color}">{html.escape(repr(token)[1:-1])}</span>'
+    #if '\n' in token or '\r' in token: #token.isspace():
+    #    output += '<br>'
+    return output


-# TODO: Major issue: Applying this to too many tokens will cause a permanent slowdown in generation speed until the messages are removed from the history.
+# TODO: Might also need message index for the click-to-regenerate feature to work... For now it only works in the last message, which I think is fine.
+
+# TODO: Major issue: Applying this to too many tokens will cause a permanent slowdown in generation speed until the messages are removed from the history. The slowdown seems to be mostly resolved in the current version though
 # I think the issue is from HTML elements taking up space in the visible history, and things like history deepcopy add latency proportional to the size of the history.
 # Potential solution is maybe to modify the main generation code to send just the internal text and not the visible history, to avoid moving too much around.
 # I wonder if we can also avoid using deepcopy here.
-def add_dropdown_html(token, color, top_tokens, top_probs, perplexity=0):
-    html = f'<div class="hoverable"><span style="color: #{color}">{token}</span><div class="dropdown"><table class="dropdown-content"><tbody>'
-    for token_option, prob in zip(top_tokens, top_probs):
+def add_dropdown_html(token, index, msg_position, color, top_tokens, top_probs, perplexity=0):
+    #print("Token:", token, token.isspace(), '\n' in token or '\r' in token)
+    output = ''
+    # Use the repr to get characters like \n visible. Exclude the quotes around it
+    output += f'<div class="hoverable" name="tok_{index}_{msg_position}"><span style="color: #{color}">{html.escape(repr(token)[1:-1])}</span><div class="dropdown"><table class="dropdown-content"><tbody>'
+    for i, token_option, prob in zip(range(len(top_tokens)), top_tokens, top_probs):
        # TODO: Bold for selected token?
        # Using divs prevented the problem of divs inside spans causing issues.
        # Now the problem is that divs show the same whitespace of one space between every token.
        # There is probably some way to fix this in CSS that I don't know about.
        row_color = probability_color_scale(prob)
        row_class = ' class="selected"' if token_option == token else ''
-        html += f'<tr{row_class}><td style="color: #{row_color}">{token_option}</td><td style="color: #{row_color}">{prob:.4f}</td></tr>'
+        # This time we want to include the quotes around it so that we can see where the spaces are.
+        output += f'<tr{row_class}><td name="opt_{index}_{i}_{msg_position}" style="color: #{row_color}">{html.escape(repr(token_option))}</td><td style="color: #{row_color}">{prob:.4f}</td></tr>'
    if perplexity != 0:
        ppl_color = perplexity_color_scale(perplexity)
-        html += f'<tr><td>Perplexity:</td><td style="color: #{ppl_color}">{perplexity:.4f}</td></tr>'
-    html += '</tbody></table></div></div>'
-    return html  # About 750 characters per token...
+        output += f'<tr><td>Perplexity:</td><td style="color: #{ppl_color}">{perplexity:.4f}</td></tr>'
+    output += '</tbody></table></div></div>'
+    #if '\n' in token or '\r' in token: #token.isspace():
+    #    output += '<br>' # I imagine this will cause problems sometimes
+    return output  # About 750 characters per token...


 def custom_css():
@ -223,8 +290,8 @@ def custom_css():
            display: none;
            position: absolute;
            z-index: 50;
-            background-color: var(--block-background-fill);
-            box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2);
+            background-color: var(--background-fill-secondary);
+            box-shadow: 0px 8px 16px 0px rgba(0,0,0,1.0);
            width: max-content;
            overflow: visible;
            padding: 5px;
@ -238,7 +305,7 @@ def custom_css():
        }

        .dropdown-content tr.selected {
-            background-color: var(--block-label-background-fill);
+            background-color: var(--background-fill-primary);
        }

        .dropdown-content td {
@ -267,21 +334,111 @@ def custom_css():
        # TODO: This makes the hover menus extend outside the bounds of the chat area, which is good.
        # However, it also makes the scrollbar disappear, which is bad.
        # The scroll bar needs to still be present. So for now, we can't see dropdowns that extend past the edge of the chat area.
-        #.chat {
-        #    overflow-y: auto;
-        #}
+        .chat {
+            overflow-y: auto;
+        }
    """

+def custom_js():
+    return """
+
+function sleep(ms) {
+    return new Promise(resolve => setTimeout(resolve, ms));
+}    
+
+// Note that this will only work as intended on the last agent message
+document.addEventListener("click", async function(event) {
+    //console.log(event.target);
+    const name = event.target.getAttribute("name");
+    if (name != null && name.includes("opt_")) {
+        const name_parts = name.split("_");
+        const token_index = name_parts[1];
+        const option_index = name_parts[2];
+        const msg_pos = name_parts[3];
+        // Exclude the quotes and convert newlines... Not sure about the newlines though
+        // TODO: Seems like continuing generation from a newline causes problems whether you add it or not!
+        const token_string = event.target.innerHTML.substring(1, event.target.innerHTML.length-1).replace(new RegExp(String.fromCharCode(92)+String.fromCharCode(92)+"r", "g"), '').replace(new RegExp(String.fromCharCode(92)+String.fromCharCode(92)+"n", "g"), '');
+        //console.log(token_index + ", " + option_index + ", " + token_string);
+        // Get all the previous text (I'm sure there is a more efficient way to do this)
+        var msg_text = ""
+        const msg_html = event.target.parentElement.parentElement.parentElement.parentElement.parentElement.parentElement;
+        var msg_parts = msg_html.childNodes;
+        for (var i = 0; i < msg_parts.length; i++) {
+            var msg_part = msg_parts[i];
+            if (msg_part.nodeType === Node.ELEMENT_NODE) {
+                if (msg_part.nodeName == "DIV") {
+                    msg_part_name = msg_part.getAttribute("name")
+                    if (msg_part_name != null) {
+                        var current_token_index = msg_part_name.split("_")[1];
+                        var current_message_pos = msg_part_name.split("_")[2];
+                        if (current_token_index == token_index && current_message_pos == msg_pos) {
+                            // Use the replacement token
+                            // TODO: Don't have access to the tokenizer here, and sometimes there needs to be a space added before this token
+                            msg_text += token_string //.replace(new RegExp(String.fromCharCode(92)+String.fromCharCode(92)+"r", "g"), '').replace(new RegExp(String.fromCharCode(92)+String.fromCharCode(92)+"n", "g"), '');
+                            break;
+                        }
+                        else {
+                            // Replace here or at the end?
+                            var text = msg_part.firstChild.innerHTML.replace(new RegExp(String.fromCharCode(92)+String.fromCharCode(92)+"r", "g"), '').replace(new RegExp(String.fromCharCode(92)+String.fromCharCode(92)+"n", "g"), '')
+                            msg_text += text;
+                        }
+                    }
+                }
+                else {
+                    // Break tag (hacky workaround because the newline literal can't be parsed here)
+                    //msg_text += String.fromCharCode(10);
+                    // Do nothing???
+                }
+            }
+            else if (msg_part.nodeType === Node.TEXT_NODE) {
+                msg_text +=  msg_part.textContent;
+            }
+        }
+        var textbox = document.querySelector("#chat-input textarea");
+        textbox.focus();
+        textbox.value = msg_text.trimStart() // Fix initial tokenization spaces
+        //console.log(textbox.value);
+        
+        // Add some delays to make sure it's processed correctly. Without these, there's a chance the events don't go through correctly and it doesn't work
+        // It's unknown how long this will take, and probably depends on the size of the message...
+        // It would be better to somehow wait for gradio to update instead of waiting a fixed amount of time.
+        // Hopefully 1 second of delay before starting generation isn't unacceptable.
+        var inputEvent = new Event('input', {
+            bubbles: true,
+            cancelable: true,
+        });
+        textbox.dispatchEvent(inputEvent);
+        var changeEvent = new Event('change', {
+            bubbles: true,
+            cancelable: true,
+        });
+        textbox.dispatchEvent(changeEvent);
+        await sleep(250);
+        document.getElementById("Replace-last").click();
+        // This can take a while to execute
+        await sleep(750);
+        document.getElementById("Continue").click();
+    }
+});
+
+console.log("Custom JS for perplexity_colors loaded");
+"""

 # Monkeypatch applied to html_generator.py
 # We simply don't render markdown into HTML. We wrap everything in <pre> tags to preserve whitespace
 # formatting. If you're coloring tokens by perplexity or probability, or especially if you're using
 # the probability dropdown, you probably care more about seeing the tokens the model actually outputted
 # rather than rendering ```code blocks``` or *italics*.
+@functools.lru_cache(maxsize=4096)
 def convert_to_markdown(string):
    return '<pre>' + string + '</pre>'

+def convert_to_markdown_wrapped(string, use_cache=True):
+    if use_cache:
+        return convert_to_markdown(string)
+    return convert_to_markdown.__wrapped__(string)

+# This is still necessary for formatting to work correctly
 html_generator.convert_to_markdown = convert_to_markdown


@ -298,7 +455,7 @@ def ui():
    def update_prob_dropdown_check(x):
        params.update({'probability_dropdown': x})

-    active_check = gradio.Checkbox(value=True, label="Compute probabilities and perplexity scores", info="Activate this extension. Note that this extension currently does not work with exllama or llama.cpp.")
+    active_check = gradio.Checkbox(value=True, label="Compute probabilities and perplexity scores", info="Activate this extension. Note that this extension currently does not work with llama.cpp, but it does work with ExLlamav2_HF and llamacpp_HF when set up correctly")
    color_by_ppl_check = gradio.Checkbox(value=False, label="Color by perplexity", info="Higher perplexity is more red. If also showing probability, higher perplexity has more blue component.")
    color_by_prob_check = gradio.Checkbox(value=False, label="Color by probability", info="Green-yellow-red linear scale, with 100% green, 50% yellow, 0% red.")
    prob_dropdown_check = gradio.Checkbox(value=False, label="Probability dropdown", info="Hover over a token to show a dropdown of top token probabilities. Currently slightly buggy with whitespace between tokens.")
--- a/extensions/superboogav2/README.md
+++ b/extensions/superboogav2/README.md
@ -1,5 +1,41 @@
-# superboogav2
+# SuperboogaV2

-For a description, please see the comments in this Pull Request:
+Enhance your LLM with additional information from text, URLs, and files for more accurate and context-aware responses.

-https://github.com/oobabooga/text-generation-webui/pull/3272
+---
+
+
+
+## Installation and Activation
+
+1. Start the conda environment by running `cmd_windows.bat` or the equivalent for your system in the root directory of `text-generation-webui`.
+2. Install the necessary packages:
+   ```
+   pip install -r extensions/superboogav2/requirements.txt
+   ```
+3. Activate the extension in the `Session` tab of the web UI.
+4. Click on `Apply flags/extensions and restart`. Optionally save the configuration by clicking on `Save UI defaults to settings.yaml`.
+
+## Usage and Features
+
+After activation, you can scroll further down in the chat UI to reveal the SuperboogaV2 interface. Here, you can add extra information to your chats through text input, multiple URLs, or by providing multiple files subject to the context window limit of your model.
+
+The extra information and the current date and time are provided to the model as embeddings that persist across conversations. To clear them, click the `Clear Data` button and start a new chat. You can adjust the text extraction parameters and other options in the `Settings`.
+
+## Supported File Formats
+
+SuperboogaV2 utilizes MuPDF, pandas, python-docx, and python-pptx to extract text from various file formats, including:
+
+- TXT
+- PDF
+- EPUB
+- HTML
+- CSV
+- ODT/ODS/ODP
+- DOCX/PPTX/XLSX
+
+## Additional Information
+
+SuperboogaV2 processes your data into context-aware chunks, applies cleaning techniques, and stores them as embeddings to minimize redundant computations. Relevance is determined using distance calculations and prioritization of recent information.
+
+For a detailed description and more information, refer to the comments in this pull request: [https://github.com/oobabooga/text-generation-webui/pull/3272](https://github.com/oobabooga/text-generation-webui/pull/3272)
--- a/extensions/superboogav2/chromadb.py
+++ b/extensions/superboogav2/chromadb.py
@ -1,7 +1,7 @@
 import math
 import random
 import threading
-
+import torch
 import chromadb
 import numpy as np
 import posthog
@ -16,9 +16,6 @@ from modules.text_generation import decode, encode
 posthog.capture = lambda *args, **kwargs: None


-embedder = embedding_functions.SentenceTransformerEmbeddingFunction("sentence-transformers/all-mpnet-base-v2")
-
-
 class Info:
    def __init__(self, start_index, text_with_context, distance, id):
        self.text_with_context = text_with_context
@ -77,11 +74,23 @@ class Info:

 class ChromaCollector():
    def __init__(self):
-        name = ''.join(random.choice('ab') for _ in range(10))
+        name = "".join(random.choice("ab") for _ in range(10))

        self.name = name
-        self.chroma_client = chromadb.Client(Settings(anonymized_telemetry=False))
-        self.collection = self.chroma_client.create_collection(name=name, embedding_function=embedder)
+        self.embedder = embedding_functions.SentenceTransformerEmbeddingFunction(
+            "sentence-transformers/all-mpnet-base-v2",
+            device=("cuda" if torch.cuda.is_available() else "cpu"),
+        )
+        chroma_client = chromadb.Client(Settings(anonymized_telemetry=False))
+        self.collection = chroma_client.create_collection(
+            name=self.name,
+            embedding_function=self.embedder,
+            metadata={
+                "hnsw:search_ef": 200,
+                "hnsw:construction_ef": 200,
+                "hnsw:M": 64,
+            },
+        )

        self.ids = []
        self.id_to_info = {}
@ -110,7 +119,7 @@ class ChromaCollector():

            # If there are any non-existing texts, compute their embeddings all at once. Each call to embed has significant overhead.
            if non_existing_texts:
-                non_existing_embeddings = embedder(non_existing_texts)
+                non_existing_embeddings = self.embedder(non_existing_texts)
                for text, embedding in zip(non_existing_texts, non_existing_embeddings):
                    self.embeddings_cache[text] = embedding

@ -139,7 +148,7 @@ class ChromaCollector():
            id_ = new_ids[i]
            metadata = metadatas[i] if metadatas is not None else None
            embedding = self.embeddings_cache.get(text)
-            if embedding:
+            if embedding is not None and embedding.any():
                existing_texts.append(text)
                existing_embeddings.append(embedding)
                existing_ids.append(id_)
@ -323,6 +332,8 @@ class ChromaCollector():
    def delete(self, ids_to_delete: list[str], where: dict):
        with self.lock:
            ids_to_delete = self.collection.get(ids=ids_to_delete, where=where)['ids']
+            if not ids_to_delete:
+                return
            self.collection.delete(ids=ids_to_delete, where=where)

            # Remove the deleted ids from self.ids and self.id_to_info
@ -335,12 +346,7 @@ class ChromaCollector():

    def clear(self):
        with self.lock:
-            self.chroma_client.reset()
-
-            self.ids = []
-            self.chroma_client.delete_collection(name=self.name)
-            self.collection = self.chroma_client.create_collection(name=self.name, embedding_function=embedder)
-
+            self.__init__()  # reinitialize the collector
            logger.info('Successfully cleared all records and reset chromaDB.')


--- a/extensions/superboogav2/config.json
+++ b/extensions/superboogav2/config.json
@ -127,6 +127,9 @@
      "default": "\n\n<<document end>>\n\n"
    },
    "manual": {
+      "default": false
+    },
+    "add_date_time": {
      "default": true
    },
    "add_chat_to_data": {
--- a/extensions/superboogav2/data_processor.py
+++ b/extensions/superboogav2/data_processor.py
@ -6,6 +6,7 @@ It will only include full words.

 import bisect
 import re
+from datetime import datetime

 import extensions.superboogav2.parameters as parameters

@ -154,6 +155,13 @@ def process_and_add_to_collector(corpus: str, collector: ChromaCollector, clear_
    data_chunks_with_context = []
    data_chunk_starting_indices = []

+    if parameters.get_add_date_time():
+        now = datetime.now()
+        date_time_chunk = f"Current time is {now.strftime('%H:%M:%S')}. Today is {now.strftime('%A')}. The current date is {now.strftime('%Y-%m-%d')}."
+        data_chunks.append(date_time_chunk)
+        data_chunks_with_context.append(date_time_chunk)
+        data_chunk_starting_indices.append(0)
+
    # Handling chunk_regex
    if parameters.get_chunk_regex():
        if parameters.get_chunk_separator():
--- a/extensions/superboogav2/optimize.py
+++ b/extensions/superboogav2/optimize.py
@ -39,11 +39,11 @@ def _markdown_hyperparams():
 # Convert numpy types to python types.
 def _convert_np_types(params):
    for key in params:
-        if type(params[key]) == np.bool_:
+        if isinstance(params[key], np.bool_):
            params[key] = bool(params[key])
-        elif type(params[key]) == np.int64:
+        elif isinstance(params[key], np.int64):
            params[key] = int(params[key])
-        elif type(params[key]) == np.float64:
+        elif isinstance(params[key], np.float64):
            params[key] = float(params[key])
    return params

--- a/extensions/superboogav2/parameters.py
+++ b/extensions/superboogav2/parameters.py
@ -251,6 +251,10 @@ def get_is_manual() -> bool:
    return bool(Parameters.getInstance().hyperparameters['manual']['default'])


+def get_add_date_time() -> bool:
+    return bool(Parameters.getInstance().hyperparameters['add_date_time']['default'])
+
+
 def get_add_chat_to_data() -> bool:
    return bool(Parameters.getInstance().hyperparameters['add_chat_to_data']['default'])

@ -331,6 +335,10 @@ def set_manual(value: bool):
    Parameters.getInstance().hyperparameters['manual']['default'] = value


+def set_add_date_time(value: bool):
+    Parameters.getInstance().hyperparameters['add_date_time']['default'] = value
+
+
 def set_add_chat_to_data(value: bool):
    Parameters.getInstance().hyperparameters['add_chat_to_data']['default'] = value

--- a/extensions/superboogav2/requirements.txt
+++ b/extensions/superboogav2/requirements.txt
@ -1,10 +1,16 @@
-beautifulsoup4==4.12.2
-chromadb==0.4.24
+beautifulsoup4==4.13.3
+chromadb==0.6.3
 lxml
+nltk
 optuna
-pandas==2.0.3
-posthog==2.4.2
-sentence_transformers==2.2.2
+pandas
+posthog==3.13.0
+sentence_transformers==3.3.1
 spacy
 pytextrank
 num2words
+PyMuPDF
+python-docx
+python-pptx
+openpyxl
+odfpy
--- a/extensions/superboogav2/script.py
+++ b/extensions/superboogav2/script.py
@ -9,6 +9,13 @@ os.environ['NLTK_DATA'] = str(Path("extensions/superboogav2/nltk_data").resolve(

 import codecs
 import textwrap
+import docx
+import pptx
+import fitz
+fitz.TOOLS.mupdf_display_errors(False)
+import pandas as pd
+from odf.opendocument import load
+from odf.draw import Page

 import gradio as gr

@ -46,11 +53,123 @@ def _feed_data_into_collector(corpus):
    yield '### Done.'


-def _feed_file_into_collector(file):
-    yield '### Reading and processing the input dataset...'
-    text = file.decode('utf-8')
-    process_and_add_to_collector(text, collector, False, create_metadata_source('file'))
-    yield '### Done.'
+def _feed_file_into_collector(files):
+    if not files:
+        logger.warning("No files selected.")
+        return
+
+    def read_binary_file(file_path):
+        try:
+            with open(file_path, 'rb') as f:
+                return f.read()
+        except Exception:
+            logger.error(f"Failed to read {file_path}.")
+            return None
+
+    def extract_with_utf8(text):
+        try:
+            return text.decode('utf-8')
+        except Exception:
+            return ""
+
+    def extract_with_fitz(file_content):
+        try:
+            with fitz.open(stream=file_content, filetype=None) as doc:
+                num_pages = doc.page_count
+                text = "\n".join(block[4] for page in doc for block in page.get_text("blocks") if block[6] == 0)
+                logger.info(f"Extracted text from {num_pages} pages with fitz.")
+                return text
+        except Exception:
+            return ""
+
+    def extract_with_docx(file_path):
+        try:
+            paragraphs = docx.Document(file_path).paragraphs
+            text = "\n".join(para.text for para in paragraphs)
+            logger.info(f"Extracted text from {len(paragraphs)} paragraphs with docx.")
+            return text
+        except Exception:
+            return ""
+
+    def extract_with_pptx(file_path):
+        try:
+            slides = pptx.Presentation(file_path).slides
+            text = "\n".join(
+                shape.text for slide in slides for shape in slide.shapes if hasattr(shape, "text")
+            )
+            logger.info(f"Extracted text from {len(slides)} slides with pptx.")
+            return text
+        except Exception:
+            return ""
+
+    def extract_with_odf(file_path):
+        if not file_path.endswith(".odp"):
+            return ""
+        try:
+            doc = load(file_path)
+            text_content = []
+
+            def extract_text(element):
+                parts = []
+                if hasattr(element, "childNodes"):
+                    for node in element.childNodes:
+                        if node.nodeType == node.TEXT_NODE:
+                            parts.append(node.data)
+                        else:
+                            parts.append(extract_text(node))
+                return "".join(parts)
+
+            for slide in doc.getElementsByType(Page):
+                slide_text = extract_text(slide)
+                if slide_text.strip():
+                    text_content.append(slide_text.strip())
+
+            text = "\n".join(text_content)
+            logger.info(f"Extracted text from {len(doc.getElementsByType(Page))} slides with odf.")
+            return text
+        except Exception as e:
+            logger.error(f"Failed to extract text from {file_path}: {str(e)}")
+            return ""
+
+    def extract_with_pandas(file_path):
+        try:
+            df = pd.read_excel(file_path)
+            text = "\n".join(str(cell) for col in df.columns for cell in df[col])
+            logger.info(f"Extracted text from {df.shape[0]}x{df.shape[1]} cells with pandas.")
+            return text
+        except Exception:
+            return ""
+
+    for index, file in enumerate(files, start=1):
+        file_name = os.path.basename(file)
+        logger.info(f"Processing {file_name}...")
+
+        file_content = read_binary_file(file)
+        if not file_content:
+            continue
+
+        text_extractors = [
+            lambda: extract_with_utf8(file_content),
+            lambda: extract_with_fitz(file_content),
+            lambda: extract_with_docx(file),
+            lambda: extract_with_pptx(file),
+            lambda: extract_with_odf(file),
+            lambda: extract_with_pandas(file),
+        ]
+
+        for extractor in text_extractors:
+            text = extractor()
+            if text:
+                break
+
+        if not text:
+            logger.error(f"Failed to extract text from {file_name}, unsupported format.")
+            continue
+
+        process_and_add_to_collector(text, collector, False, create_metadata_source(f"file-{index}"))
+
+    logger.info("Done.")
+    yield "### Done."


 def _feed_url_into_collector(urls):
@ -107,7 +226,7 @@ def _get_optimizable_settings() -> list:


 def _apply_settings(optimization_steps, time_power, time_steepness, significant_level, min_sentences, new_dist_strat, delta_start, min_number_length, num_conversion,
-                    preprocess_pipeline, api_port, api_on, injection_strategy, add_chat_to_data, manual, postfix, data_separator, prefix, max_token_count,
+                    preprocess_pipeline, api_port, api_on, injection_strategy, add_chat_to_data, manual, add_date_time, postfix, data_separator, prefix, max_token_count,
                    chunk_count, chunk_sep, context_len, chunk_regex, chunk_len, threads, strong_cleanup):
    logger.debug('Applying settings.')

@ -124,6 +243,7 @@ def _apply_settings(optimization_steps, time_power, time_steepness, significant_
        parameters.set_injection_strategy(injection_strategy)
        parameters.set_add_chat_to_data(add_chat_to_data)
        parameters.set_manual(manual)
+        parameters.set_add_date_time(add_date_time)
        parameters.set_postfix(codecs.decode(postfix, 'unicode_escape'))
        parameters.set_data_separator(codecs.decode(data_separator, 'unicode_escape'))
        parameters.set_prefix(codecs.decode(prefix, 'unicode_escape'))
@ -237,11 +357,11 @@ def ui():
                url_input = gr.Textbox(lines=10, label='Input URLs', info='Enter one or more URLs separated by newline characters.')
                strong_cleanup = gr.Checkbox(value=parameters.get_is_strong_cleanup(), label='Strong cleanup', info='Only keeps html elements that look like long-form text.')
                threads = gr.Number(value=parameters.get_num_threads(), label='Threads', info='The number of threads to use while downloading the URLs.', precision=0)
-                update_url = gr.Button('Load data')
+                update_urls = gr.Button('Load data')

            with gr.Tab("File input"):
-                file_input = gr.File(label='Input file', type='binary')
-                update_file = gr.Button('Load data')
+                file_input = gr.File(label="Input file", type="filepath", file_count="multiple")
+                update_files = gr.Button('Load data')

            with gr.Tab("Settings"):
                with gr.Accordion("Processing settings", open=True):
@ -258,6 +378,7 @@ def ui():
                    postfix = gr.Textbox(value=codecs.encode(parameters.get_postfix(), 'unicode_escape').decode(), label='Postfix', info='What to put after the injection point.')
                    with gr.Row():
                        manual = gr.Checkbox(value=parameters.get_is_manual(), label="Is Manual", info="Manually specify when to use ChromaDB. Insert `!c` at the start or end of the message to trigger a query.", visible=shared.is_chat())
+                        add_date_time = gr.Checkbox(value=parameters.get_add_date_time(), label="Add date and time to Data", info="Make the current date and time available to the model.", visible=shared.is_chat())
                        add_chat_to_data = gr.Checkbox(value=parameters.get_add_chat_to_data(), label="Add Chat to Data", info="Automatically feed the chat history as you chat.", visible=shared.is_chat())
                    injection_strategy = gr.Radio(choices=[parameters.PREPEND_TO_LAST, parameters.APPEND_TO_LAST, parameters.HIJACK_LAST_IN_CONTEXT], value=parameters.get_injection_strategy(), label='Injection Strategy', info='Where to inject the messages in chat or instruct mode.', visible=shared.is_chat())
                    with gr.Row():
@ -313,14 +434,14 @@ def ui():
            last_updated = gr.Markdown()

    all_params = [optimization_steps, time_power, time_steepness, significant_level, min_sentences, new_dist_strat, delta_start, min_number_length, num_conversion,
-                  preprocess_pipeline, api_port, api_on, injection_strategy, add_chat_to_data, manual, postfix, data_separator, prefix, max_token_count,
+                  preprocess_pipeline, api_port, api_on, injection_strategy, add_chat_to_data, manual, add_date_time, postfix, data_separator, prefix, max_token_count,
                  chunk_count, chunk_sep, context_len, chunk_regex, chunk_len, threads, strong_cleanup]
    optimizable_params = [time_power, time_steepness, significant_level, min_sentences, new_dist_strat, delta_start, min_number_length, num_conversion,
                          preprocess_pipeline, chunk_count, context_len, chunk_len]

    update_data.click(_feed_data_into_collector, [data_input], last_updated, show_progress=False)
-    update_url.click(_feed_url_into_collector, [url_input], last_updated, show_progress=False)
-    update_file.click(_feed_file_into_collector, [file_input], last_updated, show_progress=False)
+    update_urls.click(_feed_url_into_collector, [url_input], last_updated, show_progress=False)
+    update_files.click(_feed_file_into_collector, [file_input], last_updated, show_progress=False)
    benchmark_button.click(_begin_benchmark, [], last_updated, show_progress=True)
    optimize_button.click(_begin_optimization, [], [last_updated] + optimizable_params, show_progress=True)
    clear_button.click(_clear_data, [], last_updated, show_progress=False)
@ -339,6 +460,7 @@ def ui():
    api_on.input(fn=_apply_settings, inputs=all_params, show_progress=False)
    injection_strategy.input(fn=_apply_settings, inputs=all_params, show_progress=False)
    add_chat_to_data.input(fn=_apply_settings, inputs=all_params, show_progress=False)
+    add_date_time.input(fn=_apply_settings, inputs=all_params, show_progress=False)
    manual.input(fn=_apply_settings, inputs=all_params, show_progress=False)
    postfix.input(fn=_apply_settings, inputs=all_params, show_progress=False)
    data_separator.input(fn=_apply_settings, inputs=all_params, show_progress=False)
--- a/modules/chat.py
+++ b/modules/chat.py
@ -11,6 +11,7 @@ from pathlib import Path

 import gradio as gr
 import yaml
+from jinja2.ext import loopcontrols
 from jinja2.sandbox import ImmutableSandboxedEnvironment
 from PIL import Image

@ -35,7 +36,11 @@ def strftime_now(format):
    return datetime.now().strftime(format)


-jinja_env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True)
+jinja_env = ImmutableSandboxedEnvironment(
+    trim_blocks=True,
+    lstrip_blocks=True,
+    extensions=[loopcontrols]
+)
 jinja_env.globals["strftime_now"] = strftime_now


--- a/modules/llama_cpp_python_hijack.py
+++ b/modules/llama_cpp_python_hijack.py
@ -121,5 +121,45 @@ def monkey_patch_llama_cpp_python(lib):
    lib.Llama.original_generate = lib.Llama.generate
    lib.Llama.generate = my_generate

+    # Also patch Jinja2ChatFormatter to handle loop controls
+    if hasattr(lib, 'llama_chat_format') and hasattr(lib.llama_chat_format, 'Jinja2ChatFormatter'):
+        Formatter = lib.llama_chat_format.Jinja2ChatFormatter
+
+        if not getattr(Formatter, '_is_patched', False):
+            def patched_init(self, *args, **kwargs):
+                # Extract parameters from args or kwargs
+                if args:
+                    self.template = args[0]
+                    self.eos_token = args[1] if len(args) > 1 else kwargs.get('eos_token')
+                    self.bos_token = args[2] if len(args) > 2 else kwargs.get('bos_token')
+                    self.add_generation_prompt = args[3] if len(args) > 3 else kwargs.get('add_generation_prompt', True)
+                    self.stop_token_ids = args[4] if len(args) > 4 else kwargs.get('stop_token_ids')
+                else:
+                    self.template = kwargs.get('template')
+                    self.eos_token = kwargs.get('eos_token')
+                    self.bos_token = kwargs.get('bos_token')
+                    self.add_generation_prompt = kwargs.get('add_generation_prompt', True)
+                    self.stop_token_ids = kwargs.get('stop_token_ids')
+
+                # Process stop tokens as in the original
+                self.stop_token_ids = (
+                    set(self.stop_token_ids) if self.stop_token_ids is not None else None
+                )
+
+                # Create environment with loopcontrols extension
+                import jinja2
+                from jinja2.ext import loopcontrols
+
+                self._environment = jinja2.sandbox.ImmutableSandboxedEnvironment(
+                    loader=jinja2.BaseLoader(),
+                    trim_blocks=True,
+                    lstrip_blocks=True,
+                    extensions=[loopcontrols]
+                ).from_string(self.template)
+
+            # Replace the original __init__ with our patched version
+            Formatter.__init__ = patched_init
+            Formatter._is_patched = True
+
    # Set the flag to indicate that the patch has been applied
    lib.Llama._is_patched = True
--- a/modules/loaders.py
+++ b/modules/loaders.py
@ -137,6 +137,7 @@ def transformers_samplers():
        'eta_cutoff',
        'tfs',
        'top_a',
+        'top_n_sigma',
        'dry_multiplier',
        'dry_allowed_length',
        'dry_base',
@ -224,6 +225,7 @@ loaders_samplers = {
        'eta_cutoff',
        'tfs',
        'top_a',
+        'top_n_sigma',
        'dry_multiplier',
        'dry_allowed_length',
        'dry_base',
@ -288,6 +290,7 @@ loaders_samplers = {
        'eta_cutoff',
        'tfs',
        'top_a',
+        'top_n_sigma',
        'dry_multiplier',
        'dry_allowed_length',
        'dry_base',
--- a/modules/presets.py
+++ b/modules/presets.py
@ -28,6 +28,7 @@ def default_preset():
        'eta_cutoff': 0,
        'tfs': 1,
        'top_a': 0,
+        'top_n_sigma': 0,
        'dry_multiplier': 0,
        'dry_allowed_length': 2,
        'dry_base': 1.75,
@ -45,7 +46,7 @@ def default_preset():
        'do_sample': True,
        'dynamic_temperature': False,
        'temperature_last': False,
-        'sampler_priority': 'repetition_penalty\npresence_penalty\nfrequency_penalty\ndry\ntemperature\ndynamic_temperature\nquadratic_sampling\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat\nxtc\nencoder_repetition_penalty\nno_repeat_ngram',
+        'sampler_priority': 'repetition_penalty\npresence_penalty\nfrequency_penalty\ndry\ntemperature\ndynamic_temperature\nquadratic_sampling\ntop_n_sigma\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat\nxtc\nencoder_repetition_penalty\nno_repeat_ngram',
        'dry_sequence_breakers': '"\\n", ":", "\\"", "*"',
    }

--- a/modules/sampler_hijack.py
+++ b/modules/sampler_hijack.py
@ -5,7 +5,6 @@ import random

 import torch
 import transformers
-from transformers import LogitsWarper
 from transformers.generation.logits_process import (
    LogitNormalization,
    LogitsProcessor,
@ -19,7 +18,7 @@ from modules.models import get_device
 global_scores = None


-class TemperatureLogitsWarperCustom(LogitsWarper):
+class TemperatureLogitsWarperCustom(LogitsProcessor):
    '''
    A copy of the original Transformers temperature logits warper.
    '''
@ -42,7 +41,7 @@ class TemperatureLogitsWarperCustom(LogitsWarper):
        return scores


-class DynamicTemperatureLogitsWarper(LogitsWarper):
+class DynamicTemperatureLogitsWarper(LogitsProcessor):
    '''
    Dynamic temperature.
    '''
@ -100,7 +99,7 @@ class DynamicTemperatureLogitsWarper(LogitsWarper):
        return scores


-class QuadraticSamplingLogitsWarper(LogitsWarper):
+class QuadraticSamplingLogitsWarper(LogitsProcessor):
    '''
    Quadratic sampling with smoothing factor and smoothing curve parameters.
    '''
@ -127,7 +126,7 @@ class QuadraticSamplingLogitsWarper(LogitsWarper):
        return transformed_logits


-class TailFreeLogitsWarper(LogitsWarper):
+class TailFreeLogitsWarper(LogitsProcessor):
    def __init__(self, tfs: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
        tfs = float(tfs)
        if tfs < 0 or tfs > 1.0:
@ -167,7 +166,7 @@ class TailFreeLogitsWarper(LogitsWarper):
        return scores


-class TopALogitsWarper(LogitsWarper):
+class TopALogitsWarper(LogitsProcessor):
    def __init__(self, top_a: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
        top_a = float(top_a)
        if top_a < 0 or top_a > 1.0:
@ -193,8 +192,48 @@ class TopALogitsWarper(LogitsWarper):
        return scores


+class TopNSigmaLogitsWarper(LogitsProcessor):
+    def __init__(self, n_sigma: float = 2.0, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
+        """
+        Initialize Top-nσ Sampling logits warper.
+
+        Args:
+            n_sigma: The threshold multiplier for standard deviation
+            filter_value: Value to assign to filtered logits
+            min_tokens_to_keep: Minimum number of tokens to keep
+        """
+        if n_sigma < 0:
+            raise ValueError(f"`n_sigma` must be a non-negative float, but is {n_sigma}")
+        self.n_sigma = n_sigma
+        self.filter_value = filter_value
+        self.min_tokens_to_keep = min_tokens_to_keep
+
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        # Calculate max of logits
+        max_logit = torch.max(scores, dim=-1, keepdim=True)[0]
+
+        # Calculate standard deviation only on finite values
+        finite_mask = torch.isfinite(scores)
+        finite_scores = scores.masked_fill(~finite_mask, 0.0)
+        std_logit = torch.std(finite_scores, dim=-1, keepdim=True)
+
+        # Create mask where tokens with logits >= max_logit - n_sigma * std_logit are kept
+        threshold = max_logit - self.n_sigma * std_logit
+        indices_to_remove = scores < threshold
+
+        if self.min_tokens_to_keep > 1:
+            # Keep at least min_tokens_to_keep tokens
+            top_k_indices = torch.topk(scores, self.min_tokens_to_keep, dim=-1)[1]
+            indices_to_remove.scatter_(-1, top_k_indices, False)
+
+        # Apply mask by setting filtered tokens to filter_value
+        scores = scores.masked_fill(indices_to_remove, self.filter_value)
+
+        return scores
+
+
 # Exclude Top Choices (XTC)
-class XTCLogitsWarper(LogitsWarper):
+class XTCLogitsWarper(LogitsProcessor):
    def __init__(self, threshold: float, probability: float, filter_value: float = -float("Inf")):
        self.threshold = threshold
        self.probability = probability
@ -312,7 +351,7 @@ class DRYLogitsProcessor(LogitsProcessor):
        return scores


-class MirostatLogitsWarper(LogitsWarper):
+class MirostatLogitsWarper(LogitsProcessor):
    def __init__(self, mirostat_mode: int, mirostat_tau: float, mirostat_eta: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
        if mirostat_mode not in [2]:
            raise ValueError(f"`mirostat` has to be a an integer 2, but is {mirostat_mode}")
@ -361,7 +400,7 @@ class MirostatLogitsWarper(LogitsWarper):
        return scores


-class SpyLogitsWarper(LogitsWarper):
+class SpyLogitsWarper(LogitsProcessor):
    def __init__(self):
        pass

@ -525,6 +564,14 @@ def get_logits_processor_patch(self, **kwargs):
            )
        )

+    if generation_config.top_n_sigma is not None and generation_config.top_n_sigma > 0.0:
+        warpers_to_add.append(
+            TopNSigmaLogitsWarper(
+                n_sigma=generation_config.top_n_sigma,
+                min_tokens_to_keep=min_tokens_to_keep
+            )
+        )
+
    if generation_config.xtc_probability is not None and generation_config.xtc_probability > 0:
        warpers_to_add.append(
            XTCLogitsWarper(
@ -589,6 +636,7 @@ def get_logits_processor_patch(self, **kwargs):
        'TailFreeLogitsWarper': 'tfs',
        'TemperatureLogitsWarperCustom': 'temperature',
        'TopALogitsWarper': 'top_a',
+        'TopNSigmaLogitsWarper': 'top_n_sigma',
        'TopKLogitsWarper': 'top_k',
        'TopPLogitsWarper': 'top_p',
        'TypicalLogitsWarper': 'typical_p',
@ -636,6 +684,7 @@ def generation_config_init_patch(self, **kwargs):
    self.smoothing_curve = kwargs.pop("smoothing_curve", 1.0)
    self.tfs = kwargs.pop("tfs", 1.0)
    self.top_a = kwargs.pop("top_a", 0.0)
+    self.top_n_sigma = kwargs.pop("top_n_sigma", 0.0)
    self.mirostat_mode = kwargs.pop("mirostat_mode", 0)
    self.mirostat_eta = kwargs.pop("mirostat_eta", 0.1)
    self.mirostat_tau = kwargs.pop("mirostat_tau", 5)
@ -649,7 +698,7 @@ def generation_config_init_patch(self, **kwargs):
    self.xtc_threshold = kwargs.pop("xtc_threshold", 0.1)
    self.xtc_probability = kwargs.pop("xtc_probability", 0)
    self.temperature_last = kwargs.pop("temperature_last", False)
-    self.sampler_priority = kwargs.pop("sampler_priority", ['repetition_penalty', 'presence_penalty', 'frequency_penalty', 'dry', 'temperature', 'dynamic_temperature', 'quadratic_sampling', 'top_k', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'tfs', 'top_a', 'min_p', 'mirostat', 'xtc', 'encoder_repetition_penalty', 'no_repeat_ngram'])
+    self.sampler_priority = kwargs.pop("sampler_priority", ['repetition_penalty', 'presence_penalty', 'frequency_penalty', 'dry', 'temperature', 'dynamic_temperature', 'quadratic_sampling', 'top_n_sigma', 'top_k', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'tfs', 'top_a', 'min_p', 'mirostat', 'xtc', 'encoder_repetition_penalty', 'no_repeat_ngram'])


 def hijack_samplers():
--- a/modules/text_generation.py
+++ b/modules/text_generation.py
@ -302,6 +302,7 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
        'xtc_probability',
        'tfs',
        'top_a',
+        'top_n_sigma',
        'dry_multiplier',
        'dry_allowed_length',
        'dry_base',
--- a/modules/ui.py
+++ b/modules/ui.py
@ -183,6 +183,7 @@ def list_interface_input_elements():
        'eta_cutoff',
        'tfs',
        'top_a',
+        'top_n_sigma',
        'dry_multiplier',
        'dry_allowed_length',
        'dry_base',
--- a/modules/ui_parameters.py
+++ b/modules/ui_parameters.py
@ -37,6 +37,7 @@ def create_ui(default_preset):

                            gr.Markdown('## Curve cutoff')
                            shared.gradio['min_p'] = gr.Slider(0.0, 1.0, value=generate_params['min_p'], step=0.01, label='min_p')
+                            shared.gradio['top_n_sigma'] = gr.Slider(0.0, 5.0, value=generate_params['top_n_sigma'], step=0.01, label='top_n_sigma')
                            shared.gradio['top_p'] = gr.Slider(0.0, 1.0, value=generate_params['top_p'], step=0.01, label='top_p')
                            shared.gradio['top_k'] = gr.Slider(0, 200, value=generate_params['top_k'], step=1, label='top_k')
                            shared.gradio['typical_p'] = gr.Slider(0.0, 1.0, value=generate_params['typical_p'], step=0.01, label='typical_p')
--- a/requirements.txt
+++ b/requirements.txt
@ -1,11 +1,11 @@
-accelerate==1.3.*
+accelerate==1.4.*
 bitsandbytes==0.45.*
 colorama
 datasets
 einops
 fastapi==0.112.4
 gradio==4.37.*
-jinja2==3.1.5
+jinja2==3.1.6
 markdown
 numba==0.59.*
 numpy==1.26.*
@ -21,7 +21,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.48.*
+transformers==4.49.*
 tqdm
 wandb

@ -32,29 +32,29 @@ sse-starlette==1.6.5
 tiktoken

 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"

 # llama-cpp-python (CUDA, with GGML_CUDA_FORCE_MMQ)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.7+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.7+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.7+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.7+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.8+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.8+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.8+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.8+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"

 # llama-cpp-python (CUDA, without GGML_CUDA_FORCE_MMQ)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.7+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.7+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.7+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.7+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.8+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.8+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.8+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.8+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"

 # CUDA wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.4.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.4.1-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu121.torch2.4.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu121.torch2.4.1-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu121.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu121.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
 https://github.com/oobabooga/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu122torch2.4.1cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu122torch2.4.1cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@ -1,10 +1,10 @@
-accelerate==1.3.*
+accelerate==1.4.*
 colorama
 datasets
 einops
 fastapi==0.112.4
 gradio==4.37.*
-jinja2==3.1.5
+jinja2==3.1.6
 markdown
 numba==0.59.*
 numpy==1.26.*
@ -20,7 +20,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.48.*
+transformers==4.49.*
 tqdm
 wandb

@ -31,14 +31,14 @@ sse-starlette==1.6.5
 tiktoken

 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"

 # AMD wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.7+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.7+rocm6.1.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+rocm6.1.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.8+rocm6.1.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.3.8+rocm6.1.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@ -1,10 +1,10 @@
-accelerate==1.3.*
+accelerate==1.4.*
 colorama
 datasets
 einops
 fastapi==0.112.4
 gradio==4.37.*
-jinja2==3.1.5
+jinja2==3.1.6
 markdown
 numba==0.59.*
 numpy==1.26.*
@ -20,7 +20,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.48.*
+transformers==4.49.*
 tqdm
 wandb

@ -31,12 +31,12 @@ sse-starlette==1.6.5
 tiktoken

 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"

 # AMD wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+rocm6.1.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.1.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64"
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@ -1,10 +1,10 @@
-accelerate==1.3.*
+accelerate==1.4.*
 colorama
 datasets
 einops
 fastapi==0.112.4
 gradio==4.37.*
-jinja2==3.1.5
+jinja2==3.1.6
 markdown
 numba==0.59.*
 numpy==1.26.*
@ -20,7 +20,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.48.*
+transformers==4.49.*
 tqdm
 wandb

@ -31,8 +31,8 @@ sse-starlette==1.6.5
 tiktoken

 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.7-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.7-cp310-cp310-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.7-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.7-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7-py3-none-any.whl
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.8-cp311-cp311-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.8-cp310-cp310-macosx_15_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.8-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.8-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@ -1,10 +1,10 @@
-accelerate==1.3.*
+accelerate==1.4.*
 colorama
 datasets
 einops
 fastapi==0.112.4
 gradio==4.37.*
-jinja2==3.1.5
+jinja2==3.1.6
 markdown
 numba==0.59.*
 numpy==1.26.*
@ -20,7 +20,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.48.*
+transformers==4.49.*
 tqdm
 wandb

@ -31,10 +31,10 @@ sse-starlette==1.6.5
 tiktoken

 # Mac wheels
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.7-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.7-cp310-cp310-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.7-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.7-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.7-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.7-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7-py3-none-any.whl
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.8-cp311-cp311-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.8-cp310-cp310-macosx_15_0_arm64.whl; platform_system == "Darwin" and platform_release >= "24.0.0" and platform_release < "25.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.8-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.8-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.8-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.3.8-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@ -1,10 +1,10 @@
-accelerate==1.3.*
+accelerate==1.4.*
 colorama
 datasets
 einops
 fastapi==0.112.4
 gradio==4.37.*
-jinja2==3.1.5
+jinja2==3.1.6
 markdown
 numba==0.59.*
 numpy==1.26.*
@ -20,7 +20,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.48.*
+transformers==4.49.*
 tqdm
 wandb

@ -31,7 +31,7 @@ sse-starlette==1.6.5
 tiktoken

 # llama-cpp-python (CPU only, AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@ -1,10 +1,10 @@
-accelerate==1.3.*
+accelerate==1.4.*
 colorama
 datasets
 einops
 fastapi==0.112.4
 gradio==4.37.*
-jinja2==3.1.5
+jinja2==3.1.6
 markdown
 numba==0.59.*
 numpy==1.26.*
@ -20,7 +20,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.48.*
+transformers==4.49.*
 tqdm
 wandb

@ -31,7 +31,7 @@ sse-starlette==1.6.5
 tiktoken

 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@ -1,11 +1,11 @@
-accelerate==1.3.*
+accelerate==1.4.*
 bitsandbytes==0.45.*
 colorama
 datasets
 einops
 fastapi==0.112.4
 gradio==4.37.*
-jinja2==3.1.5
+jinja2==3.1.6
 markdown
 numba==0.59.*
 numpy==1.26.*
@ -21,7 +21,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.48.*
+transformers==4.49.*
 tqdm
 wandb

@ -32,29 +32,29 @@ sse-starlette==1.6.5
 tiktoken

 # llama-cpp-python (CPU only, no AVX2)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.7+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.3.8+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"

 # llama-cpp-python (CUDA, with GGML_CUDA_FORCE_MMQ)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.7+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.7+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.7+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.7+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.8+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.8+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.8+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.3.8+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"

 # llama-cpp-python (CUDA, without GGML_CUDA_FORCE_MMQ)
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.7+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.7+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.7+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.7+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.8+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.8+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.8+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.3.8+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"

 # CUDA wheels
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.4.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.4.1-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7+cu121.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
-https://github.com/oobabooga/exllamav2/releases/download/v0.2.7/exllamav2-0.2.7-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu121.torch2.4.1-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu121.torch2.4.1-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu121.torch2.4.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu121.torch2.4.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10"
+https://github.com/oobabooga/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64"
 https://github.com/oobabooga/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu122torch2.4.1cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11"
 https://github.com/oobabooga/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu122torch2.4.1cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10"
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11"
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@ -1,10 +1,10 @@
-accelerate==1.3.*
+accelerate==1.4.*
 colorama
 datasets
 einops
 fastapi==0.112.4
 gradio==4.37.*
-jinja2==3.1.5
+jinja2==3.1.6
 markdown
 numba==0.59.*
 numpy==1.26.*
@ -20,7 +20,7 @@ safetensors==0.5.*
 scipy
 sentencepiece
 tensorboard
-transformers==4.48.*
+transformers==4.49.*
 tqdm
 wandb