Attempt at making the llama-server streaming more efficient.

2025-06-07 22:25:54 -04:00 · 2025-04-18 18:04:49 -07:00 · 2025-04-18 18:04:49 -07:00 · 5ad080ff25
commit 5ad080ff25
parent 4fabd729c9
10 changed files with 37 additions and 26 deletions
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@ -8,6 +8,7 @@ import time
 import llama_cpp_binaries
 import requests
 import sseclient
 from modules import shared
 from modules.logging_colors import logger
@ -138,41 +139,42 @@ class LlamaServer:
            pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
            print()
-        # Make a direct request with streaming enabled
+        # Configure headers for Server-Sent Events
-        response = requests.post(url, json=payload, stream=True)
+        headers = {
-        response.raise_for_status()  # Raise an exception for HTTP errors
+            'Content-Type': 'application/json',
            'Accept': 'text/event-stream'
        }
        response = requests.post(url, json=payload, stream=True, headers=headers)
        response.raise_for_status()
        # Initialize SSE client for proper event stream parsing
        client = sseclient.SSEClient(response)
        full_text = ""
-        # Process the streaming response
+        for event in client.events():
        for line in response.iter_lines():
            if shared.stop_everything:
                break
            if line:
            try:
-                    # Check if the line starts with "data: " and remove it
+                # Handle stream termination marker
-                    line_str = line.decode('utf-8')
+                if event.data == '[DONE]':
-                    if line_str.startswith('data: '):
+                    break
                        line_str = line_str[6:]  # Remove the "data: " prefix
-                    # Parse the JSON data
+                data = json.loads(event.data)
                    data = json.loads(line_str)
                    # Extract the token content
                if 'content' in data:
                    token_text = data['content']
                    full_text += token_text
                    yield full_text
                    # Check if generation is complete
                if data.get('stop', False):
                    break
            except json.JSONDecodeError as e:
                    # Log the error and the problematic line
                print(f"JSON decode error: {e}")
-                    print(f"Problematic line: {line}")
+                print(f"Problematic data: {event.data}")
                continue
    def generate(self, prompt, state):
--- a/requirements.txt
+++ b/requirements.txt
@ -19,6 +19,7 @@ requests
 rich
 safetensors==0.5.*
 scipy
 sseclient-py==1.8.0
 sentencepiece
 tensorboard
 transformers==4.50.*
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@ -18,6 +18,7 @@ requests
 rich
 safetensors==0.5.*
 scipy
 sseclient-py==1.8.0
 sentencepiece
 tensorboard
 transformers==4.50.*
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@ -18,6 +18,7 @@ requests
 rich
 safetensors==0.5.*
 scipy
 sseclient-py==1.8.0
 sentencepiece
 tensorboard
 transformers==4.50.*
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@ -18,6 +18,7 @@ requests
 rich
 safetensors==0.5.*
 scipy
 sseclient-py==1.8.0
 sentencepiece
 tensorboard
 transformers==4.50.*
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@ -18,6 +18,7 @@ requests
 rich
 safetensors==0.5.*
 scipy
 sseclient-py==1.8.0
 sentencepiece
 tensorboard
 transformers==4.50.*
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@ -18,6 +18,7 @@ requests
 rich
 safetensors==0.5.*
 scipy
 sseclient-py==1.8.0
 sentencepiece
 tensorboard
 transformers==4.50.*
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@ -18,6 +18,7 @@ requests
 rich
 safetensors==0.5.*
 scipy
 sseclient-py==1.8.0
 sentencepiece
 tensorboard
 transformers==4.50.*
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@ -19,6 +19,7 @@ requests
 rich
 safetensors==0.5.*
 scipy
 sseclient-py==1.8.0
 sentencepiece
 tensorboard
 transformers==4.50.*
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@ -18,6 +18,7 @@ requests
 rich
 safetensors==0.5.*
 scipy
 sseclient-py==1.8.0
 sentencepiece
 tensorboard
 transformers==4.50.*