Attempt at making the llama-server streaming more efficient.

2025-06-07 14:17:09 -04:00 · 2025-04-18 18:04:49 -07:00 · 2025-04-18 18:04:49 -07:00 · 5ad080ff25
commit 5ad080ff25
parent 4fabd729c9
10 changed files with 37 additions and 26 deletions
--- a/modules/llama_cpp_server.py
+++ b/modules/llama_cpp_server.py
@ -8,6 +8,7 @@ import time

 import llama_cpp_binaries
 import requests
+import sseclient

 from modules import shared
 from modules.logging_colors import logger
@ -138,42 +139,43 @@ class LlamaServer:
            pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
            print()

-        # Make a direct request with streaming enabled
-        response = requests.post(url, json=payload, stream=True)
-        response.raise_for_status()  # Raise an exception for HTTP errors
+        # Configure headers for Server-Sent Events
+        headers = {
+            'Content-Type': 'application/json',
+            'Accept': 'text/event-stream'
+        }
+
+        response = requests.post(url, json=payload, stream=True, headers=headers)
+        response.raise_for_status()
+
+        # Initialize SSE client for proper event stream parsing
+        client = sseclient.SSEClient(response)

        full_text = ""

-        # Process the streaming response
-        for line in response.iter_lines():
+        for event in client.events():
            if shared.stop_everything:
                break

-            if line:
-                try:
-                    # Check if the line starts with "data: " and remove it
-                    line_str = line.decode('utf-8')
-                    if line_str.startswith('data: '):
-                        line_str = line_str[6:]  # Remove the "data: " prefix
+            try:
+                # Handle stream termination marker
+                if event.data == '[DONE]':
+                    break

-                    # Parse the JSON data
-                    data = json.loads(line_str)
+                data = json.loads(event.data)

-                    # Extract the token content
-                    if 'content' in data:
-                        token_text = data['content']
-                        full_text += token_text
-                        yield full_text
+                if 'content' in data:
+                    token_text = data['content']
+                    full_text += token_text
+                    yield full_text

-                    # Check if generation is complete
-                    if data.get('stop', False):
-                        break
+                if data.get('stop', False):
+                    break

-                except json.JSONDecodeError as e:
-                    # Log the error and the problematic line
-                    print(f"JSON decode error: {e}")
-                    print(f"Problematic line: {line}")
-                    continue
+            except json.JSONDecodeError as e:
+                print(f"JSON decode error: {e}")
+                print(f"Problematic data: {event.data}")
+                continue

    def generate(self, prompt, state):
        output = ""
--- a/requirements.txt
+++ b/requirements.txt
@ -19,6 +19,7 @@ requests
 rich
 safetensors==0.5.*
 scipy
+sseclient-py==1.8.0
 sentencepiece
 tensorboard
 transformers==4.50.*
--- a/requirements_amd.txt
+++ b/requirements_amd.txt
@ -18,6 +18,7 @@ requests
 rich
 safetensors==0.5.*
 scipy
+sseclient-py==1.8.0
 sentencepiece
 tensorboard
 transformers==4.50.*
--- a/requirements_amd_noavx2.txt
+++ b/requirements_amd_noavx2.txt
@ -18,6 +18,7 @@ requests
 rich
 safetensors==0.5.*
 scipy
+sseclient-py==1.8.0
 sentencepiece
 tensorboard
 transformers==4.50.*
--- a/requirements_apple_intel.txt
+++ b/requirements_apple_intel.txt
@ -18,6 +18,7 @@ requests
 rich
 safetensors==0.5.*
 scipy
+sseclient-py==1.8.0
 sentencepiece
 tensorboard
 transformers==4.50.*
--- a/requirements_apple_silicon.txt
+++ b/requirements_apple_silicon.txt
@ -18,6 +18,7 @@ requests
 rich
 safetensors==0.5.*
 scipy
+sseclient-py==1.8.0
 sentencepiece
 tensorboard
 transformers==4.50.*
--- a/requirements_cpu_only.txt
+++ b/requirements_cpu_only.txt
@ -18,6 +18,7 @@ requests
 rich
 safetensors==0.5.*
 scipy
+sseclient-py==1.8.0
 sentencepiece
 tensorboard
 transformers==4.50.*
--- a/requirements_cpu_only_noavx2.txt
+++ b/requirements_cpu_only_noavx2.txt
@ -18,6 +18,7 @@ requests
 rich
 safetensors==0.5.*
 scipy
+sseclient-py==1.8.0
 sentencepiece
 tensorboard
 transformers==4.50.*
--- a/requirements_noavx2.txt
+++ b/requirements_noavx2.txt
@ -19,6 +19,7 @@ requests
 rich
 safetensors==0.5.*
 scipy
+sseclient-py==1.8.0
 sentencepiece
 tensorboard
 transformers==4.50.*
--- a/requirements_nowheels.txt
+++ b/requirements_nowheels.txt
@ -18,6 +18,7 @@ requests
 rich
 safetensors==0.5.*
 scipy
+sseclient-py==1.8.0
 sentencepiece
 tensorboard
 transformers==4.50.*