diff --git a/modules/llama_cpp_server.py b/modules/llama_cpp_server.py index 123f9471..5319e7af 100644 --- a/modules/llama_cpp_server.py +++ b/modules/llama_cpp_server.py @@ -8,6 +8,7 @@ import time import llama_cpp_binaries import requests +import sseclient from modules import shared from modules.logging_colors import logger @@ -138,42 +139,43 @@ class LlamaServer: pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload) print() - # Make a direct request with streaming enabled - response = requests.post(url, json=payload, stream=True) - response.raise_for_status() # Raise an exception for HTTP errors + # Configure headers for Server-Sent Events + headers = { + 'Content-Type': 'application/json', + 'Accept': 'text/event-stream' + } + + response = requests.post(url, json=payload, stream=True, headers=headers) + response.raise_for_status() + + # Initialize SSE client for proper event stream parsing + client = sseclient.SSEClient(response) full_text = "" - # Process the streaming response - for line in response.iter_lines(): + for event in client.events(): if shared.stop_everything: break - if line: - try: - # Check if the line starts with "data: " and remove it - line_str = line.decode('utf-8') - if line_str.startswith('data: '): - line_str = line_str[6:] # Remove the "data: " prefix + try: + # Handle stream termination marker + if event.data == '[DONE]': + break - # Parse the JSON data - data = json.loads(line_str) + data = json.loads(event.data) - # Extract the token content - if 'content' in data: - token_text = data['content'] - full_text += token_text - yield full_text + if 'content' in data: + token_text = data['content'] + full_text += token_text + yield full_text - # Check if generation is complete - if data.get('stop', False): - break + if data.get('stop', False): + break - except json.JSONDecodeError as e: - # Log the error and the problematic line - print(f"JSON decode error: {e}") - print(f"Problematic line: {line}") - continue + except json.JSONDecodeError as e: + print(f"JSON decode error: {e}") + print(f"Problematic data: {event.data}") + continue def generate(self, prompt, state): output = "" diff --git a/requirements.txt b/requirements.txt index 607efda0..b6759806 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,6 +19,7 @@ requests rich safetensors==0.5.* scipy +sseclient-py==1.8.0 sentencepiece tensorboard transformers==4.50.* diff --git a/requirements_amd.txt b/requirements_amd.txt index b242d4ad..e156bc55 100644 --- a/requirements_amd.txt +++ b/requirements_amd.txt @@ -18,6 +18,7 @@ requests rich safetensors==0.5.* scipy +sseclient-py==1.8.0 sentencepiece tensorboard transformers==4.50.* diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt index b6105209..6becd514 100644 --- a/requirements_amd_noavx2.txt +++ b/requirements_amd_noavx2.txt @@ -18,6 +18,7 @@ requests rich safetensors==0.5.* scipy +sseclient-py==1.8.0 sentencepiece tensorboard transformers==4.50.* diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt index ce730f63..1223b4d3 100644 --- a/requirements_apple_intel.txt +++ b/requirements_apple_intel.txt @@ -18,6 +18,7 @@ requests rich safetensors==0.5.* scipy +sseclient-py==1.8.0 sentencepiece tensorboard transformers==4.50.* diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt index a7be282d..f47f9991 100644 --- a/requirements_apple_silicon.txt +++ b/requirements_apple_silicon.txt @@ -18,6 +18,7 @@ requests rich safetensors==0.5.* scipy +sseclient-py==1.8.0 sentencepiece tensorboard transformers==4.50.* diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt index 2437c2ae..007f9ef1 100644 --- a/requirements_cpu_only.txt +++ b/requirements_cpu_only.txt @@ -18,6 +18,7 @@ requests rich safetensors==0.5.* scipy +sseclient-py==1.8.0 sentencepiece tensorboard transformers==4.50.* diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt index cbaa8e96..4219273d 100644 --- a/requirements_cpu_only_noavx2.txt +++ b/requirements_cpu_only_noavx2.txt @@ -18,6 +18,7 @@ requests rich safetensors==0.5.* scipy +sseclient-py==1.8.0 sentencepiece tensorboard transformers==4.50.* diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt index cce27aa2..58e0e5a1 100644 --- a/requirements_noavx2.txt +++ b/requirements_noavx2.txt @@ -19,6 +19,7 @@ requests rich safetensors==0.5.* scipy +sseclient-py==1.8.0 sentencepiece tensorboard transformers==4.50.* diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt index 3b61ca39..c210f6a0 100644 --- a/requirements_nowheels.txt +++ b/requirements_nowheels.txt @@ -18,6 +18,7 @@ requests rich safetensors==0.5.* scipy +sseclient-py==1.8.0 sentencepiece tensorboard transformers==4.50.*