Attempt at making the llama-server streaming more efficient.

This commit is contained in:
oobabooga 2025-04-18 18:04:49 -07:00
parent 4fabd729c9
commit 5ad080ff25
10 changed files with 37 additions and 26 deletions

View file

@ -8,6 +8,7 @@ import time
import llama_cpp_binaries import llama_cpp_binaries
import requests import requests
import sseclient
from modules import shared from modules import shared
from modules.logging_colors import logger from modules.logging_colors import logger
@ -138,41 +139,42 @@ class LlamaServer:
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload) pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
print() print()
# Make a direct request with streaming enabled # Configure headers for Server-Sent Events
response = requests.post(url, json=payload, stream=True) headers = {
response.raise_for_status() # Raise an exception for HTTP errors 'Content-Type': 'application/json',
'Accept': 'text/event-stream'
}
response = requests.post(url, json=payload, stream=True, headers=headers)
response.raise_for_status()
# Initialize SSE client for proper event stream parsing
client = sseclient.SSEClient(response)
full_text = "" full_text = ""
# Process the streaming response for event in client.events():
for line in response.iter_lines():
if shared.stop_everything: if shared.stop_everything:
break break
if line:
try: try:
# Check if the line starts with "data: " and remove it # Handle stream termination marker
line_str = line.decode('utf-8') if event.data == '[DONE]':
if line_str.startswith('data: '): break
line_str = line_str[6:] # Remove the "data: " prefix
# Parse the JSON data data = json.loads(event.data)
data = json.loads(line_str)
# Extract the token content
if 'content' in data: if 'content' in data:
token_text = data['content'] token_text = data['content']
full_text += token_text full_text += token_text
yield full_text yield full_text
# Check if generation is complete
if data.get('stop', False): if data.get('stop', False):
break break
except json.JSONDecodeError as e: except json.JSONDecodeError as e:
# Log the error and the problematic line
print(f"JSON decode error: {e}") print(f"JSON decode error: {e}")
print(f"Problematic line: {line}") print(f"Problematic data: {event.data}")
continue continue
def generate(self, prompt, state): def generate(self, prompt, state):

View file

@ -19,6 +19,7 @@ requests
rich rich
safetensors==0.5.* safetensors==0.5.*
scipy scipy
sseclient-py==1.8.0
sentencepiece sentencepiece
tensorboard tensorboard
transformers==4.50.* transformers==4.50.*

View file

@ -18,6 +18,7 @@ requests
rich rich
safetensors==0.5.* safetensors==0.5.*
scipy scipy
sseclient-py==1.8.0
sentencepiece sentencepiece
tensorboard tensorboard
transformers==4.50.* transformers==4.50.*

View file

@ -18,6 +18,7 @@ requests
rich rich
safetensors==0.5.* safetensors==0.5.*
scipy scipy
sseclient-py==1.8.0
sentencepiece sentencepiece
tensorboard tensorboard
transformers==4.50.* transformers==4.50.*

View file

@ -18,6 +18,7 @@ requests
rich rich
safetensors==0.5.* safetensors==0.5.*
scipy scipy
sseclient-py==1.8.0
sentencepiece sentencepiece
tensorboard tensorboard
transformers==4.50.* transformers==4.50.*

View file

@ -18,6 +18,7 @@ requests
rich rich
safetensors==0.5.* safetensors==0.5.*
scipy scipy
sseclient-py==1.8.0
sentencepiece sentencepiece
tensorboard tensorboard
transformers==4.50.* transformers==4.50.*

View file

@ -18,6 +18,7 @@ requests
rich rich
safetensors==0.5.* safetensors==0.5.*
scipy scipy
sseclient-py==1.8.0
sentencepiece sentencepiece
tensorboard tensorboard
transformers==4.50.* transformers==4.50.*

View file

@ -18,6 +18,7 @@ requests
rich rich
safetensors==0.5.* safetensors==0.5.*
scipy scipy
sseclient-py==1.8.0
sentencepiece sentencepiece
tensorboard tensorboard
transformers==4.50.* transformers==4.50.*

View file

@ -19,6 +19,7 @@ requests
rich rich
safetensors==0.5.* safetensors==0.5.*
scipy scipy
sseclient-py==1.8.0
sentencepiece sentencepiece
tensorboard tensorboard
transformers==4.50.* transformers==4.50.*

View file

@ -18,6 +18,7 @@ requests
rich rich
safetensors==0.5.* safetensors==0.5.*
scipy scipy
sseclient-py==1.8.0
sentencepiece sentencepiece
tensorboard tensorboard
transformers==4.50.* transformers==4.50.*