mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2025-06-07 22:25:54 -04:00
Attempt at making the llama-server streaming more efficient.
This commit is contained in:
parent
4fabd729c9
commit
5ad080ff25
10 changed files with 37 additions and 26 deletions
|
@ -8,6 +8,7 @@ import time
|
||||||
|
|
||||||
import llama_cpp_binaries
|
import llama_cpp_binaries
|
||||||
import requests
|
import requests
|
||||||
|
import sseclient
|
||||||
|
|
||||||
from modules import shared
|
from modules import shared
|
||||||
from modules.logging_colors import logger
|
from modules.logging_colors import logger
|
||||||
|
@ -138,41 +139,42 @@ class LlamaServer:
|
||||||
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
|
pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(printable_payload)
|
||||||
print()
|
print()
|
||||||
|
|
||||||
# Make a direct request with streaming enabled
|
# Configure headers for Server-Sent Events
|
||||||
response = requests.post(url, json=payload, stream=True)
|
headers = {
|
||||||
response.raise_for_status() # Raise an exception for HTTP errors
|
'Content-Type': 'application/json',
|
||||||
|
'Accept': 'text/event-stream'
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(url, json=payload, stream=True, headers=headers)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
# Initialize SSE client for proper event stream parsing
|
||||||
|
client = sseclient.SSEClient(response)
|
||||||
|
|
||||||
full_text = ""
|
full_text = ""
|
||||||
|
|
||||||
# Process the streaming response
|
for event in client.events():
|
||||||
for line in response.iter_lines():
|
|
||||||
if shared.stop_everything:
|
if shared.stop_everything:
|
||||||
break
|
break
|
||||||
|
|
||||||
if line:
|
|
||||||
try:
|
try:
|
||||||
# Check if the line starts with "data: " and remove it
|
# Handle stream termination marker
|
||||||
line_str = line.decode('utf-8')
|
if event.data == '[DONE]':
|
||||||
if line_str.startswith('data: '):
|
break
|
||||||
line_str = line_str[6:] # Remove the "data: " prefix
|
|
||||||
|
|
||||||
# Parse the JSON data
|
data = json.loads(event.data)
|
||||||
data = json.loads(line_str)
|
|
||||||
|
|
||||||
# Extract the token content
|
|
||||||
if 'content' in data:
|
if 'content' in data:
|
||||||
token_text = data['content']
|
token_text = data['content']
|
||||||
full_text += token_text
|
full_text += token_text
|
||||||
yield full_text
|
yield full_text
|
||||||
|
|
||||||
# Check if generation is complete
|
|
||||||
if data.get('stop', False):
|
if data.get('stop', False):
|
||||||
break
|
break
|
||||||
|
|
||||||
except json.JSONDecodeError as e:
|
except json.JSONDecodeError as e:
|
||||||
# Log the error and the problematic line
|
|
||||||
print(f"JSON decode error: {e}")
|
print(f"JSON decode error: {e}")
|
||||||
print(f"Problematic line: {line}")
|
print(f"Problematic data: {event.data}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
def generate(self, prompt, state):
|
def generate(self, prompt, state):
|
||||||
|
|
|
@ -19,6 +19,7 @@ requests
|
||||||
rich
|
rich
|
||||||
safetensors==0.5.*
|
safetensors==0.5.*
|
||||||
scipy
|
scipy
|
||||||
|
sseclient-py==1.8.0
|
||||||
sentencepiece
|
sentencepiece
|
||||||
tensorboard
|
tensorboard
|
||||||
transformers==4.50.*
|
transformers==4.50.*
|
||||||
|
|
|
@ -18,6 +18,7 @@ requests
|
||||||
rich
|
rich
|
||||||
safetensors==0.5.*
|
safetensors==0.5.*
|
||||||
scipy
|
scipy
|
||||||
|
sseclient-py==1.8.0
|
||||||
sentencepiece
|
sentencepiece
|
||||||
tensorboard
|
tensorboard
|
||||||
transformers==4.50.*
|
transformers==4.50.*
|
||||||
|
|
|
@ -18,6 +18,7 @@ requests
|
||||||
rich
|
rich
|
||||||
safetensors==0.5.*
|
safetensors==0.5.*
|
||||||
scipy
|
scipy
|
||||||
|
sseclient-py==1.8.0
|
||||||
sentencepiece
|
sentencepiece
|
||||||
tensorboard
|
tensorboard
|
||||||
transformers==4.50.*
|
transformers==4.50.*
|
||||||
|
|
|
@ -18,6 +18,7 @@ requests
|
||||||
rich
|
rich
|
||||||
safetensors==0.5.*
|
safetensors==0.5.*
|
||||||
scipy
|
scipy
|
||||||
|
sseclient-py==1.8.0
|
||||||
sentencepiece
|
sentencepiece
|
||||||
tensorboard
|
tensorboard
|
||||||
transformers==4.50.*
|
transformers==4.50.*
|
||||||
|
|
|
@ -18,6 +18,7 @@ requests
|
||||||
rich
|
rich
|
||||||
safetensors==0.5.*
|
safetensors==0.5.*
|
||||||
scipy
|
scipy
|
||||||
|
sseclient-py==1.8.0
|
||||||
sentencepiece
|
sentencepiece
|
||||||
tensorboard
|
tensorboard
|
||||||
transformers==4.50.*
|
transformers==4.50.*
|
||||||
|
|
|
@ -18,6 +18,7 @@ requests
|
||||||
rich
|
rich
|
||||||
safetensors==0.5.*
|
safetensors==0.5.*
|
||||||
scipy
|
scipy
|
||||||
|
sseclient-py==1.8.0
|
||||||
sentencepiece
|
sentencepiece
|
||||||
tensorboard
|
tensorboard
|
||||||
transformers==4.50.*
|
transformers==4.50.*
|
||||||
|
|
|
@ -18,6 +18,7 @@ requests
|
||||||
rich
|
rich
|
||||||
safetensors==0.5.*
|
safetensors==0.5.*
|
||||||
scipy
|
scipy
|
||||||
|
sseclient-py==1.8.0
|
||||||
sentencepiece
|
sentencepiece
|
||||||
tensorboard
|
tensorboard
|
||||||
transformers==4.50.*
|
transformers==4.50.*
|
||||||
|
|
|
@ -19,6 +19,7 @@ requests
|
||||||
rich
|
rich
|
||||||
safetensors==0.5.*
|
safetensors==0.5.*
|
||||||
scipy
|
scipy
|
||||||
|
sseclient-py==1.8.0
|
||||||
sentencepiece
|
sentencepiece
|
||||||
tensorboard
|
tensorboard
|
||||||
transformers==4.50.*
|
transformers==4.50.*
|
||||||
|
|
|
@ -18,6 +18,7 @@ requests
|
||||||
rich
|
rich
|
||||||
safetensors==0.5.*
|
safetensors==0.5.*
|
||||||
scipy
|
scipy
|
||||||
|
sseclient-py==1.8.0
|
||||||
sentencepiece
|
sentencepiece
|
||||||
tensorboard
|
tensorboard
|
||||||
transformers==4.50.*
|
transformers==4.50.*
|
||||||
|
|
Loading…
Add table
Reference in a new issue