Add support for .docx attachments

This commit is contained in:
oobabooga 2025-05-31 20:15:07 -07:00
parent dc8ed6dbe7
commit 1d88456659
20 changed files with 69 additions and 1 deletions

View file

@ -16,7 +16,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
- Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory.
- 100% offline and private, with zero telemetry, external resources, or remote update requests.
- Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats.
- **File attachments**: Upload text files and PDF documents to talk about their contents.
- **File attachments**: Upload text files, PDF documents, and .docx documents to talk about their contents.
- **Web search**: Optionally search the internet with LLM-generated queries to add context to the conversation.
- Aesthetic UI with dark and light themes.
- `instruct` mode for instruction-following (like ChatGPT), and `chat-instruct`/`chat` modes for talking to custom characters.

View file

@ -500,6 +500,9 @@ def add_message_attachment(history, row_idx, file_path, is_user=True):
# Process PDF file
content = extract_pdf_text(path)
file_type = "application/pdf"
elif file_extension == '.docx':
content = extract_docx_text(path)
file_type = "application/docx"
else:
# Default handling for text files
with open(path, 'r', encoding='utf-8') as f:
@ -538,6 +541,53 @@ def extract_pdf_text(pdf_path):
return f"[Error extracting PDF text: {str(e)}]"
def extract_docx_text(docx_path):
"""
Extract text from a .docx file, including headers,
body (paragraphs and tables), and footers.
"""
try:
import docx
doc = docx.Document(docx_path)
parts = []
# 1) Extract non-empty header paragraphs from each section
for section in doc.sections:
for para in section.header.paragraphs:
text = para.text.strip()
if text:
parts.append(text)
# 2) Extract body blocks (paragraphs and tables) in document order
parent_elm = doc.element.body
for child in parent_elm.iterchildren():
if isinstance(child, docx.oxml.text.paragraph.CT_P):
para = docx.text.paragraph.Paragraph(child, doc)
text = para.text.strip()
if text:
parts.append(text)
elif isinstance(child, docx.oxml.table.CT_Tbl):
table = docx.table.Table(child, doc)
for row in table.rows:
cells = [cell.text.strip() for cell in row.cells]
parts.append("\t".join(cells))
# 3) Extract non-empty footer paragraphs from each section
for section in doc.sections:
for para in section.footer.paragraphs:
text = para.text.strip()
if text:
parts.append(text)
return "\n".join(parts)
except Exception as e:
logger.error(f"Error extracting text from DOCX: {e}")
return f"[Error extracting DOCX text: {str(e)}]"
def generate_search_query(user_message, state):
"""Generate a search query from user message using the LLM"""
# Augment the user message with search instruction

View file

@ -16,6 +16,7 @@ Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
python-docx==1.1.2
pyyaml
requests
rich

View file

@ -15,6 +15,7 @@ Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
python-docx==1.1.2
pyyaml
requests
rich

View file

@ -15,6 +15,7 @@ Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
python-docx==1.1.2
pyyaml
requests
rich

View file

@ -15,6 +15,7 @@ Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
python-docx==1.1.2
pyyaml
requests
rich

View file

@ -15,6 +15,7 @@ Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
python-docx==1.1.2
pyyaml
requests
rich

View file

@ -15,6 +15,7 @@ Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
python-docx==1.1.2
pyyaml
requests
rich

View file

@ -15,6 +15,7 @@ Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
python-docx==1.1.2
pyyaml
requests
rich

View file

@ -16,6 +16,7 @@ Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
python-docx==1.1.2
pyyaml
requests
rich

View file

@ -15,6 +15,7 @@ Pillow>=9.5.0
psutil
pydantic==2.8.2
PyPDF2==3.0.1
python-docx==1.1.2
pyyaml
requests
rich

View file

@ -7,6 +7,7 @@ markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
python-docx==1.1.2
pyyaml
requests
rich

View file

@ -7,6 +7,7 @@ markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
python-docx==1.1.2
pyyaml
requests
rich

View file

@ -7,6 +7,7 @@ markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
python-docx==1.1.2
pyyaml
requests
rich

View file

@ -7,6 +7,7 @@ markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
python-docx==1.1.2
pyyaml
requests
rich

View file

@ -7,6 +7,7 @@ markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
python-docx==1.1.2
pyyaml
requests
rich

View file

@ -7,6 +7,7 @@ markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
python-docx==1.1.2
pyyaml
requests
rich

View file

@ -7,6 +7,7 @@ markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
python-docx==1.1.2
pyyaml
requests
rich

View file

@ -7,6 +7,7 @@ markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
python-docx==1.1.2
pyyaml
requests
rich

View file

@ -7,6 +7,7 @@ markdown
numpy==1.26.*
pydantic==2.8.2
PyPDF2==3.0.1
python-docx==1.1.2
pyyaml
requests
rich