Add support for .docx attachments

2025-06-07 14:17:09 -04:00 · 2025-05-31 20:15:07 -07:00 · 2025-05-31 20:15:07 -07:00 · 1d88456659
commit 1d88456659
parent dc8ed6dbe7
20 changed files with 69 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -16,7 +16,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 - Easy setup: Choose between **portable builds** (zero setup, just unzip and run) for GGUF models on Windows/Linux/macOS, or the one-click installer that creates a self-contained `installer_files` directory.
 - 100% offline and private, with zero telemetry, external resources, or remote update requests.
 - Automatic prompt formatting using Jinja2 templates. You don't need to ever worry about prompt formats.
- **File attachments**: Upload text files and PDF documents to talk about their contents.
+- **File attachments**: Upload text files, PDF documents, and .docx documents to talk about their contents.
 - **Web search**: Optionally search the internet with LLM-generated queries to add context to the conversation.
 - Aesthetic UI with dark and light themes.
 - `instruct` mode for instruction-following (like ChatGPT), and `chat-instruct`/`chat` modes for talking to custom characters.
--- a/modules/chat.py
+++ b/modules/chat.py
@ -500,6 +500,9 @@ def add_message_attachment(history, row_idx, file_path, is_user=True):
            # Process PDF file
            content = extract_pdf_text(path)
            file_type = "application/pdf"
+        elif file_extension == '.docx':
+            content = extract_docx_text(path)
+            file_type = "application/docx"
        else:
            # Default handling for text files
            with open(path, 'r', encoding='utf-8') as f:
@ -538,6 +541,53 @@ def extract_pdf_text(pdf_path):
        return f"[Error extracting PDF text: {str(e)}]"


+def extract_docx_text(docx_path):
+    """
+    Extract text from a .docx file, including headers,
+    body (paragraphs and tables), and footers.
+    """
+    try:
+        import docx
+
+        doc = docx.Document(docx_path)
+        parts = []
+
+        # 1) Extract non-empty header paragraphs from each section
+        for section in doc.sections:
+            for para in section.header.paragraphs:
+                text = para.text.strip()
+                if text:
+                    parts.append(text)
+
+        # 2) Extract body blocks (paragraphs and tables) in document order
+        parent_elm = doc.element.body
+        for child in parent_elm.iterchildren():
+            if isinstance(child, docx.oxml.text.paragraph.CT_P):
+                para = docx.text.paragraph.Paragraph(child, doc)
+                text = para.text.strip()
+                if text:
+                    parts.append(text)
+
+            elif isinstance(child, docx.oxml.table.CT_Tbl):
+                table = docx.table.Table(child, doc)
+                for row in table.rows:
+                    cells = [cell.text.strip() for cell in row.cells]
+                    parts.append("\t".join(cells))
+
+        # 3) Extract non-empty footer paragraphs from each section
+        for section in doc.sections:
+            for para in section.footer.paragraphs:
+                text = para.text.strip()
+                if text:
+                    parts.append(text)
+
+        return "\n".join(parts)
+
+    except Exception as e:
+        logger.error(f"Error extracting text from DOCX: {e}")
+        return f"[Error extracting DOCX text: {str(e)}]"
+
+
 def generate_search_query(user_message, state):
    """Generate a search query from user message using the LLM"""
    # Augment the user message with search instruction
--- a/requirements/full/requirements.txt
+++ b/requirements/full/requirements.txt
@ -16,6 +16,7 @@ Pillow>=9.5.0
 psutil
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
--- a/requirements/full/requirements_amd.txt
+++ b/requirements/full/requirements_amd.txt
@ -15,6 +15,7 @@ Pillow>=9.5.0
 psutil
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
--- a/requirements/full/requirements_amd_noavx2.txt
+++ b/requirements/full/requirements_amd_noavx2.txt
@ -15,6 +15,7 @@ Pillow>=9.5.0
 psutil
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
--- a/requirements/full/requirements_apple_intel.txt
+++ b/requirements/full/requirements_apple_intel.txt
@ -15,6 +15,7 @@ Pillow>=9.5.0
 psutil
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
--- a/requirements/full/requirements_apple_silicon.txt
+++ b/requirements/full/requirements_apple_silicon.txt
@ -15,6 +15,7 @@ Pillow>=9.5.0
 psutil
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
--- a/requirements/full/requirements_cpu_only.txt
+++ b/requirements/full/requirements_cpu_only.txt
@ -15,6 +15,7 @@ Pillow>=9.5.0
 psutil
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
--- a/requirements/full/requirements_cpu_only_noavx2.txt
+++ b/requirements/full/requirements_cpu_only_noavx2.txt
@ -15,6 +15,7 @@ Pillow>=9.5.0
 psutil
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
--- a/requirements/full/requirements_noavx2.txt
+++ b/requirements/full/requirements_noavx2.txt
@ -16,6 +16,7 @@ Pillow>=9.5.0
 psutil
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
--- a/requirements/full/requirements_nowheels.txt
+++ b/requirements/full/requirements_nowheels.txt
@ -15,6 +15,7 @@ Pillow>=9.5.0
 psutil
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
--- a/requirements/portable/requirements.txt
+++ b/requirements/portable/requirements.txt
@ -7,6 +7,7 @@ markdown
 numpy==1.26.*
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
--- a/requirements/portable/requirements_apple_intel.txt
+++ b/requirements/portable/requirements_apple_intel.txt
@ -7,6 +7,7 @@ markdown
 numpy==1.26.*
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
--- a/requirements/portable/requirements_apple_silicon.txt
+++ b/requirements/portable/requirements_apple_silicon.txt
@ -7,6 +7,7 @@ markdown
 numpy==1.26.*
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
--- a/requirements/portable/requirements_cpu_only.txt
+++ b/requirements/portable/requirements_cpu_only.txt
@ -7,6 +7,7 @@ markdown
 numpy==1.26.*
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
--- a/requirements/portable/requirements_cpu_only_noavx2.txt
+++ b/requirements/portable/requirements_cpu_only_noavx2.txt
@ -7,6 +7,7 @@ markdown
 numpy==1.26.*
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
--- a/requirements/portable/requirements_noavx2.txt
+++ b/requirements/portable/requirements_noavx2.txt
@ -7,6 +7,7 @@ markdown
 numpy==1.26.*
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
--- a/requirements/portable/requirements_nowheels.txt
+++ b/requirements/portable/requirements_nowheels.txt
@ -7,6 +7,7 @@ markdown
 numpy==1.26.*
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
--- a/requirements/portable/requirements_vulkan.txt
+++ b/requirements/portable/requirements_vulkan.txt
@ -7,6 +7,7 @@ markdown
 numpy==1.26.*
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich
--- a/requirements/portable/requirements_vulkan_noavx2.txt
+++ b/requirements/portable/requirements_vulkan_noavx2.txt
@ -7,6 +7,7 @@ markdown
 numpy==1.26.*
 pydantic==2.8.2
 PyPDF2==3.0.1
+python-docx==1.1.2
 pyyaml
 requests
 rich