From 476c2fc5d112166e05bd641a3236034e514f21a8 Mon Sep 17 00:00:00 2001
From: Chaim <chaim@marcus-law.co.il>
Date: Wed, 3 Jun 2026 13:47:47 +0000
Subject: [PATCH] feat(upload): accept legacy .doc, convert via LibreOffice in
 container
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Legacy Hebrew .doc precedents (e.g. nevo.co.il CP1255 OLE2) can now be
uploaded directly through the precedent-library, missing-precedent, and
training upload paths — the frontend already advertised .doc but the
backend gate rejected it before reaching the extractor.

- web/app.py: add .doc to ALLOWED_EXTENSIONS (covers all paths that share
  the set: precedent library, missing-precedent, training).
- Dockerfile: install libreoffice-writer-nogui (no X11/Java) so the
  extractor's existing _extract_doc LibreOffice conversion works in the
  Coolify container (was missing → would fail at runtime).
- extractor.py: isolate the LibreOffice user profile per call to avoid a
  profile-lock failure on concurrent .doc conversions.

Verified in python:3.12-slim (prod base): .doc→.docx→text yields text
byte-identical to a native Word .docx save (103 paragraphs, 24,341 chars).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 Dockerfile                                     | 5 +++--
 mcp-server/src/legal_mcp/services/extractor.py | 9 ++++++++-
 web/app.py                                     | 2 +-
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 66725d8..9ccf0d8 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -32,9 +32,10 @@ RUN pip install --no-cache-dir ./mcp-server
 FROM python:3.12-slim AS runner
 WORKDIR /app
 
-# Install Node.js 20.x
+# Install Node.js 20.x + LibreOffice Writer (headless .doc→.docx conversion
+# in extractor.py:_extract_doc — needed for legacy Hebrew .doc precedents).
 RUN apt-get update && apt-get install -y --no-install-recommends \
-      curl ca-certificates git \
+      curl ca-certificates git libreoffice-writer-nogui \
  && curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
  && apt-get install -y --no-install-recommends nodejs \
  && rm -rf /var/lib/apt/lists/*
diff --git a/mcp-server/src/legal_mcp/services/extractor.py b/mcp-server/src/legal_mcp/services/extractor.py
index c882570..385e9a5 100644
--- a/mcp-server/src/legal_mcp/services/extractor.py
+++ b/mcp-server/src/legal_mcp/services/extractor.py
@@ -262,8 +262,15 @@ def _ocr_with_google_vision(image_bytes: bytes, page_num: int) -> str:
 def _extract_doc(path: Path) -> str:
     """Extract text from legacy .doc file by converting to .docx via LibreOffice."""
     with tempfile.TemporaryDirectory() as tmp_dir:
+        # Isolate the LibreOffice user profile per call: headless soffice
+        # locks a single shared profile, so concurrent .doc conversions would
+        # otherwise fail with a profile-lock error.
         result = subprocess.run(
-            ["libreoffice", "--headless", "--convert-to", "docx", str(path), "--outdir", tmp_dir],
+            [
+                "libreoffice",
+                f"-env:UserInstallation=file://{tmp_dir}/lo-profile",
+                "--headless", "--convert-to", "docx", str(path), "--outdir", tmp_dir,
+            ],
             capture_output=True, text=True, timeout=120,
         )
         if result.returncode != 0:
diff --git a/web/app.py b/web/app.py
index e917ee8..f62fb2d 100644
--- a/web/app.py
+++ b/web/app.py
@@ -72,7 +72,7 @@ from web.paperclip_client import (
 logger = logging.getLogger(__name__)
 
 UPLOAD_DIR = config.DATA_DIR / "uploads"
-ALLOWED_EXTENSIONS = {".pdf", ".docx", ".rtf", ".txt", ".md"}
+ALLOWED_EXTENSIONS = {".pdf", ".docx", ".doc", ".rtf", ".txt", ".md"}
 MAX_FILE_SIZE = 50 * 1024 * 1024  # 50MB
 
 # Progress tracking — backed by Redis with TTL.