diff --git a/Dockerfile b/Dockerfile index 66725d8..9ccf0d8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -32,9 +32,10 @@ RUN pip install --no-cache-dir ./mcp-server FROM python:3.12-slim AS runner WORKDIR /app -# Install Node.js 20.x +# Install Node.js 20.x + LibreOffice Writer (headless .doc→.docx conversion +# in extractor.py:_extract_doc — needed for legacy Hebrew .doc precedents). RUN apt-get update && apt-get install -y --no-install-recommends \ - curl ca-certificates git \ + curl ca-certificates git libreoffice-writer-nogui \ && curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \ && apt-get install -y --no-install-recommends nodejs \ && rm -rf /var/lib/apt/lists/* diff --git a/mcp-server/src/legal_mcp/services/extractor.py b/mcp-server/src/legal_mcp/services/extractor.py index c882570..385e9a5 100644 --- a/mcp-server/src/legal_mcp/services/extractor.py +++ b/mcp-server/src/legal_mcp/services/extractor.py @@ -262,8 +262,15 @@ def _ocr_with_google_vision(image_bytes: bytes, page_num: int) -> str: def _extract_doc(path: Path) -> str: """Extract text from legacy .doc file by converting to .docx via LibreOffice.""" with tempfile.TemporaryDirectory() as tmp_dir: + # Isolate the LibreOffice user profile per call: headless soffice + # locks a single shared profile, so concurrent .doc conversions would + # otherwise fail with a profile-lock error. result = subprocess.run( - ["libreoffice", "--headless", "--convert-to", "docx", str(path), "--outdir", tmp_dir], + [ + "libreoffice", + f"-env:UserInstallation=file://{tmp_dir}/lo-profile", + "--headless", "--convert-to", "docx", str(path), "--outdir", tmp_dir, + ], capture_output=True, text=True, timeout=120, ) if result.returncode != 0: diff --git a/web/app.py b/web/app.py index e917ee8..f62fb2d 100644 --- a/web/app.py +++ b/web/app.py @@ -72,7 +72,7 @@ from web.paperclip_client import ( logger = logging.getLogger(__name__) UPLOAD_DIR = config.DATA_DIR / "uploads" -ALLOWED_EXTENSIONS = {".pdf", ".docx", ".rtf", ".txt", ".md"} +ALLOWED_EXTENSIONS = {".pdf", ".docx", ".doc", ".rtf", ".txt", ".md"} MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB # Progress tracking — backed by Redis with TTL.