Merge pull request 'feat(upload): accept legacy .doc, convert via LibreOffice in container' (#53) from feat/doc-upload-support into main
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 2m3s
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 2m3s
This commit was merged in pull request #53.
This commit is contained in:
@@ -32,9 +32,10 @@ RUN pip install --no-cache-dir ./mcp-server
|
|||||||
FROM python:3.12-slim AS runner
|
FROM python:3.12-slim AS runner
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
# Install Node.js 20.x
|
# Install Node.js 20.x + LibreOffice Writer (headless .doc→.docx conversion
|
||||||
|
# in extractor.py:_extract_doc — needed for legacy Hebrew .doc precedents).
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
curl ca-certificates git \
|
curl ca-certificates git libreoffice-writer-nogui \
|
||||||
&& curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
|
&& curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
|
||||||
&& apt-get install -y --no-install-recommends nodejs \
|
&& apt-get install -y --no-install-recommends nodejs \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|||||||
@@ -262,8 +262,15 @@ def _ocr_with_google_vision(image_bytes: bytes, page_num: int) -> str:
|
|||||||
def _extract_doc(path: Path) -> str:
|
def _extract_doc(path: Path) -> str:
|
||||||
"""Extract text from legacy .doc file by converting to .docx via LibreOffice."""
|
"""Extract text from legacy .doc file by converting to .docx via LibreOffice."""
|
||||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||||
|
# Isolate the LibreOffice user profile per call: headless soffice
|
||||||
|
# locks a single shared profile, so concurrent .doc conversions would
|
||||||
|
# otherwise fail with a profile-lock error.
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
["libreoffice", "--headless", "--convert-to", "docx", str(path), "--outdir", tmp_dir],
|
[
|
||||||
|
"libreoffice",
|
||||||
|
f"-env:UserInstallation=file://{tmp_dir}/lo-profile",
|
||||||
|
"--headless", "--convert-to", "docx", str(path), "--outdir", tmp_dir,
|
||||||
|
],
|
||||||
capture_output=True, text=True, timeout=120,
|
capture_output=True, text=True, timeout=120,
|
||||||
)
|
)
|
||||||
if result.returncode != 0:
|
if result.returncode != 0:
|
||||||
|
|||||||
@@ -72,7 +72,7 @@ from web.paperclip_client import (
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
UPLOAD_DIR = config.DATA_DIR / "uploads"
|
UPLOAD_DIR = config.DATA_DIR / "uploads"
|
||||||
ALLOWED_EXTENSIONS = {".pdf", ".docx", ".rtf", ".txt", ".md"}
|
ALLOWED_EXTENSIONS = {".pdf", ".docx", ".doc", ".rtf", ".txt", ".md"}
|
||||||
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB
|
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB
|
||||||
|
|
||||||
# Progress tracking — backed by Redis with TTL.
|
# Progress tracking — backed by Redis with TTL.
|
||||||
|
|||||||
Reference in New Issue
Block a user