Improve document processing pipeline and agent workflows

- Add delete_document_chunks for reprocessing, save extracted text to disk - Expand case directory structure (original/extracted/proofread/backup) - Update classifier patterns (תגובה, הודעת עמדה) - Fix proofreader agent paths for new directory layout - Update HEARTBEAT to notify on every task completion - Improve bidi_table with LRE/PDF directional embedding - Add Paperclip project verification and auto-close setup issue - Add auto-sync-cases.sh for Gitea synchronization Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-09 16:45:49 +00:00
parent 63c9ca184b
commit 3f759d3610
10 changed files with 164 additions and 19 deletions
--- a/mcp-server/src/legal_mcp/services/processor.py
+++ b/mcp-server/src/legal_mcp/services/processor.py
@@ -3,6 +3,7 @@
 from __future__ import annotations

 import logging
+from pathlib import Path
 from uuid import UUID

 from legal_mcp.services import chunker, db, embeddings, extractor, references_extractor
@@ -37,6 +38,17 @@ async def process_document(document_id: UUID, case_id: UUID) -> dict:
            page_count=page_count,
        )

+        # Save extracted text to documents/extracted/ directory
+        original_path = Path(doc["file_path"])
+        extracted_dir = original_path.parent.parent / "extracted"
+        extracted_dir.mkdir(parents=True, exist_ok=True)
+        txt_path = extracted_dir / (original_path.stem + ".txt")
+        try:
+            txt_path.write_text(text, encoding="utf-8")
+            logger.info("Saved extracted text to %s", txt_path)
+        except Exception as e:
+            logger.warning("Failed to save text file (non-fatal): %s", e)
+
        # Step 1.5: Classify document — local rules first, Claude Code headless fallback
        classification_result = {}
        try: