Improve document processing pipeline and agent workflows
- Add delete_document_chunks for reprocessing, save extracted text to disk - Expand case directory structure (original/extracted/proofread/backup) - Update classifier patterns (תגובה, הודעת עמדה) - Fix proofreader agent paths for new directory layout - Update HEARTBEAT to notify on every task completion - Improve bidi_table with LRE/PDF directional embedding - Add Paperclip project verification and auto-close setup issue - Add auto-sync-cases.sh for Gitea synchronization Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -687,6 +687,16 @@ async def update_decision(decision_id: UUID, **fields) -> None:
|
||||
|
||||
# ── Chunks & Vectors ───────────────────────────────────────────────
|
||||
|
||||
async def delete_document_chunks(document_id: UUID) -> int:
|
||||
"""Delete all chunks for a document (used before reprocessing)."""
|
||||
pool = await get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
result = await conn.execute(
|
||||
"DELETE FROM document_chunks WHERE document_id = $1", document_id
|
||||
)
|
||||
return int(result.split()[-1]) # e.g. "DELETE 5" -> 5
|
||||
|
||||
|
||||
async def store_chunks(
|
||||
document_id: UUID,
|
||||
case_id: UUID | None,
|
||||
|
||||
@@ -19,7 +19,7 @@ logger = logging.getLogger(__name__)
|
||||
_FILENAME_RULES: list[tuple[str, str, float]] = [
|
||||
# (regex pattern on filename, doc_type, confidence)
|
||||
(r"כתב.ערר|כתב-ערר", "appeal", 1.0),
|
||||
(r"תשובה|תשובת|תגובת|השלמת.טיעון|בקשה.להשלמת", "response", 1.0),
|
||||
(r"תשובה|תשובת|תגובה|תגובת|השלמת.טיעון|בקשה.להשלמת|הודעת.עמדה", "response", 1.0),
|
||||
(r"פרוטוקול", "protocol", 1.0),
|
||||
(r"החלטת?.ביניים|החלטה.לתיקון", "decision", 0.95),
|
||||
(r"הוראות.תכנית|תכנית", "plan", 1.0),
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from uuid import UUID
|
||||
|
||||
from legal_mcp.services import chunker, db, embeddings, extractor, references_extractor
|
||||
@@ -37,6 +38,17 @@ async def process_document(document_id: UUID, case_id: UUID) -> dict:
|
||||
page_count=page_count,
|
||||
)
|
||||
|
||||
# Save extracted text to documents/extracted/ directory
|
||||
original_path = Path(doc["file_path"])
|
||||
extracted_dir = original_path.parent.parent / "extracted"
|
||||
extracted_dir.mkdir(parents=True, exist_ok=True)
|
||||
txt_path = extracted_dir / (original_path.stem + ".txt")
|
||||
try:
|
||||
txt_path.write_text(text, encoding="utf-8")
|
||||
logger.info("Saved extracted text to %s", txt_path)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to save text file (non-fatal): %s", e)
|
||||
|
||||
# Step 1.5: Classify document — local rules first, Claude Code headless fallback
|
||||
classification_result = {}
|
||||
try:
|
||||
|
||||
@@ -62,7 +62,12 @@ async def case_create(
|
||||
# Initialize git repo for the case
|
||||
case_dir = config.find_case_dir(case_number)
|
||||
case_dir.mkdir(parents=True, exist_ok=True)
|
||||
(case_dir / "documents").mkdir(exist_ok=True)
|
||||
docs_dir = case_dir / "documents"
|
||||
docs_dir.mkdir(exist_ok=True)
|
||||
(docs_dir / "original").mkdir(exist_ok=True)
|
||||
(docs_dir / "extracted").mkdir(exist_ok=True)
|
||||
(docs_dir / "proofread").mkdir(exist_ok=True)
|
||||
(docs_dir / "backup").mkdir(exist_ok=True)
|
||||
(case_dir / "drafts").mkdir(exist_ok=True)
|
||||
|
||||
# Save case metadata
|
||||
|
||||
Reference in New Issue
Block a user