Files
legal-ai/mcp-server/src/legal_mcp/tools/documents.py
Chaim 81ccf3a888
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 6m33s
feat(retrieval): track page_number on text chunks for multimodal hybrid boost
The legacy chunker did not track which PDF page each chunk came from.
Stored chunks had page_number=NULL, which blocked the multimodal
hybrid retriever's text+image boost — it joins (chunk, image) on
(document_id, page_number) and the join could never fire.

This change:

- extractor.extract_text now returns (text, page_count, page_offsets);
  page_offsets[i] is the start char offset of page (i+1) in the joined
  text. None for non-PDFs.
- chunker.chunk_document accepts an optional page_offsets and tags
  each chunk with the page that contains its first character (uses
  the existing chunker logic; pages assigned post-hoc by content
  search to keep the diff minimal).
- processor.process_document and precedent_library.ingest_precedent
  forward page_offsets through the chunker. New uploads now carry
  accurate page_number on every chunk.
- Other extract_text callers (tools/documents, tools/workflow,
  web/app.py) updated to unpack the third element (ignored).
- scripts/backfill_chunk_pages.py: per-case retrofit. Re-extracts each
  PDF (re-OCRs via Google Vision if needed, ~$0.0015/page), computes
  page_offsets, and updates page_number on every chunk by content
  search. Idempotent; --force re-runs on already-tagged docs.

Forward-only would leave the 419 image embeddings backfilled on
cases 8174-24 + 8137-24 unable to boost their corresponding text
chunks. The retrofit script closes that gap (cost ~$0.60).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-03 19:49:41 +00:00

481 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""MCP tools for document management and processing."""
from __future__ import annotations
import json
import shutil
from pathlib import Path
from uuid import UUID
from legal_mcp import config
from legal_mcp.services import db, git_sync, processor
async def document_upload(
case_number: str,
file_path: str,
doc_type: str = "auto",
title: str = "",
) -> str:
"""העלאה ועיבוד מסמך לתיק ערר. מחלץ טקסט, יוצר chunks ו-embeddings.
Args:
case_number: מספר תיק הערר
file_path: נתיב מלא לקובץ (PDF, DOCX, RTF, TXT)
doc_type: סוג מסמך (auto=סיווג אוטומטי, appeal=כתב ערר, response=תשובה, protocol=פרוטוקול, plan=תכנית, permit=היתר, court_decision=פסק דין, decision=החלטת ועדה, appraisal=שומה, objection=התנגדות, exhibit=נספח, reference=מסמך עזר)
title: שם המסמך (אם ריק, ייקח משם הקובץ)
"""
case = await db.get_case_by_number(case_number)
if not case:
return f"תיק {case_number} לא נמצא."
source = Path(file_path)
if not source.exists():
return f"קובץ לא נמצא: {file_path}"
case_id = UUID(case["id"])
if not title:
title = source.stem
# Copy file to case directory
case_dir = config.find_case_dir(case_number) / "documents" / "originals"
case_dir.mkdir(parents=True, exist_ok=True)
dest = case_dir / source.name
shutil.copy2(str(source), str(dest))
# For auto classification, start with "reference" — will be updated after processing
initial_doc_type = doc_type if doc_type != "auto" else "reference"
# Create document record
doc = await db.create_document(
case_id=case_id,
doc_type=initial_doc_type,
title=title,
file_path=str(dest),
)
# Process document (extract → classify → chunk → embed → store)
result = await processor.process_document(UUID(doc["id"]), case_id)
# If auto-classification, update doc_type from classification result
actual_doc_type = initial_doc_type
if doc_type == "auto" and result.get("classification"):
classified_type = result["classification"].get("classification", {}).get("doc_type", "")
if classified_type:
actual_doc_type = classified_type
await db.update_document(UUID(doc["id"]), doc_type=classified_type)
doc["doc_type"] = classified_type
# Git commit + push (best-effort — don't fail upload on git errors)
try:
repo_dir = config.find_case_dir(case_number)
if repo_dir.exists():
doc_type_hebrew = {
"appeal": "כתב ערר",
"response": "תשובה",
"protocol": "פרוטוקול",
"plan": "תכנית",
"permit": "היתר",
"court_decision": "פסק דין",
"decision": "החלטה",
"appraisal": "שומה",
"objection": "התנגדות",
"exhibit": "נספח",
"reference": "מסמך עזר",
}.get(actual_doc_type, actual_doc_type)
git_sync.commit_and_push(repo_dir, f"הוספת {doc_type_hebrew}: {title}")
except Exception:
pass # git not available in container — non-critical
return json.dumps({
"document": doc,
"processing": result,
}, default=str, ensure_ascii=False, indent=2)
async def document_upload_training(
file_path: str,
decision_number: str = "",
decision_date: str = "",
subject_categories: list[str] | None = None,
title: str = "",
practice_area: str = "appeals_committee",
appeal_subtype: str = "",
) -> str:
"""העלאת החלטה קודמת של דפנה לקורפוס הסגנון (training).
Args:
file_path: נתיב מלא לקובץ ההחלטה
decision_number: מספר ההחלטה
decision_date: תאריך ההחלטה (YYYY-MM-DD)
subject_categories: קטגוריות - אפשר לבחור כמה (בנייה, שימוש חורג, תכנית, היתר, הקלה, חלוקה, תמ"א 38, היטל השבחה, פיצויים 197)
title: שם המסמך
practice_area: תחום משפטי (appeals_committee / national_insurance / labor_law)
appeal_subtype: סוג ערר (building_permit / betterment_levy / compensation_197).
ריק = יוסק אוטומטית ממספר ההחלטה
"""
from datetime import date as date_type
from legal_mcp.services import chunker, embeddings, extractor, practice_area as pa
source = Path(file_path)
if not source.exists():
return f"קובץ לא נמצא: {file_path}"
if not title:
title = source.stem
# Resolve subtype: explicit > derived from decision_number > 'unknown'
if not appeal_subtype:
appeal_subtype = pa.derive_subtype(decision_number, practice_area)
pa.validate(practice_area, appeal_subtype)
# Copy to training directory, organized by subtype
_SUBTYPE_DIRS = {
"betterment_levy": "cmpa",
"compensation_197": "cmpa",
"building_permit": "cmp",
}
subdir = _SUBTYPE_DIRS.get(appeal_subtype, "")
training_dest = config.TRAINING_DIR / subdir if subdir else config.TRAINING_DIR
training_dest.mkdir(parents=True, exist_ok=True)
dest = training_dest / source.name
if source.resolve() != dest.resolve():
shutil.copy2(str(source), str(dest))
# Extract text and strip Nevo preamble
text, page_count, _ = await extractor.extract_text(str(dest))
text = extractor.strip_nevo_preamble(text)
# Parse date
d_date = None
if decision_date:
d_date = date_type.fromisoformat(decision_date)
# Add to style corpus (tagged by domain so block-writer can filter)
corpus_id = await db.add_to_style_corpus(
document_id=None,
decision_number=decision_number,
decision_date=d_date,
subject_categories=subject_categories or [],
full_text=text,
practice_area=practice_area,
appeal_subtype=appeal_subtype,
)
# Chunk and embed for RAG search over training corpus
chunks = chunker.chunk_document(text)
if chunks:
# Create a document record (no case association — tag explicitly)
doc = await db.create_document(
case_id=None,
doc_type="decision",
title=f"[קורפוס] {title}",
file_path=str(dest),
page_count=page_count,
)
doc_id = UUID(doc["id"])
await db.update_document(
doc_id, extracted_text=text, extraction_status="completed",
metadata={"practice_area": practice_area, "appeal_subtype": appeal_subtype},
)
# Generate embeddings and store chunks
texts = [c.content for c in chunks]
embs = await embeddings.embed_texts(texts, input_type="document")
chunk_dicts = [
{
"content": c.content,
"section_type": c.section_type,
"embedding": emb,
"page_number": c.page_number,
"chunk_index": c.chunk_index,
}
for c, emb in zip(chunks, embs)
]
await db.store_chunks(doc_id, None, chunk_dicts)
return json.dumps({
"corpus_id": str(corpus_id),
"title": title,
"pages": page_count,
"text_length": len(text),
"chunks": len(chunks) if chunks else 0,
}, default=str, ensure_ascii=False, indent=2)
async def document_get_text(case_number: str, doc_title: str = "") -> str:
"""קבלת טקסט מלא של מסמך מתוך תיק.
Args:
case_number: מספר תיק הערר
doc_title: שם המסמך (אם ריק, מחזיר את כל המסמכים)
"""
case = await db.get_case_by_number(case_number)
if not case:
return f"תיק {case_number} לא נמצא."
docs = await db.list_documents(UUID(case["id"]))
if not docs:
return f"אין מסמכים בתיק {case_number}."
if doc_title:
docs = [d for d in docs if doc_title.lower() in d["title"].lower()]
if not docs:
return f"מסמך '{doc_title}' לא נמצא בתיק."
results = []
for doc in docs:
text = await db.get_document_text(UUID(doc["id"]))
results.append({
"title": doc["title"],
"doc_type": doc["doc_type"],
"text": text[:10000] if text else "(ללא טקסט)",
})
return json.dumps(results, ensure_ascii=False, indent=2)
async def document_list(case_number: str) -> str:
"""רשימת מסמכים בתיק.
Args:
case_number: מספר תיק הערר
"""
case = await db.get_case_by_number(case_number)
if not case:
return f"תיק {case_number} לא נמצא."
docs = await db.list_documents(UUID(case["id"]))
if not docs:
return f"אין מסמכים בתיק {case_number}."
return json.dumps(docs, default=str, ensure_ascii=False, indent=2)
async def extract_references(
case_number: str,
doc_title: str = "",
) -> str:
"""זיהוי תכניות, פסיקה וחקיקה מתוך מסמכי תיק.
Args:
case_number: מספר תיק הערר
doc_title: שם מסמך ספציפי (אם ריק, מזהה בכל המסמכים)
"""
from legal_mcp.services import references_extractor
case = await db.get_case_by_number(case_number)
if not case:
return f"תיק {case_number} לא נמצא."
case_id = UUID(case["id"])
docs = await db.list_documents(case_id)
if not docs:
return f"אין מסמכים בתיק {case_number}."
if doc_title:
docs = [d for d in docs if doc_title.lower() in d["title"].lower()]
results = []
for doc in docs:
text = await db.get_document_text(UUID(doc["id"]))
if not text:
continue
refs = await references_extractor.extract_and_link_references(
UUID(doc["id"]), case_id, text,
)
results.append({
"document": doc["title"],
"plans": refs["plans"],
"case_law": refs["case_law"],
"case_law_linked": refs["case_law_linked"],
"legislation": refs["legislation"],
})
return json.dumps(results, default=str, ensure_ascii=False, indent=2)
async def extract_claims(
case_number: str,
doc_title: str = "",
party_hint: str = "",
) -> str:
"""חילוץ טענות מכתב טענות בתיק ושמירה ב-DB.
Args:
case_number: מספר תיק הערר
doc_title: שם מסמך ספציפי (אם ריק, מחלץ מכל כתבי הטענות)
party_hint: שם הצד המגיש (אם ידוע)
"""
from legal_mcp.services import claims_extractor
case = await db.get_case_by_number(case_number)
if not case:
return f"תיק {case_number} לא נמצא."
case_id = UUID(case["id"])
docs = await db.list_documents(case_id)
if not docs:
return f"אין מסמכים בתיק {case_number}."
# Filter to claims documents (appeal, response) or specific doc
if doc_title:
docs = [d for d in docs if doc_title.lower() in d["title"].lower()]
else:
docs = [d for d in docs if d["doc_type"] in ("appeal", "response", "objection")]
if not docs:
return "לא נמצאו כתבי טענות בתיק."
results = []
for doc in docs:
text = await db.get_document_text(UUID(doc["id"]))
if not text:
continue
result = await claims_extractor.extract_and_store_claims(
case_id=case_id,
document_id=UUID(doc["id"]),
text=text,
doc_type=doc["doc_type"],
party_hint=party_hint,
)
results.append(result)
return json.dumps(results, default=str, ensure_ascii=False, indent=2)
async def get_claims(case_number: str, party_role: str = "") -> str:
"""שליפת טענות שחולצו לתיק.
Args:
case_number: מספר תיק הערר
party_role: סינון לפי צד (appellant/respondent/committee/permit_applicant). ריק = הכל.
"""
case = await db.get_case_by_number(case_number)
if not case:
return f"תיק {case_number} לא נמצא."
claims = await db.get_claims(
UUID(case["id"]),
party_role=party_role if party_role else None,
)
if not claims:
return f"אין טענות בתיק {case_number}."
# Format for display
role_hebrew = {
"appellant": "עוררים",
"respondent": "משיבים",
"committee": "ועדה מקומית",
"permit_applicant": "מבקשי היתר",
"appraiser": "שמאי",
}
formatted = []
for c in claims:
formatted.append({
"party": role_hebrew.get(c["party_role"], c["party_role"]),
"claim": c["claim_text"],
"source": c.get("source_document", ""),
})
return json.dumps(formatted, default=str, ensure_ascii=False, indent=2)
# Whitelist of doc_type values; mirrors web/app.py:DOC_TYPE_NAMES.
ALLOWED_DOC_TYPES = {
"appeal", "response", "protocol", "plan", "decision",
"court_decision", "permit", "appraisal", "exhibit",
"objection", "reference",
}
# Allowed appraiser_side values; '' (empty) clears the tag.
ALLOWED_APPRAISER_SIDES = {"committee", "appellant", "deciding", ""}
async def document_update(
case_number: str,
doc_id: str,
doc_type: str = "",
appraiser_side: str = "",
) -> str:
"""עדכון תיוג מסמך — doc_type ו/או appraiser_side. ריק = אין שינוי.
הולידציה זהה ל-PATCH endpoint ב-web/app.py. appraiser_side נשמר ב-
documents.metadata JSONB (מתפרסם משם ע"י extract_appraiser_facts).
Args:
case_number: מספר תיק הערר (לאישור שייכות)
doc_id: UUID של המסמך
doc_type: ערך חדש (appeal/response/protocol/plan/decision/court_decision/
permit/appraisal/exhibit/objection/reference). ריק = אין שינוי.
appraiser_side: ערך חדש (committee/appellant/deciding). ריק = אין שינוי;
העבר במפורש מחרוזת ריקה לא-default אם רוצים לנקות.
"""
case = await db.get_case_by_number(case_number)
if not case:
return json.dumps({"status": "error",
"message": f"תיק {case_number} לא נמצא."},
ensure_ascii=False, indent=2)
try:
doc_uuid = UUID(doc_id)
except ValueError:
return json.dumps({"status": "error",
"message": f"doc_id לא תקין: {doc_id}"},
ensure_ascii=False, indent=2)
doc = await db.get_document(doc_uuid)
if not doc:
return json.dumps({"status": "error",
"message": f"מסמך {doc_id} לא נמצא."},
ensure_ascii=False, indent=2)
if doc.get("case_id") != case["id"]:
return json.dumps({"status": "error",
"message": f"מסמך {doc_id} לא שייך לתיק {case_number}."},
ensure_ascii=False, indent=2)
updates: dict = {}
if doc_type:
if doc_type not in ALLOWED_DOC_TYPES:
return json.dumps({
"status": "error",
"message": f"doc_type לא תקין: {doc_type}",
"allowed": sorted(ALLOWED_DOC_TYPES),
}, ensure_ascii=False, indent=2)
updates["doc_type"] = doc_type
# appraiser_side is optional. The MCP tool can't distinguish "skip" from
# "set to empty string", so we use the convention: only update if non-empty.
# To clear, the operator must edit metadata directly (rare).
if appraiser_side:
if appraiser_side not in ALLOWED_APPRAISER_SIDES:
return json.dumps({
"status": "error",
"message": f"appraiser_side לא תקין: {appraiser_side}",
"allowed": sorted(s for s in ALLOWED_APPRAISER_SIDES if s),
}, ensure_ascii=False, indent=2)
metadata = doc.get("metadata") or {}
if isinstance(metadata, str):
metadata = json.loads(metadata)
metadata["appraiser_side"] = appraiser_side
updates["metadata"] = metadata
if not updates:
return json.dumps({"status": "noop", "message": "אין שינוי לבצע."},
ensure_ascii=False, indent=2)
await db.update_document(doc_uuid, **updates)
fresh = await db.get_document(doc_uuid)
return json.dumps({
"status": "completed",
"doc_id": doc_id,
"doc_type": fresh.get("doc_type"),
"metadata": fresh.get("metadata"),
}, default=str, ensure_ascii=False, indent=2)