Add unified upload UI with Din Leumi support and auto metadata extraction

Upload interface now supports three targets:
- Ezer Mishpati case documents
- Ezer Mishpati training corpus
- Din Leumi NI court decisions (new)

Din Leumi uploads auto-extract metadata (court, judge, date, parties,
topics, outcome) using Claude API, eliminating manual form filling.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-25 16:07:16 +00:00
parent 1ac0259799
commit 98311eef12
2 changed files with 268 additions and 21 deletions

View File

@@ -16,6 +16,8 @@ from uuid import UUID, uuid4
# Allow importing legal_mcp from the MCP server source
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src"))
# Allow importing din_leumi from its MCP server source
sys.path.insert(0, str(Path.home() / "din-leumi" / "mcp-server" / "src"))
from fastapi import FastAPI, File, HTTPException, UploadFile
from fastapi.responses import FileResponse, StreamingResponse
@@ -25,6 +27,14 @@ from pydantic import BaseModel
from legal_mcp import config
from legal_mcp.services import chunker, db, embeddings, extractor, processor
# Din Leumi imports (aliased to avoid collision)
from din_leumi import config as dl_config
from din_leumi.services import db as dl_db
from din_leumi.services import processor as dl_processor
from din_leumi.services import extractor as dl_extractor
import anthropic
logger = logging.getLogger(__name__)
UPLOAD_DIR = config.DATA_DIR / "uploads"
@@ -38,12 +48,15 @@ _progress: dict[str, dict] = {}
@asynccontextmanager
async def lifespan(app: FastAPI):
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
dl_config.DECISIONS_DIR.mkdir(parents=True, exist_ok=True)
await db.init_schema()
await dl_db.init_schema()
yield
await db.close_pool()
await dl_db.close_pool()
app = FastAPI(title="Ezer Mishpati — Upload", lifespan=lifespan)
app = FastAPI(title="העלאת מסמכים משפטיים", lifespan=lifespan)
STATIC_DIR = Path(__file__).parent / "static"
@@ -170,6 +183,11 @@ async def progress_stream(task_id: str):
return StreamingResponse(event_stream(), media_type="text/event-stream")
@app.get("/health")
async def health():
return {"status": "ok"}
@app.get("/api/cases")
async def list_cases():
"""List existing cases for the dropdown."""
@@ -184,6 +202,94 @@ async def list_cases():
]
# ── Din Leumi Endpoint ────────────────────────────────────────────
class DinLeumiRequest(BaseModel):
filename: str
title: str = ""
@app.post("/api/classify-dinleumi")
async def classify_dinleumi(req: DinLeumiRequest):
"""Upload a decision to Din Leumi with auto metadata extraction."""
source = UPLOAD_DIR / req.filename
if not source.exists() or not source.parent.samefile(UPLOAD_DIR):
raise HTTPException(404, "File not found in uploads")
task_id = str(uuid4())
_progress[task_id] = {"status": "queued", "filename": req.filename}
asyncio.create_task(_process_dinleumi_decision(task_id, source, req))
return {"task_id": task_id}
# ── Metadata Extraction ──────────────────────────────────────────
METADATA_EXTRACTION_PROMPT = """אתה מנתח פסקי דין של בתי דין לעבודה בתחום ביטוח לאומי.
חלץ את המטאדאטא הבאה מתוך פסק הדין והחזר אותה כ-JSON בלבד:
{
"title": "כותרת תיאורית קצרה של פסק הדין",
"court": "שם בית המשפט (למשל: בית הדין האזורי לעבודה תל אביב)",
"decision_date": "YYYY-MM-DD או null אם לא נמצא",
"case_number": "מספר תיק (למשל: בל 12345-06-20)",
"judge": "שם השופט/ת",
"parties_appellant": "שם התובע/מערער",
"parties_respondent": "שם הנתבע/משיב",
"topics": ["רשימת נושאים רלוונטיים מתוך הרשימה למטה"],
"outcome": "accepted/rejected/partial/remanded",
"summary": "תקציר של 2-3 משפטים"
}
נושאים אפשריים: נכות כללית, נכות מעבודה, תאונת עבודה, דמי לידה, דמי אבטלה, גמלת הבטחת הכנסה, גמלת ניידות, גמלת סיעוד, קצבת זקנה, קצבת שאירים, מילואים, דמי פגיעה, נפגעי פעולות איבה
החזר JSON בלבד, ללא טקסט נוסף."""
_anthropic_client: anthropic.Anthropic | None = None
def _get_anthropic() -> anthropic.Anthropic:
global _anthropic_client
if _anthropic_client is None:
_anthropic_client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
return _anthropic_client
async def _extract_metadata_with_claude(text: str) -> dict:
"""Extract metadata from decision text using Claude."""
client = _get_anthropic()
# Use first ~5000 chars (usually contains all metadata)
excerpt = text[:5000]
message = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1024,
messages=[
{
"role": "user",
"content": f"{METADATA_EXTRACTION_PROMPT}\n\nפסק הדין:\n{excerpt}",
}
],
)
response_text = message.content[0].text.strip()
# Parse JSON from response (handle potential markdown wrapping)
if response_text.startswith("```"):
response_text = response_text.split("```")[1]
if response_text.startswith("json"):
response_text = response_text[4:]
try:
metadata = json.loads(response_text)
except json.JSONDecodeError:
logger.warning("Failed to parse metadata JSON: %s", response_text[:200])
metadata = {}
return metadata
# ── Background Processing ─────────────────────────────────────────
@@ -340,3 +446,117 @@ async def _process_training_document(task_id: str, source: Path, req: ClassifyRe
"chunks": chunk_count,
},
}
async def _process_dinleumi_decision(task_id: str, source: Path, req: DinLeumiRequest):
"""Process a National Insurance court decision with auto metadata extraction."""
from datetime import date as date_type
try:
# Step 1: Copy to din-leumi decisions directory
_progress[task_id] = {"status": "copying", "filename": req.filename}
original_name = re.sub(r"^\d+_", "", source.name)
dest = dl_config.DECISIONS_DIR / original_name
if dest.exists():
dest = dl_config.DECISIONS_DIR / f"{dest.stem}_{int(time.time())}{dest.suffix}"
shutil.copy2(str(source), str(dest))
# Step 2: Extract text
_progress[task_id] = {"status": "processing", "filename": req.filename, "step": "extracting"}
text, page_count = await dl_extractor.extract_text(str(dest))
# Step 3: Extract metadata with Claude
_progress[task_id] = {"status": "processing", "filename": req.filename, "step": "extracting_metadata"}
metadata = await _extract_metadata_with_claude(text)
# Parse date
d_date = None
if metadata.get("decision_date"):
try:
d_date = date_type.fromisoformat(metadata["decision_date"])
except (ValueError, TypeError):
d_date = None
title = req.title or metadata.get("title", original_name.rsplit(".", 1)[0])
# Step 4: Create decision record
_progress[task_id] = {"status": "registering", "filename": req.filename}
decision = await dl_db.create_decision(
title=title,
file_path=str(dest),
court=metadata.get("court", ""),
decision_date=d_date,
case_number=metadata.get("case_number", ""),
judge=metadata.get("judge", ""),
parties_appellant=metadata.get("parties_appellant", ""),
parties_respondent=metadata.get("parties_respondent", "המוסד לביטוח לאומי"),
topics=metadata.get("topics"),
outcome=metadata.get("outcome", ""),
)
decision_id = UUID(decision["id"])
# Update with extracted text
await dl_db.update_decision(
decision_id,
extracted_text=text,
page_count=page_count,
summary=metadata.get("summary", ""),
)
# Step 5: Chunk
_progress[task_id] = {"status": "processing", "filename": req.filename, "step": "chunking"}
from din_leumi.services import chunker as dl_chunker, embeddings as dl_embeddings
chunks = dl_chunker.chunk_document(text)
chunk_count = 0
if chunks:
# Step 6: Embed
_progress[task_id] = {"status": "processing", "filename": req.filename, "step": "embedding"}
texts = [c.content for c in chunks]
embs = await dl_embeddings.embed_texts(texts, input_type="document")
chunk_dicts = [
{
"content": c.content,
"section_type": c.section_type,
"embedding": emb,
"page_number": c.page_number,
"chunk_index": c.chunk_index,
}
for c, emb in zip(chunks, embs)
]
await dl_db.store_chunks(decision_id, chunk_dicts)
chunk_count = len(chunks)
await dl_db.update_decision(decision_id, extraction_status="completed")
await dl_db.ensure_ivfflat_index()
# Remove from uploads
source.unlink(missing_ok=True)
_progress[task_id] = {
"status": "completed",
"filename": req.filename,
"system": "din-leumi",
"result": {
"decision_id": str(decision_id),
"title": title,
"pages": page_count,
"text_length": len(text),
"chunks": chunk_count,
},
"metadata": {
"court": metadata.get("court", ""),
"judge": metadata.get("judge", ""),
"case_number": metadata.get("case_number", ""),
"decision_date": metadata.get("decision_date", ""),
"outcome": metadata.get("outcome", ""),
"topics": metadata.get("topics", []),
"summary": metadata.get("summary", ""),
},
}
except Exception as e:
logger.exception("Din Leumi processing failed for %s", req.filename)
_progress[task_id] = {"status": "failed", "error": str(e), "filename": req.filename}

View File

@@ -3,7 +3,7 @@
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>עוזר משפטי — העלאת מסמכים</title>
<title>העלאת מסמכים משפטיים</title>
<style>
* { box-sizing: border-box; margin: 0; padding: 0; }
body {
@@ -234,8 +234,8 @@ header p { opacity: 0.7; margin-top: 4px; font-size: 0.9em; }
<body>
<div class="container">
<header>
<h1>עוזר משפטי — העלאת מסמכים</h1>
<p>העלאה, סיווג ועיבוד מסמכים משפטיים</p>
<h1>העלאת מסמכים משפטיים</h1>
<p>העלאה, סיווג ועיבוד מסמכים — עזר משפטי | דין לאומי</p>
</header>
<!-- Upload Zone -->
@@ -363,8 +363,9 @@ async function loadPending() {
</div>
</div>
<div class="radio-group">
<label><input type="radio" name="cat_${esc(f.filename)}" value="training" onchange="showFields(this)"> החלטה קודמת (אימון)</label>
<label><input type="radio" name="cat_${esc(f.filename)}" value="case" onchange="showFields(this)"> מסמך תיק</label>
<label><input type="radio" name="cat_${esc(f.filename)}" value="case" onchange="showFields(this)"> עזר משפטי — מסמך תיק</label>
<label><input type="radio" name="cat_${esc(f.filename)}" value="training" onchange="showFields(this)"> עזר משפטי — החלטה (אימון)</label>
<label><input type="radio" name="cat_${esc(f.filename)}" value="dinleumi" onchange="showFields(this)"> דין לאומי — פסק דין בל"ל</label>
</div>
<div class="conditional case-fields" id="case_${esc(f.filename)}">
<div class="form-row">
@@ -407,6 +408,9 @@ async function loadPending() {
</div>
</div>
</div>
<div class="conditional dinleumi-fields" id="dinleumi_${esc(f.filename)}">
<p style="font-size:0.85em;color:#636e72;margin:4px 0 8px">המטאדאטא (בית משפט, שופט, תאריך, נושא, תוצאה) תחולץ אוטומטית מתוך פסק הדין</p>
</div>
<div style="margin-top:12px">
<div class="form-group" style="max-width:300px;margin-bottom:8px">
<label>כותרת (אופציונלי)</label>
@@ -420,11 +424,11 @@ async function loadPending() {
function showFields(radio) {
const container = radio.closest('.pending-file');
const filename = container.dataset.filename;
const val = radio.value;
container.querySelector('.case-fields').classList.toggle('active', val === 'case');
container.querySelector('.training-fields').classList.toggle('active', val === 'training');
container.querySelector('.dinleumi-fields').classList.toggle('active', val === 'dinleumi');
container.querySelector('.process-btn').disabled = false;
}
@@ -439,20 +443,30 @@ async function classifyFile(filename) {
const category = container.querySelector('input[type="radio"]:checked')?.value;
if (!category) return toast('יש לבחור סיווג', 'error');
const body = {
filename,
category,
title: container.querySelector('.doc-title').value,
};
let endpoint = API + '/classify';
let body;
if (category === 'case') {
body.case_number = container.querySelector('.case-select').value;
body.doc_type = container.querySelector('.doctype-select').value;
if (!body.case_number) return toast('יש לבחור תיק', 'error');
if (category === 'dinleumi') {
endpoint = API + '/classify-dinleumi';
body = {
filename,
title: container.querySelector('.doc-title').value,
};
} else {
body.decision_number = container.querySelector('.decision-number').value;
body.decision_date = container.querySelector('.decision-date').value;
body.subject_categories = Array.from(container.querySelectorAll('.subject-grid input:checked')).map(cb => cb.value);
body = {
filename,
category,
title: container.querySelector('.doc-title').value,
};
if (category === 'case') {
body.case_number = container.querySelector('.case-select').value;
body.doc_type = container.querySelector('.doctype-select').value;
if (!body.case_number) return toast('יש לבחור תיק', 'error');
} else {
body.decision_number = container.querySelector('.decision-number').value;
body.decision_date = container.querySelector('.decision-date').value;
body.subject_categories = Array.from(container.querySelectorAll('.subject-grid input:checked')).map(cb => cb.value);
}
}
// Disable button
@@ -460,7 +474,7 @@ async function classifyFile(filename) {
container.querySelector('.process-btn').textContent = 'מעבד...';
try {
const res = await fetch(API + '/classify', {
const res = await fetch(endpoint, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(body),
@@ -501,6 +515,7 @@ function trackTask(taskId, displayName) {
const STEP_LABELS = {
extracting: 'מחלץ טקסט',
extracting_metadata: 'מחלץ מטאדאטא מפסק הדין',
corpus: 'מוסיף לקורפוס',
chunking: 'מפצל לקטעים',
embedding: 'יוצר embeddings',
@@ -530,7 +545,19 @@ function trackTask(taskId, displayName) {
div.classList.add('done');
es.close();
const r = data.result || {};
if (r.chunks !== undefined) {
const m = data.metadata;
if (m && data.system === 'din-leumi') {
// Show extracted metadata for din-leumi
let metaHtml = `הושלם — ${r.chunks || 0} קטעים, ${r.pages || '?'} עמודים`;
const parts = [];
if (m.court) parts.push(m.court);
if (m.judge) parts.push('שופט: ' + m.judge);
if (m.decision_date) parts.push(m.decision_date);
if (m.outcome) parts.push({accepted:'התקבלה',rejected:'נדחתה',partial:'חלקית',remanded:'הוחזרה'}[m.outcome] || m.outcome);
if (parts.length) metaHtml += '<br><small style="color:#636e72">' + esc(parts.join(' | ')) + '</small>';
if (m.topics && m.topics.length) metaHtml += '<br><small style="color:#0984e3">' + m.topics.map(t => esc(t)).join(', ') + '</small>';
statusEl.innerHTML = metaHtml;
} else if (r.chunks !== undefined) {
statusEl.textContent = `הושלם — ${r.chunks} קטעים, ${r.pages || '?'} עמודים`;
}
toast('העיבוד הושלם: ' + esc(displayName), 'success');