Add unified upload UI with Din Leumi support and auto metadata extraction
Upload interface now supports three targets: - Ezer Mishpati case documents - Ezer Mishpati training corpus - Din Leumi NI court decisions (new) Din Leumi uploads auto-extract metadata (court, judge, date, parties, topics, outcome) using Claude API, eliminating manual form filling. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
222
web/app.py
222
web/app.py
@@ -16,6 +16,8 @@ from uuid import UUID, uuid4
|
||||
|
||||
# Allow importing legal_mcp from the MCP server source
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src"))
|
||||
# Allow importing din_leumi from its MCP server source
|
||||
sys.path.insert(0, str(Path.home() / "din-leumi" / "mcp-server" / "src"))
|
||||
|
||||
from fastapi import FastAPI, File, HTTPException, UploadFile
|
||||
from fastapi.responses import FileResponse, StreamingResponse
|
||||
@@ -25,6 +27,14 @@ from pydantic import BaseModel
|
||||
from legal_mcp import config
|
||||
from legal_mcp.services import chunker, db, embeddings, extractor, processor
|
||||
|
||||
# Din Leumi imports (aliased to avoid collision)
|
||||
from din_leumi import config as dl_config
|
||||
from din_leumi.services import db as dl_db
|
||||
from din_leumi.services import processor as dl_processor
|
||||
from din_leumi.services import extractor as dl_extractor
|
||||
|
||||
import anthropic
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
UPLOAD_DIR = config.DATA_DIR / "uploads"
|
||||
@@ -38,12 +48,15 @@ _progress: dict[str, dict] = {}
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
||||
dl_config.DECISIONS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
await db.init_schema()
|
||||
await dl_db.init_schema()
|
||||
yield
|
||||
await db.close_pool()
|
||||
await dl_db.close_pool()
|
||||
|
||||
|
||||
app = FastAPI(title="Ezer Mishpati — Upload", lifespan=lifespan)
|
||||
app = FastAPI(title="העלאת מסמכים משפטיים", lifespan=lifespan)
|
||||
|
||||
STATIC_DIR = Path(__file__).parent / "static"
|
||||
|
||||
@@ -170,6 +183,11 @@ async def progress_stream(task_id: str):
|
||||
return StreamingResponse(event_stream(), media_type="text/event-stream")
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
@app.get("/api/cases")
|
||||
async def list_cases():
|
||||
"""List existing cases for the dropdown."""
|
||||
@@ -184,6 +202,94 @@ async def list_cases():
|
||||
]
|
||||
|
||||
|
||||
# ── Din Leumi Endpoint ────────────────────────────────────────────
|
||||
|
||||
|
||||
class DinLeumiRequest(BaseModel):
|
||||
filename: str
|
||||
title: str = ""
|
||||
|
||||
|
||||
@app.post("/api/classify-dinleumi")
|
||||
async def classify_dinleumi(req: DinLeumiRequest):
|
||||
"""Upload a decision to Din Leumi with auto metadata extraction."""
|
||||
source = UPLOAD_DIR / req.filename
|
||||
if not source.exists() or not source.parent.samefile(UPLOAD_DIR):
|
||||
raise HTTPException(404, "File not found in uploads")
|
||||
|
||||
task_id = str(uuid4())
|
||||
_progress[task_id] = {"status": "queued", "filename": req.filename}
|
||||
|
||||
asyncio.create_task(_process_dinleumi_decision(task_id, source, req))
|
||||
|
||||
return {"task_id": task_id}
|
||||
|
||||
|
||||
# ── Metadata Extraction ──────────────────────────────────────────
|
||||
|
||||
METADATA_EXTRACTION_PROMPT = """אתה מנתח פסקי דין של בתי דין לעבודה בתחום ביטוח לאומי.
|
||||
חלץ את המטאדאטא הבאה מתוך פסק הדין והחזר אותה כ-JSON בלבד:
|
||||
|
||||
{
|
||||
"title": "כותרת תיאורית קצרה של פסק הדין",
|
||||
"court": "שם בית המשפט (למשל: בית הדין האזורי לעבודה תל אביב)",
|
||||
"decision_date": "YYYY-MM-DD או null אם לא נמצא",
|
||||
"case_number": "מספר תיק (למשל: בל 12345-06-20)",
|
||||
"judge": "שם השופט/ת",
|
||||
"parties_appellant": "שם התובע/מערער",
|
||||
"parties_respondent": "שם הנתבע/משיב",
|
||||
"topics": ["רשימת נושאים רלוונטיים מתוך הרשימה למטה"],
|
||||
"outcome": "accepted/rejected/partial/remanded",
|
||||
"summary": "תקציר של 2-3 משפטים"
|
||||
}
|
||||
|
||||
נושאים אפשריים: נכות כללית, נכות מעבודה, תאונת עבודה, דמי לידה, דמי אבטלה, גמלת הבטחת הכנסה, גמלת ניידות, גמלת סיעוד, קצבת זקנה, קצבת שאירים, מילואים, דמי פגיעה, נפגעי פעולות איבה
|
||||
|
||||
החזר JSON בלבד, ללא טקסט נוסף."""
|
||||
|
||||
|
||||
_anthropic_client: anthropic.Anthropic | None = None
|
||||
|
||||
|
||||
def _get_anthropic() -> anthropic.Anthropic:
|
||||
global _anthropic_client
|
||||
if _anthropic_client is None:
|
||||
_anthropic_client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
|
||||
return _anthropic_client
|
||||
|
||||
|
||||
async def _extract_metadata_with_claude(text: str) -> dict:
|
||||
"""Extract metadata from decision text using Claude."""
|
||||
client = _get_anthropic()
|
||||
# Use first ~5000 chars (usually contains all metadata)
|
||||
excerpt = text[:5000]
|
||||
|
||||
message = client.messages.create(
|
||||
model="claude-sonnet-4-20250514",
|
||||
max_tokens=1024,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"{METADATA_EXTRACTION_PROMPT}\n\nפסק הדין:\n{excerpt}",
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
response_text = message.content[0].text.strip()
|
||||
# Parse JSON from response (handle potential markdown wrapping)
|
||||
if response_text.startswith("```"):
|
||||
response_text = response_text.split("```")[1]
|
||||
if response_text.startswith("json"):
|
||||
response_text = response_text[4:]
|
||||
try:
|
||||
metadata = json.loads(response_text)
|
||||
except json.JSONDecodeError:
|
||||
logger.warning("Failed to parse metadata JSON: %s", response_text[:200])
|
||||
metadata = {}
|
||||
|
||||
return metadata
|
||||
|
||||
|
||||
# ── Background Processing ─────────────────────────────────────────
|
||||
|
||||
|
||||
@@ -340,3 +446,117 @@ async def _process_training_document(task_id: str, source: Path, req: ClassifyRe
|
||||
"chunks": chunk_count,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
async def _process_dinleumi_decision(task_id: str, source: Path, req: DinLeumiRequest):
|
||||
"""Process a National Insurance court decision with auto metadata extraction."""
|
||||
from datetime import date as date_type
|
||||
|
||||
try:
|
||||
# Step 1: Copy to din-leumi decisions directory
|
||||
_progress[task_id] = {"status": "copying", "filename": req.filename}
|
||||
original_name = re.sub(r"^\d+_", "", source.name)
|
||||
dest = dl_config.DECISIONS_DIR / original_name
|
||||
if dest.exists():
|
||||
dest = dl_config.DECISIONS_DIR / f"{dest.stem}_{int(time.time())}{dest.suffix}"
|
||||
shutil.copy2(str(source), str(dest))
|
||||
|
||||
# Step 2: Extract text
|
||||
_progress[task_id] = {"status": "processing", "filename": req.filename, "step": "extracting"}
|
||||
text, page_count = await dl_extractor.extract_text(str(dest))
|
||||
|
||||
# Step 3: Extract metadata with Claude
|
||||
_progress[task_id] = {"status": "processing", "filename": req.filename, "step": "extracting_metadata"}
|
||||
metadata = await _extract_metadata_with_claude(text)
|
||||
|
||||
# Parse date
|
||||
d_date = None
|
||||
if metadata.get("decision_date"):
|
||||
try:
|
||||
d_date = date_type.fromisoformat(metadata["decision_date"])
|
||||
except (ValueError, TypeError):
|
||||
d_date = None
|
||||
|
||||
title = req.title or metadata.get("title", original_name.rsplit(".", 1)[0])
|
||||
|
||||
# Step 4: Create decision record
|
||||
_progress[task_id] = {"status": "registering", "filename": req.filename}
|
||||
decision = await dl_db.create_decision(
|
||||
title=title,
|
||||
file_path=str(dest),
|
||||
court=metadata.get("court", ""),
|
||||
decision_date=d_date,
|
||||
case_number=metadata.get("case_number", ""),
|
||||
judge=metadata.get("judge", ""),
|
||||
parties_appellant=metadata.get("parties_appellant", ""),
|
||||
parties_respondent=metadata.get("parties_respondent", "המוסד לביטוח לאומי"),
|
||||
topics=metadata.get("topics"),
|
||||
outcome=metadata.get("outcome", ""),
|
||||
)
|
||||
|
||||
decision_id = UUID(decision["id"])
|
||||
|
||||
# Update with extracted text
|
||||
await dl_db.update_decision(
|
||||
decision_id,
|
||||
extracted_text=text,
|
||||
page_count=page_count,
|
||||
summary=metadata.get("summary", ""),
|
||||
)
|
||||
|
||||
# Step 5: Chunk
|
||||
_progress[task_id] = {"status": "processing", "filename": req.filename, "step": "chunking"}
|
||||
from din_leumi.services import chunker as dl_chunker, embeddings as dl_embeddings
|
||||
chunks = dl_chunker.chunk_document(text)
|
||||
|
||||
chunk_count = 0
|
||||
if chunks:
|
||||
# Step 6: Embed
|
||||
_progress[task_id] = {"status": "processing", "filename": req.filename, "step": "embedding"}
|
||||
texts = [c.content for c in chunks]
|
||||
embs = await dl_embeddings.embed_texts(texts, input_type="document")
|
||||
|
||||
chunk_dicts = [
|
||||
{
|
||||
"content": c.content,
|
||||
"section_type": c.section_type,
|
||||
"embedding": emb,
|
||||
"page_number": c.page_number,
|
||||
"chunk_index": c.chunk_index,
|
||||
}
|
||||
for c, emb in zip(chunks, embs)
|
||||
]
|
||||
await dl_db.store_chunks(decision_id, chunk_dicts)
|
||||
chunk_count = len(chunks)
|
||||
|
||||
await dl_db.update_decision(decision_id, extraction_status="completed")
|
||||
await dl_db.ensure_ivfflat_index()
|
||||
|
||||
# Remove from uploads
|
||||
source.unlink(missing_ok=True)
|
||||
|
||||
_progress[task_id] = {
|
||||
"status": "completed",
|
||||
"filename": req.filename,
|
||||
"system": "din-leumi",
|
||||
"result": {
|
||||
"decision_id": str(decision_id),
|
||||
"title": title,
|
||||
"pages": page_count,
|
||||
"text_length": len(text),
|
||||
"chunks": chunk_count,
|
||||
},
|
||||
"metadata": {
|
||||
"court": metadata.get("court", ""),
|
||||
"judge": metadata.get("judge", ""),
|
||||
"case_number": metadata.get("case_number", ""),
|
||||
"decision_date": metadata.get("decision_date", ""),
|
||||
"outcome": metadata.get("outcome", ""),
|
||||
"topics": metadata.get("topics", []),
|
||||
"summary": metadata.get("summary", ""),
|
||||
},
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.exception("Din Leumi processing failed for %s", req.filename)
|
||||
_progress[task_id] = {"status": "failed", "error": str(e), "filename": req.filename}
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>עוזר משפטי — העלאת מסמכים</title>
|
||||
<title>העלאת מסמכים משפטיים</title>
|
||||
<style>
|
||||
* { box-sizing: border-box; margin: 0; padding: 0; }
|
||||
body {
|
||||
@@ -234,8 +234,8 @@ header p { opacity: 0.7; margin-top: 4px; font-size: 0.9em; }
|
||||
<body>
|
||||
<div class="container">
|
||||
<header>
|
||||
<h1>עוזר משפטי — העלאת מסמכים</h1>
|
||||
<p>העלאה, סיווג ועיבוד מסמכים משפטיים</p>
|
||||
<h1>העלאת מסמכים משפטיים</h1>
|
||||
<p>העלאה, סיווג ועיבוד מסמכים — עזר משפטי | דין לאומי</p>
|
||||
</header>
|
||||
|
||||
<!-- Upload Zone -->
|
||||
@@ -363,8 +363,9 @@ async function loadPending() {
|
||||
</div>
|
||||
</div>
|
||||
<div class="radio-group">
|
||||
<label><input type="radio" name="cat_${esc(f.filename)}" value="training" onchange="showFields(this)"> החלטה קודמת (אימון)</label>
|
||||
<label><input type="radio" name="cat_${esc(f.filename)}" value="case" onchange="showFields(this)"> מסמך תיק</label>
|
||||
<label><input type="radio" name="cat_${esc(f.filename)}" value="case" onchange="showFields(this)"> עזר משפטי — מסמך תיק</label>
|
||||
<label><input type="radio" name="cat_${esc(f.filename)}" value="training" onchange="showFields(this)"> עזר משפטי — החלטה (אימון)</label>
|
||||
<label><input type="radio" name="cat_${esc(f.filename)}" value="dinleumi" onchange="showFields(this)"> דין לאומי — פסק דין בל"ל</label>
|
||||
</div>
|
||||
<div class="conditional case-fields" id="case_${esc(f.filename)}">
|
||||
<div class="form-row">
|
||||
@@ -407,6 +408,9 @@ async function loadPending() {
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="conditional dinleumi-fields" id="dinleumi_${esc(f.filename)}">
|
||||
<p style="font-size:0.85em;color:#636e72;margin:4px 0 8px">המטאדאטא (בית משפט, שופט, תאריך, נושא, תוצאה) תחולץ אוטומטית מתוך פסק הדין</p>
|
||||
</div>
|
||||
<div style="margin-top:12px">
|
||||
<div class="form-group" style="max-width:300px;margin-bottom:8px">
|
||||
<label>כותרת (אופציונלי)</label>
|
||||
@@ -420,11 +424,11 @@ async function loadPending() {
|
||||
|
||||
function showFields(radio) {
|
||||
const container = radio.closest('.pending-file');
|
||||
const filename = container.dataset.filename;
|
||||
const val = radio.value;
|
||||
|
||||
container.querySelector('.case-fields').classList.toggle('active', val === 'case');
|
||||
container.querySelector('.training-fields').classList.toggle('active', val === 'training');
|
||||
container.querySelector('.dinleumi-fields').classList.toggle('active', val === 'dinleumi');
|
||||
container.querySelector('.process-btn').disabled = false;
|
||||
}
|
||||
|
||||
@@ -439,20 +443,30 @@ async function classifyFile(filename) {
|
||||
const category = container.querySelector('input[type="radio"]:checked')?.value;
|
||||
if (!category) return toast('יש לבחור סיווג', 'error');
|
||||
|
||||
const body = {
|
||||
filename,
|
||||
category,
|
||||
title: container.querySelector('.doc-title').value,
|
||||
};
|
||||
let endpoint = API + '/classify';
|
||||
let body;
|
||||
|
||||
if (category === 'case') {
|
||||
body.case_number = container.querySelector('.case-select').value;
|
||||
body.doc_type = container.querySelector('.doctype-select').value;
|
||||
if (!body.case_number) return toast('יש לבחור תיק', 'error');
|
||||
if (category === 'dinleumi') {
|
||||
endpoint = API + '/classify-dinleumi';
|
||||
body = {
|
||||
filename,
|
||||
title: container.querySelector('.doc-title').value,
|
||||
};
|
||||
} else {
|
||||
body.decision_number = container.querySelector('.decision-number').value;
|
||||
body.decision_date = container.querySelector('.decision-date').value;
|
||||
body.subject_categories = Array.from(container.querySelectorAll('.subject-grid input:checked')).map(cb => cb.value);
|
||||
body = {
|
||||
filename,
|
||||
category,
|
||||
title: container.querySelector('.doc-title').value,
|
||||
};
|
||||
if (category === 'case') {
|
||||
body.case_number = container.querySelector('.case-select').value;
|
||||
body.doc_type = container.querySelector('.doctype-select').value;
|
||||
if (!body.case_number) return toast('יש לבחור תיק', 'error');
|
||||
} else {
|
||||
body.decision_number = container.querySelector('.decision-number').value;
|
||||
body.decision_date = container.querySelector('.decision-date').value;
|
||||
body.subject_categories = Array.from(container.querySelectorAll('.subject-grid input:checked')).map(cb => cb.value);
|
||||
}
|
||||
}
|
||||
|
||||
// Disable button
|
||||
@@ -460,7 +474,7 @@ async function classifyFile(filename) {
|
||||
container.querySelector('.process-btn').textContent = 'מעבד...';
|
||||
|
||||
try {
|
||||
const res = await fetch(API + '/classify', {
|
||||
const res = await fetch(endpoint, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(body),
|
||||
@@ -501,6 +515,7 @@ function trackTask(taskId, displayName) {
|
||||
|
||||
const STEP_LABELS = {
|
||||
extracting: 'מחלץ טקסט',
|
||||
extracting_metadata: 'מחלץ מטאדאטא מפסק הדין',
|
||||
corpus: 'מוסיף לקורפוס',
|
||||
chunking: 'מפצל לקטעים',
|
||||
embedding: 'יוצר embeddings',
|
||||
@@ -530,7 +545,19 @@ function trackTask(taskId, displayName) {
|
||||
div.classList.add('done');
|
||||
es.close();
|
||||
const r = data.result || {};
|
||||
if (r.chunks !== undefined) {
|
||||
const m = data.metadata;
|
||||
if (m && data.system === 'din-leumi') {
|
||||
// Show extracted metadata for din-leumi
|
||||
let metaHtml = `הושלם — ${r.chunks || 0} קטעים, ${r.pages || '?'} עמודים`;
|
||||
const parts = [];
|
||||
if (m.court) parts.push(m.court);
|
||||
if (m.judge) parts.push('שופט: ' + m.judge);
|
||||
if (m.decision_date) parts.push(m.decision_date);
|
||||
if (m.outcome) parts.push({accepted:'התקבלה',rejected:'נדחתה',partial:'חלקית',remanded:'הוחזרה'}[m.outcome] || m.outcome);
|
||||
if (parts.length) metaHtml += '<br><small style="color:#636e72">' + esc(parts.join(' | ')) + '</small>';
|
||||
if (m.topics && m.topics.length) metaHtml += '<br><small style="color:#0984e3">' + m.topics.map(t => esc(t)).join(', ') + '</small>';
|
||||
statusEl.innerHTML = metaHtml;
|
||||
} else if (r.chunks !== undefined) {
|
||||
statusEl.textContent = `הושלם — ${r.chunks} קטעים, ${r.pages || '?'} עמודים`;
|
||||
}
|
||||
toast('העיבוד הושלם: ' + esc(displayName), 'success');
|
||||
|
||||
Reference in New Issue
Block a user