diff --git a/web/app.py b/web/app.py index 02a8e57..113d3de 100644 --- a/web/app.py +++ b/web/app.py @@ -16,6 +16,8 @@ from uuid import UUID, uuid4 # Allow importing legal_mcp from the MCP server source sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src")) +# Allow importing din_leumi from its MCP server source +sys.path.insert(0, str(Path.home() / "din-leumi" / "mcp-server" / "src")) from fastapi import FastAPI, File, HTTPException, UploadFile from fastapi.responses import FileResponse, StreamingResponse @@ -25,6 +27,14 @@ from pydantic import BaseModel from legal_mcp import config from legal_mcp.services import chunker, db, embeddings, extractor, processor +# Din Leumi imports (aliased to avoid collision) +from din_leumi import config as dl_config +from din_leumi.services import db as dl_db +from din_leumi.services import processor as dl_processor +from din_leumi.services import extractor as dl_extractor + +import anthropic + logger = logging.getLogger(__name__) UPLOAD_DIR = config.DATA_DIR / "uploads" @@ -38,12 +48,15 @@ _progress: dict[str, dict] = {} @asynccontextmanager async def lifespan(app: FastAPI): UPLOAD_DIR.mkdir(parents=True, exist_ok=True) + dl_config.DECISIONS_DIR.mkdir(parents=True, exist_ok=True) await db.init_schema() + await dl_db.init_schema() yield await db.close_pool() + await dl_db.close_pool() -app = FastAPI(title="Ezer Mishpati — Upload", lifespan=lifespan) +app = FastAPI(title="העלאת מסמכים משפטיים", lifespan=lifespan) STATIC_DIR = Path(__file__).parent / "static" @@ -170,6 +183,11 @@ async def progress_stream(task_id: str): return StreamingResponse(event_stream(), media_type="text/event-stream") +@app.get("/health") +async def health(): + return {"status": "ok"} + + @app.get("/api/cases") async def list_cases(): """List existing cases for the dropdown.""" @@ -184,6 +202,94 @@ async def list_cases(): ] +# ── Din Leumi Endpoint ──────────────────────────────────────────── + + +class DinLeumiRequest(BaseModel): + filename: str + title: str = "" + + +@app.post("/api/classify-dinleumi") +async def classify_dinleumi(req: DinLeumiRequest): + """Upload a decision to Din Leumi with auto metadata extraction.""" + source = UPLOAD_DIR / req.filename + if not source.exists() or not source.parent.samefile(UPLOAD_DIR): + raise HTTPException(404, "File not found in uploads") + + task_id = str(uuid4()) + _progress[task_id] = {"status": "queued", "filename": req.filename} + + asyncio.create_task(_process_dinleumi_decision(task_id, source, req)) + + return {"task_id": task_id} + + +# ── Metadata Extraction ────────────────────────────────────────── + +METADATA_EXTRACTION_PROMPT = """אתה מנתח פסקי דין של בתי דין לעבודה בתחום ביטוח לאומי. +חלץ את המטאדאטא הבאה מתוך פסק הדין והחזר אותה כ-JSON בלבד: + +{ + "title": "כותרת תיאורית קצרה של פסק הדין", + "court": "שם בית המשפט (למשל: בית הדין האזורי לעבודה תל אביב)", + "decision_date": "YYYY-MM-DD או null אם לא נמצא", + "case_number": "מספר תיק (למשל: בל 12345-06-20)", + "judge": "שם השופט/ת", + "parties_appellant": "שם התובע/מערער", + "parties_respondent": "שם הנתבע/משיב", + "topics": ["רשימת נושאים רלוונטיים מתוך הרשימה למטה"], + "outcome": "accepted/rejected/partial/remanded", + "summary": "תקציר של 2-3 משפטים" +} + +נושאים אפשריים: נכות כללית, נכות מעבודה, תאונת עבודה, דמי לידה, דמי אבטלה, גמלת הבטחת הכנסה, גמלת ניידות, גמלת סיעוד, קצבת זקנה, קצבת שאירים, מילואים, דמי פגיעה, נפגעי פעולות איבה + +החזר JSON בלבד, ללא טקסט נוסף.""" + + +_anthropic_client: anthropic.Anthropic | None = None + + +def _get_anthropic() -> anthropic.Anthropic: + global _anthropic_client + if _anthropic_client is None: + _anthropic_client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY) + return _anthropic_client + + +async def _extract_metadata_with_claude(text: str) -> dict: + """Extract metadata from decision text using Claude.""" + client = _get_anthropic() + # Use first ~5000 chars (usually contains all metadata) + excerpt = text[:5000] + + message = client.messages.create( + model="claude-sonnet-4-20250514", + max_tokens=1024, + messages=[ + { + "role": "user", + "content": f"{METADATA_EXTRACTION_PROMPT}\n\nפסק הדין:\n{excerpt}", + } + ], + ) + + response_text = message.content[0].text.strip() + # Parse JSON from response (handle potential markdown wrapping) + if response_text.startswith("```"): + response_text = response_text.split("```")[1] + if response_text.startswith("json"): + response_text = response_text[4:] + try: + metadata = json.loads(response_text) + except json.JSONDecodeError: + logger.warning("Failed to parse metadata JSON: %s", response_text[:200]) + metadata = {} + + return metadata + + # ── Background Processing ───────────────────────────────────────── @@ -340,3 +446,117 @@ async def _process_training_document(task_id: str, source: Path, req: ClassifyRe "chunks": chunk_count, }, } + + +async def _process_dinleumi_decision(task_id: str, source: Path, req: DinLeumiRequest): + """Process a National Insurance court decision with auto metadata extraction.""" + from datetime import date as date_type + + try: + # Step 1: Copy to din-leumi decisions directory + _progress[task_id] = {"status": "copying", "filename": req.filename} + original_name = re.sub(r"^\d+_", "", source.name) + dest = dl_config.DECISIONS_DIR / original_name + if dest.exists(): + dest = dl_config.DECISIONS_DIR / f"{dest.stem}_{int(time.time())}{dest.suffix}" + shutil.copy2(str(source), str(dest)) + + # Step 2: Extract text + _progress[task_id] = {"status": "processing", "filename": req.filename, "step": "extracting"} + text, page_count = await dl_extractor.extract_text(str(dest)) + + # Step 3: Extract metadata with Claude + _progress[task_id] = {"status": "processing", "filename": req.filename, "step": "extracting_metadata"} + metadata = await _extract_metadata_with_claude(text) + + # Parse date + d_date = None + if metadata.get("decision_date"): + try: + d_date = date_type.fromisoformat(metadata["decision_date"]) + except (ValueError, TypeError): + d_date = None + + title = req.title or metadata.get("title", original_name.rsplit(".", 1)[0]) + + # Step 4: Create decision record + _progress[task_id] = {"status": "registering", "filename": req.filename} + decision = await dl_db.create_decision( + title=title, + file_path=str(dest), + court=metadata.get("court", ""), + decision_date=d_date, + case_number=metadata.get("case_number", ""), + judge=metadata.get("judge", ""), + parties_appellant=metadata.get("parties_appellant", ""), + parties_respondent=metadata.get("parties_respondent", "המוסד לביטוח לאומי"), + topics=metadata.get("topics"), + outcome=metadata.get("outcome", ""), + ) + + decision_id = UUID(decision["id"]) + + # Update with extracted text + await dl_db.update_decision( + decision_id, + extracted_text=text, + page_count=page_count, + summary=metadata.get("summary", ""), + ) + + # Step 5: Chunk + _progress[task_id] = {"status": "processing", "filename": req.filename, "step": "chunking"} + from din_leumi.services import chunker as dl_chunker, embeddings as dl_embeddings + chunks = dl_chunker.chunk_document(text) + + chunk_count = 0 + if chunks: + # Step 6: Embed + _progress[task_id] = {"status": "processing", "filename": req.filename, "step": "embedding"} + texts = [c.content for c in chunks] + embs = await dl_embeddings.embed_texts(texts, input_type="document") + + chunk_dicts = [ + { + "content": c.content, + "section_type": c.section_type, + "embedding": emb, + "page_number": c.page_number, + "chunk_index": c.chunk_index, + } + for c, emb in zip(chunks, embs) + ] + await dl_db.store_chunks(decision_id, chunk_dicts) + chunk_count = len(chunks) + + await dl_db.update_decision(decision_id, extraction_status="completed") + await dl_db.ensure_ivfflat_index() + + # Remove from uploads + source.unlink(missing_ok=True) + + _progress[task_id] = { + "status": "completed", + "filename": req.filename, + "system": "din-leumi", + "result": { + "decision_id": str(decision_id), + "title": title, + "pages": page_count, + "text_length": len(text), + "chunks": chunk_count, + }, + "metadata": { + "court": metadata.get("court", ""), + "judge": metadata.get("judge", ""), + "case_number": metadata.get("case_number", ""), + "decision_date": metadata.get("decision_date", ""), + "outcome": metadata.get("outcome", ""), + "topics": metadata.get("topics", []), + "summary": metadata.get("summary", ""), + }, + } + + except Exception as e: + logger.exception("Din Leumi processing failed for %s", req.filename) + _progress[task_id] = {"status": "failed", "error": str(e), "filename": req.filename} diff --git a/web/static/index.html b/web/static/index.html index 4e721a2..ebcc97e 100644 --- a/web/static/index.html +++ b/web/static/index.html @@ -3,7 +3,7 @@ -עוזר משפטי — העלאת מסמכים +העלאת מסמכים משפטיים