Remove din-leumi: fully separate into standalone service

- Removed din-leumi imports, endpoints, and processing from app.py - Removed bundled din-leumi source from repo - Simplified Dockerfile (no din-leumi dependency) - din-leumi now runs as its own Coolify application Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-04 08:34:35 +00:00
parent 324807ff1d
commit cb41867bc9
16 changed files with 3 additions and 1549 deletions
--- a/web/app.py
+++ b/web/app.py
@@ -16,8 +16,6 @@ from uuid import UUID, uuid4

 # Allow importing legal_mcp from the MCP server source
 sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src"))
-# Allow importing din_leumi from its MCP server source
-sys.path.insert(0, str(Path.home() / "din-leumi" / "mcp-server" / "src"))

 from fastapi import FastAPI, File, HTTPException, UploadFile
 from fastapi.responses import FileResponse, StreamingResponse
@@ -28,13 +26,6 @@ from legal_mcp import config
 from legal_mcp.services import chunker, db, embeddings, extractor, processor
 from legal_mcp.tools import cases as cases_tools, search as search_tools, workflow as workflow_tools, drafting as drafting_tools

-# Din Leumi imports (aliased to avoid collision)
-from din_leumi import config as dl_config
-from din_leumi.services import db as dl_db
-from din_leumi.services import processor as dl_processor
-from din_leumi.services import extractor as dl_extractor
-
-import anthropic

 logger = logging.getLogger(__name__)

@@ -49,12 +40,9 @@ _progress: dict[str, dict] = {}
@asynccontextmanager
 async def lifespan(app: FastAPI):
    UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
-    dl_config.DECISIONS_DIR.mkdir(parents=True, exist_ok=True)
    await db.init_schema()
-    await dl_db.init_schema()
    yield
    await db.close_pool()
-    await dl_db.close_pool()


 app = FastAPI(title="העלאת מסמכים משפטיים", lifespan=lifespan)
@@ -558,94 +546,6 @@ async def api_document_text(doc_id: str):
    return {"doc_id": doc_id, "text": text}


-# ── Din Leumi Endpoint ────────────────────────────────────────────
-
-
-class DinLeumiRequest(BaseModel):
-    filename: str
-    title: str = ""
-
-
-@app.post("/api/classify-dinleumi")
-async def classify_dinleumi(req: DinLeumiRequest):
-    """Upload a decision to Din Leumi with auto metadata extraction."""
-    source = UPLOAD_DIR / req.filename
-    if not source.exists() or not source.parent.samefile(UPLOAD_DIR):
-        raise HTTPException(404, "File not found in uploads")
-
-    task_id = str(uuid4())
-    _progress[task_id] = {"status": "queued", "filename": req.filename}
-
-    asyncio.create_task(_process_dinleumi_decision(task_id, source, req))
-
-    return {"task_id": task_id}
-
-
-# ── Metadata Extraction ──────────────────────────────────────────
-
-METADATA_EXTRACTION_PROMPT = """אתה מנתח פסקי דין של בתי דין לעבודה בתחום ביטוח לאומי.
-חלץ את המטאדאטא הבאה מתוך פסק הדין והחזר אותה כ-JSON בלבד:
-
-{
-  "title": "כותרת תיאורית קצרה של פסק הדין",
-  "court": "שם בית המשפט (למשל: בית הדין האזורי לעבודה תל אביב)",
-  "decision_date": "YYYY-MM-DD או null אם לא נמצא",
-  "case_number": "מספר תיק (למשל: בל 12345-06-20)",
-  "judge": "שם השופט/ת",
-  "parties_appellant": "שם התובע/מערער",
-  "parties_respondent": "שם הנתבע/משיב",
-  "topics": ["רשימת נושאים רלוונטיים מתוך הרשימה למטה"],
-  "outcome": "accepted/rejected/partial/remanded",
-  "summary": "תקציר של 2-3 משפטים"
-}
-
-נושאים אפשריים: נכות כללית, נכות מעבודה, תאונת עבודה, דמי לידה, דמי אבטלה, גמלת הבטחת הכנסה, גמלת ניידות, גמלת סיעוד, קצבת זקנה, קצבת שאירים, מילואים, דמי פגיעה, נפגעי פעולות איבה
-
-החזר JSON בלבד, ללא טקסט נוסף."""
-
-
-_anthropic_client: anthropic.Anthropic | None = None
-
-
-def _get_anthropic() -> anthropic.Anthropic:
-    global _anthropic_client
-    if _anthropic_client is None:
-        _anthropic_client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
-    return _anthropic_client
-
-
-async def _extract_metadata_with_claude(text: str) -> dict:
-    """Extract metadata from decision text using Claude."""
-    client = _get_anthropic()
-    # Use first ~5000 chars (usually contains all metadata)
-    excerpt = text[:5000]
-
-    message = client.messages.create(
-        model="claude-sonnet-4-20250514",
-        max_tokens=1024,
-        messages=[
-            {
-                "role": "user",
-                "content": f"{METADATA_EXTRACTION_PROMPT}\n\nפסק הדין:\n{excerpt}",
-            }
-        ],
-    )
-
-    response_text = message.content[0].text.strip()
-    # Parse JSON from response (handle potential markdown wrapping)
-    if response_text.startswith("```"):
-        response_text = response_text.split("```")[1]
-        if response_text.startswith("json"):
-            response_text = response_text[4:]
-    try:
-        metadata = json.loads(response_text)
-    except json.JSONDecodeError:
-        logger.warning("Failed to parse metadata JSON: %s", response_text[:200])
-        metadata = {}
-
-    return metadata
-
-
 # ── Background Processing ─────────────────────────────────────────


@@ -802,117 +702,3 @@ async def _process_training_document(task_id: str, source: Path, req: ClassifyRe
            "chunks": chunk_count,
        },
    }
-
-
-async def _process_dinleumi_decision(task_id: str, source: Path, req: DinLeumiRequest):
-    """Process a National Insurance court decision with auto metadata extraction."""
-    from datetime import date as date_type
-
-    try:
-        # Step 1: Copy to din-leumi decisions directory
-        _progress[task_id] = {"status": "copying", "filename": req.filename}
-        original_name = re.sub(r"^\d+_", "", source.name)
-        dest = dl_config.DECISIONS_DIR / original_name
-        if dest.exists():
-            dest = dl_config.DECISIONS_DIR / f"{dest.stem}_{int(time.time())}{dest.suffix}"
-        shutil.copy2(str(source), str(dest))
-
-        # Step 2: Extract text
-        _progress[task_id] = {"status": "processing", "filename": req.filename, "step": "extracting"}
-        text, page_count = await dl_extractor.extract_text(str(dest))
-
-        # Step 3: Extract metadata with Claude
-        _progress[task_id] = {"status": "processing", "filename": req.filename, "step": "extracting_metadata"}
-        metadata = await _extract_metadata_with_claude(text)
-
-        # Parse date
-        d_date = None
-        if metadata.get("decision_date"):
-            try:
-                d_date = date_type.fromisoformat(metadata["decision_date"])
-            except (ValueError, TypeError):
-                d_date = None
-
-        title = req.title or metadata.get("title", original_name.rsplit(".", 1)[0])
-
-        # Step 4: Create decision record
-        _progress[task_id] = {"status": "registering", "filename": req.filename}
-        decision = await dl_db.create_decision(
-            title=title,
-            file_path=str(dest),
-            court=metadata.get("court", ""),
-            decision_date=d_date,
-            case_number=metadata.get("case_number", ""),
-            judge=metadata.get("judge", ""),
-            parties_appellant=metadata.get("parties_appellant", ""),
-            parties_respondent=metadata.get("parties_respondent", "המוסד לביטוח לאומי"),
-            topics=metadata.get("topics"),
-            outcome=metadata.get("outcome", ""),
-        )
-
-        decision_id = UUID(decision["id"])
-
-        # Update with extracted text
-        await dl_db.update_decision(
-            decision_id,
-            extracted_text=text,
-            page_count=page_count,
-            summary=metadata.get("summary", ""),
-        )
-
-        # Step 5: Chunk
-        _progress[task_id] = {"status": "processing", "filename": req.filename, "step": "chunking"}
-        from din_leumi.services import chunker as dl_chunker, embeddings as dl_embeddings
-        chunks = dl_chunker.chunk_document(text)
-
-        chunk_count = 0
-        if chunks:
-            # Step 6: Embed
-            _progress[task_id] = {"status": "processing", "filename": req.filename, "step": "embedding"}
-            texts = [c.content for c in chunks]
-            embs = await dl_embeddings.embed_texts(texts, input_type="document")
-
-            chunk_dicts = [
-                {
-                    "content": c.content,
-                    "section_type": c.section_type,
-                    "embedding": emb,
-                    "page_number": c.page_number,
-                    "chunk_index": c.chunk_index,
-                }
-                for c, emb in zip(chunks, embs)
-            ]
-            await dl_db.store_chunks(decision_id, chunk_dicts)
-            chunk_count = len(chunks)
-
-        await dl_db.update_decision(decision_id, extraction_status="completed")
-        await dl_db.ensure_ivfflat_index()
-
-        # Remove from uploads
-        source.unlink(missing_ok=True)
-
-        _progress[task_id] = {
-            "status": "completed",
-            "filename": req.filename,
-            "system": "din-leumi",
-            "result": {
-                "decision_id": str(decision_id),
-                "title": title,
-                "pages": page_count,
-                "text_length": len(text),
-                "chunks": chunk_count,
-            },
-            "metadata": {
-                "court": metadata.get("court", ""),
-                "judge": metadata.get("judge", ""),
-                "case_number": metadata.get("case_number", ""),
-                "decision_date": metadata.get("decision_date", ""),
-                "outcome": metadata.get("outcome", ""),
-                "topics": metadata.get("topics", []),
-                "summary": metadata.get("summary", ""),
-            },
-        }
-
-    except Exception as e:
-        logger.exception("Din Leumi processing failed for %s", req.filename)
-        _progress[task_id] = {"status": "failed", "error": str(e), "filename": req.filename}