Remove din-leumi: fully separate into standalone service
- Removed din-leumi imports, endpoints, and processing from app.py - Removed bundled din-leumi source from repo - Simplified Dockerfile (no din-leumi dependency) - din-leumi now runs as its own Coolify application Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
214
web/app.py
214
web/app.py
@@ -16,8 +16,6 @@ from uuid import UUID, uuid4
|
||||
|
||||
# Allow importing legal_mcp from the MCP server source
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src"))
|
||||
# Allow importing din_leumi from its MCP server source
|
||||
sys.path.insert(0, str(Path.home() / "din-leumi" / "mcp-server" / "src"))
|
||||
|
||||
from fastapi import FastAPI, File, HTTPException, UploadFile
|
||||
from fastapi.responses import FileResponse, StreamingResponse
|
||||
@@ -28,13 +26,6 @@ from legal_mcp import config
|
||||
from legal_mcp.services import chunker, db, embeddings, extractor, processor
|
||||
from legal_mcp.tools import cases as cases_tools, search as search_tools, workflow as workflow_tools, drafting as drafting_tools
|
||||
|
||||
# Din Leumi imports (aliased to avoid collision)
|
||||
from din_leumi import config as dl_config
|
||||
from din_leumi.services import db as dl_db
|
||||
from din_leumi.services import processor as dl_processor
|
||||
from din_leumi.services import extractor as dl_extractor
|
||||
|
||||
import anthropic
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -49,12 +40,9 @@ _progress: dict[str, dict] = {}
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
||||
dl_config.DECISIONS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
await db.init_schema()
|
||||
await dl_db.init_schema()
|
||||
yield
|
||||
await db.close_pool()
|
||||
await dl_db.close_pool()
|
||||
|
||||
|
||||
app = FastAPI(title="העלאת מסמכים משפטיים", lifespan=lifespan)
|
||||
@@ -558,94 +546,6 @@ async def api_document_text(doc_id: str):
|
||||
return {"doc_id": doc_id, "text": text}
|
||||
|
||||
|
||||
# ── Din Leumi Endpoint ────────────────────────────────────────────
|
||||
|
||||
|
||||
class DinLeumiRequest(BaseModel):
|
||||
filename: str
|
||||
title: str = ""
|
||||
|
||||
|
||||
@app.post("/api/classify-dinleumi")
|
||||
async def classify_dinleumi(req: DinLeumiRequest):
|
||||
"""Upload a decision to Din Leumi with auto metadata extraction."""
|
||||
source = UPLOAD_DIR / req.filename
|
||||
if not source.exists() or not source.parent.samefile(UPLOAD_DIR):
|
||||
raise HTTPException(404, "File not found in uploads")
|
||||
|
||||
task_id = str(uuid4())
|
||||
_progress[task_id] = {"status": "queued", "filename": req.filename}
|
||||
|
||||
asyncio.create_task(_process_dinleumi_decision(task_id, source, req))
|
||||
|
||||
return {"task_id": task_id}
|
||||
|
||||
|
||||
# ── Metadata Extraction ──────────────────────────────────────────
|
||||
|
||||
METADATA_EXTRACTION_PROMPT = """אתה מנתח פסקי דין של בתי דין לעבודה בתחום ביטוח לאומי.
|
||||
חלץ את המטאדאטא הבאה מתוך פסק הדין והחזר אותה כ-JSON בלבד:
|
||||
|
||||
{
|
||||
"title": "כותרת תיאורית קצרה של פסק הדין",
|
||||
"court": "שם בית המשפט (למשל: בית הדין האזורי לעבודה תל אביב)",
|
||||
"decision_date": "YYYY-MM-DD או null אם לא נמצא",
|
||||
"case_number": "מספר תיק (למשל: בל 12345-06-20)",
|
||||
"judge": "שם השופט/ת",
|
||||
"parties_appellant": "שם התובע/מערער",
|
||||
"parties_respondent": "שם הנתבע/משיב",
|
||||
"topics": ["רשימת נושאים רלוונטיים מתוך הרשימה למטה"],
|
||||
"outcome": "accepted/rejected/partial/remanded",
|
||||
"summary": "תקציר של 2-3 משפטים"
|
||||
}
|
||||
|
||||
נושאים אפשריים: נכות כללית, נכות מעבודה, תאונת עבודה, דמי לידה, דמי אבטלה, גמלת הבטחת הכנסה, גמלת ניידות, גמלת סיעוד, קצבת זקנה, קצבת שאירים, מילואים, דמי פגיעה, נפגעי פעולות איבה
|
||||
|
||||
החזר JSON בלבד, ללא טקסט נוסף."""
|
||||
|
||||
|
||||
_anthropic_client: anthropic.Anthropic | None = None
|
||||
|
||||
|
||||
def _get_anthropic() -> anthropic.Anthropic:
|
||||
global _anthropic_client
|
||||
if _anthropic_client is None:
|
||||
_anthropic_client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
|
||||
return _anthropic_client
|
||||
|
||||
|
||||
async def _extract_metadata_with_claude(text: str) -> dict:
|
||||
"""Extract metadata from decision text using Claude."""
|
||||
client = _get_anthropic()
|
||||
# Use first ~5000 chars (usually contains all metadata)
|
||||
excerpt = text[:5000]
|
||||
|
||||
message = client.messages.create(
|
||||
model="claude-sonnet-4-20250514",
|
||||
max_tokens=1024,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"{METADATA_EXTRACTION_PROMPT}\n\nפסק הדין:\n{excerpt}",
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
response_text = message.content[0].text.strip()
|
||||
# Parse JSON from response (handle potential markdown wrapping)
|
||||
if response_text.startswith("```"):
|
||||
response_text = response_text.split("```")[1]
|
||||
if response_text.startswith("json"):
|
||||
response_text = response_text[4:]
|
||||
try:
|
||||
metadata = json.loads(response_text)
|
||||
except json.JSONDecodeError:
|
||||
logger.warning("Failed to parse metadata JSON: %s", response_text[:200])
|
||||
metadata = {}
|
||||
|
||||
return metadata
|
||||
|
||||
|
||||
# ── Background Processing ─────────────────────────────────────────
|
||||
|
||||
|
||||
@@ -802,117 +702,3 @@ async def _process_training_document(task_id: str, source: Path, req: ClassifyRe
|
||||
"chunks": chunk_count,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
async def _process_dinleumi_decision(task_id: str, source: Path, req: DinLeumiRequest):
|
||||
"""Process a National Insurance court decision with auto metadata extraction."""
|
||||
from datetime import date as date_type
|
||||
|
||||
try:
|
||||
# Step 1: Copy to din-leumi decisions directory
|
||||
_progress[task_id] = {"status": "copying", "filename": req.filename}
|
||||
original_name = re.sub(r"^\d+_", "", source.name)
|
||||
dest = dl_config.DECISIONS_DIR / original_name
|
||||
if dest.exists():
|
||||
dest = dl_config.DECISIONS_DIR / f"{dest.stem}_{int(time.time())}{dest.suffix}"
|
||||
shutil.copy2(str(source), str(dest))
|
||||
|
||||
# Step 2: Extract text
|
||||
_progress[task_id] = {"status": "processing", "filename": req.filename, "step": "extracting"}
|
||||
text, page_count = await dl_extractor.extract_text(str(dest))
|
||||
|
||||
# Step 3: Extract metadata with Claude
|
||||
_progress[task_id] = {"status": "processing", "filename": req.filename, "step": "extracting_metadata"}
|
||||
metadata = await _extract_metadata_with_claude(text)
|
||||
|
||||
# Parse date
|
||||
d_date = None
|
||||
if metadata.get("decision_date"):
|
||||
try:
|
||||
d_date = date_type.fromisoformat(metadata["decision_date"])
|
||||
except (ValueError, TypeError):
|
||||
d_date = None
|
||||
|
||||
title = req.title or metadata.get("title", original_name.rsplit(".", 1)[0])
|
||||
|
||||
# Step 4: Create decision record
|
||||
_progress[task_id] = {"status": "registering", "filename": req.filename}
|
||||
decision = await dl_db.create_decision(
|
||||
title=title,
|
||||
file_path=str(dest),
|
||||
court=metadata.get("court", ""),
|
||||
decision_date=d_date,
|
||||
case_number=metadata.get("case_number", ""),
|
||||
judge=metadata.get("judge", ""),
|
||||
parties_appellant=metadata.get("parties_appellant", ""),
|
||||
parties_respondent=metadata.get("parties_respondent", "המוסד לביטוח לאומי"),
|
||||
topics=metadata.get("topics"),
|
||||
outcome=metadata.get("outcome", ""),
|
||||
)
|
||||
|
||||
decision_id = UUID(decision["id"])
|
||||
|
||||
# Update with extracted text
|
||||
await dl_db.update_decision(
|
||||
decision_id,
|
||||
extracted_text=text,
|
||||
page_count=page_count,
|
||||
summary=metadata.get("summary", ""),
|
||||
)
|
||||
|
||||
# Step 5: Chunk
|
||||
_progress[task_id] = {"status": "processing", "filename": req.filename, "step": "chunking"}
|
||||
from din_leumi.services import chunker as dl_chunker, embeddings as dl_embeddings
|
||||
chunks = dl_chunker.chunk_document(text)
|
||||
|
||||
chunk_count = 0
|
||||
if chunks:
|
||||
# Step 6: Embed
|
||||
_progress[task_id] = {"status": "processing", "filename": req.filename, "step": "embedding"}
|
||||
texts = [c.content for c in chunks]
|
||||
embs = await dl_embeddings.embed_texts(texts, input_type="document")
|
||||
|
||||
chunk_dicts = [
|
||||
{
|
||||
"content": c.content,
|
||||
"section_type": c.section_type,
|
||||
"embedding": emb,
|
||||
"page_number": c.page_number,
|
||||
"chunk_index": c.chunk_index,
|
||||
}
|
||||
for c, emb in zip(chunks, embs)
|
||||
]
|
||||
await dl_db.store_chunks(decision_id, chunk_dicts)
|
||||
chunk_count = len(chunks)
|
||||
|
||||
await dl_db.update_decision(decision_id, extraction_status="completed")
|
||||
await dl_db.ensure_ivfflat_index()
|
||||
|
||||
# Remove from uploads
|
||||
source.unlink(missing_ok=True)
|
||||
|
||||
_progress[task_id] = {
|
||||
"status": "completed",
|
||||
"filename": req.filename,
|
||||
"system": "din-leumi",
|
||||
"result": {
|
||||
"decision_id": str(decision_id),
|
||||
"title": title,
|
||||
"pages": page_count,
|
||||
"text_length": len(text),
|
||||
"chunks": chunk_count,
|
||||
},
|
||||
"metadata": {
|
||||
"court": metadata.get("court", ""),
|
||||
"judge": metadata.get("judge", ""),
|
||||
"case_number": metadata.get("case_number", ""),
|
||||
"decision_date": metadata.get("decision_date", ""),
|
||||
"outcome": metadata.get("outcome", ""),
|
||||
"topics": metadata.get("topics", []),
|
||||
"summary": metadata.get("summary", ""),
|
||||
},
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.exception("Din Leumi processing failed for %s", req.filename)
|
||||
_progress[task_id] = {"status": "failed", "error": str(e), "filename": req.filename}
|
||||
|
||||
Reference in New Issue
Block a user