#!/usr/bin/env python3 """Extract case law citations from block-yod and link to case_law table.""" import asyncio import re import sys from pathlib import Path from uuid import UUID sys.path.insert(0, str(Path(__file__).parent.parent / "mcp-server" / "src")) from legal_mcp.services.db import get_pool, init_schema, close_pool # Patterns for Israeli case law citations CITATION_PATTERNS = [ # עע"מ, בג"ץ, ע"א, etc. re.compile(r'(עע"מ|בג"ץ|ע"א|בר"ם|עת"מ|עמ"נ|ע"ע|רע"א|דנ"א|בש"א)\s*(\d[\d/\-]+)'), # ערר with number re.compile(r'ערר\s*\(?\s*(?:מרכז|ירושלים|חי\'?|ת"א|דרום|צפון)?\s*\)?\s*(\d[\d/\-]+)'), # ערר without district re.compile(r'ערר\s+(\d{3,5}[\-/]\d{2,4})'), ] def extract_citations_from_text(text: str) -> list[dict]: """Find all case law citations in text.""" citations = [] seen = set() for pattern in CITATION_PATTERNS: for match in pattern.finditer(text): full_match = match.group(0) if full_match in seen: continue seen.add(full_match) # Get surrounding context (50 chars before and after) start = max(0, match.start() - 50) end = min(len(text), match.end() + 100) context = text[start:end].replace("\n", " ") citations.append({ "citation_text": full_match, "context": context, }) return citations async def main(): await init_schema() pool = await get_pool() async with pool.acquire() as conn: # Get all block-yod content with decision info blocks = await conn.fetch( """SELECT db.content, d.id as decision_id, c.case_number FROM decision_blocks db JOIN decisions d ON d.id = db.decision_id JOIN cases c ON c.id = d.case_id WHERE db.block_id = 'block-yod' AND db.word_count > 0 ORDER BY c.case_number""" ) # Get existing case_law for matching case_laws = await conn.fetch("SELECT id, case_number, case_name FROM case_law") case_law_map = {} for cl in case_laws: # Index by various forms of the case number case_law_map[cl["case_number"]] = cl["id"] # Also index by short number (e.g., "3975/22" from "עע"מ 3975/22") parts = cl["case_number"].split() if len(parts) > 1: case_law_map[parts[-1]] = cl["id"] total_citations = 0 total_linked = 0 for block in blocks: case_number = block["case_number"] decision_id = block["decision_id"] text = block["content"] citations = extract_citations_from_text(text) if not citations: continue print(f"\n{case_number}: {len(citations)} ציטוטים נמצאו") async with pool.acquire() as conn: for cit in citations: total_citations += 1 # Try to match to case_law table case_law_id = None for key, cl_id in case_law_map.items(): if key in cit["citation_text"] or cit["citation_text"] in key: case_law_id = cl_id break if case_law_id: # Check if already exists existing = await conn.fetchval( """SELECT id FROM case_law_citations WHERE case_law_id = $1 AND decision_id = $2""", case_law_id, decision_id, ) if not existing: await conn.execute( """INSERT INTO case_law_citations (case_law_id, decision_id, citation_type, context_text) VALUES ($1, $2, 'support', $3)""", case_law_id, decision_id, cit["context"], ) total_linked += 1 print(f" ✅ {cit['citation_text'][:40]} → קושר לפסיקה") else: print(f" ⬜ {cit['citation_text'][:40]} — לא נמצא ב-DB") # Summary async with pool.acquire() as conn: total_in_db = await conn.fetchval("SELECT count(*) FROM case_law_citations") await close_pool() print(f"\n{'='*50}") print(f"סה\"כ ציטוטים שנמצאו: {total_citations}") print(f"סה\"כ קושרו לפסיקה ב-DB: {total_linked}") print(f"סה\"כ ב-case_law_citations: {total_in_db}") if __name__ == "__main__": asyncio.run(main())