Files
legal-ai/scripts/.archive/extract-citations.py
Chaim 5c9a5d702a Clean up scripts/: archive 17, delete 5, add SCRIPTS.md registry
Active scripts (5): auto-sync-cases.sh, backup-db.sh, restore-db.sh,
notify.py, bidi_table.py

Archived (17): one-time migration/seeding scripts whose functionality
is now in MCP server or web API. Moved to scripts/.archive/

Deleted (5): zero-value scripts (duplicates, hardcoded single-case,
debug scripts)

Added scripts/SCRIPTS.md — registry of all scripts with purpose,
status, and what superseded them. CLAUDE.md updated with rule:
any script change requires SCRIPTS.md update.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-14 16:30:19 +00:00

135 lines
4.6 KiB
Python

#!/usr/bin/env python3
"""Extract case law citations from block-yod and link to case_law table."""
import asyncio
import re
import sys
from pathlib import Path
from uuid import UUID
sys.path.insert(0, str(Path(__file__).parent.parent / "mcp-server" / "src"))
from legal_mcp.services.db import get_pool, init_schema, close_pool
# Patterns for Israeli case law citations
CITATION_PATTERNS = [
# עע"מ, בג"ץ, ע"א, etc.
re.compile(r'(עע"מ|בג"ץ|ע"א|בר"ם|עת"מ|עמ"נ|ע"ע|רע"א|דנ"א|בש"א)\s*(\d[\d/\-]+)'),
# ערר with number
re.compile(r'ערר\s*\(?\s*(?:מרכז|ירושלים|חי\'?|ת"א|דרום|צפון)?\s*\)?\s*(\d[\d/\-]+)'),
# ערר without district
re.compile(r'ערר\s+(\d{3,5}[\-/]\d{2,4})'),
]
def extract_citations_from_text(text: str) -> list[dict]:
"""Find all case law citations in text."""
citations = []
seen = set()
for pattern in CITATION_PATTERNS:
for match in pattern.finditer(text):
full_match = match.group(0)
if full_match in seen:
continue
seen.add(full_match)
# Get surrounding context (50 chars before and after)
start = max(0, match.start() - 50)
end = min(len(text), match.end() + 100)
context = text[start:end].replace("\n", " ")
citations.append({
"citation_text": full_match,
"context": context,
})
return citations
async def main():
await init_schema()
pool = await get_pool()
async with pool.acquire() as conn:
# Get all block-yod content with decision info
blocks = await conn.fetch(
"""SELECT db.content, d.id as decision_id, c.case_number
FROM decision_blocks db
JOIN decisions d ON d.id = db.decision_id
JOIN cases c ON c.id = d.case_id
WHERE db.block_id = 'block-yod' AND db.word_count > 0
ORDER BY c.case_number"""
)
# Get existing case_law for matching
case_laws = await conn.fetch("SELECT id, case_number, case_name FROM case_law")
case_law_map = {}
for cl in case_laws:
# Index by various forms of the case number
case_law_map[cl["case_number"]] = cl["id"]
# Also index by short number (e.g., "3975/22" from "עע"מ 3975/22")
parts = cl["case_number"].split()
if len(parts) > 1:
case_law_map[parts[-1]] = cl["id"]
total_citations = 0
total_linked = 0
for block in blocks:
case_number = block["case_number"]
decision_id = block["decision_id"]
text = block["content"]
citations = extract_citations_from_text(text)
if not citations:
continue
print(f"\n{case_number}: {len(citations)} ציטוטים נמצאו")
async with pool.acquire() as conn:
for cit in citations:
total_citations += 1
# Try to match to case_law table
case_law_id = None
for key, cl_id in case_law_map.items():
if key in cit["citation_text"] or cit["citation_text"] in key:
case_law_id = cl_id
break
if case_law_id:
# Check if already exists
existing = await conn.fetchval(
"""SELECT id FROM case_law_citations
WHERE case_law_id = $1 AND decision_id = $2""",
case_law_id, decision_id,
)
if not existing:
await conn.execute(
"""INSERT INTO case_law_citations
(case_law_id, decision_id, citation_type, context_text)
VALUES ($1, $2, 'support', $3)""",
case_law_id, decision_id, cit["context"],
)
total_linked += 1
print(f"{cit['citation_text'][:40]} → קושר לפסיקה")
else:
print(f"{cit['citation_text'][:40]} — לא נמצא ב-DB")
# Summary
async with pool.acquire() as conn:
total_in_db = await conn.fetchval("SELECT count(*) FROM case_law_citations")
await close_pool()
print(f"\n{'='*50}")
print(f"סה\"כ ציטוטים שנמצאו: {total_citations}")
print(f"סה\"כ קושרו לפסיקה ב-DB: {total_linked}")
print(f"סה\"כ ב-case_law_citations: {total_in_db}")
if __name__ == "__main__":
asyncio.run(main())