Clean up scripts/: archive 17, delete 5, add SCRIPTS.md registry
Active scripts (5): auto-sync-cases.sh, backup-db.sh, restore-db.sh, notify.py, bidi_table.py Archived (17): one-time migration/seeding scripts whose functionality is now in MCP server or web API. Moved to scripts/.archive/ Deleted (5): zero-value scripts (duplicates, hardcoded single-case, debug scripts) Added scripts/SCRIPTS.md — registry of all scripts with purpose, status, and what superseded them. CLAUDE.md updated with rule: any script change requires SCRIPTS.md update. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
163
scripts/.archive/backfill_pattern_frequency.py
Normal file
163
scripts/.archive/backfill_pattern_frequency.py
Normal file
@@ -0,0 +1,163 @@
|
||||
"""Backfill style_patterns.frequency with real occurrence counts.
|
||||
|
||||
The analyzer currently stores frequency=1 for every pattern (it only extracts
|
||||
unique patterns, doesn't count occurrences). This script scans the full_text
|
||||
of every decision in style_corpus and updates each pattern's frequency to
|
||||
the true count of decisions containing the pattern_text as a substring.
|
||||
|
||||
Run once after analysis, and again whenever new decisions are added.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import unicodedata
|
||||
from pathlib import Path
|
||||
|
||||
# Load env
|
||||
for line in (Path.home() / ".env").read_text().splitlines():
|
||||
if "=" in line and not line.startswith("#"):
|
||||
k, v = line.split("=", 1)
|
||||
os.environ.setdefault(k.strip(), v.strip().strip('"').strip("'"))
|
||||
|
||||
sys.path.insert(0, "/home/chaim/legal-ai/mcp-server/src")
|
||||
|
||||
from legal_mcp.services import db as db_mod # noqa: E402
|
||||
|
||||
|
||||
def _strip_nikud(text: str) -> str:
|
||||
"""Remove Hebrew combining marks (nikud) for robust matching."""
|
||||
return "".join(
|
||||
c for c in unicodedata.normalize("NFD", text)
|
||||
if not unicodedata.combining(c)
|
||||
)
|
||||
|
||||
|
||||
def _extract_searchable_variants(pattern_text: str) -> list[str]:
|
||||
"""Extract searchable substrings from a pattern template.
|
||||
|
||||
The analyzer stores patterns as templates with:
|
||||
- Placeholders in [brackets]: "בפנינו ערר על החלטת [הגוף] מיום [תאריך]"
|
||||
- Alternatives separated by / : "נפנה ל... / ראה והשווה / נפנה להחלטה"
|
||||
- Ellipsis ... for variable parts
|
||||
|
||||
This function returns a list of concrete substrings to search for.
|
||||
We pick the longest fixed segment from each alternative (>= 4 chars)
|
||||
so that matching is specific enough to be meaningful but still flexible.
|
||||
"""
|
||||
# Split on " / " or " או " to get alternatives
|
||||
alternatives = re.split(r"\s*/\s*|\s+או\s+", pattern_text)
|
||||
|
||||
variants: list[str] = []
|
||||
for alt in alternatives:
|
||||
alt = alt.strip()
|
||||
if not alt:
|
||||
continue
|
||||
|
||||
# Remove bracket placeholders [X]
|
||||
alt = re.sub(r"\[[^\]]*\]", "|", alt)
|
||||
# Replace ellipsis with separator
|
||||
alt = re.sub(r"\.{2,}", "|", alt)
|
||||
# Remove ellipsis unicode
|
||||
alt = alt.replace("…", "|")
|
||||
|
||||
# Split on the | separator and take fixed segments
|
||||
segments = [s.strip(" ,.:;\"'") for s in alt.split("|")]
|
||||
# Keep segments long enough to be meaningful (>= 4 chars, not just common words)
|
||||
good = [s for s in segments if len(s) >= 4]
|
||||
|
||||
if good:
|
||||
# Use the longest segment as the key variant for this alternative
|
||||
variants.append(max(good, key=len))
|
||||
elif alt.strip():
|
||||
# Fallback: use the whole cleaned alternative
|
||||
stripped = alt.replace("|", " ").strip()
|
||||
if len(stripped) >= 4:
|
||||
variants.append(stripped)
|
||||
|
||||
# Deduplicate while preserving order
|
||||
seen = set()
|
||||
unique = []
|
||||
for v in variants:
|
||||
if v not in seen:
|
||||
seen.add(v)
|
||||
unique.append(v)
|
||||
return unique
|
||||
|
||||
|
||||
def _count_decisions_containing(variants: list[str], normalized_decisions: list) -> int:
|
||||
"""Count how many decisions contain ANY of the variants."""
|
||||
count = 0
|
||||
for _, _, text in normalized_decisions:
|
||||
if any(v in text for v in variants):
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
async def main() -> int:
|
||||
pool = await db_mod.get_pool()
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
decisions = await conn.fetch(
|
||||
"SELECT id, decision_number, full_text FROM style_corpus "
|
||||
"WHERE full_text IS NOT NULL AND length(full_text) > 0"
|
||||
)
|
||||
patterns = await conn.fetch(
|
||||
"SELECT id, pattern_text, pattern_type FROM style_patterns"
|
||||
)
|
||||
|
||||
print(f"Scanning {len(patterns)} patterns across {len(decisions)} decisions...")
|
||||
|
||||
# Normalize decisions once
|
||||
normalized_decisions = [
|
||||
(d["id"], d["decision_number"], _strip_nikud(d["full_text"]))
|
||||
for d in decisions
|
||||
]
|
||||
|
||||
updates = []
|
||||
for p in patterns:
|
||||
pattern_text = p["pattern_text"]
|
||||
if not pattern_text or len(pattern_text) < 3:
|
||||
updates.append((0, p["id"]))
|
||||
continue
|
||||
|
||||
variants = _extract_searchable_variants(_strip_nikud(pattern_text))
|
||||
if not variants:
|
||||
updates.append((0, p["id"]))
|
||||
continue
|
||||
|
||||
count = _count_decisions_containing(variants, normalized_decisions)
|
||||
updates.append((count, p["id"]))
|
||||
|
||||
await conn.executemany(
|
||||
"UPDATE style_patterns SET frequency = $1 WHERE id = $2",
|
||||
updates,
|
||||
)
|
||||
|
||||
# Show distribution
|
||||
rows = await conn.fetch(
|
||||
"SELECT pattern_type, pattern_text, frequency "
|
||||
"FROM style_patterns "
|
||||
"ORDER BY frequency DESC "
|
||||
"LIMIT 15"
|
||||
)
|
||||
print(f"\nTop 15 patterns by real frequency:")
|
||||
for r in rows:
|
||||
print(f" {r['frequency']:>3} [{r['pattern_type']:<22}] {r['pattern_text'][:90]}")
|
||||
|
||||
dist = await conn.fetch(
|
||||
"SELECT frequency, count(*) FROM style_patterns "
|
||||
"GROUP BY frequency ORDER BY frequency DESC"
|
||||
)
|
||||
print(f"\nFrequency distribution:")
|
||||
for r in dist:
|
||||
print(f" frequency={r['frequency']:>3} → {r['count']} patterns")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(asyncio.run(main()))
|
||||
349
scripts/.archive/batch_upload_training.py
Normal file
349
scripts/.archive/batch_upload_training.py
Normal file
@@ -0,0 +1,349 @@
|
||||
"""Batch upload proofread training corpus to style DB.
|
||||
|
||||
Two-phase workflow:
|
||||
--preview Extract metadata from all .md files, print review table, don't upload
|
||||
--upload Actually upload all files (with optional --only FILE to run one)
|
||||
|
||||
Metadata extraction:
|
||||
* decision_number: from filename (ARAR-YY-NNNN / ערר NNNN-YY) or decision date year
|
||||
* decision_date: from "ניתנה ... <day> ב<Hebrew month> <YYYY>" near end of text
|
||||
* categories: keyword heuristics on body text
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
PROOFREAD_DIR = Path("/home/chaim/legal-ai/data/training/proofread")
|
||||
|
||||
# Manual metadata overrides for files where auto-extraction can't determine values.
|
||||
METADATA_OVERRIDES: dict[str, dict] = {
|
||||
"ARAR-25-1067 - יחיעם יפה ואח׳.md": {
|
||||
"decision_date": "2025-11-27", # no "ניתנה" signature in file; user-provided
|
||||
},
|
||||
}
|
||||
|
||||
# Files to skip — already in style_corpus from legacy ingestion
|
||||
# (verified by exact character-count match with existing DB rows).
|
||||
SKIP_FILES = {
|
||||
"תמא 38-בית הכרם-1126+1141-החלטה.md", # → corpus: 1126/1141
|
||||
"היתר בניה-בית שמש-1180+1181-החלטה.md", # → corpus: 1180/1181
|
||||
"היתר בניה-הראל-1043+1054-החלטה.md", # → corpus: 1043/1054
|
||||
"היתר בניה-הראל-1071+1077-החלטה.md", # → corpus: 1071/1077
|
||||
}
|
||||
|
||||
# Load env vars needed by mcp-server
|
||||
ENV_FILE = Path.home() / ".env"
|
||||
if ENV_FILE.exists():
|
||||
for line in ENV_FILE.read_text().splitlines():
|
||||
if "=" in line and not line.startswith("#"):
|
||||
k, v = line.split("=", 1)
|
||||
os.environ.setdefault(k.strip(), v.strip().strip('"').strip("'"))
|
||||
|
||||
# Make mcp-server package importable
|
||||
sys.path.insert(0, "/home/chaim/legal-ai/mcp-server/src")
|
||||
|
||||
|
||||
# ── Decision number extraction ───────────────────────────────────
|
||||
|
||||
FILENAME_NUMBER_PATTERNS = [
|
||||
# ARAR-YY-NNNN[-X] - title.md
|
||||
re.compile(r"^ARAR-(\d{2})-(\d{3,4})"),
|
||||
# ערר NNNN-YY title.md or ערר NNNN-YY title
|
||||
re.compile(r"^ערר\s+(\d{3,4})-(\d{2})"),
|
||||
# ערר NNNN - title (no year in filename — needs date lookup)
|
||||
re.compile(r"^ערר\s+(\d{3,4})\s*-"),
|
||||
]
|
||||
|
||||
LEGACY_MULTI_PATTERN = re.compile(r"(\d{3,4})\+(\d{3,4})")
|
||||
|
||||
|
||||
def decision_number_from_filename(stem: str) -> tuple[str | None, str | None]:
|
||||
"""Return (number, year_short) or (multi_number, None) or (None, None).
|
||||
|
||||
year_short is YY (last 2 digits) if extractable from filename.
|
||||
For legacy files with 'NNNN+NNNN' or no year, returns partial info
|
||||
that must be completed from decision date.
|
||||
"""
|
||||
# ARAR-YY-NNNN
|
||||
m = FILENAME_NUMBER_PATTERNS[0].match(stem)
|
||||
if m:
|
||||
year, num = m.group(1), m.group(2)
|
||||
return f"{num}/{year}", year
|
||||
|
||||
# ערר NNNN-YY
|
||||
m = FILENAME_NUMBER_PATTERNS[1].match(stem)
|
||||
if m:
|
||||
num, year = m.group(1), m.group(2)
|
||||
return f"{num}/{year}", year
|
||||
|
||||
# ערר NNNN - title (no year)
|
||||
m = FILENAME_NUMBER_PATTERNS[2].match(stem)
|
||||
if m:
|
||||
num = m.group(1)
|
||||
return f"{num}/??", None
|
||||
|
||||
# Legacy: "NNNN+NNNN" merged decisions
|
||||
m = LEGACY_MULTI_PATTERN.search(stem)
|
||||
if m:
|
||||
return f"{m.group(1)}+{m.group(2)}/??", None
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
# ── Decision date extraction ─────────────────────────────────────
|
||||
|
||||
HEBREW_MONTHS = {
|
||||
"ינואר": 1, "בינואר": 1,
|
||||
"פברואר": 2, "בפברואר": 2,
|
||||
"מרץ": 3, "מרס": 3, "במרץ": 3, "במרס": 3,
|
||||
"אפריל": 4, "באפריל": 4,
|
||||
"מאי": 5, "במאי": 5,
|
||||
"יוני": 6, "ביוני": 6,
|
||||
"יולי": 7, "ביולי": 7,
|
||||
"אוגוסט": 8, "באוגוסט": 8,
|
||||
"ספטמבר": 9, "בספטמבר": 9,
|
||||
"אוקטובר": 10, "באוקטובר": 10,
|
||||
"נובמבר": 11, "בנובמבר": 11,
|
||||
"דצמבר": 12, "בדצמבר": 12,
|
||||
}
|
||||
|
||||
# Matches "<day> ב<month>, <year>" or "<day> <month>, <year>" (with optional commas)
|
||||
DATE_RE = re.compile(
|
||||
r"(\d{1,2})\s+(ב?(?:ינואר|פברואר|מרץ|מרס|אפריל|מאי|יוני|יולי|אוגוסט|ספטמבר|אוקטובר|נובמבר|דצמבר))\s*[,.]?\s*(\d{4})"
|
||||
)
|
||||
|
||||
NITNA_RE = re.compile(r"ניתנ[הו]?\s+(?:פה\s+אחד|בדעת\s+רוב|היום)?")
|
||||
|
||||
|
||||
def decision_date_from_text(text: str) -> str | None:
|
||||
"""Extract decision date in YYYY-MM-DD format from 'ניתנה... DATE' section.
|
||||
|
||||
Searches the last ~2000 chars where the signing block lives.
|
||||
"""
|
||||
tail = text[-2500:] if len(text) > 2500 else text
|
||||
|
||||
# Prefer dates near "ניתנה" marker
|
||||
nitna_match = NITNA_RE.search(tail)
|
||||
search_text = tail[nitna_match.start():] if nitna_match else tail
|
||||
|
||||
m = DATE_RE.search(search_text)
|
||||
if not m:
|
||||
# Fall back: search whole tail
|
||||
m = DATE_RE.search(tail)
|
||||
if not m:
|
||||
return None
|
||||
|
||||
day = int(m.group(1))
|
||||
month = HEBREW_MONTHS.get(m.group(2))
|
||||
year = int(m.group(3))
|
||||
if not month:
|
||||
return None
|
||||
try:
|
||||
from datetime import date
|
||||
return date(year, month, day).isoformat()
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
# ── Subject category extraction ──────────────────────────────────
|
||||
|
||||
# Categories as defined in the tool signature.
|
||||
ALL_CATEGORIES = [
|
||||
"בנייה", "שימוש חורג", "תכנית", "היתר", "הקלה",
|
||||
"חלוקה", 'תמ"א 38', "היטל השבחה", "פיצויים 197",
|
||||
]
|
||||
|
||||
|
||||
def categorize(text: str) -> list[str]:
|
||||
"""Heuristic category detection based on subject matter, not incidental mentions.
|
||||
|
||||
Strategy: the real subject is established in the opening 2000 chars
|
||||
(first decision-opening paragraph). Secondary signal is repetition count
|
||||
— casual mentions in law citations don't repeat.
|
||||
"""
|
||||
opening = text[:2000] # subject is stated up front
|
||||
t = text
|
||||
|
||||
cats: list[str] = []
|
||||
|
||||
# תמ"א 38 — very specific marker, single mention is fine
|
||||
if re.search(r'תמ[״"\']?א\s*38|תמא\s*38', t):
|
||||
cats.append('תמ"א 38')
|
||||
|
||||
# היטל השבחה — require real engagement: must appear in opening OR 3+ times
|
||||
hsbacha_count = len(re.findall(r"היטל(?:י)?\s+השבחה", t))
|
||||
if hsbacha_count >= 3 or re.search(r"היטל(?:י)?\s+השבחה", opening):
|
||||
cats.append("היטל השבחה")
|
||||
|
||||
# פיצויים 197 — require multiple mentions OR in opening
|
||||
p197_re = r"פיצויים\s+לפי\s+(?:ס(?:עיף|')\s*)?197|סעיף\s*197|ס['\"]?\s*197"
|
||||
p197_count = len(re.findall(p197_re, t))
|
||||
if p197_count >= 2 or re.search(p197_re, opening):
|
||||
cats.append("פיצויים 197")
|
||||
|
||||
# שימוש חורג — must appear in opening OR 3+ times (avoids law-quote false positives)
|
||||
shimush_count = t.count("שימוש חורג")
|
||||
if shimush_count >= 3 or "שימוש חורג" in opening:
|
||||
cats.append("שימוש חורג")
|
||||
|
||||
# הקלה — real subject if 3+ mentions AND appears in opening
|
||||
hakala_count = len(re.findall(r"\bהקלה\b|\bהקלות\b", t))
|
||||
if hakala_count >= 3 and re.search(r"\bהקלה\b|\bהקלות\b", opening):
|
||||
cats.append("הקלה")
|
||||
|
||||
# חלוקה — "איחוד וחלוקה" or "חלוקה חדשה" (specific phrases)
|
||||
if re.search(r"איחוד\s+וחלוקה|חלוקה\s+חדשה|תכנית\s+לחלוקה", t):
|
||||
cats.append("חלוקה")
|
||||
|
||||
# תכנית — plan-level appeal (primary subject). Allow ה/ב/ל prefixes on תכנית.
|
||||
tochnit_opening = bool(re.search(
|
||||
r"הפקדת\s+ה?תכנית|"
|
||||
r"אישור\s+ה?תכנית|"
|
||||
r"המלצה\s+להפקיד|"
|
||||
r"להפקיד\s+את\s+ה?תכנית|"
|
||||
r"לדון\s+בתכנית|"
|
||||
r"דנה\s+בתכנית|"
|
||||
r"החלטה\s+לאשר\s+ה?תכנית",
|
||||
opening,
|
||||
))
|
||||
if tochnit_opening:
|
||||
cats.append("תכנית")
|
||||
|
||||
# היתר — "בקשה להיתר" or "היתר בניה" as subject in opening
|
||||
if re.search(r"בקשה\s+להיתר|היתר\s+בני(?:י)?ה", opening):
|
||||
cats.append("היתר")
|
||||
|
||||
# בנייה — default/fallback for building-permit cases
|
||||
# (not for plan-level תכנית-only cases)
|
||||
has_permit_subject = "היתר" in cats or "הקלה" in cats or 'תמ"א 38' in cats
|
||||
if has_permit_subject and "בנייה" not in cats:
|
||||
cats.append("בנייה")
|
||||
|
||||
# If nothing matched, default to בנייה
|
||||
return cats or ["בנייה"]
|
||||
|
||||
|
||||
# ── Year fallback from date ──────────────────────────────────────
|
||||
|
||||
|
||||
def finalize_decision_number(number: str | None, date_iso: str | None) -> str:
|
||||
"""If filename number is missing year, fill it from decision date."""
|
||||
if not number:
|
||||
if date_iso:
|
||||
# Extract last 2 digits of Hebrew year via Gregorian year
|
||||
return f"??/{date_iso[2:4]}"
|
||||
return ""
|
||||
if number.endswith("/??"):
|
||||
if date_iso:
|
||||
yy = date_iso[2:4]
|
||||
return number.replace("/??", f"/{yy}")
|
||||
return number.replace("/??", "")
|
||||
return number
|
||||
|
||||
|
||||
# ── Main metadata extraction ─────────────────────────────────────
|
||||
|
||||
|
||||
def extract_metadata(path: Path) -> dict:
|
||||
text = path.read_text(encoding="utf-8")
|
||||
num_from_name, _ = decision_number_from_filename(path.stem)
|
||||
date_iso = decision_date_from_text(text)
|
||||
decision_number = finalize_decision_number(num_from_name, date_iso)
|
||||
cats = categorize(text)
|
||||
meta = {
|
||||
"file": path.name,
|
||||
"decision_number": decision_number,
|
||||
"decision_date": date_iso or "??",
|
||||
"categories": cats,
|
||||
"chars": len(text),
|
||||
}
|
||||
# Apply manual overrides
|
||||
if path.name in METADATA_OVERRIDES:
|
||||
meta.update(METADATA_OVERRIDES[path.name])
|
||||
return meta
|
||||
|
||||
|
||||
def print_preview(results: list[dict]) -> None:
|
||||
"""Print review table of metadata for all files."""
|
||||
print(f"\n{'#':<3} {'FILE':<55} {'NUMBER':<15} {'DATE':<12} {'CATEGORIES'}")
|
||||
print("-" * 130)
|
||||
for i, r in enumerate(results, 1):
|
||||
file_short = r["file"] if len(r["file"]) <= 53 else r["file"][:50] + "..."
|
||||
cats = ", ".join(r["categories"])
|
||||
print(f"{i:<3} {file_short:<55} {r['decision_number']:<15} {r['decision_date']:<12} {cats}")
|
||||
print()
|
||||
# Highlight issues
|
||||
issues = [r for r in results if r["decision_date"] == "??" or not r["decision_number"] or "??" in r["decision_number"]]
|
||||
if issues:
|
||||
print(f"⚠️ {len(issues)} files with incomplete metadata:")
|
||||
for r in issues:
|
||||
print(f" - {r['file']} → number={r['decision_number']!r} date={r['decision_date']!r}")
|
||||
|
||||
|
||||
# ── Upload ───────────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def upload_one(meta: dict) -> dict:
|
||||
from legal_mcp.tools.documents import document_upload_training
|
||||
|
||||
path = PROOFREAD_DIR / meta["file"]
|
||||
result = await document_upload_training(
|
||||
file_path=str(path),
|
||||
decision_number=meta["decision_number"],
|
||||
decision_date=meta["decision_date"] if meta["decision_date"] != "??" else "",
|
||||
subject_categories=meta["categories"],
|
||||
title=path.stem,
|
||||
)
|
||||
return {"file": meta["file"], "result": result}
|
||||
|
||||
|
||||
async def upload_all(results: list[dict]) -> None:
|
||||
for i, meta in enumerate(results, 1):
|
||||
try:
|
||||
r = await upload_one(meta)
|
||||
print(f"[{i}/{len(results)}] ✓ {meta['file']}")
|
||||
print(f" {r['result'][:200]}")
|
||||
except Exception as e:
|
||||
print(f"[{i}/{len(results)}] ✗ {meta['file']}: {e}")
|
||||
|
||||
|
||||
# ── CLI ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--preview", action="store_true", help="Show metadata table without uploading")
|
||||
ap.add_argument("--upload", action="store_true", help="Upload all files to style corpus")
|
||||
ap.add_argument("--only", help="Only process this specific filename")
|
||||
args = ap.parse_args()
|
||||
|
||||
files = sorted(PROOFREAD_DIR.glob("*.md"))
|
||||
files = [f for f in files if f.name not in SKIP_FILES]
|
||||
if args.only:
|
||||
files = [f for f in files if f.name == args.only]
|
||||
if not files:
|
||||
print(f"File not found: {args.only}")
|
||||
return 1
|
||||
|
||||
results = [extract_metadata(f) for f in files]
|
||||
|
||||
if args.preview or not args.upload:
|
||||
print_preview(results)
|
||||
if not args.upload:
|
||||
return 0
|
||||
|
||||
if args.upload:
|
||||
print(f"\n>>> Uploading {len(results)} files to style corpus...\n")
|
||||
asyncio.run(upload_all(results))
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
232
scripts/.archive/benchmark_embeddings.py
Normal file
232
scripts/.archive/benchmark_embeddings.py
Normal file
@@ -0,0 +1,232 @@
|
||||
"""Benchmark embedding models on case 1130-25 documents.
|
||||
|
||||
Compares voyage-3-large (current), voyage-4-large, and voyage-law-2
|
||||
on Hebrew legal text retrieval quality, timing, and cost.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import voyageai
|
||||
|
||||
API_KEY = os.environ.get("VOYAGE_API_KEY", "pa-qbfhBDxW0tVtgzr_abMyw_AJO2gli9w3nnqyHuQOW-e")
|
||||
client = voyageai.Client(api_key=API_KEY)
|
||||
|
||||
MODELS = [
|
||||
"voyage-3-large", # current
|
||||
"voyage-4-large", # upgrade candidate
|
||||
"voyage-law-2", # legal specialist
|
||||
]
|
||||
|
||||
# Pricing per 1M tokens (from Voyage AI docs)
|
||||
PRICING = {
|
||||
"voyage-3-large": 0.06,
|
||||
"voyage-4-large": 0.12,
|
||||
"voyage-law-2": 0.12,
|
||||
}
|
||||
|
||||
DOCS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents")
|
||||
|
||||
DOCUMENTS = {
|
||||
"כתב ערר קובר": DOCS_DIR / "2025-08-14-כתב-ערר-קובר.md",
|
||||
"כתב ערר מטמון": DOCS_DIR / "2025-10-22-כתב-ערר-מטמון.md",
|
||||
"תשובת ועדת הראל": DOCS_DIR / "2025-09-02-כתב-תשובה-ועדת-הראל-לערר.md",
|
||||
"תשובת ליבמן": DOCS_DIR / "2025-09-01-כתב-תשובה-ליבמן-לערר.md",
|
||||
}
|
||||
|
||||
# Test queries — real questions a judge would ask about this case
|
||||
QUERIES = [
|
||||
"מהי הטענה המרכזית של העוררים בנוגע לחניה?",
|
||||
"מה עמדת הוועדה המקומית לגבי התכנית?",
|
||||
"האם יש פגיעה בזכויות הבנייה של השכנים?",
|
||||
"מהם התנאים שנקבעו בהיתר הבנייה?",
|
||||
"האם התכנית עומדת בתקן החניה?",
|
||||
"מה טענות המשיבים לגבי הגובה והצפיפות?",
|
||||
"האם נערך שימוע כדין לפני מתן ההחלטה?",
|
||||
"מהם הנימוקים לאישור התכנית על ידי הוועדה המקומית?",
|
||||
]
|
||||
|
||||
|
||||
def chunk_text(text: str, chunk_size: int = 600, overlap: int = 100) -> list[str]:
|
||||
"""Simple word-based chunking."""
|
||||
words = text.split()
|
||||
chunks = []
|
||||
i = 0
|
||||
while i < len(words):
|
||||
chunk = " ".join(words[i:i + chunk_size])
|
||||
chunks.append(chunk)
|
||||
i += chunk_size - overlap
|
||||
return chunks
|
||||
|
||||
|
||||
def cosine_sim(a: list[float], b: list[float]) -> float:
|
||||
dot = sum(x * y for x, y in zip(a, b))
|
||||
norm_a = sum(x * x for x in a) ** 0.5
|
||||
norm_b = sum(x * x for x in b) ** 0.5
|
||||
return dot / (norm_a * norm_b) if norm_a and norm_b else 0.0
|
||||
|
||||
|
||||
def main():
|
||||
# Load and chunk documents
|
||||
print("=" * 70)
|
||||
print("Loading and chunking documents...")
|
||||
print("=" * 70)
|
||||
|
||||
all_chunks = [] # (doc_name, chunk_index, text)
|
||||
for doc_name, doc_path in DOCUMENTS.items():
|
||||
text = doc_path.read_text(encoding="utf-8")
|
||||
chunks = chunk_text(text)
|
||||
for i, chunk in enumerate(chunks):
|
||||
all_chunks.append((doc_name, i, chunk))
|
||||
print(f" {doc_name}: {len(text):,} chars, {len(text.split()):,} words -> {len(chunks)} chunks")
|
||||
|
||||
chunk_texts = [c[2] for c in all_chunks]
|
||||
total_chunks = len(chunk_texts)
|
||||
print(f"\nTotal: {total_chunks} chunks")
|
||||
|
||||
# Estimate tokens (rough: 1 Hebrew word ~ 2-3 tokens)
|
||||
total_words = sum(len(t.split()) for t in chunk_texts)
|
||||
est_tokens_docs = int(total_words * 2.5)
|
||||
total_query_words = sum(len(q.split()) for q in QUERIES)
|
||||
est_tokens_queries = int(total_query_words * 2.5)
|
||||
|
||||
print(f"Estimated tokens per model: ~{est_tokens_docs:,} (docs) + ~{est_tokens_queries:,} (queries)")
|
||||
|
||||
results = {}
|
||||
|
||||
for model in MODELS:
|
||||
print(f"\n{'=' * 70}")
|
||||
print(f"Model: {model}")
|
||||
print(f"{'=' * 70}")
|
||||
|
||||
# Embed documents
|
||||
print(f" Embedding {total_chunks} chunks...")
|
||||
t0 = time.time()
|
||||
doc_embeddings = client.embed(
|
||||
chunk_texts,
|
||||
model=model,
|
||||
input_type="document",
|
||||
)
|
||||
doc_time = time.time() - t0
|
||||
doc_usage = doc_embeddings.total_tokens
|
||||
doc_embs = doc_embeddings.embeddings
|
||||
print(f" Done in {doc_time:.1f}s — {doc_usage:,} tokens used")
|
||||
|
||||
# Embed queries
|
||||
print(f" Embedding {len(QUERIES)} queries...")
|
||||
t0 = time.time()
|
||||
query_embeddings = client.embed(
|
||||
QUERIES,
|
||||
model=model,
|
||||
input_type="query",
|
||||
)
|
||||
query_time = time.time() - t0
|
||||
query_usage = query_embeddings.total_tokens
|
||||
query_embs = query_embeddings.embeddings
|
||||
print(f" Done in {query_time:.1f}s — {query_usage:,} tokens used")
|
||||
|
||||
total_tokens = doc_usage + query_usage
|
||||
cost = total_tokens / 1_000_000 * PRICING[model]
|
||||
|
||||
# Search: for each query, rank chunks by similarity
|
||||
print(f"\n Search results:")
|
||||
query_results = []
|
||||
for qi, query in enumerate(QUERIES):
|
||||
scores = []
|
||||
for ci, doc_emb in enumerate(doc_embs):
|
||||
sim = cosine_sim(query_embs[qi], doc_emb)
|
||||
scores.append((sim, all_chunks[ci][0], all_chunks[ci][1], all_chunks[ci][2][:80]))
|
||||
scores.sort(reverse=True)
|
||||
top5 = scores[:5]
|
||||
query_results.append({
|
||||
"query": query,
|
||||
"top5": [(s[0], s[1], s[2], s[3]) for s in top5],
|
||||
})
|
||||
print(f"\n Q{qi+1}: {query}")
|
||||
for rank, (score, doc_name, chunk_idx, preview) in enumerate(top5):
|
||||
print(f" #{rank+1} [{score:.4f}] {doc_name} (chunk {chunk_idx}): {preview}...")
|
||||
|
||||
results[model] = {
|
||||
"doc_time": doc_time,
|
||||
"query_time": query_time,
|
||||
"doc_tokens": doc_usage,
|
||||
"query_tokens": query_usage,
|
||||
"total_tokens": total_tokens,
|
||||
"cost_usd": cost,
|
||||
"dimensions": len(doc_embs[0]),
|
||||
"query_results": query_results,
|
||||
}
|
||||
|
||||
# Summary comparison
|
||||
print(f"\n{'=' * 70}")
|
||||
print("SUMMARY")
|
||||
print(f"{'=' * 70}")
|
||||
print(f"\n{'Model':<25} {'Tokens':>10} {'Time':>8} {'Cost':>10} {'Dims':>6}")
|
||||
print("-" * 65)
|
||||
for model in MODELS:
|
||||
r = results[model]
|
||||
print(f"{model:<25} {r['total_tokens']:>10,} {r['doc_time']+r['query_time']:>7.1f}s ${r['cost_usd']:>8.5f} {r['dimensions']:>6}")
|
||||
|
||||
# Compare top-1 agreement between models
|
||||
print(f"\n{'=' * 70}")
|
||||
print("TOP-1 AGREEMENT (which doc is ranked #1 for each query)")
|
||||
print(f"{'=' * 70}")
|
||||
print(f"\n{'Query':<50}", end="")
|
||||
for model in MODELS:
|
||||
print(f" {model.split('-')[-1]:>10}", end="")
|
||||
print()
|
||||
print("-" * 85)
|
||||
|
||||
for qi, query in enumerate(QUERIES):
|
||||
short_q = query[:48]
|
||||
print(f"{short_q:<50}", end="")
|
||||
for model in MODELS:
|
||||
top1_doc = results[model]["query_results"][qi]["top5"][0][1]
|
||||
# Shorten doc name
|
||||
short_doc = top1_doc[:10]
|
||||
print(f" {short_doc:>10}", end="")
|
||||
print()
|
||||
|
||||
# Score distribution comparison
|
||||
print(f"\n{'=' * 70}")
|
||||
print("AVERAGE TOP-5 SCORES PER MODEL")
|
||||
print(f"{'=' * 70}")
|
||||
for model in MODELS:
|
||||
all_top5_scores = []
|
||||
for qr in results[model]["query_results"]:
|
||||
for score, _, _, _ in qr["top5"]:
|
||||
all_top5_scores.append(score)
|
||||
avg = sum(all_top5_scores) / len(all_top5_scores)
|
||||
top1_scores = [qr["top5"][0][0] for qr in results[model]["query_results"]]
|
||||
avg_top1 = sum(top1_scores) / len(top1_scores)
|
||||
print(f" {model:<25} avg top-1: {avg_top1:.4f} avg top-5: {avg:.4f}")
|
||||
|
||||
# Save full results
|
||||
output_path = Path("/home/chaim/legal-ai/data/benchmark-embeddings.json")
|
||||
serializable = {}
|
||||
for model, r in results.items():
|
||||
serializable[model] = {
|
||||
"doc_time": r["doc_time"],
|
||||
"query_time": r["query_time"],
|
||||
"doc_tokens": r["doc_tokens"],
|
||||
"query_tokens": r["query_tokens"],
|
||||
"total_tokens": r["total_tokens"],
|
||||
"cost_usd": r["cost_usd"],
|
||||
"dimensions": r["dimensions"],
|
||||
"queries": [
|
||||
{
|
||||
"query": qr["query"],
|
||||
"top5": [{"score": s, "doc": d, "chunk": c, "preview": p} for s, d, c, p in qr["top5"]],
|
||||
}
|
||||
for qr in r["query_results"]
|
||||
],
|
||||
}
|
||||
output_path.write_text(json.dumps(serializable, ensure_ascii=False, indent=2))
|
||||
print(f"\nFull results saved to {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
203
scripts/.archive/benchmark_new_vs_old.py
Normal file
203
scripts/.archive/benchmark_new_vs_old.py
Normal file
@@ -0,0 +1,203 @@
|
||||
"""Compare Google Vision extractions vs existing MDs, then benchmark voyage-law-2."""
|
||||
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import voyageai
|
||||
|
||||
API_KEY = "pa-qbfhBDxW0tVtgzr_abMyw_AJO2gli9w3nnqyHuQOW-e"
|
||||
client = voyageai.Client(api_key=API_KEY)
|
||||
MODEL = "voyage-law-2"
|
||||
|
||||
DOCS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents")
|
||||
GOOGLE_DIR = DOCS_DIR / "extracted"
|
||||
|
||||
# Map new (Google Vision) files to existing MDs
|
||||
PAIRS = [
|
||||
("מרק קובר-כתב ערר.md", "2025-08-14-כתב-ערר-קובר.md"),
|
||||
("תשובה לערר מטעם המשיבים.md", "2025-09-01-כתב-תשובה-ליבמן-לערר.md"),
|
||||
("תשובת הועדה המרחבית לערר.md", "2025-09-02-כתב-תשובה-ועדת-הראל-לערר.md"),
|
||||
("תשובת המשיב-יצחק מטמון.md", "2025-10-22-כתב-ערר-מטמון.md"),
|
||||
("השלמת טיעון מטעם משיבים 2-3.md", "2025-12-23-השלמת-טיעון-ליבמן.md"),
|
||||
("תשובה מטעם העורר להשלמת טיעון.md", "2025-12-08-תגובת-קובר-לבקשת-השלמת-טיעון.md"),
|
||||
("בקשה להשלמת טיעון ממשיבים 2-3.md", "2025-12-03-בקשה-להשלמת-טיעון-ליבמן.md"),
|
||||
("השלמת טיעון מטעם הוועדה המקומית.md", "2026-02-04-השלמת-טיעון-ועדת-הראל.md"),
|
||||
("תגובת העורר לתשובת ועדת הראל להשלמת הטיעון ערר.md", "2026-02-10-תגובת-קובר-להשלמת-טיעון-הראל.md"),
|
||||
("כתב תשובה-השלמת טיעון מטעם המשיב יצחק מטמון.md", "2026-02-12-כתב-תשובה-השלמת-טיעון-מטמון.md"),
|
||||
("בקשת העורר לדחיית השלמת הטיעון במלואה.md", "2026-01-13-תגובת-קובר-לדחיית-השלמת-טיעון.md"),
|
||||
("1130-25-החלטה לתיקון פרוטוקול.md", "2025-11-27-החלטה-לתיקון-פרוטוקול.md"),
|
||||
("החלטת ביניים 1130-25.md", "2025-12-31-החלטת-ביניים.md"),
|
||||
("1130-25-פרוטוקול ועדת ערר והחלטה.md", "2025-10-27-פרוטוקול-דיון-ועדת-ערר.md"),
|
||||
("פרוטוקול ועדה מקומית לדיון בתכנית 152-1257682.md", "2025-07-23-פרוטוקול-ועדה-מקומית-הראל.md"),
|
||||
]
|
||||
|
||||
QUERIES = [
|
||||
"מהי הטענה המרכזית של העוררים בנוגע לחניה?",
|
||||
"מה עמדת הוועדה המקומית לגבי התכנית?",
|
||||
"האם יש פגיעה בזכויות הבנייה של השכנים?",
|
||||
"מהם התנאים שנקבעו בהיתר הבנייה?",
|
||||
"האם התכנית עומדת בתקן החניה?",
|
||||
"מה טענות המשיבים לגבי הגובה והצפיפות?",
|
||||
"האם נערך שימוע כדין לפני מתן ההחלטה?",
|
||||
"מהם הנימוקים לאישור התכנית על ידי הוועדה המקומית?",
|
||||
]
|
||||
|
||||
|
||||
def cosine_sim(a, b):
|
||||
dot = sum(x * y for x, y in zip(a, b))
|
||||
na = sum(x * x for x in a) ** 0.5
|
||||
nb = sum(x * x for x in b) ** 0.5
|
||||
return dot / (na * nb) if na and nb else 0.0
|
||||
|
||||
|
||||
def chunk_text(text, chunk_size=600, overlap=100):
|
||||
words = text.split()
|
||||
chunks = []
|
||||
i = 0
|
||||
while i < len(words):
|
||||
chunks.append(" ".join(words[i:i + chunk_size]))
|
||||
i += chunk_size - overlap
|
||||
return chunks
|
||||
|
||||
|
||||
def word_overlap(a, b):
|
||||
wa, wb = set(a.split()), set(b.split())
|
||||
if not wa or not wb:
|
||||
return 0.0
|
||||
return len(wa & wb) / max(len(wa), len(wb))
|
||||
|
||||
|
||||
def main():
|
||||
# ── Part 1: Document comparison ──
|
||||
print("=" * 70)
|
||||
print("PART 1: DOCUMENT COMPARISON (Google Vision vs Existing)")
|
||||
print("=" * 70)
|
||||
|
||||
comparison_results = []
|
||||
all_new_chunks = []
|
||||
all_old_chunks = []
|
||||
|
||||
for new_name, old_name in PAIRS:
|
||||
new_path = GOOGLE_DIR / new_name
|
||||
old_path = DOCS_DIR / old_name
|
||||
|
||||
if not new_path.exists():
|
||||
continue
|
||||
if not old_path.exists():
|
||||
print(f" SKIP (no existing): {old_name}")
|
||||
continue
|
||||
|
||||
new_text = new_path.read_text(encoding="utf-8")
|
||||
old_text = old_path.read_text(encoding="utf-8")
|
||||
|
||||
new_words = len(new_text.split())
|
||||
old_words = len(old_text.split())
|
||||
overlap = word_overlap(new_text, old_text)
|
||||
|
||||
short_name = old_name[:40]
|
||||
diff = new_words - old_words
|
||||
diff_pct = (diff / old_words * 100) if old_words else 0
|
||||
|
||||
comparison_results.append({
|
||||
"name": short_name,
|
||||
"old_words": old_words,
|
||||
"new_words": new_words,
|
||||
"diff": diff,
|
||||
"diff_pct": diff_pct,
|
||||
"overlap": overlap,
|
||||
})
|
||||
|
||||
# Chunk for embedding
|
||||
new_chunks = chunk_text(new_text)
|
||||
old_chunks = chunk_text(old_text)
|
||||
for i, c in enumerate(new_chunks):
|
||||
all_new_chunks.append((short_name, i, c))
|
||||
for i, c in enumerate(old_chunks):
|
||||
all_old_chunks.append((short_name, i, c))
|
||||
|
||||
print(f"\n{'Document':<42} {'Old':>6} {'New':>6} {'Diff':>8} {'Overlap':>8}")
|
||||
print("-" * 72)
|
||||
for r in comparison_results:
|
||||
print(f" {r['name']:<40} {r['old_words']:>6} {r['new_words']:>6} {r['diff']:>+7} ({r['diff_pct']:>+.0f}%) {r['overlap']:>7.0%}")
|
||||
|
||||
# ── Part 2: Embedding benchmark ──
|
||||
print(f"\n{'=' * 70}")
|
||||
print("PART 2: VOYAGE-LAW-2 EMBEDDING BENCHMARK")
|
||||
print(f"{'=' * 70}")
|
||||
|
||||
new_texts = [c[2] for c in all_new_chunks]
|
||||
old_texts = [c[2] for c in all_old_chunks]
|
||||
|
||||
print(f"\nNew chunks: {len(new_texts)}, Old chunks: {len(old_texts)}")
|
||||
|
||||
def embed_batched(texts, label):
|
||||
BATCH = 20
|
||||
all_embs = []
|
||||
total_tokens = 0
|
||||
t0 = time.time()
|
||||
for i in range(0, len(texts), BATCH):
|
||||
batch = texts[i:i+BATCH]
|
||||
result = client.embed(batch, model=MODEL, input_type="document")
|
||||
all_embs.extend(result.embeddings)
|
||||
total_tokens += result.total_tokens
|
||||
elapsed = time.time() - t0
|
||||
print(f" {label}: {len(texts)} chunks, {total_tokens:,} tokens, {elapsed:.1f}s")
|
||||
return all_embs, total_tokens, elapsed
|
||||
|
||||
# Embed new
|
||||
print("Embedding NEW (Google Vision) chunks...")
|
||||
new_embs, new_tokens, new_time = embed_batched(new_texts, "NEW")
|
||||
|
||||
# Embed old
|
||||
print("Embedding OLD (existing) chunks...")
|
||||
old_embs, old_tokens, old_time = embed_batched(old_texts, "OLD")
|
||||
|
||||
# Embed queries
|
||||
print(f"Embedding {len(QUERIES)} queries...")
|
||||
q_result = client.embed(QUERIES, model=MODEL, input_type="query")
|
||||
q_embs = q_result.embeddings
|
||||
|
||||
# Search and compare
|
||||
print(f"\n{'=' * 70}")
|
||||
print("PART 3: SEARCH QUALITY COMPARISON")
|
||||
print(f"{'=' * 70}")
|
||||
|
||||
for qi, query in enumerate(QUERIES):
|
||||
# Score against new
|
||||
new_scores = [(cosine_sim(q_embs[qi], e), all_new_chunks[i][0], all_new_chunks[i][2][:60]) for i, e in enumerate(new_embs)]
|
||||
new_scores.sort(reverse=True)
|
||||
|
||||
# Score against old
|
||||
old_scores = [(cosine_sim(q_embs[qi], e), all_old_chunks[i][0], all_old_chunks[i][2][:60]) for i, e in enumerate(old_embs)]
|
||||
old_scores.sort(reverse=True)
|
||||
|
||||
print(f"\nQ{qi+1}: {query}")
|
||||
print(f" {'NEW top-1':>10}: [{new_scores[0][0]:.4f}] {new_scores[0][1]}")
|
||||
print(f" {'OLD top-1':>10}: [{old_scores[0][0]:.4f}] {old_scores[0][1]}")
|
||||
if new_scores[0][0] > old_scores[0][0]:
|
||||
print(f" >> NEW better by {new_scores[0][0] - old_scores[0][0]:.4f}")
|
||||
else:
|
||||
print(f" >> OLD better by {old_scores[0][0] - new_scores[0][0]:.4f}")
|
||||
|
||||
# Summary
|
||||
new_avg = sum(max(cosine_sim(q_embs[qi], e) for e in new_embs) for qi in range(len(QUERIES))) / len(QUERIES)
|
||||
old_avg = sum(max(cosine_sim(q_embs[qi], e) for e in old_embs) for qi in range(len(QUERIES))) / len(QUERIES)
|
||||
|
||||
print(f"\n{'=' * 70}")
|
||||
print("SUMMARY")
|
||||
print(f"{'=' * 70}")
|
||||
print(f" {'Metric':<30} {'Old (existing)':>15} {'New (Google Vision)':>20}")
|
||||
print(f" {'-' * 65}")
|
||||
print(f" {'Total chunks':<30} {len(old_texts):>15} {len(new_texts):>20}")
|
||||
print(f" {'Total tokens':<30} {old_tokens:>15,} {new_tokens:>20,}")
|
||||
print(f" {'Embed time':<30} {old_time:>14.1f}s {new_time:>19.1f}s")
|
||||
print(f" {'Avg top-1 score':<30} {old_avg:>15.4f} {new_avg:>20.4f}")
|
||||
print(f" {'Score difference':<30} {'':>15} {new_avg - old_avg:>+20.4f}")
|
||||
|
||||
est_cost = (new_tokens + old_tokens) / 1_000_000 * 0.12
|
||||
print(f"\n Embedding cost: ${est_cost:.3f}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
323
scripts/.archive/decompose-decisions.py
Normal file
323
scripts/.archive/decompose-decisions.py
Normal file
@@ -0,0 +1,323 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Decompose 6 final decisions into 12-block structure.
|
||||
|
||||
Uses heuristic parsing based on known section headers in Dafna's decisions.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from uuid import UUID
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "mcp-server" / "src"))
|
||||
|
||||
from legal_mcp.services.db import get_pool, init_schema, close_pool
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Block definitions with detection patterns
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
|
||||
BLOCKS = [
|
||||
{
|
||||
"block_id": "block-alef",
|
||||
"block_index": 1,
|
||||
"title": "כותרת מוסדית",
|
||||
"generation_type": "template-fill",
|
||||
},
|
||||
{
|
||||
"block_id": "block-bet",
|
||||
"block_index": 2,
|
||||
"title": "הרכב הוועדה",
|
||||
"generation_type": "template-fill",
|
||||
},
|
||||
{
|
||||
"block_id": "block-gimel",
|
||||
"block_index": 3,
|
||||
"title": "צדדים",
|
||||
"generation_type": "template-fill",
|
||||
},
|
||||
{
|
||||
"block_id": "block-dalet",
|
||||
"block_index": 4,
|
||||
"title": "כותרת החלטה",
|
||||
"generation_type": "template-fill",
|
||||
},
|
||||
{
|
||||
"block_id": "block-he",
|
||||
"block_index": 5,
|
||||
"title": "פתיחה",
|
||||
"generation_type": "paraphrase",
|
||||
},
|
||||
{
|
||||
"block_id": "block-vav",
|
||||
"block_index": 6,
|
||||
"title": "רקע עובדתי",
|
||||
"generation_type": "reproduction",
|
||||
},
|
||||
{
|
||||
"block_id": "block-zayin",
|
||||
"block_index": 7,
|
||||
"title": "טענות הצדדים",
|
||||
"generation_type": "paraphrase",
|
||||
},
|
||||
{
|
||||
"block_id": "block-chet",
|
||||
"block_index": 8,
|
||||
"title": "הליכים בפני ועדת הערר",
|
||||
"generation_type": "reproduction",
|
||||
},
|
||||
{
|
||||
"block_id": "block-tet",
|
||||
"block_index": 9,
|
||||
"title": "תכניות חלות",
|
||||
"generation_type": "guided-synthesis",
|
||||
},
|
||||
{
|
||||
"block_id": "block-yod",
|
||||
"block_index": 10,
|
||||
"title": "דיון והכרעה",
|
||||
"generation_type": "rhetorical-construction",
|
||||
},
|
||||
{
|
||||
"block_id": "block-yod-alef",
|
||||
"block_index": 11,
|
||||
"title": "סיכום",
|
||||
"generation_type": "paraphrase",
|
||||
},
|
||||
{
|
||||
"block_id": "block-yod-bet",
|
||||
"block_index": 12,
|
||||
"title": "חתימות",
|
||||
"generation_type": "template-fill",
|
||||
},
|
||||
]
|
||||
|
||||
# Section header patterns (Hebrew)
|
||||
SECTION_PATTERNS = {
|
||||
"claims": re.compile(r"תמצית\s*טענות\s*הצדדים|טענות\s*הצדדים|טענות\s*העוררי"),
|
||||
"proceedings": re.compile(r"ההליכים\s*בפני\s*ועדת\s*הערר|הליכים\s*בפני\s*הוועדה|הדיון\s*בפני\s*ועדת\s*הערר"),
|
||||
"plans": re.compile(r"תכניות\s*חלות|המסגרת\s*התכנונית|הוראות\s*התכנית"),
|
||||
"discussion": re.compile(r"דיון\s*והכרעה|דיון|הכרעה"),
|
||||
"summary": re.compile(r"^סיכום$|^סוף\s*דבר$", re.MULTILINE),
|
||||
"appellant_claims": re.compile(r"טענות\s*העוררי|טענות\s*העורר"),
|
||||
"respondent_claims": re.compile(r"עמדת\s*הוועדה\s*המקומית|תגובת\s*המשיבה|עמדת\s*המשיב"),
|
||||
"permit_applicant": re.compile(r"עמדת\s*מבקש|עמדת\s*מגיש|עמדת\s*היזם"),
|
||||
"panel": re.compile(r"בפני[:\s]|יו\"ר"),
|
||||
"parties_vs": re.compile(r"\s*נגד\s*"),
|
||||
"decision_title": re.compile(r"^החלטה$", re.MULTILINE),
|
||||
"opening": re.compile(r"^לפנינו\s|^בפנינו\s"),
|
||||
"signature": re.compile(r"ניתנה?\s*(היום|פה\s*אחד|ביום)|חתימ"),
|
||||
}
|
||||
|
||||
|
||||
def find_section_start(text: str, pattern: re.Pattern) -> int:
|
||||
"""Find the character position where a section starts."""
|
||||
match = pattern.search(text)
|
||||
return match.start() if match else -1
|
||||
|
||||
|
||||
def decompose_decision(text: str) -> list[dict]:
|
||||
"""Parse decision text into blocks based on section headers."""
|
||||
lines = text.split("\n")
|
||||
total_len = len(text)
|
||||
|
||||
# Find key section boundaries
|
||||
pos_claims = find_section_start(text, SECTION_PATTERNS["claims"])
|
||||
pos_proceedings = find_section_start(text, SECTION_PATTERNS["proceedings"])
|
||||
pos_plans = find_section_start(text, SECTION_PATTERNS["plans"])
|
||||
pos_discussion = find_section_start(text, SECTION_PATTERNS["discussion"])
|
||||
pos_summary = find_section_start(text, SECTION_PATTERNS["summary"])
|
||||
pos_signature = find_section_start(text, SECTION_PATTERNS["signature"])
|
||||
pos_opening = find_section_start(text, SECTION_PATTERNS["opening"])
|
||||
pos_decision_title = find_section_start(text, SECTION_PATTERNS["decision_title"])
|
||||
pos_panel = find_section_start(text, SECTION_PATTERNS["panel"])
|
||||
pos_parties = find_section_start(text, SECTION_PATTERNS["parties_vs"])
|
||||
|
||||
# Build blocks based on what we found
|
||||
blocks = []
|
||||
|
||||
# Blocks א-ד: Header area (before the opening "לפנינו")
|
||||
header_end = pos_opening if pos_opening > 0 else pos_claims if pos_claims > 0 else 500
|
||||
header_text = text[:header_end].strip()
|
||||
|
||||
# Try to split header into institutional header, panel, parties, title
|
||||
if pos_panel > 0 and pos_panel < header_end:
|
||||
blocks.append({"block_id": "block-alef", "content": text[:pos_panel].strip()})
|
||||
|
||||
if pos_parties > 0 and pos_parties < header_end:
|
||||
blocks.append({"block_id": "block-bet", "content": text[pos_panel:pos_parties].strip()})
|
||||
if pos_decision_title > 0 and pos_decision_title < header_end:
|
||||
blocks.append({"block_id": "block-gimel", "content": text[pos_parties:pos_decision_title].strip()})
|
||||
blocks.append({"block_id": "block-dalet", "content": "החלטה"})
|
||||
else:
|
||||
blocks.append({"block_id": "block-gimel", "content": text[pos_parties:header_end].strip()})
|
||||
blocks.append({"block_id": "block-dalet", "content": "החלטה"})
|
||||
else:
|
||||
blocks.append({"block_id": "block-bet", "content": text[pos_panel:header_end].strip()})
|
||||
blocks.append({"block_id": "block-gimel", "content": ""})
|
||||
blocks.append({"block_id": "block-dalet", "content": "החלטה"})
|
||||
else:
|
||||
# Can't split — put everything in alef
|
||||
blocks.append({"block_id": "block-alef", "content": header_text})
|
||||
blocks.append({"block_id": "block-bet", "content": ""})
|
||||
blocks.append({"block_id": "block-gimel", "content": ""})
|
||||
blocks.append({"block_id": "block-dalet", "content": "החלטה"})
|
||||
|
||||
# Block ה: Opening — from "לפנינו" to claims section
|
||||
if pos_opening > 0:
|
||||
opening_end = pos_claims if pos_claims > pos_opening else pos_discussion if pos_discussion > pos_opening else total_len
|
||||
# Opening is usually just 1-3 paragraphs
|
||||
opening_text = text[pos_opening:min(pos_opening + 1000, opening_end)].strip()
|
||||
# Find end of first few paragraphs
|
||||
para_breaks = [i for i, c in enumerate(opening_text) if c == '\n' and i > 50]
|
||||
if len(para_breaks) >= 2:
|
||||
opening_text = opening_text[:para_breaks[1]].strip()
|
||||
blocks.append({"block_id": "block-he", "content": opening_text})
|
||||
|
||||
# Block ו: Background — from after opening to claims
|
||||
if pos_claims > pos_opening:
|
||||
bg_start = pos_opening + len(opening_text)
|
||||
blocks.append({"block_id": "block-vav", "content": text[bg_start:pos_claims].strip()})
|
||||
else:
|
||||
blocks.append({"block_id": "block-vav", "content": ""})
|
||||
else:
|
||||
blocks.append({"block_id": "block-he", "content": ""})
|
||||
blocks.append({"block_id": "block-vav", "content": ""})
|
||||
|
||||
# Block ז: Claims
|
||||
if pos_claims > 0:
|
||||
claims_end = pos_proceedings if pos_proceedings > pos_claims else pos_discussion if pos_discussion > pos_claims else pos_summary if pos_summary > pos_claims else total_len
|
||||
blocks.append({"block_id": "block-zayin", "content": text[pos_claims:claims_end].strip()})
|
||||
else:
|
||||
blocks.append({"block_id": "block-zayin", "content": ""})
|
||||
|
||||
# Block ח: Proceedings (optional)
|
||||
if pos_proceedings > 0:
|
||||
proc_end = pos_plans if pos_plans > pos_proceedings else pos_discussion if pos_discussion > pos_proceedings else pos_summary if pos_summary > pos_proceedings else total_len
|
||||
blocks.append({"block_id": "block-chet", "content": text[pos_proceedings:proc_end].strip()})
|
||||
else:
|
||||
blocks.append({"block_id": "block-chet", "content": ""})
|
||||
|
||||
# Block ט: Plans (optional)
|
||||
if pos_plans > 0 and pos_plans < (pos_discussion if pos_discussion > 0 else total_len):
|
||||
plans_end = pos_discussion if pos_discussion > pos_plans else pos_summary if pos_summary > pos_plans else total_len
|
||||
blocks.append({"block_id": "block-tet", "content": text[pos_plans:plans_end].strip()})
|
||||
else:
|
||||
blocks.append({"block_id": "block-tet", "content": ""})
|
||||
|
||||
# Block י: Discussion
|
||||
if pos_discussion > 0:
|
||||
disc_end = pos_summary if pos_summary > pos_discussion else pos_signature if pos_signature > pos_discussion else total_len
|
||||
blocks.append({"block_id": "block-yod", "content": text[pos_discussion:disc_end].strip()})
|
||||
else:
|
||||
blocks.append({"block_id": "block-yod", "content": ""})
|
||||
|
||||
# Block יא: Summary
|
||||
if pos_summary > 0:
|
||||
summ_end = pos_signature if pos_signature > pos_summary else total_len
|
||||
blocks.append({"block_id": "block-yod-alef", "content": text[pos_summary:summ_end].strip()})
|
||||
else:
|
||||
blocks.append({"block_id": "block-yod-alef", "content": ""})
|
||||
|
||||
# Block יב: Signatures
|
||||
if pos_signature > 0:
|
||||
blocks.append({"block_id": "block-yod-bet", "content": text[pos_signature:].strip()})
|
||||
else:
|
||||
blocks.append({"block_id": "block-yod-bet", "content": ""})
|
||||
|
||||
return blocks
|
||||
|
||||
|
||||
async def main():
|
||||
await init_schema()
|
||||
pool = await get_pool()
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
decisions = await conn.fetch(
|
||||
"""SELECT d.id as decision_id, c.case_number, c.title, d.total_words,
|
||||
doc.extracted_text
|
||||
FROM decisions d
|
||||
JOIN cases c ON c.id = d.case_id
|
||||
JOIN documents doc ON doc.case_id = d.case_id AND doc.doc_type = 'decision'
|
||||
WHERE d.status = 'final'
|
||||
ORDER BY c.case_number"""
|
||||
)
|
||||
|
||||
for dec in decisions:
|
||||
decision_id = dec["decision_id"]
|
||||
case_number = dec["case_number"]
|
||||
text = dec["extracted_text"]
|
||||
total_words = len(text.split())
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"מפרק: {case_number} — {dec['title']}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
# Decompose
|
||||
blocks = decompose_decision(text)
|
||||
|
||||
# Merge with block metadata
|
||||
block_data = []
|
||||
for block_def in BLOCKS:
|
||||
matching = [b for b in blocks if b["block_id"] == block_def["block_id"]]
|
||||
content = matching[0]["content"] if matching else ""
|
||||
word_count = len(content.split()) if content else 0
|
||||
weight = round((word_count / total_words * 100), 2) if total_words > 0 and word_count > 0 else 0
|
||||
|
||||
block_data.append({
|
||||
**block_def,
|
||||
"content": content,
|
||||
"word_count": word_count,
|
||||
"weight_percent": weight,
|
||||
"status": "final" if content else "empty",
|
||||
})
|
||||
|
||||
# Print summary
|
||||
for b in block_data:
|
||||
status = "✅" if b["word_count"] > 0 else "⬜"
|
||||
print(f" {status} {b['block_id']:18s} | {b['title']:25s} | {b['word_count']:5d} מילים | {b['weight_percent']:5.1f}%")
|
||||
|
||||
# Store in DB
|
||||
async with pool.acquire() as conn:
|
||||
# Delete existing blocks for this decision
|
||||
await conn.execute(
|
||||
"DELETE FROM decision_blocks WHERE decision_id = $1", decision_id
|
||||
)
|
||||
|
||||
for b in block_data:
|
||||
await conn.execute(
|
||||
"""INSERT INTO decision_blocks
|
||||
(decision_id, block_id, block_index, title, content,
|
||||
word_count, weight_percent, generation_type, status)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)""",
|
||||
decision_id,
|
||||
b["block_id"], b["block_index"], b["title"],
|
||||
b["content"], b["word_count"], b["weight_percent"],
|
||||
b["generation_type"], b["status"],
|
||||
)
|
||||
|
||||
# Count paragraphs in discussion block
|
||||
discussion = [b for b in block_data if b["block_id"] == "block-yod"][0]
|
||||
if discussion["content"]:
|
||||
paragraphs = [p.strip() for p in discussion["content"].split("\n") if p.strip() and len(p.strip()) > 20]
|
||||
await conn.execute(
|
||||
"UPDATE decisions SET total_paragraphs = $1 WHERE id = $2",
|
||||
len(paragraphs), decision_id,
|
||||
)
|
||||
|
||||
# Final summary
|
||||
async with pool.acquire() as conn:
|
||||
block_count = await conn.fetchval("SELECT count(*) FROM decision_blocks")
|
||||
non_empty = await conn.fetchval("SELECT count(*) FROM decision_blocks WHERE status = 'final'")
|
||||
|
||||
await close_pool()
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"✅ סה\"כ בלוקים: {block_count} ({non_empty} עם תוכן)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
139
scripts/.archive/export-decision-docx.py
Normal file
139
scripts/.archive/export-decision-docx.py
Normal file
@@ -0,0 +1,139 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Export a decision from DB to DOCX using the CJS template generator.
|
||||
|
||||
Usage: python export-decision-docx.py <case_number> [output.docx]
|
||||
|
||||
Pulls decision blocks from DB, generates structure JSON,
|
||||
invokes create-decision-structure.cjs to produce DOCX.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "mcp-server" / "src"))
|
||||
|
||||
from legal_mcp.services.db import get_pool, init_schema, close_pool
|
||||
|
||||
CJS_SCRIPT = Path(__file__).parent.parent / "skills" / "decision" / "scripts" / "create-decision-structure.cjs"
|
||||
|
||||
|
||||
def block_id_to_hebrew(block_id: str) -> str:
|
||||
"""Map block_id to Hebrew letter label."""
|
||||
mapping = {
|
||||
"block-alef": "א", "block-bet": "ב", "block-gimel": "ג",
|
||||
"block-dalet": "ד", "block-he": "ה", "block-vav": "ו",
|
||||
"block-zayin": "ז", "block-chet": "ח", "block-tet": "ט",
|
||||
"block-yod": "י", "block-yod-alef": "יא", "block-yod-bet": "יב",
|
||||
}
|
||||
return mapping.get(block_id, "")
|
||||
|
||||
|
||||
async def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("שימוש: python export-decision-docx.py <מספר_תיק> [output.docx]")
|
||||
sys.exit(1)
|
||||
|
||||
case_number = sys.argv[1]
|
||||
output_path = sys.argv[2] if len(sys.argv) > 2 else f"החלטה-{case_number}.docx"
|
||||
|
||||
await init_schema()
|
||||
pool = await get_pool()
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
# Get case info
|
||||
case = await conn.fetchrow(
|
||||
"SELECT * FROM cases WHERE case_number = $1", case_number
|
||||
)
|
||||
if not case:
|
||||
print(f"תיק {case_number} לא נמצא")
|
||||
sys.exit(1)
|
||||
|
||||
# Get decision
|
||||
decision = await conn.fetchrow(
|
||||
"SELECT * FROM decisions WHERE case_id = $1 AND status = 'final'",
|
||||
case["id"],
|
||||
)
|
||||
if not decision:
|
||||
print(f"אין החלטה סופית לתיק {case_number}")
|
||||
sys.exit(1)
|
||||
|
||||
# Get blocks
|
||||
blocks = await conn.fetch(
|
||||
"""SELECT block_id, block_index, title, content, word_count
|
||||
FROM decision_blocks
|
||||
WHERE decision_id = $1
|
||||
ORDER BY block_index""",
|
||||
decision["id"],
|
||||
)
|
||||
|
||||
await close_pool()
|
||||
|
||||
# Build structure JSON for CJS script
|
||||
appellants = json.loads(case["appellants"]) if isinstance(case["appellants"], str) else case["appellants"]
|
||||
respondents = json.loads(case["respondents"]) if isinstance(case["respondents"], str) else case["respondents"]
|
||||
|
||||
structure = {
|
||||
"metadata": {
|
||||
"case_number": case["case_number"],
|
||||
"title": case["title"],
|
||||
"subject": case["subject"],
|
||||
"property_address": case["property_address"],
|
||||
"committee": case["committee_type"],
|
||||
"outcome": decision["outcome"] or "",
|
||||
"decision_date": str(decision["decision_date"]) if decision["decision_date"] else "",
|
||||
"author": decision["author"],
|
||||
},
|
||||
"parties": {
|
||||
"appellants": [{"name": a} for a in appellants],
|
||||
"respondents": [{"name": r} for r in respondents],
|
||||
},
|
||||
"blocks": [],
|
||||
}
|
||||
|
||||
for block in blocks:
|
||||
content = block["content"] or ""
|
||||
# Skip empty header blocks
|
||||
if block["block_id"] in ("block-alef", "block-bet", "block-gimel", "block-dalet") and not content:
|
||||
continue
|
||||
|
||||
paragraphs = [p.strip() for p in content.split("\n") if p.strip()]
|
||||
|
||||
structure["blocks"].append({
|
||||
"id": block["block_id"],
|
||||
"index": block["block_index"],
|
||||
"title": block["title"],
|
||||
"hebrew_letter": block_id_to_hebrew(block["block_id"]),
|
||||
"word_count": block["word_count"],
|
||||
"paragraphs": paragraphs,
|
||||
})
|
||||
|
||||
# Write JSON (absolute paths)
|
||||
output_abs = Path(output_path).resolve()
|
||||
json_path = output_abs.with_suffix(".json")
|
||||
json_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(json_path, "w", encoding="utf-8") as f:
|
||||
json.dump(structure, f, ensure_ascii=False, indent=2)
|
||||
print(f"JSON נוצר: {json_path}")
|
||||
|
||||
# Run CJS script with absolute paths
|
||||
result = subprocess.run(
|
||||
["node", str(CJS_SCRIPT), str(json_path), str(output_abs)],
|
||||
capture_output=True, text=True,
|
||||
cwd=str(CJS_SCRIPT.parent),
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
print(f"✅ DOCX נוצר: {output_path}")
|
||||
else:
|
||||
print(f"❌ שגיאה ביצירת DOCX:")
|
||||
print(result.stderr)
|
||||
# JSON is still available for manual processing
|
||||
print(f"ה-JSON זמין: {json_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
134
scripts/.archive/extract-citations.py
Normal file
134
scripts/.archive/extract-citations.py
Normal file
@@ -0,0 +1,134 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Extract case law citations from block-yod and link to case_law table."""
|
||||
|
||||
import asyncio
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from uuid import UUID
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "mcp-server" / "src"))
|
||||
|
||||
from legal_mcp.services.db import get_pool, init_schema, close_pool
|
||||
|
||||
# Patterns for Israeli case law citations
|
||||
CITATION_PATTERNS = [
|
||||
# עע"מ, בג"ץ, ע"א, etc.
|
||||
re.compile(r'(עע"מ|בג"ץ|ע"א|בר"ם|עת"מ|עמ"נ|ע"ע|רע"א|דנ"א|בש"א)\s*(\d[\d/\-]+)'),
|
||||
# ערר with number
|
||||
re.compile(r'ערר\s*\(?\s*(?:מרכז|ירושלים|חי\'?|ת"א|דרום|צפון)?\s*\)?\s*(\d[\d/\-]+)'),
|
||||
# ערר without district
|
||||
re.compile(r'ערר\s+(\d{3,5}[\-/]\d{2,4})'),
|
||||
]
|
||||
|
||||
|
||||
def extract_citations_from_text(text: str) -> list[dict]:
|
||||
"""Find all case law citations in text."""
|
||||
citations = []
|
||||
seen = set()
|
||||
|
||||
for pattern in CITATION_PATTERNS:
|
||||
for match in pattern.finditer(text):
|
||||
full_match = match.group(0)
|
||||
if full_match in seen:
|
||||
continue
|
||||
seen.add(full_match)
|
||||
|
||||
# Get surrounding context (50 chars before and after)
|
||||
start = max(0, match.start() - 50)
|
||||
end = min(len(text), match.end() + 100)
|
||||
context = text[start:end].replace("\n", " ")
|
||||
|
||||
citations.append({
|
||||
"citation_text": full_match,
|
||||
"context": context,
|
||||
})
|
||||
|
||||
return citations
|
||||
|
||||
|
||||
async def main():
|
||||
await init_schema()
|
||||
pool = await get_pool()
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
# Get all block-yod content with decision info
|
||||
blocks = await conn.fetch(
|
||||
"""SELECT db.content, d.id as decision_id, c.case_number
|
||||
FROM decision_blocks db
|
||||
JOIN decisions d ON d.id = db.decision_id
|
||||
JOIN cases c ON c.id = d.case_id
|
||||
WHERE db.block_id = 'block-yod' AND db.word_count > 0
|
||||
ORDER BY c.case_number"""
|
||||
)
|
||||
|
||||
# Get existing case_law for matching
|
||||
case_laws = await conn.fetch("SELECT id, case_number, case_name FROM case_law")
|
||||
case_law_map = {}
|
||||
for cl in case_laws:
|
||||
# Index by various forms of the case number
|
||||
case_law_map[cl["case_number"]] = cl["id"]
|
||||
# Also index by short number (e.g., "3975/22" from "עע"מ 3975/22")
|
||||
parts = cl["case_number"].split()
|
||||
if len(parts) > 1:
|
||||
case_law_map[parts[-1]] = cl["id"]
|
||||
|
||||
total_citations = 0
|
||||
total_linked = 0
|
||||
|
||||
for block in blocks:
|
||||
case_number = block["case_number"]
|
||||
decision_id = block["decision_id"]
|
||||
text = block["content"]
|
||||
|
||||
citations = extract_citations_from_text(text)
|
||||
|
||||
if not citations:
|
||||
continue
|
||||
|
||||
print(f"\n{case_number}: {len(citations)} ציטוטים נמצאו")
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
for cit in citations:
|
||||
total_citations += 1
|
||||
|
||||
# Try to match to case_law table
|
||||
case_law_id = None
|
||||
for key, cl_id in case_law_map.items():
|
||||
if key in cit["citation_text"] or cit["citation_text"] in key:
|
||||
case_law_id = cl_id
|
||||
break
|
||||
|
||||
if case_law_id:
|
||||
# Check if already exists
|
||||
existing = await conn.fetchval(
|
||||
"""SELECT id FROM case_law_citations
|
||||
WHERE case_law_id = $1 AND decision_id = $2""",
|
||||
case_law_id, decision_id,
|
||||
)
|
||||
if not existing:
|
||||
await conn.execute(
|
||||
"""INSERT INTO case_law_citations
|
||||
(case_law_id, decision_id, citation_type, context_text)
|
||||
VALUES ($1, $2, 'support', $3)""",
|
||||
case_law_id, decision_id, cit["context"],
|
||||
)
|
||||
total_linked += 1
|
||||
print(f" ✅ {cit['citation_text'][:40]} → קושר לפסיקה")
|
||||
else:
|
||||
print(f" ⬜ {cit['citation_text'][:40]} — לא נמצא ב-DB")
|
||||
|
||||
# Summary
|
||||
async with pool.acquire() as conn:
|
||||
total_in_db = await conn.fetchval("SELECT count(*) FROM case_law_citations")
|
||||
|
||||
await close_pool()
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"סה\"כ ציטוטים שנמצאו: {total_citations}")
|
||||
print(f"סה\"כ קושרו לפסיקה ב-DB: {total_linked}")
|
||||
print(f"סה\"כ ב-case_law_citations: {total_in_db}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
228
scripts/.archive/extract-claims.py
Normal file
228
scripts/.archive/extract-claims.py
Normal file
@@ -0,0 +1,228 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Extract individual claims from block-zayin of each decision.
|
||||
|
||||
Identifies party sub-sections and individual claims (paragraphs).
|
||||
Stores in the claims table with party_role classification.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from uuid import UUID
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "mcp-server" / "src"))
|
||||
|
||||
from legal_mcp.services.db import get_pool, init_schema, close_pool
|
||||
|
||||
|
||||
# Party role detection patterns
|
||||
PARTY_PATTERNS = [
|
||||
# Appellants
|
||||
(r"טענות\s*העוררי[םן]|טענות\s*העורר\b|טענות\s*המבקש|טענות\s*המערער", "appellant"),
|
||||
# Respondent - local committee
|
||||
(r"עמדת\s*הוועדה\s*המקומית|עמדת\s*המשיבה|טענות\s*המשיבה|תגובת\s*המשיבה|הוועדה\s*המקומית$", "committee"),
|
||||
# Respondent - general
|
||||
(r"עמדת\s*המשיבי[םן]|עמדת\s*המשיב\b|טענות\s*המשיבי[םן]|טענות\s*המשיב\b", "respondent"),
|
||||
# Permit applicant
|
||||
(r"מבקשי\s*ההיתר|עמדת\s*מבקש|עמדת\s*היזם|מגישי\s*התכנית", "permit_applicant"),
|
||||
# Appraiser clarifications (היטל השבחה)
|
||||
(r"הבהרות\s*השמא|התייחסות\s*הצדדים", "appraiser"),
|
||||
]
|
||||
|
||||
|
||||
def detect_party_role(line: str) -> str | None:
|
||||
"""Detect if a line is a party section header. Returns role or None."""
|
||||
for pattern, role in PARTY_PATTERNS:
|
||||
if re.search(pattern, line):
|
||||
return role
|
||||
return None
|
||||
|
||||
|
||||
def is_section_header(line: str) -> bool:
|
||||
"""Check if line is a section/sub-section header (not a claim)."""
|
||||
line = line.strip()
|
||||
if not line:
|
||||
return False
|
||||
# Very short lines that are headers
|
||||
if len(line) < 50 and (
|
||||
detect_party_role(line) is not None
|
||||
or re.match(r"^תמצית\s*טענות", line)
|
||||
or re.match(r"^[א-ת][\.\)]\s*טענות", line)
|
||||
or re.match(r"^[א-ת][\.\)]\s*כללי", line)
|
||||
or re.match(r"^\d+\.\s*$", line) # just a number
|
||||
):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def is_numbered_sub_header(line: str) -> bool:
|
||||
"""Check if line is a numbered topic header within claims (e.g., '2. שיעור ההפקעה')."""
|
||||
return bool(re.match(r"^\d+\.\s+\S.{3,40}$", line.strip()))
|
||||
|
||||
|
||||
def extract_claims_from_block(text: str) -> list[dict]:
|
||||
"""Extract individual claims grouped by party from block-zayin text."""
|
||||
lines = text.split("\n")
|
||||
claims = []
|
||||
current_role = "appellant" # default if no header found
|
||||
current_claim_lines = []
|
||||
claim_index = 0
|
||||
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
if not stripped:
|
||||
continue
|
||||
|
||||
# Check for party header — must be a SHORT line (header, not claim content)
|
||||
role = detect_party_role(stripped) if len(stripped.split()) <= 8 else None
|
||||
if role:
|
||||
# Save accumulated claim
|
||||
if current_claim_lines:
|
||||
claim_text = "\n".join(current_claim_lines).strip()
|
||||
if len(claim_text) > 30:
|
||||
claims.append({
|
||||
"party_role": current_role,
|
||||
"claim_text": claim_text,
|
||||
"claim_index": claim_index,
|
||||
})
|
||||
claim_index += 1
|
||||
current_claim_lines = []
|
||||
current_role = role
|
||||
continue
|
||||
|
||||
# Skip generic section headers
|
||||
if is_section_header(stripped):
|
||||
# Save accumulated claim before skipping header
|
||||
if current_claim_lines:
|
||||
claim_text = "\n".join(current_claim_lines).strip()
|
||||
if len(claim_text) > 30:
|
||||
claims.append({
|
||||
"party_role": current_role,
|
||||
"claim_text": claim_text,
|
||||
"claim_index": claim_index,
|
||||
})
|
||||
claim_index += 1
|
||||
current_claim_lines = []
|
||||
continue
|
||||
|
||||
# Numbered sub-header in היטל השבחה style (e.g., "2. שיעור ההפקעה")
|
||||
# starts a new claim
|
||||
if is_numbered_sub_header(stripped):
|
||||
if current_claim_lines:
|
||||
claim_text = "\n".join(current_claim_lines).strip()
|
||||
if len(claim_text) > 30:
|
||||
claims.append({
|
||||
"party_role": current_role,
|
||||
"claim_text": claim_text,
|
||||
"claim_index": claim_index,
|
||||
})
|
||||
claim_index += 1
|
||||
current_claim_lines = [stripped]
|
||||
continue
|
||||
|
||||
# Each substantial paragraph is a separate claim
|
||||
# Save previous accumulated claim first
|
||||
if current_claim_lines:
|
||||
claim_text = "\n".join(current_claim_lines).strip()
|
||||
if len(claim_text) > 30:
|
||||
claims.append({
|
||||
"party_role": current_role,
|
||||
"claim_text": claim_text,
|
||||
"claim_index": claim_index,
|
||||
})
|
||||
claim_index += 1
|
||||
current_claim_lines = [stripped]
|
||||
|
||||
# Save last claim
|
||||
if current_claim_lines:
|
||||
claim_text = "\n".join(current_claim_lines).strip()
|
||||
if len(claim_text) > 30:
|
||||
claims.append({
|
||||
"party_role": current_role,
|
||||
"claim_text": claim_text,
|
||||
"claim_index": claim_index,
|
||||
})
|
||||
|
||||
return claims
|
||||
|
||||
|
||||
async def main():
|
||||
await init_schema()
|
||||
pool = await get_pool()
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
# Get all block-zayin with content
|
||||
rows = await conn.fetch(
|
||||
"""SELECT c.id as case_id, c.case_number, c.title,
|
||||
db.content
|
||||
FROM decision_blocks db
|
||||
JOIN decisions d ON d.id = db.decision_id
|
||||
JOIN cases c ON c.id = d.case_id
|
||||
WHERE db.block_id = 'block-zayin' AND db.word_count > 0
|
||||
ORDER BY c.case_number"""
|
||||
)
|
||||
|
||||
total_claims = 0
|
||||
|
||||
for row in rows:
|
||||
case_id = row["case_id"]
|
||||
case_number = row["case_number"]
|
||||
text = row["content"]
|
||||
|
||||
claims = extract_claims_from_block(text)
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"תיק: {case_number} — {row['title']}")
|
||||
print(f"{'='*50}")
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
# Delete existing claims for this case
|
||||
await conn.execute("DELETE FROM claims WHERE case_id = $1", case_id)
|
||||
|
||||
role_counts = {}
|
||||
for claim in claims:
|
||||
role = claim["party_role"]
|
||||
role_counts[role] = role_counts.get(role, 0) + 1
|
||||
|
||||
await conn.execute(
|
||||
"""INSERT INTO claims (case_id, party_role, claim_text, claim_index, source_document)
|
||||
VALUES ($1, $2, $3, $4, $5)""",
|
||||
case_id,
|
||||
claim["party_role"],
|
||||
claim["claim_text"],
|
||||
claim["claim_index"],
|
||||
"block-zayin",
|
||||
)
|
||||
|
||||
for role, count in sorted(role_counts.items()):
|
||||
role_heb = {
|
||||
"appellant": "עוררים",
|
||||
"committee": "ועדה מקומית",
|
||||
"respondent": "משיבים",
|
||||
"permit_applicant": "מבקשי היתר",
|
||||
"appraiser": "שמאי",
|
||||
}.get(role, role)
|
||||
print(f" {role_heb:20s} — {count} טענות")
|
||||
|
||||
total_claims += len(claims)
|
||||
print(f" סה\"כ: {len(claims)} טענות")
|
||||
|
||||
# Summary
|
||||
async with pool.acquire() as conn:
|
||||
total = await conn.fetchval("SELECT count(*) FROM claims")
|
||||
by_role = await conn.fetch(
|
||||
"SELECT party_role, count(*) as cnt FROM claims GROUP BY party_role ORDER BY cnt DESC"
|
||||
)
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"סיכום כללי — {total} טענות מ-{len(rows)} החלטות")
|
||||
for r in by_role:
|
||||
print(f" {r['party_role']:20s} — {r['cnt']}")
|
||||
|
||||
await close_pool()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
128
scripts/.archive/extract_all_google_vision.py
Normal file
128
scripts/.archive/extract_all_google_vision.py
Normal file
@@ -0,0 +1,128 @@
|
||||
"""Extract ALL PDFs from originals using Google Cloud Vision OCR.
|
||||
Forces OCR on all pages (ignoring broken text layers).
|
||||
Then runs voyage-law-2 embedding benchmark comparing old vs new.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src"))
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(Path.home() / ".env")
|
||||
|
||||
import fitz
|
||||
from google.cloud import vision
|
||||
from legal_mcp import config
|
||||
|
||||
API_KEY = config.GOOGLE_CLOUD_VISION_API_KEY
|
||||
client = vision.ImageAnnotatorClient(client_options={"api_key": API_KEY})
|
||||
|
||||
ORIGINALS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/originals")
|
||||
OUTPUT_DIR = ORIGINALS_DIR.parent / "extracted"
|
||||
|
||||
# Hebrew abbreviation quote fixer
|
||||
import re
|
||||
_ABBREV_FIXES = {
|
||||
'עוהייד': 'עוה"ד', 'עוייד': 'עו"ד', 'הנייל': 'הנ"ל',
|
||||
'מצייב': 'מצ"ב', 'ביהמייש': 'ביהמ"ש', 'תייז': 'ת"ז',
|
||||
'עייי': 'ע"י', 'אחייכ': 'אח"כ', 'סייק': 'ס"ק',
|
||||
'דייר': 'ד"ר', 'כדוייח': 'כדו"ח', 'חווייד': 'חוו"ד',
|
||||
'מייר': 'מ"ר', 'יחייד': 'יח"ד', 'בייכ': 'ב"כ',
|
||||
}
|
||||
_ABBREV_PAT = re.compile('|'.join(re.escape(k) for k in sorted(_ABBREV_FIXES, key=len, reverse=True)))
|
||||
|
||||
def fix_quotes(text):
|
||||
return _ABBREV_PAT.sub(lambda m: _ABBREV_FIXES[m.group()], text)
|
||||
|
||||
|
||||
def ocr_page(image_bytes, page_num):
|
||||
image = vision.Image(content=image_bytes)
|
||||
response = client.document_text_detection(
|
||||
image=image,
|
||||
image_context=vision.ImageContext(language_hints=["he"]),
|
||||
)
|
||||
if response.error.message:
|
||||
print(f" ERROR page {page_num}: {response.error.message}")
|
||||
return ""
|
||||
text = response.full_text_annotation.text if response.full_text_annotation else ""
|
||||
return fix_quotes(text)
|
||||
|
||||
|
||||
def process_pdf(pdf_path):
|
||||
doc = fitz.open(str(pdf_path))
|
||||
page_count = len(doc)
|
||||
pages_text = []
|
||||
t0 = time.time()
|
||||
|
||||
for i in range(page_count):
|
||||
page = doc[i]
|
||||
pix = page.get_pixmap(dpi=300)
|
||||
img_bytes = pix.tobytes("png")
|
||||
|
||||
pt = time.time()
|
||||
text = ocr_page(img_bytes, i + 1)
|
||||
elapsed = time.time() - pt
|
||||
pages_text.append(text)
|
||||
print(f" Page {i+1}/{page_count}: {len(text):,} chars, {elapsed:.1f}s")
|
||||
|
||||
doc.close()
|
||||
total_time = time.time() - t0
|
||||
return "\n\n".join(pages_text), page_count, total_time
|
||||
|
||||
|
||||
def main():
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
pdfs = sorted(ORIGINALS_DIR.glob("*.pdf"))
|
||||
print(f"Found {len(pdfs)} PDFs\n")
|
||||
|
||||
results = []
|
||||
total_pages = 0
|
||||
total_time = 0.0
|
||||
|
||||
for pdf in pdfs:
|
||||
out_file = OUTPUT_DIR / f"{pdf.stem}.md"
|
||||
|
||||
# Skip already extracted
|
||||
if out_file.exists() and out_file.stat().st_size > 100:
|
||||
text = out_file.read_text(encoding="utf-8")
|
||||
doc = fitz.open(str(pdf))
|
||||
pages = len(doc)
|
||||
doc.close()
|
||||
print(f"SKIP (exists): {pdf.name} ({pages} pages, {len(text):,} chars)")
|
||||
results.append({"name": pdf.stem, "pages": pages, "chars": len(text), "words": len(text.split()), "time": 0, "skipped": True})
|
||||
total_pages += pages
|
||||
continue
|
||||
|
||||
print(f"{'=' * 60}")
|
||||
print(f" {pdf.name} ({pdf.stat().st_size:,} bytes)")
|
||||
|
||||
text, pages, elapsed = process_pdf(pdf)
|
||||
total_pages += pages
|
||||
total_time += elapsed
|
||||
|
||||
out_file.write_text(text, encoding="utf-8")
|
||||
|
||||
words = len(text.split())
|
||||
print(f" Result: {pages} pages, {len(text):,} chars, {words:,} words, {elapsed:.1f}s")
|
||||
print(f" Saved: {out_file.name}\n")
|
||||
|
||||
results.append({"name": pdf.stem, "pages": pages, "chars": len(text), "words": words, "time": elapsed, "skipped": False})
|
||||
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"TOTAL: {len(pdfs)} docs, {total_pages} pages, {total_time:.1f}s")
|
||||
est_cost = total_pages * 0.0015
|
||||
print(f"Estimated cost: ${est_cost:.2f}")
|
||||
|
||||
# Save results
|
||||
Path("/home/chaim/legal-ai/data/google-vision-extraction.json").write_text(
|
||||
json.dumps(results, ensure_ascii=False, indent=2)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
66
scripts/.archive/extract_originals.py
Normal file
66
scripts/.archive/extract_originals.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""Extract text from original PDF files using Claude Opus Vision OCR."""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src"))
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(Path.home() / ".env")
|
||||
|
||||
from legal_mcp.services import extractor
|
||||
|
||||
ORIGINALS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/originals")
|
||||
OUTPUT_DIR = ORIGINALS_DIR / "extracted"
|
||||
|
||||
|
||||
async def main():
|
||||
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||
|
||||
pdfs = sorted(ORIGINALS_DIR.glob("*.pdf"))
|
||||
print(f"Found {len(pdfs)} PDFs\n")
|
||||
|
||||
total_cost = 0.0
|
||||
total_pages = 0
|
||||
total_time = 0.0
|
||||
|
||||
for pdf in pdfs:
|
||||
print(f"{'=' * 60}")
|
||||
print(f"Processing: {pdf.name}")
|
||||
print(f" Size: {pdf.stat().st_size:,} bytes")
|
||||
|
||||
t0 = time.time()
|
||||
text, page_count = await extractor.extract_text(str(pdf))
|
||||
elapsed = time.time() - t0
|
||||
|
||||
total_pages += page_count
|
||||
total_time += elapsed
|
||||
|
||||
# Estimate cost (Opus: $15/M input, $75/M output, ~1000 tokens per image)
|
||||
# Rough: ~$0.05 per page for image input + output
|
||||
est_cost = page_count * 0.05
|
||||
total_cost += est_cost
|
||||
|
||||
# Save extracted text
|
||||
out_file = OUTPUT_DIR / f"{pdf.stem}.md"
|
||||
out_file.write_text(text, encoding="utf-8")
|
||||
|
||||
print(f" Pages: {page_count}")
|
||||
print(f" Extracted: {len(text):,} chars, {len(text.split()):,} words")
|
||||
print(f" Time: {elapsed:.1f}s ({elapsed/max(page_count,1):.1f}s/page)")
|
||||
print(f" Est. cost: ${est_cost:.3f}")
|
||||
print(f" Saved to: {out_file.name}")
|
||||
print()
|
||||
|
||||
print(f"{'=' * 60}")
|
||||
print(f"TOTAL")
|
||||
print(f" Documents: {len(pdfs)}")
|
||||
print(f" Pages: {total_pages}")
|
||||
print(f" Time: {total_time:.1f}s")
|
||||
print(f" Est. cost: ${total_cost:.3f}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
113
scripts/.archive/extract_originals_ocr.py
Normal file
113
scripts/.archive/extract_originals_ocr.py
Normal file
@@ -0,0 +1,113 @@
|
||||
"""Extract text from original PDF files using Claude Opus Vision OCR on ALL pages.
|
||||
|
||||
Forces Vision OCR regardless of embedded text layer (which may be broken).
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src"))
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(Path.home() / ".env")
|
||||
|
||||
import anthropic
|
||||
import fitz
|
||||
from legal_mcp import config
|
||||
|
||||
client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
|
||||
MODEL = "claude-opus-4-20250514"
|
||||
|
||||
ORIGINALS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/originals")
|
||||
OUTPUT_DIR = ORIGINALS_DIR.parent / "extracted"
|
||||
|
||||
|
||||
async def ocr_page(image_bytes: bytes, page_num: int) -> str:
|
||||
b64_image = base64.b64encode(image_bytes).decode("utf-8")
|
||||
message = client.messages.create(
|
||||
model=MODEL,
|
||||
max_tokens=4096,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image",
|
||||
"source": {"type": "base64", "media_type": "image/png", "data": b64_image},
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": (
|
||||
"חלץ את כל הטקסט מהתמונה הזו. זהו מסמך משפטי בעברית. "
|
||||
"שמור על מבנה הפסקאות המקורי. "
|
||||
"אם יש כותרות, סמן אותן. "
|
||||
"החזר רק את הטקסט המחולץ, ללא הערות נוספות."
|
||||
),
|
||||
},
|
||||
],
|
||||
}],
|
||||
)
|
||||
return message.content[0].text
|
||||
|
||||
|
||||
async def process_pdf(pdf_path: Path) -> tuple[str, int, float, int, int]:
|
||||
doc = fitz.open(str(pdf_path))
|
||||
page_count = len(doc)
|
||||
pages_text = []
|
||||
total_input = 0
|
||||
total_output = 0
|
||||
|
||||
t0 = time.time()
|
||||
for i in range(page_count):
|
||||
page = doc[i]
|
||||
pix = page.get_pixmap(dpi=200)
|
||||
img_bytes = pix.tobytes("png")
|
||||
|
||||
print(f" Page {i+1}/{page_count}...", end=" ", flush=True)
|
||||
pt = time.time()
|
||||
text = await ocr_page(img_bytes, i + 1)
|
||||
elapsed = time.time() - pt
|
||||
pages_text.append(text)
|
||||
print(f"{len(text):,} chars, {elapsed:.1f}s")
|
||||
|
||||
doc.close()
|
||||
total_time = time.time() - t0
|
||||
full_text = "\n\n".join(pages_text)
|
||||
return full_text, page_count, total_time
|
||||
|
||||
|
||||
async def main():
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
pdfs = sorted(ORIGINALS_DIR.glob("*.pdf"))
|
||||
print(f"Found {len(pdfs)} PDFs — extracting ALL pages with {MODEL}\n")
|
||||
|
||||
total_pages = 0
|
||||
total_time = 0.0
|
||||
|
||||
for pdf in pdfs:
|
||||
print(f"{'=' * 60}")
|
||||
print(f" {pdf.name} ({pdf.stat().st_size:,} bytes)")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
text, pages, elapsed = await process_pdf(pdf)
|
||||
total_pages += pages
|
||||
total_time += elapsed
|
||||
|
||||
out_file = OUTPUT_DIR / f"{pdf.stem}.md"
|
||||
out_file.write_text(text, encoding="utf-8")
|
||||
|
||||
print(f" Result: {pages} pages, {len(text):,} chars, {len(text.split()):,} words")
|
||||
print(f" Time: {elapsed:.1f}s ({elapsed/max(pages,1):.1f}s/page)")
|
||||
print(f" Saved: {out_file.name}\n")
|
||||
|
||||
print(f"{'=' * 60}")
|
||||
print(f"TOTAL: {len(pdfs)} docs, {total_pages} pages, {total_time:.1f}s")
|
||||
est_cost = total_pages * 0.05
|
||||
print(f"Estimated cost: ${est_cost:.2f}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
177
scripts/.archive/generate-embeddings.py
Normal file
177
scripts/.archive/generate-embeddings.py
Normal file
@@ -0,0 +1,177 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Generate embeddings for decision blocks and case law.
|
||||
|
||||
Creates:
|
||||
- paragraph_embeddings: for each decision block with content
|
||||
- case_law_embeddings: for each case law summary
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from uuid import UUID
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "mcp-server" / "src"))
|
||||
|
||||
from legal_mcp.services.db import get_pool, init_schema, close_pool
|
||||
from legal_mcp.services.embeddings import embed_texts
|
||||
from legal_mcp import config
|
||||
|
||||
|
||||
async def generate_block_embeddings(conn) -> int:
|
||||
"""Generate embeddings for decision blocks.
|
||||
|
||||
First creates decision_paragraphs records from block content,
|
||||
then generates embeddings in paragraph_embeddings.
|
||||
"""
|
||||
blocks = await conn.fetch(
|
||||
"""SELECT db.id as block_id, db.decision_id, db.block_id as block_type,
|
||||
db.content, db.word_count, c.case_number
|
||||
FROM decision_blocks db
|
||||
JOIN decisions d ON d.id = db.decision_id
|
||||
JOIN cases c ON c.id = d.case_id
|
||||
WHERE db.word_count > 10
|
||||
AND db.block_id NOT IN ('block-alef', 'block-bet', 'block-gimel', 'block-dalet')
|
||||
ORDER BY c.case_number, db.block_index"""
|
||||
)
|
||||
|
||||
if not blocks:
|
||||
print(" אין בלוקים ליצירת embeddings")
|
||||
return 0
|
||||
|
||||
print(f" מעבד {len(blocks)} בלוקים...")
|
||||
|
||||
# Create paragraphs and collect texts for embedding
|
||||
para_records = []
|
||||
para_number = 1
|
||||
|
||||
for block in blocks:
|
||||
content = block["content"]
|
||||
words = content.split()
|
||||
|
||||
# Split into chunks for embedding
|
||||
if len(words) <= 600:
|
||||
chunk_texts = [content]
|
||||
else:
|
||||
chunk_texts = []
|
||||
for start in range(0, len(words), 400):
|
||||
chunk_words = words[start:start + 500]
|
||||
if len(chunk_words) > 50:
|
||||
chunk_texts.append(" ".join(chunk_words))
|
||||
|
||||
for chunk_text in chunk_texts:
|
||||
# Create decision_paragraph record
|
||||
para_id = await conn.fetchval(
|
||||
"""INSERT INTO decision_paragraphs
|
||||
(block_id, paragraph_number, content, word_count)
|
||||
VALUES ($1, $2, $3, $4)
|
||||
ON CONFLICT DO NOTHING
|
||||
RETURNING id""",
|
||||
block["block_id"],
|
||||
para_number,
|
||||
chunk_text,
|
||||
len(chunk_text.split()),
|
||||
)
|
||||
if para_id:
|
||||
para_records.append({
|
||||
"para_id": para_id,
|
||||
"text": chunk_text,
|
||||
"case_number": block["case_number"],
|
||||
})
|
||||
para_number += 1
|
||||
|
||||
if not para_records:
|
||||
print(" אין פסקאות חדשות")
|
||||
return 0
|
||||
|
||||
print(f" {len(para_records)} פסקאות נוצרו, מייצר embeddings...")
|
||||
|
||||
# Generate embeddings in batches
|
||||
texts = [p["text"] for p in para_records]
|
||||
embeddings = await embed_texts(texts, input_type="document")
|
||||
|
||||
# Store embeddings
|
||||
count = 0
|
||||
for para, embedding in zip(para_records, embeddings):
|
||||
await conn.execute(
|
||||
"""INSERT INTO paragraph_embeddings (paragraph_id, embedding)
|
||||
VALUES ($1, $2)""",
|
||||
para["para_id"],
|
||||
embedding,
|
||||
)
|
||||
count += 1
|
||||
|
||||
return count
|
||||
|
||||
|
||||
async def generate_case_law_embeddings(conn) -> int:
|
||||
"""Generate embeddings for case law summaries."""
|
||||
cases = await conn.fetch(
|
||||
"""SELECT id, case_number, case_name, summary, key_quote
|
||||
FROM case_law
|
||||
WHERE summary != '' OR key_quote != ''"""
|
||||
)
|
||||
|
||||
# Filter out existing
|
||||
existing = await conn.fetch("SELECT case_law_id FROM case_law_embeddings")
|
||||
existing_ids = {r["case_law_id"] for r in existing}
|
||||
|
||||
to_embed = [c for c in cases if c["id"] not in existing_ids]
|
||||
|
||||
if not to_embed:
|
||||
print(" אין פסיקה חדשה ליצירת embeddings")
|
||||
return 0
|
||||
|
||||
print(f" מייצר embeddings ל-{len(to_embed)} תקדימים...")
|
||||
|
||||
texts = []
|
||||
for c in to_embed:
|
||||
# Combine case info into a searchable text
|
||||
text = f"{c['case_number']} {c['case_name']}: {c['summary']}"
|
||||
if c["key_quote"]:
|
||||
text += f" ציטוט: {c['key_quote']}"
|
||||
texts.append(text)
|
||||
|
||||
embeddings = await embed_texts(texts, input_type="document")
|
||||
|
||||
count = 0
|
||||
for case, embedding in zip(to_embed, embeddings):
|
||||
await conn.execute(
|
||||
"""INSERT INTO case_law_embeddings (case_law_id, chunk_text, embedding)
|
||||
VALUES ($1, $2, $3)""",
|
||||
case["id"],
|
||||
f"{case['case_number']} {case['case_name']}: {case['summary']}",
|
||||
embedding,
|
||||
)
|
||||
count += 1
|
||||
|
||||
return count
|
||||
|
||||
|
||||
async def main():
|
||||
await init_schema()
|
||||
pool = await get_pool()
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
print("שלב 1: embeddings לבלוקי החלטה")
|
||||
block_count = await generate_block_embeddings(conn)
|
||||
print(f" ✅ {block_count} embeddings נוצרו")
|
||||
|
||||
print("\nשלב 2: embeddings לפסיקה")
|
||||
cl_count = await generate_case_law_embeddings(conn)
|
||||
print(f" ✅ {cl_count} embeddings נוצרו")
|
||||
|
||||
# Summary
|
||||
para_total = await conn.fetchval("SELECT count(*) FROM paragraph_embeddings")
|
||||
cl_total = await conn.fetchval("SELECT count(*) FROM case_law_embeddings")
|
||||
|
||||
await close_pool()
|
||||
|
||||
print(f"\nסיכום:")
|
||||
print(f" סה\"כ paragraph_embeddings: {para_total}")
|
||||
print(f" סה\"כ case_law_embeddings: {cl_total}")
|
||||
print(f" מודל: {config.VOYAGE_MODEL}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
118
scripts/.archive/link-claims-to-discussion.py
Normal file
118
scripts/.archive/link-claims-to-discussion.py
Normal file
@@ -0,0 +1,118 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Link claims to discussion paragraphs using semantic similarity.
|
||||
|
||||
For each claim, finds the most similar paragraph in block-yod of the same decision.
|
||||
Updates claims.addressed_in_paragraph with the paragraph number.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from uuid import UUID
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "mcp-server" / "src"))
|
||||
|
||||
from legal_mcp.services.db import get_pool, init_schema, close_pool
|
||||
from legal_mcp.services.embeddings import embed_texts
|
||||
|
||||
|
||||
async def main():
|
||||
await init_schema()
|
||||
pool = await get_pool()
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
# Get all cases with both claims and discussion blocks
|
||||
cases = await conn.fetch(
|
||||
"""SELECT DISTINCT c.id as case_id, c.case_number
|
||||
FROM cases c
|
||||
JOIN claims cl ON cl.case_id = c.id
|
||||
JOIN decisions d ON d.case_id = c.id
|
||||
JOIN decision_blocks db ON db.decision_id = d.id AND db.block_id = 'block-yod' AND db.word_count > 0
|
||||
ORDER BY c.case_number"""
|
||||
)
|
||||
|
||||
total_linked = 0
|
||||
|
||||
for case in cases:
|
||||
case_id = case["case_id"]
|
||||
case_number = case["case_number"]
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
# Get claims for this case
|
||||
claims = await conn.fetch(
|
||||
"SELECT id, claim_text, party_role, claim_index FROM claims WHERE case_id = $1 ORDER BY claim_index",
|
||||
case_id,
|
||||
)
|
||||
|
||||
# Get discussion paragraphs (split block-yod into paragraphs)
|
||||
yod_content = await conn.fetchval(
|
||||
"""SELECT db.content FROM decision_blocks db
|
||||
JOIN decisions d ON d.id = db.decision_id
|
||||
WHERE d.case_id = $1 AND db.block_id = 'block-yod'""",
|
||||
case_id,
|
||||
)
|
||||
|
||||
if not yod_content or not claims:
|
||||
continue
|
||||
|
||||
# Split discussion into paragraphs
|
||||
disc_paragraphs = [p.strip() for p in yod_content.split("\n") if p.strip() and len(p.strip()) > 30]
|
||||
|
||||
if not disc_paragraphs:
|
||||
continue
|
||||
|
||||
print(f"\n{case_number}: {len(claims)} טענות ← {len(disc_paragraphs)} פסקאות דיון")
|
||||
|
||||
# Embed all claims and discussion paragraphs
|
||||
claim_texts = [c["claim_text"][:500] for c in claims]
|
||||
all_texts = claim_texts + disc_paragraphs
|
||||
|
||||
embeddings = await embed_texts(all_texts, input_type="document")
|
||||
|
||||
claim_embeddings = embeddings[:len(claims)]
|
||||
disc_embeddings = embeddings[len(claims):]
|
||||
|
||||
# For each claim, find the best matching discussion paragraph
|
||||
linked = 0
|
||||
async with pool.acquire() as conn:
|
||||
for i, claim in enumerate(claims):
|
||||
claim_emb = claim_embeddings[i]
|
||||
|
||||
# Cosine similarity
|
||||
best_score = -1
|
||||
best_para_idx = -1
|
||||
for j, disc_emb in enumerate(disc_embeddings):
|
||||
dot = sum(a * b for a, b in zip(claim_emb, disc_emb))
|
||||
norm_a = sum(a * a for a in claim_emb) ** 0.5
|
||||
norm_b = sum(b * b for b in disc_emb) ** 0.5
|
||||
score = dot / (norm_a * norm_b) if norm_a > 0 and norm_b > 0 else 0
|
||||
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_para_idx = j
|
||||
|
||||
if best_para_idx >= 0 and best_score > 0.3:
|
||||
# paragraph_number is 1-indexed
|
||||
para_num = best_para_idx + 1
|
||||
await conn.execute(
|
||||
"UPDATE claims SET addressed_in_paragraph = $1 WHERE id = $2",
|
||||
para_num, claim["id"],
|
||||
)
|
||||
linked += 1
|
||||
|
||||
total_linked += linked
|
||||
print(f" קושרו: {linked}/{len(claims)} טענות (ציון מינימלי: 0.3)")
|
||||
|
||||
# Summary
|
||||
async with pool.acquire() as conn:
|
||||
total_claims = await conn.fetchval("SELECT count(*) FROM claims")
|
||||
linked_claims = await conn.fetchval("SELECT count(*) FROM claims WHERE addressed_in_paragraph IS NOT NULL")
|
||||
|
||||
await close_pool()
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"סיכום: {linked_claims}/{total_claims} טענות קושרו לפסקאות דיון ({linked_claims/total_claims*100:.0f}%)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
382
scripts/.archive/proofread_training_corpus.py
Normal file
382
scripts/.archive/proofread_training_corpus.py
Normal file
@@ -0,0 +1,382 @@
|
||||
"""Proofread training corpus: strip Nevo additions from DOCX/PDF, output clean Markdown.
|
||||
|
||||
Nevo DOCX additions:
|
||||
Front: ספרות / חקיקה שאוזכרה / מיני-רציו / topic tags / Nevo summary paragraphs
|
||||
Back: 5129371512937154678313 / "בעניין עריכה ושינויים" link / "54678313-..." / "נוסח מסמך זה כפוף"
|
||||
|
||||
Nevo PDF additions:
|
||||
"עמוד X מתוך Y" header on every page
|
||||
|
||||
PDF text extraction uses Google Cloud Vision OCR — PyMuPDF fragments Hebrew RTL
|
||||
text unusably (words split mid-word, reading order broken). OCR gives clean output.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import fitz
|
||||
from docx import Document
|
||||
|
||||
# Load GOOGLE_CLOUD_VISION_API_KEY from ~/.env if not already set
|
||||
if not os.environ.get("GOOGLE_CLOUD_VISION_API_KEY"):
|
||||
env_path = Path.home() / ".env"
|
||||
if env_path.exists():
|
||||
for line in env_path.read_text().splitlines():
|
||||
if line.startswith("GOOGLE_CLOUD_VISION_API_KEY="):
|
||||
os.environ["GOOGLE_CLOUD_VISION_API_KEY"] = line.split("=", 1)[1].strip().strip('"').strip("'")
|
||||
break
|
||||
|
||||
from google.cloud import vision # noqa: E402
|
||||
|
||||
TRAINING_DIR = Path("/home/chaim/legal-ai/data/training")
|
||||
OUTPUT_DIR = TRAINING_DIR / "proofread"
|
||||
RAW_DIR = TRAINING_DIR / "raw"
|
||||
|
||||
# ── Nevo pattern detection ────────────────────────────────────────
|
||||
|
||||
NEVO_PREAMBLE_HEADERS = (
|
||||
"ספרות:",
|
||||
"חקיקה שאוזכרה:",
|
||||
"מיני-רציו:",
|
||||
)
|
||||
|
||||
# Strong decision-opening patterns — highly distinctive first words of real decision
|
||||
# body. These rarely appear inside Nevo's own summary block, so first match wins.
|
||||
DECISION_OPENING = re.compile(
|
||||
r"^(עניינו\s|ענייננו\s|עסקינן\s|בפנינו\s|לפנינו\s|בערר\s+שלפנינו|זהו\s+ערר)"
|
||||
)
|
||||
|
||||
# Section headers that definitively mark decision body start.
|
||||
DECISION_SECTION_HEADERS = {
|
||||
"רקע",
|
||||
"פתח דבר",
|
||||
"תמצית טענות הצדדים",
|
||||
"העובדות",
|
||||
"הרקע העובדתי",
|
||||
"מבוא",
|
||||
}
|
||||
|
||||
# Nevo postamble markers — everything from first match onwards is stripped.
|
||||
NEVO_POSTAMBLE_MARKERS = (
|
||||
"5129371512937154678313",
|
||||
"בעניין עריכה ושינויים במסמכי פסיקה",
|
||||
"נוסח מסמך זה כפוף לשינויי ניסוח ועריכה",
|
||||
)
|
||||
|
||||
# Nevo inline watermark codes — appear as prefixes embedded in real paragraphs
|
||||
# (e.g. "5129371ניתנה פה אחד" or "054678313האם ההיתר..."). These must be
|
||||
# stripped from paragraph content, not used as postamble boundaries.
|
||||
NEVO_INLINE_CODE_RE = re.compile(r"^0?(5129371|54678313)\d*")
|
||||
|
||||
# Nevo PDF page header: "עמוד X מתוך Y" or "עמוד X בן Y" (Hebrew variants)
|
||||
PDF_PAGE_HEADER_RE = re.compile(
|
||||
r"\s*עמוד\s*\n?\s*\d+\s*\n?\s*(?:מתוך|בן)\s*\n?\s*\d+\s*"
|
||||
)
|
||||
# Short orphan lines starting with "עמוד" — OCR artifacts from merged footer text
|
||||
# (e.g. "עמודירבי", "עמוד :", "עמודי", "עמוד ר"). Conservative: up to 12 chars.
|
||||
PDF_PAGE_ORPHAN_RE = re.compile(r"(?m)^עמוד[^\n]{0,12}$")
|
||||
# "עמוד" followed by number (with optional garbled Nevo URL line after)
|
||||
PDF_PAGE_BLOCK_RE = re.compile(
|
||||
r"(?m)^\s*עמוד\s*\n\s*\d+[·.]?\s*\n[^\n]*\n", re.UNICODE
|
||||
)
|
||||
# Standalone "עמוד N" at line start
|
||||
PDF_PAGE_NUM_LINE_RE = re.compile(r"(?m)^\s*עמוד\s*\n?\s*\d+[·.]?\s*$")
|
||||
# Nevo watermark URL (and common OCR-garbled variants)
|
||||
NEVO_URL_RE = re.compile(
|
||||
r"(nevo\.co\.il|neto\.co\.il|netocoal|neetocoal|nevocoal|nevo\.co|rawo\.co\.il)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def find_decision_start(paragraphs: list[str]) -> int:
|
||||
"""Find index of first real decision paragraph, skipping Nevo preamble.
|
||||
|
||||
Strategy:
|
||||
1. If no Nevo headers present → start at 0.
|
||||
2. Otherwise, scan past Nevo headers; look for first paragraph matching
|
||||
DECISION_OPENING regex or DECISION_SECTION_HEADERS.
|
||||
3. Fallback: first paragraph after "ועדת הערר ... קבעה כלהלן:" bullet block
|
||||
that doesn't look like summary (heuristic: longer, has proper sentence).
|
||||
"""
|
||||
has_nevo_preamble = any(
|
||||
any(p.startswith(h) for h in NEVO_PREAMBLE_HEADERS) for p in paragraphs[:10]
|
||||
)
|
||||
if not has_nevo_preamble:
|
||||
return 0
|
||||
|
||||
# Scan for strong decision-opening markers
|
||||
for i, p in enumerate(paragraphs):
|
||||
stripped = p.strip()
|
||||
if stripped in DECISION_SECTION_HEADERS:
|
||||
return i
|
||||
if DECISION_OPENING.match(stripped):
|
||||
return i
|
||||
|
||||
# Fallback: find "ועדת הערר ... קבעה כלהלן" and take first long para after bullets
|
||||
for i, p in enumerate(paragraphs):
|
||||
if "קבעה כלהלן" in p or "קבעה את הדברים הבאים" in p:
|
||||
# Skip summary paragraphs (Nevo typically has 3-8 of these)
|
||||
for j in range(i + 1, min(i + 15, len(paragraphs))):
|
||||
if len(paragraphs[j]) > 80 and not paragraphs[j].strip().startswith("*"):
|
||||
# Check if this looks like real decision content
|
||||
return j
|
||||
break
|
||||
|
||||
# Last resort: strip only the first 10 paragraphs of preamble
|
||||
return min(10, len(paragraphs) - 1)
|
||||
|
||||
|
||||
def find_decision_end(paragraphs: list[str]) -> int:
|
||||
"""Find exclusive end index: first paragraph that is a Nevo postamble marker."""
|
||||
for i, p in enumerate(paragraphs):
|
||||
for marker in NEVO_POSTAMBLE_MARKERS:
|
||||
if marker in p:
|
||||
return i
|
||||
return len(paragraphs)
|
||||
|
||||
|
||||
# ── DOCX proofreading ─────────────────────────────────────────────
|
||||
|
||||
|
||||
def _strip_inline_nevo_codes(paragraphs: list[str]) -> list[str]:
|
||||
"""Remove Nevo inline watermark codes from paragraph prefixes; drop pure-code paras."""
|
||||
out: list[str] = []
|
||||
for p in paragraphs:
|
||||
stripped = NEVO_INLINE_CODE_RE.sub("", p).strip()
|
||||
if stripped:
|
||||
out.append(stripped)
|
||||
return out
|
||||
|
||||
|
||||
def proofread_docx(path: Path) -> tuple[str, dict]:
|
||||
"""Extract clean decision text from Nevo DOCX. Returns (markdown, stats)."""
|
||||
doc = Document(str(path))
|
||||
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
||||
|
||||
start = find_decision_start(paragraphs)
|
||||
end = find_decision_end(paragraphs)
|
||||
|
||||
clean = _strip_inline_nevo_codes(paragraphs[start:end])
|
||||
md = "\n\n".join(clean)
|
||||
|
||||
return md, {
|
||||
"total_paragraphs": len(paragraphs),
|
||||
"preamble_stripped": start,
|
||||
"postamble_stripped": len(paragraphs) - end,
|
||||
"clean_paragraphs": len(clean),
|
||||
}
|
||||
|
||||
|
||||
# ── PDF proofreading (Google Vision OCR) ──────────────────────────
|
||||
|
||||
_vision_client: vision.ImageAnnotatorClient | None = None
|
||||
|
||||
|
||||
def _get_vision_client() -> vision.ImageAnnotatorClient:
|
||||
global _vision_client
|
||||
if _vision_client is None:
|
||||
api_key = os.environ.get("GOOGLE_CLOUD_VISION_API_KEY")
|
||||
if not api_key:
|
||||
raise RuntimeError("GOOGLE_CLOUD_VISION_API_KEY not set")
|
||||
_vision_client = vision.ImageAnnotatorClient(
|
||||
client_options={"api_key": api_key}
|
||||
)
|
||||
return _vision_client
|
||||
|
||||
|
||||
# Hebrew abbreviation quote fixes — Google Vision renders ״ as 'יי'
|
||||
_HEBREW_ABBREV_FIXES: dict[str, str] = {
|
||||
"עוהייד": 'עוה"ד',
|
||||
"עוייד": 'עו"ד',
|
||||
"הנייל": 'הנ"ל',
|
||||
"מצייב": 'מצ"ב',
|
||||
"ביהמייש": 'ביהמ"ש',
|
||||
"תייז": 'ת"ז',
|
||||
"עייי": 'ע"י',
|
||||
"אחייכ": 'אח"כ',
|
||||
"סייק": 'ס"ק',
|
||||
"דייר": 'ד"ר',
|
||||
"חווייד": 'חוו"ד',
|
||||
"מייר": 'מ"ר',
|
||||
"יחייד": 'יח"ד',
|
||||
"בייכ": 'ב"כ',
|
||||
"בייה": 'ב"ה',
|
||||
"שייח": 'ש"ח',
|
||||
"יוייר": 'יו"ר',
|
||||
"בליימ": 'בל"מ',
|
||||
"תבייע": 'תב"ע',
|
||||
"תמייא": 'תמ"א',
|
||||
"סייה": 'ס"ה',
|
||||
"שייפ": 'ש"פ',
|
||||
"שצייפ": 'שצ"פ',
|
||||
"שבייצ": 'שב"צ',
|
||||
"עסיים": 'עס"ם',
|
||||
"הייה": 'ה"ה',
|
||||
"פסייד": 'פס"ד',
|
||||
"תיידא": 'תיד"א',
|
||||
"בגייץ": 'בג"ץ',
|
||||
"עתיים": 'עת"ם',
|
||||
"עעיים": 'עע"ם',
|
||||
# Hebrew calendar day prefixes (כ"א .. כ"ט etc.)
|
||||
"כייא": 'כ"א', "כייב": 'כ"ב', "כייג": 'כ"ג', "כייד": 'כ"ד',
|
||||
"כייה": 'כ"ה', "כייו": 'כ"ו', "כייז": 'כ"ז', "כייח": 'כ"ח', "כייט": 'כ"ט',
|
||||
"לייא": 'ל"א',
|
||||
"יייא": 'י"א', "יייב": 'י"ב', "יייג": 'י"ג', "יייד": 'י"ד',
|
||||
"טייו": 'ט"ו', "טייז": 'ט"ז', "יייז": 'י"ז', "יייח": 'י"ח', "יייט": 'י"ט',
|
||||
# Hebrew calendar years (תשפ"ה, תשפ"ד...)
|
||||
"תשפייא": 'תשפ"א', "תשפייב": 'תשפ"ב', "תשפייג": 'תשפ"ג',
|
||||
"תשפייד": 'תשפ"ד', "תשפייה": 'תשפ"ה', "תשפייו": 'תשפ"ו',
|
||||
"תשפיין": 'תשפ"ן',
|
||||
}
|
||||
_ABBREV_PATTERN = re.compile(
|
||||
"|".join(re.escape(k) for k in sorted(_HEBREW_ABBREV_FIXES, key=len, reverse=True))
|
||||
)
|
||||
|
||||
|
||||
def _fix_hebrew_quotes(text: str) -> str:
|
||||
return _ABBREV_PATTERN.sub(lambda m: _HEBREW_ABBREV_FIXES[m.group()], text)
|
||||
|
||||
|
||||
def _ocr_page_image(image_bytes: bytes, page_num: int) -> str:
|
||||
client = _get_vision_client()
|
||||
image = vision.Image(content=image_bytes)
|
||||
response = client.document_text_detection(
|
||||
image=image,
|
||||
image_context=vision.ImageContext(language_hints=["he"]),
|
||||
)
|
||||
if response.error.message:
|
||||
raise RuntimeError(f"Vision error page {page_num}: {response.error.message}")
|
||||
text = response.full_text_annotation.text if response.full_text_annotation else ""
|
||||
return _fix_hebrew_quotes(text)
|
||||
|
||||
|
||||
_FOOTER_JUNK_RE = re.compile(
|
||||
r"^("
|
||||
r"\s*|" # blank
|
||||
r"[-·*.\"\'׳״]+|" # stray punctuation
|
||||
r"\d{1,3}[\s\-·*.\"\'׳״]*|" # page number with any stray char
|
||||
r"עמוד[\s\d\-·*.\"\'׳״]*|" # "עמוד" / "עמוד N" w/ trailing noise
|
||||
r"[-·*\s\"\'׳״]*[a-zA-Z][a-zA-Z0-9 .\-·*_]{0,30}" # garbled latin (nevo URL variants)
|
||||
r")$"
|
||||
)
|
||||
|
||||
|
||||
def _clean_page_text(text: str) -> str:
|
||||
"""Strip Nevo page headers, footers and watermarks from a single page's OCR text.
|
||||
|
||||
Nevo footer on each page looks like:
|
||||
עמוד
|
||||
N (or "N·", "N*")
|
||||
nevo.co.il (or OCR-garbled: "new coal", "neto coal", etc.)
|
||||
- (optional stray dash)
|
||||
|
||||
Google Vision OCRs this block at the end of each page's text.
|
||||
"""
|
||||
# 1. Strip top header "עמוד X מתוך Y" anywhere
|
||||
text = PDF_PAGE_HEADER_RE.sub("\n", text)
|
||||
|
||||
# 2. Walk back from end, dropping footer junk lines
|
||||
lines = text.split("\n")
|
||||
while lines and _FOOTER_JUNK_RE.match(lines[-1].strip()):
|
||||
lines.pop()
|
||||
text = "\n".join(lines)
|
||||
|
||||
# 3. Final pass: strip any leftover Nevo URLs mid-text and orphan "עמוד X" lines
|
||||
text = NEVO_URL_RE.sub("", text)
|
||||
text = PDF_PAGE_NUM_LINE_RE.sub("", text)
|
||||
text = PDF_PAGE_ORPHAN_RE.sub("", text)
|
||||
|
||||
return text.strip()
|
||||
|
||||
|
||||
def proofread_pdf(path: Path) -> tuple[str, dict]:
|
||||
"""Extract clean decision text from Nevo PDF via Google Vision OCR."""
|
||||
doc = fitz.open(str(path))
|
||||
pages: list[str] = []
|
||||
for i, page in enumerate(doc):
|
||||
pix = page.get_pixmap(dpi=300)
|
||||
img_bytes = pix.tobytes("png")
|
||||
text = _ocr_page_image(img_bytes, i + 1)
|
||||
pages.append(_clean_page_text(text))
|
||||
# Small delay between API calls to be safe
|
||||
time.sleep(0.1)
|
||||
doc.close()
|
||||
|
||||
body = "\n\n".join(p for p in pages if p)
|
||||
body = re.sub(r"\n{3,}", "\n\n", body)
|
||||
body = re.sub(r"[ \t]+\n", "\n", body)
|
||||
|
||||
for marker in NEVO_POSTAMBLE_MARKERS:
|
||||
idx = body.find(marker)
|
||||
if idx != -1:
|
||||
body = body[:idx].rstrip()
|
||||
break
|
||||
|
||||
return body, {
|
||||
"pages": len(pages),
|
||||
"chars": len(body),
|
||||
}
|
||||
|
||||
|
||||
# ── Orchestration ─────────────────────────────────────────────────
|
||||
|
||||
|
||||
SKIP_FILES = {
|
||||
"הכנת שאלות מחקר.docx",
|
||||
"סוכן_מנתח_ומחקר_משפטי_Paperclip_מדריך.docx",
|
||||
"README.md",
|
||||
}
|
||||
|
||||
|
||||
def output_filename(src: Path) -> str:
|
||||
"""Build clean output filename preserving case identifier."""
|
||||
stem = src.stem
|
||||
# Normalize: replace spaces with - where helpful, but keep Hebrew intact
|
||||
return f"{stem}.md"
|
||||
|
||||
|
||||
def main(argv: list[str]) -> int:
|
||||
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||
RAW_DIR.mkdir(exist_ok=True)
|
||||
|
||||
# Filter files
|
||||
only = argv[1:] if len(argv) > 1 else None
|
||||
files: list[Path] = []
|
||||
for p in sorted(TRAINING_DIR.iterdir()):
|
||||
if p.is_dir() or p.name.startswith("."):
|
||||
continue
|
||||
if p.name in SKIP_FILES:
|
||||
continue
|
||||
if p.suffix.lower() not in (".docx", ".pdf"):
|
||||
continue
|
||||
if only and p.name not in only:
|
||||
continue
|
||||
files.append(p)
|
||||
|
||||
print(f"Processing {len(files)} files...\n")
|
||||
|
||||
for path in files:
|
||||
try:
|
||||
if path.suffix.lower() == ".docx":
|
||||
md, stats = proofread_docx(path)
|
||||
else:
|
||||
md, stats = proofread_pdf(path)
|
||||
|
||||
out_path = OUTPUT_DIR / output_filename(path)
|
||||
out_path.write_text(md, encoding="utf-8")
|
||||
print(f"✓ {path.name}")
|
||||
print(f" → {out_path.name} ({len(md):,} chars) {stats}")
|
||||
except Exception as e:
|
||||
print(f"✗ {path.name}: {e}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main(sys.argv))
|
||||
254
scripts/.archive/seed-appeals.py
Normal file
254
scripts/.archive/seed-appeals.py
Normal file
@@ -0,0 +1,254 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Seed appeals (cases) from legacy vault metadata."""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
from datetime import date
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "mcp-server" / "src"))
|
||||
|
||||
from legal_mcp.services.db import get_pool, init_schema, close_pool
|
||||
|
||||
|
||||
APPEALS = [
|
||||
# ── Active (01_Projects) ──
|
||||
{
|
||||
"case_number": "1130/25",
|
||||
"title": "ערר קרית יערים-1 — קובר",
|
||||
"appellants": ["מרק קובר", "יצחק מטמון"],
|
||||
"respondents": ["הוועדה המרחבית הראל", "ליבמן"],
|
||||
"subject": "ערר על אישור תכנית להוספת קומה וזכויות בנייה",
|
||||
"property_address": "רח' אבינדב 23, קריית יערים",
|
||||
"status": "in_progress",
|
||||
"expected_outcome": "partial",
|
||||
},
|
||||
{
|
||||
"case_number": "1194/25+1199/25",
|
||||
"title": "ערר קרית יערים-2 — מטמון/קובר",
|
||||
"appellants": ["יצחק מטמון", "מרק קובר"],
|
||||
"respondents": ["הוועדה המקומית"],
|
||||
"subject": "תוספת קומה + הגדלת זכויות בנייה",
|
||||
"property_address": "חלקה 240, גוש 29536, רח' אבינדב",
|
||||
"status": "new",
|
||||
"expected_outcome": "",
|
||||
},
|
||||
{
|
||||
"case_number": "8027-25",
|
||||
"title": "ערר היטל השבחה תחכמוני 20",
|
||||
"appellants": ["עובדיה", "מירב", "ווינשטיין ואח'"],
|
||||
"respondents": ["הוועדה המקומית ירושלים"],
|
||||
"subject": "היטל השבחה",
|
||||
"property_address": "רח' תחכמוני, ירושלים, גוש 30069, חלקה 156",
|
||||
"status": "new",
|
||||
"expected_outcome": "",
|
||||
},
|
||||
# ── Archived — completed decisions ──
|
||||
{
|
||||
"case_number": "1180-1181",
|
||||
"title": "ערר הכט",
|
||||
"appellants": [],
|
||||
"respondents": [],
|
||||
"subject": "רישוי ובנייה",
|
||||
"property_address": "",
|
||||
"status": "final",
|
||||
"expected_outcome": "rejected",
|
||||
"notes": "פורסם 05.02.2026. דחייה. שימש כמודל לניתוח סגנון.",
|
||||
},
|
||||
{
|
||||
"case_number": "1126/25+1141/25",
|
||||
"title": "תמ\"א 38/2 בית הכרם",
|
||||
"appellants": ["מרכז קהילתי זיו-מרקס", "12 תושבים"],
|
||||
"respondents": ["הוועדה המקומית", "יזם"],
|
||||
"subject": "תמ\"א 38/2 הריסה ובנייה מחדש",
|
||||
"property_address": "רח' החלוץ 36, בית הכרם, גוש 30159/6",
|
||||
"status": "final",
|
||||
"expected_outcome": "partial",
|
||||
"notes": "גרסה סופית טיוטה 9, מרץ 2026. קבלה חלקית. שימש כמודל לניתוח סגנון.",
|
||||
},
|
||||
{
|
||||
"case_number": "8255-25",
|
||||
"title": "בל\"מ אפרים אבי",
|
||||
"appellants": ["אפרים אברהם"],
|
||||
"respondents": ["הוועדה המקומית ירושלים"],
|
||||
"subject": "היטל השבחה — בקשה להארכת מועד",
|
||||
"property_address": "רח' הורקניה 4, קטמונים, ירושלים",
|
||||
"status": "final",
|
||||
"expected_outcome": "rejected",
|
||||
"notes": "גרסה סופית מאושרת. דחייה.",
|
||||
},
|
||||
# ── Archived — unified decisions ──
|
||||
{
|
||||
"case_number": "8107-25",
|
||||
"title": "אבו זאהריה",
|
||||
"appellants": ["אבו זאהריה מפיד"],
|
||||
"respondents": ["הוועדה המקומית ירושלים"],
|
||||
"subject": "ערר על החלטת שמאי מכריע — היטל השבחה",
|
||||
"property_address": "רח' אום כולתום 26, בית חנינא, גוש 30615, חלקה 69",
|
||||
"status": "final",
|
||||
"expected_outcome": "",
|
||||
"notes": "החלטה מאחדת: ערר גפני.",
|
||||
},
|
||||
{
|
||||
"case_number": "9005-24",
|
||||
"title": "רמת שלמה — פיצויים ס' 197",
|
||||
"appellants": ["קירמאיר אסתר ואח' (63-67 עוררים)"],
|
||||
"respondents": ["הוועדה המקומית ירושלים"],
|
||||
"subject": "פיצויים לפי סעיף 197",
|
||||
"property_address": "רמת שלמה, ירושלים, גוש 30561, חלקות 36, 40",
|
||||
"status": "final",
|
||||
"expected_outcome": "",
|
||||
"notes": "החלטה מאחדת: ערר ורדי 9003-23.",
|
||||
},
|
||||
# ── Archived — in progress ──
|
||||
{
|
||||
"case_number": "1113/25",
|
||||
"title": "אייל מבורך לוי ואברהם עדי",
|
||||
"appellants": ["אייל מבורך לוי", "אברהם עדי"],
|
||||
"respondents": ["הוועדה המקומית הראל"],
|
||||
"subject": "הרחבת דירות + תוספת 2 יח\"ד",
|
||||
"property_address": "רח' השלום 63, מבשרת ציון, גוש 30475, חלקה 5",
|
||||
"status": "in_progress",
|
||||
"expected_outcome": "",
|
||||
},
|
||||
{
|
||||
"case_number": "1128/25",
|
||||
"title": "שטרית",
|
||||
"appellants": [],
|
||||
"respondents": [],
|
||||
"subject": "",
|
||||
"property_address": "",
|
||||
"status": "drafted",
|
||||
"expected_outcome": "",
|
||||
},
|
||||
{
|
||||
"case_number": "1107/06/25",
|
||||
"title": "בלוי נ' הוועדה המקומית",
|
||||
"appellants": ["בלוי מאיר", "מזיע מאיר", "דזימיטרובסקי הדסה"],
|
||||
"respondents": ["הוועדה המקומית ירושלים", "היזם"],
|
||||
"subject": "תוספת בנייה וחיזוק מפני רעידות (תמ\"א 38/1)",
|
||||
"property_address": "רח' הרב בלוי 16, ירושלים, גוש 30099/115",
|
||||
"status": "in_progress",
|
||||
"expected_outcome": "",
|
||||
},
|
||||
{
|
||||
"case_number": "8141-23",
|
||||
"title": "אזורים בנין",
|
||||
"appellants": ["אזורים בנין (1965) בע\"מ"],
|
||||
"respondents": ["הוועדה המקומית ירושלים"],
|
||||
"subject": "היטל השבחה — תכנית 101-0611905",
|
||||
"property_address": "רח' הנביאים 27, ירושלים",
|
||||
"status": "drafted",
|
||||
"expected_outcome": "",
|
||||
},
|
||||
{
|
||||
"case_number": "8047-24",
|
||||
"title": "משכן אליהו — היטל השבחה שמאי מכריע",
|
||||
"appellants": ["עומר דרוויש"],
|
||||
"respondents": ["הוועדה המקומית ירושלים"],
|
||||
"subject": "ערר על שמאית מכריעה — היטל השבחה",
|
||||
"property_address": "גוש 30614, חלקה 89, בית חנינא",
|
||||
"status": "in_progress",
|
||||
"expected_outcome": "",
|
||||
},
|
||||
{
|
||||
"case_number": "1195-25",
|
||||
"title": "וליד ג'מל",
|
||||
"appellants": ["וליד ג'מל"],
|
||||
"respondents": ["ועדת משנה מטה יהודה", "סמיר מוסא זעאתרה"],
|
||||
"subject": "הסדרת קומה שלישית למשרדים",
|
||||
"property_address": "גוש 30492, חלקה 23, כפר עין נקובא",
|
||||
"status": "in_progress",
|
||||
"expected_outcome": "",
|
||||
},
|
||||
{
|
||||
"case_number": "1200/25",
|
||||
"title": "קרית ענבים נופש",
|
||||
"appellants": ["קרית ענבים נופש בע\"מ"],
|
||||
"respondents": ["הוועדה המקומית מטה יהודה", "חברי קיבוץ קרית ענבים"],
|
||||
"subject": "שימוש חורג — סופרמרקט בייעוד ספורט ונופש",
|
||||
"property_address": "קיבוץ קרית ענבים, גוש 29551",
|
||||
"status": "in_progress",
|
||||
"expected_outcome": "",
|
||||
},
|
||||
{
|
||||
"case_number": "1184/25",
|
||||
"title": "שטוקהיים — בית נקופה",
|
||||
"appellants": ["אמנון שטוקהיים", "אילנית שטוקהיים"],
|
||||
"respondents": ["הוועדה המקומית מטה יהודה", "יערה טל"],
|
||||
"subject": "אישור בקשה להיתר עם הקלות",
|
||||
"property_address": "מגרש 51, גוש 31399, חלקה 52, בית נקופה",
|
||||
"status": "in_progress",
|
||||
"expected_outcome": "",
|
||||
},
|
||||
{
|
||||
"case_number": "8070-25",
|
||||
"title": "היטל השבחה — דירת גג",
|
||||
"appellants": ["חיים ראם"],
|
||||
"respondents": ["הוועדה המקומית ירושלים"],
|
||||
"subject": "היטל השבחה — הקלה להשלמת דירת גג",
|
||||
"property_address": "רח' צ.פ. חיות 2, דירה 31, נווה יעקב",
|
||||
"status": "in_progress",
|
||||
"expected_outcome": "",
|
||||
},
|
||||
{
|
||||
"case_number": "8136-24",
|
||||
"title": "ערר השבחה — מרפסות שירות",
|
||||
"appellants": [],
|
||||
"respondents": [],
|
||||
"subject": "היטל השבחה — מרפסות שירות",
|
||||
"property_address": "",
|
||||
"status": "in_progress",
|
||||
"expected_outcome": "",
|
||||
},
|
||||
{
|
||||
"case_number": "8007-24",
|
||||
"title": "עומר דרוויש — שומה מכרעת",
|
||||
"appellants": [],
|
||||
"respondents": [],
|
||||
"subject": "היטל השבחה",
|
||||
"property_address": "",
|
||||
"status": "in_progress",
|
||||
"expected_outcome": "",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
async def main():
|
||||
await init_schema()
|
||||
pool = await get_pool()
|
||||
|
||||
inserted = 0
|
||||
skipped = 0
|
||||
async with pool.acquire() as conn:
|
||||
for a in APPEALS:
|
||||
existing = await conn.fetchval(
|
||||
"SELECT id FROM cases WHERE case_number = $1", a["case_number"]
|
||||
)
|
||||
if existing:
|
||||
skipped += 1
|
||||
continue
|
||||
await conn.execute(
|
||||
"""INSERT INTO cases
|
||||
(case_number, title, appellants, respondents, subject,
|
||||
property_address, status, expected_outcome, notes)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)""",
|
||||
a["case_number"],
|
||||
a["title"],
|
||||
json.dumps(a.get("appellants", [])),
|
||||
json.dumps(a.get("respondents", [])),
|
||||
a.get("subject", ""),
|
||||
a.get("property_address", ""),
|
||||
a.get("status", "new"),
|
||||
a.get("expected_outcome", ""),
|
||||
a.get("notes", ""),
|
||||
)
|
||||
inserted += 1
|
||||
|
||||
await close_pool()
|
||||
print(f"✓ appeals: {inserted} inserted, {skipped} skipped (already exist)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
449
scripts/.archive/seed-knowledge.py
Normal file
449
scripts/.archive/seed-knowledge.py
Normal file
@@ -0,0 +1,449 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Seed knowledge tables from legacy vault data.
|
||||
|
||||
Imports: lessons_learned, transition_phrases, case_law, statutory_provisions.
|
||||
Sources: docs/legal-decision-lessons.md, skills/decision/SKILL.md
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add mcp-server to path so we can reuse db module
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "mcp-server" / "src"))
|
||||
|
||||
from legal_mcp.services.db import get_pool, init_schema, close_pool
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Data: Lessons Learned
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
|
||||
LESSONS = [
|
||||
# --- הכט 1180-1181 (rejected, 02.2026) ---
|
||||
{
|
||||
"lesson_title": "Discussion = continuous essay, no sub-headers",
|
||||
"lesson_text": "הדיון נקרא כחיבור משפטי רציף עם סעיפים ממוספרים, לא כמתווה מובנה עם כותרות משנה. הגרסה המפורסמת של הכט השתמשה באפס כותרות משנה בדיון, בעוד הטיוטה שלנו הכילה 6 כותרות H2.",
|
||||
"category": "structure",
|
||||
"applies_to": ["block-yod"],
|
||||
"source_case": "הכט 1180-1181",
|
||||
"severity": "critical",
|
||||
},
|
||||
{
|
||||
"lesson_title": "Citation through consolidating decision",
|
||||
"lesson_text": "להשתמש בהחלטה מאחדת קודמת (כמו ערר נגאח 1011-03-25) לצטט מספר תקדימים בפסקה אחת ארוכה (~600 מילים), במקום לצטט כל תקדים בפסקה נפרדת.",
|
||||
"category": "style",
|
||||
"applies_to": ["block-yod"],
|
||||
"source_case": "הכט 1180-1181",
|
||||
"severity": "important",
|
||||
},
|
||||
{
|
||||
"lesson_title": "Paragraph length variation in discussion",
|
||||
"lesson_text": "לא לפרגמנט טיעונים משפטיים ארוכים לפסקאות זהות וקצרות. לגוון אורך פסקאות מ-20 עד 600+ מילים. פסקאות ציטוט מרכזיות ארוכות מאוד.",
|
||||
"category": "style",
|
||||
"applies_to": ["block-yod"],
|
||||
"source_case": "הכט 1180-1181",
|
||||
"severity": "important",
|
||||
},
|
||||
{
|
||||
"lesson_title": "Opening formula promises both conclusion AND elaboration",
|
||||
"lesson_text": 'פתיחת הדיון צריכה להבטיח גם מסקנה וגם הרחבה: "לאחר שבחנו... החלטנו בשלב ראשון כי... אך יחד עם זאת ועל מנת לא לצאת בחסר... מצאנו להוסיף מספר הערות"',
|
||||
"category": "style",
|
||||
"applies_to": ["block-yod"],
|
||||
"source_case": "הכט 1180-1181",
|
||||
"severity": "important",
|
||||
},
|
||||
{
|
||||
"lesson_title": 'Summary title is "סיכום"',
|
||||
"lesson_text": 'כותרת פרק הסיכום היא "סיכום" בלבד, לא "סיכום והכרעה" ולא "סוף דבר".',
|
||||
"category": "structure",
|
||||
"applies_to": ["block-yod-alef"],
|
||||
"source_case": "הכט 1180-1181",
|
||||
"severity": "nice-to-have",
|
||||
},
|
||||
# --- בית הכרם 1126/25 (partial acceptance, 03.2026) ---
|
||||
{
|
||||
"lesson_title": "Threshold question is STRATEGIC, not mandatory",
|
||||
"lesson_text": "שאלת הסף (זכות ערר לפי ס' 152) היא כלי אסטרטגי, לא חובה. כשלתיק יש שאלות מהותיות חזקות (חניה, קווי בניין, שימור), דפנה מעדיפה להתעמק בתוכן על פני חסימה פרוצדורלית. זה גם מחזק את ההחלטה מפני ביקורת שיפוטית.",
|
||||
"category": "process",
|
||||
"applies_to": ["all"],
|
||||
"source_case": "בית הכרם 1126/25",
|
||||
"severity": "critical",
|
||||
},
|
||||
{
|
||||
"lesson_title": "Concentric circles = rejected appeals only",
|
||||
"lesson_text": 'מודל השכבות (עיגולים קונצנטריים, סעיף 6.3 ב-SKILL) הוא כלי אחד מתוך כמה, לא המסגרת הנדרשת. לעררים שמתקבלים חלקית, דפנה משתמשת בניתוח גמיש נושא-נושא.',
|
||||
"category": "process",
|
||||
"applies_to": ["block-yod"],
|
||||
"source_case": "בית הכרם 1126/25",
|
||||
"severity": "critical",
|
||||
},
|
||||
{
|
||||
"lesson_title": "New opening type: tension mapping",
|
||||
"lesson_text": 'לקבלה חלקית או תיקים עם סוגיות מורכבות מצטלבות, פתיחת "מיפוי מתחים": רשימה של 6+ מתחים ספציפיים בתבליטים לפני הניתוח. דפוס: "בערר דנן עולות שאלות כיצד והאם..." → רשימת מתחים → "כל הנקודות לעיל עומדות לפנינו..."',
|
||||
"category": "structure",
|
||||
"applies_to": ["block-yod"],
|
||||
"source_case": "בית הכרם 1126/25",
|
||||
"severity": "important",
|
||||
},
|
||||
{
|
||||
"lesson_title": "Single building weakens TAMA 38 interest",
|
||||
"lesson_text": 'כשתמ"א 38 חלה על בית בודד (לעומת בניין דירות גדול), אינטרס החיזוק מפני רעידת אדמה חלש יותר. זה מצדיק אישור זהיר יותר של זכויות, במיוחד קווי בניין וחניה.',
|
||||
"category": "content",
|
||||
"applies_to": ["block-yod"],
|
||||
"source_case": "בית הכרם 1126/25",
|
||||
"severity": "important",
|
||||
},
|
||||
{
|
||||
"lesson_title": "Master plan as shield against ad-hoc planning",
|
||||
"lesson_text": 'כשקיימת תכנית אב — לצטט אותה כדי לתת לגיטימציה להיתר בודד. מסקנה: ההיתר "משתלב בחזון כולל קיים" במקום ליצור תקדים אד-הוק.',
|
||||
"category": "content",
|
||||
"applies_to": ["block-yod"],
|
||||
"source_case": "בית הכרם 1126/25",
|
||||
"severity": "important",
|
||||
},
|
||||
{
|
||||
"lesson_title": "Deep plan provision citations for parking",
|
||||
"lesson_text": "לסוגיות חניה/תשתיות, דפנה נכנסת עמוק להוראות תכנית עם ציטוטים ישירים נרחבים (300+ מילים) וניתוח משולב. כולל מספרי סעיפים ספציפיים (לדוגמה: 6.8(4), 6.8(9), נספח תנועה, 5166b).",
|
||||
"category": "content",
|
||||
"applies_to": ["block-yod", "block-tet"],
|
||||
"source_case": "בית הכרם 1126/25",
|
||||
"severity": "important",
|
||||
},
|
||||
{
|
||||
"lesson_title": "Ultra-minimal summary for partial acceptance",
|
||||
"lesson_text": "בקבלה חלקית, כל ההנמקה כבר בדיון. סיכום = הוראות אופרטיביות בלבד (בדרך כלל 3 סעיפים קצרים). ללא דיון בהוצאות. ללא סיום חם.",
|
||||
"category": "structure",
|
||||
"applies_to": ["block-yod-alef"],
|
||||
"source_case": "בית הכרם 1126/25",
|
||||
"severity": "important",
|
||||
},
|
||||
# --- קרית יערים-1 (03.2026) ---
|
||||
{
|
||||
"lesson_title": "Neutral background rule",
|
||||
"lesson_text": 'רקע (בלוק ו) = עובדות אובייקטיביות בלבד. מבחן: האם המשפט מכיל ציטוט ישיר מצד, או מילות ערך/שיפוט (חריג, חטא, בעייתי)? אם כן → שייך בטענות (בלוק ז) או דיון (בלוק י), לא ברקע. החלטות קודמות = עובדה יבשה ("ביום X נדחתה תכנית Y"), ללא נימוקים וציטוטים.',
|
||||
"category": "structure",
|
||||
"applies_to": ["block-vav"],
|
||||
"source_case": "קרית יערים-1 (1130/25)",
|
||||
"severity": "critical",
|
||||
},
|
||||
{
|
||||
"lesson_title": "12-block mandatory structure",
|
||||
"lesson_text": 'מבנה 12 בלוקים פורמלי חובה עם שלב "טיוטת טרום-דיון". כולל: פתיחה (ה) → רקע (ו) → טענות (ז) → הליכים (ח) → תכניות (ט) → דיון (י) → סיכום (יא). חידוש מאריאלי: "ההליכים בפני ועדת הערר" כפרק נפרד. כל בלוק נכתב כאילו שופט בית משפט מנהלי קורא בפעם הראשונה.',
|
||||
"category": "structure",
|
||||
"applies_to": ["all"],
|
||||
"source_case": "קרית יערים-1 (1130/25)",
|
||||
"severity": "critical",
|
||||
},
|
||||
# --- Meta-lesson ---
|
||||
{
|
||||
"lesson_title": "Skill was over-indexed on single case type",
|
||||
"lesson_text": "ה-SKILL המקורי היה מבוסס יתר על מקרה אחד (הכט = דחייה). מודל העיגולים, שאלת סף כחובה, וסיום חם — כולם דפוסים מתיק בודד. בית הכרם (קבלה חלקית) חשף שהגישה של דפנה גמישה יותר ממה שתפסנו. צריך להבחין בין דפוסים אוניברסליים לתלויי-תוצאה.",
|
||||
"category": "process",
|
||||
"applies_to": ["all"],
|
||||
"source_case": "בית הכרם 1126/25",
|
||||
"severity": "critical",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Data: Transition Phrases
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
|
||||
TRANSITION_PHRASES = [
|
||||
# From הכט
|
||||
{"phrase": "ועל מנת לא לצאת בחסר", "usage_context": "פתיחת אוביטר דיקטה / הנמקה נוספת", "block_types": ["block-yod"], "source_decision": "הכט 1180-1181"},
|
||||
{"phrase": "נציין כי טענות אלו נטענו בלשון רפה", "usage_context": "הכרה בטענות חלשות תוך דיון בהן", "block_types": ["block-yod"], "source_decision": "הכט 1180-1181"},
|
||||
{"phrase": "עינינו הרואות", "usage_context": "סיכום אחרי ציטוט ארוך", "block_types": ["block-yod"], "source_decision": "הכט 1180-1181"},
|
||||
{"phrase": "נוסיף.", "usage_context": "מעבר קצר ביותר (מילה אחת) לנקודה הבאה", "block_types": ["block-yod"], "source_decision": "הכט 1180-1181"},
|
||||
{"phrase": "אם כך, לעת הזו", "usage_context": "הסקת מסקנה מציטוטים", "block_types": ["block-yod"], "source_decision": "הכט 1180-1181"},
|
||||
{"phrase": "למעלה מן הצורך", "usage_context": "דיון לא הכרחי להכרעה אך נכתב מטעמים אסטרטגיים", "block_types": ["block-yod"], "source_decision": "הכט 1180-1181"},
|
||||
{"phrase": "למיטב הבנתנו", "usage_context": "עמדה זהירה בשאלה משפטית לא מיושבת", "block_types": ["block-yod"], "source_decision": "הכט 1180-1181"},
|
||||
{"phrase": "נשלים ונציין", "usage_context": "נקודה אחרונה לפני מעבר לסיכום", "block_types": ["block-yod"], "source_decision": "הכט 1180-1181"},
|
||||
# From בית הכרם
|
||||
{"phrase": "הדברים משליכים על שיקול הדעת ב...", "usage_context": "קישור ממצא למסקנה", "block_types": ["block-yod"], "source_decision": "בית הכרם 1126/25"},
|
||||
{"phrase": "רוצה לומר כי", "usage_context": "ניסוח חלופי / הסבר", "block_types": ["block-yod"], "source_decision": "בית הכרם 1126/25"},
|
||||
{"phrase": "נוצר מצב בו", "usage_context": "הצגת מצב עובדתי / בעיה", "block_types": ["block-yod"], "source_decision": "בית הכרם 1126/25"},
|
||||
{"phrase": "לכך נוסיף כי", "usage_context": "הוספת שכבה נוספת לטיעון", "block_types": ["block-yod"], "source_decision": "בית הכרם 1126/25"},
|
||||
{"phrase": "יש אולי להצר על כך ש...", "usage_context": "הערה ביקורתית עדינה (כלפי רשות תכנון)", "block_types": ["block-yod"], "source_decision": "בית הכרם 1126/25"},
|
||||
{"phrase": "עם ההבנה לטענה זו של העוררים, אין בידנו לקבלה", "usage_context": "הכרה רכה בטענה תוך דחייתה", "block_types": ["block-yod"], "source_decision": "בית הכרם 1126/25"},
|
||||
# General (from SKILL.md)
|
||||
{"phrase": "ברי כי", "usage_context": "מסקנה מובנת מאליה", "block_types": ["block-yod"], "source_decision": ""},
|
||||
{"phrase": "נפנה ל...", "usage_context": "פתיחת ניתוח חוק/פסיקה", "block_types": ["block-yod"], "source_decision": ""},
|
||||
{"phrase": "מכל האמור לעיל", "usage_context": "מעבר לסיכום", "block_types": ["block-yod", "block-yod-alef"], "source_decision": ""},
|
||||
{"phrase": "נשוב על כך כי", "usage_context": "חזרה מכוונת על עיקרון חשוב", "block_types": ["block-yod"], "source_decision": ""},
|
||||
{"phrase": "דא עקא", "usage_context": "הצגת בעיה מרכזית או סתירה", "block_types": ["block-yod"], "source_decision": ""},
|
||||
{"phrase": "ובמילים אחרות", "usage_context": "הבהרה / ניסוח מחדש", "block_types": ["block-yod"], "source_decision": ""},
|
||||
{"phrase": "הגענו לכלל מסקנה כי", "usage_context": "מסקנה מרכזית (פתיחת דיון)", "block_types": ["block-yod"], "source_decision": ""},
|
||||
{"phrase": "לא נוכל לקבל", "usage_context": "דחיית עמדה / טענה", "block_types": ["block-yod"], "source_decision": ""},
|
||||
{"phrase": "מקובלת עלינו", "usage_context": "קבלת עמדה", "block_types": ["block-yod"], "source_decision": ""},
|
||||
{"phrase": "התרשמנו כי", "usage_context": "מסקנה מדיון / עיון במסמכים", "block_types": ["block-yod"], "source_decision": ""},
|
||||
{"phrase": "נחדד כי", "usage_context": "חידוד נקודה קודמת", "block_types": ["block-yod"], "source_decision": ""},
|
||||
{"phrase": "סיכומם של דברים", "usage_context": "פתיחת סיכום מהותי לפני פרק הסיכום", "block_types": ["block-yod"], "source_decision": ""},
|
||||
{"phrase": "המסקנה מכל האמור היא כי", "usage_context": "מסקנת ביניים מקיפה", "block_types": ["block-yod"], "source_decision": ""},
|
||||
{"phrase": "לעמדתנו", "usage_context": "עמדת הוועדה", "block_types": ["block-yod"], "source_decision": ""},
|
||||
{"phrase": "בנסיבות אלה", "usage_context": "מעבר מעובדות למסקנה", "block_types": ["block-yod"], "source_decision": ""},
|
||||
{"phrase": "נזכיר כי", "usage_context": "תזכורת לעיקרון ידוע", "block_types": ["block-yod"], "source_decision": ""},
|
||||
{"phrase": "מצאנו כי", "usage_context": "קביעה עובדתית", "block_types": ["block-yod"], "source_decision": ""},
|
||||
{"phrase": "שוכנענו כי", "usage_context": "קביעה לאחר בחינה", "block_types": ["block-yod"], "source_decision": ""},
|
||||
{"phrase": "על כן ולו רק מסיבה זו", "usage_context": "נטרול טענה חלשה לפני ניתוח עמוק", "block_types": ["block-yod"], "source_decision": ""},
|
||||
{"phrase": "יחד עם זאת, מצאנו לנכון לדון בשאלה העקרונית", "usage_context": "מעבר לדיון עקרוני למרות דחייה", "block_types": ["block-yod"], "source_decision": ""},
|
||||
{"phrase": "משכך", "usage_context": "הסקת מסקנה מעמדה שהוצגה", "block_types": ["block-yod"], "source_decision": ""},
|
||||
{"phrase": "הדברים מתחדדים שעה ש...", "usage_context": "הבהרה נוספת לאור נסיבות", "block_types": ["block-yod"], "source_decision": ""},
|
||||
{"phrase": "זאת ועוד", "usage_context": "הוספת נימוק", "block_types": ["block-yod"], "source_decision": ""},
|
||||
{"phrase": "יתרה מכך", "usage_context": "חיזוק הנמקה קודמת", "block_types": ["block-yod"], "source_decision": ""},
|
||||
{"phrase": "לאור כל האמור לעיל", "usage_context": "פתיחת סיכום סופי", "block_types": ["block-yod", "block-yod-alef"], "source_decision": ""},
|
||||
{"phrase": "נפתח בכך כי", "usage_context": "פתיחת דיון (לא מסמך)", "block_types": ["block-yod"], "source_decision": ""},
|
||||
{"phrase": "נפנה בעניין זה להחלטת...", "usage_context": "הפניה לתקדים", "block_types": ["block-yod"], "source_decision": ""},
|
||||
{"phrase": "ברי כי משאב הקרקע יקר לבעליו ולציבור", "usage_context": "הצדקת שימוש יעיל בקרקע", "block_types": ["block-yod"], "source_decision": ""},
|
||||
{"phrase": "נסכם כי", "usage_context": "מעבר לסיכום ביניים", "block_types": ["block-yod"], "source_decision": ""},
|
||||
{"phrase": "נחזור על כך כי", "usage_context": "חזרה אמפתית על קביעה חשובה", "block_types": ["block-yod"], "source_decision": ""},
|
||||
]
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Data: Case Law
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
|
||||
CASE_LAW = [
|
||||
{
|
||||
"case_number": "עע\"מ 3975/22",
|
||||
"case_name": "ב. קרן-נכסים",
|
||||
"court": "בית המשפט העליון",
|
||||
"subject_tags": ["proprietary_claims", "feasibility"],
|
||||
"summary": "פסק דין מנחה בנושא בדיקת היתכנות קניינית — מתי ועדה צריכה לבחון זכויות קניין לפני מתן היתר.",
|
||||
"key_quote": "",
|
||||
},
|
||||
{
|
||||
"case_number": "ערר (מרכז) 1011-03-25",
|
||||
"case_name": "נגאח עבד אל קאדר",
|
||||
"court": "ועדת ערר מרכז",
|
||||
"subject_tags": ["proprietary_claims", "consolidating_decision"],
|
||||
"summary": "החלטה מאחדת בנושא טענות קנייניות — ריכזה את כל הפסיקה בנושא.",
|
||||
"key_quote": "",
|
||||
},
|
||||
{
|
||||
"case_number": "ערר 1071/25",
|
||||
"case_name": "מינץ",
|
||||
"court": "ועדת ערר ירושלים",
|
||||
"subject_tags": ["self_reference", "previous_decision"],
|
||||
"summary": "החלטה קודמת של ועדת הערר עצמה — שימוש כתקדים פנימי.",
|
||||
"key_quote": "",
|
||||
},
|
||||
{
|
||||
"case_number": "ערר 1192/18",
|
||||
"case_name": "אילן",
|
||||
"court": "ועדת ערר ירושלים",
|
||||
"subject_tags": ["preservation", "nuisance"],
|
||||
"summary": "שימור ומטרדים — איזון בין שימור מבנים לזכויות שכנים.",
|
||||
"key_quote": "",
|
||||
},
|
||||
{
|
||||
"case_number": "ערר 1009-02-24",
|
||||
"case_name": "מובשוביץ",
|
||||
"court": "ועדת ערר ירושלים",
|
||||
"subject_tags": ["urban_renewal", "tama_38"],
|
||||
"summary": 'התחדשות עירונית — ציטוט נרחב (~400 מילים) בהחלטת בית הכרם.',
|
||||
"key_quote": "",
|
||||
},
|
||||
{
|
||||
"case_number": "ערר 1156/18",
|
||||
"case_name": "ארד",
|
||||
"court": "ועדת ערר ירושלים",
|
||||
"subject_tags": ["construction_nuisance"],
|
||||
"summary": "מטרדי בנייה — מתי מטרד בנייה מצדיק התערבות.",
|
||||
"key_quote": "",
|
||||
},
|
||||
{
|
||||
"case_number": "ערר 1169/19",
|
||||
"case_name": "זוהר",
|
||||
"court": "ועדת ערר ירושלים",
|
||||
"subject_tags": ["construction_nuisance"],
|
||||
"summary": "מטרדי בנייה — המשך קו הפסיקה של ערר ארד.",
|
||||
"key_quote": "",
|
||||
},
|
||||
{
|
||||
"case_number": "ערר (ירושלים) 1078+1083/24",
|
||||
"case_name": "אריאלי",
|
||||
"court": "ועדת ערר ירושלים",
|
||||
"subject_tags": ["structure_example", "proceedings_block"],
|
||||
"summary": "שימשה כמודל מבני — פרק הליכים נפרד (31 סעיפים), מבנה מפורט.",
|
||||
"key_quote": "",
|
||||
},
|
||||
{
|
||||
"case_number": "ערר אדלר",
|
||||
"case_name": "אדלר",
|
||||
"court": "ועדת ערר ירושלים",
|
||||
"subject_tags": ["consolidating_decision"],
|
||||
"summary": "החלטה מאחדת שצוטטה בבית הכרם — טכניקת ציטוט דרך החלטה מרכזת.",
|
||||
"key_quote": "",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Data: Statutory Provisions
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
|
||||
STATUTORY_PROVISIONS = [
|
||||
{
|
||||
"statute_name": "חוק התכנון והבנייה, תשכ\"ה-1965",
|
||||
"section_number": "152(א)(2)",
|
||||
"section_title": "זכות ערר על אישור תכנית",
|
||||
"full_text": "",
|
||||
"common_usage": "שאלת סף — האם קיימת זכות ערר. כלי אסטרטגי, לא חובה.",
|
||||
"subject_tags": ["threshold", "right_to_appeal"],
|
||||
},
|
||||
{
|
||||
"statute_name": "חוק התכנון והבנייה, תשכ\"ה-1965",
|
||||
"section_number": "149",
|
||||
"section_title": "הקלה",
|
||||
"full_text": "",
|
||||
"common_usage": "בקשות להקלה — סטייה מתכנית בניין עיר.",
|
||||
"subject_tags": ["deviation", "relief"],
|
||||
},
|
||||
{
|
||||
"statute_name": "חוק התכנון והבנייה, תשכ\"ה-1965",
|
||||
"section_number": "145",
|
||||
"section_title": "היתר בנייה",
|
||||
"full_text": "",
|
||||
"common_usage": "עררים על סירוב/אישור היתר בנייה.",
|
||||
"subject_tags": ["building_permit"],
|
||||
},
|
||||
{
|
||||
"statute_name": "חוק התכנון והבנייה, תשכ\"ה-1965",
|
||||
"section_number": "196-198",
|
||||
"section_title": "היטל השבחה",
|
||||
"full_text": "",
|
||||
"common_usage": "עררי היטל השבחה (8xxx) — חיוב בגין עליית שווי מקרקעין.",
|
||||
"subject_tags": ["betterment_levy"],
|
||||
},
|
||||
{
|
||||
"statute_name": "חוק התכנון והבנייה, תשכ\"ה-1965",
|
||||
"section_number": "197",
|
||||
"section_title": "פיצויים בגין ירידת ערך",
|
||||
"full_text": "",
|
||||
"common_usage": "עררי פיצויים (9xxx) — תביעה בגין ירידת ערך מקרקעין בשל תכנית.",
|
||||
"subject_tags": ["compensation", "depreciation"],
|
||||
},
|
||||
{
|
||||
"statute_name": "תמ\"א 38",
|
||||
"section_number": "תיקון 2 + 3",
|
||||
"section_title": "חיזוק מבנים מפני רעידות אדמה",
|
||||
"full_text": "",
|
||||
"common_usage": "חיזוק/הריסה ובנייה מחדש. אינטרס חלש יותר בבית בודד.",
|
||||
"subject_tags": ["tama_38", "seismic_reinforcement"],
|
||||
},
|
||||
{
|
||||
"statute_name": "חוק המקרקעין, תשכ\"ט-1969",
|
||||
"section_number": "71ב(א)(1)",
|
||||
"section_title": "רוב הדרוש לשינוי ברכוש משותף",
|
||||
"full_text": "",
|
||||
"common_usage": "בדיקת היתכנות קניינית — האם יש רוב לשינוי ברכוש משותף.",
|
||||
"subject_tags": ["proprietary_claims", "common_property"],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Import Logic
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
|
||||
async def seed_lessons(conn) -> int:
|
||||
count = 0
|
||||
for l in LESSONS:
|
||||
existing = await conn.fetchval(
|
||||
"SELECT id FROM lessons_learned WHERE lesson_title = $1", l["lesson_title"]
|
||||
)
|
||||
if existing:
|
||||
continue
|
||||
await conn.execute(
|
||||
"""INSERT INTO lessons_learned (lesson_title, lesson_text, category, applies_to, source_case, severity)
|
||||
VALUES ($1, $2, $3, $4, $5, $6)""",
|
||||
l["lesson_title"], l["lesson_text"], l["category"],
|
||||
json.dumps(l["applies_to"]), l["source_case"], l["severity"],
|
||||
)
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
async def seed_phrases(conn) -> int:
|
||||
count = 0
|
||||
for p in TRANSITION_PHRASES:
|
||||
existing = await conn.fetchval(
|
||||
"SELECT id FROM transition_phrases WHERE phrase = $1", p["phrase"]
|
||||
)
|
||||
if existing:
|
||||
continue
|
||||
await conn.execute(
|
||||
"""INSERT INTO transition_phrases (phrase, usage_context, block_types, source_decision)
|
||||
VALUES ($1, $2, $3, $4)""",
|
||||
p["phrase"], p["usage_context"],
|
||||
json.dumps(p["block_types"]), p["source_decision"],
|
||||
)
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
async def seed_case_law(conn) -> int:
|
||||
count = 0
|
||||
for c in CASE_LAW:
|
||||
existing = await conn.fetchval(
|
||||
"SELECT id FROM case_law WHERE case_number = $1", c["case_number"]
|
||||
)
|
||||
if existing:
|
||||
continue
|
||||
await conn.execute(
|
||||
"""INSERT INTO case_law (case_number, case_name, court, subject_tags, summary, key_quote)
|
||||
VALUES ($1, $2, $3, $4, $5, $6)""",
|
||||
c["case_number"], c["case_name"], c["court"],
|
||||
json.dumps(c["subject_tags"]), c["summary"], c.get("key_quote", ""),
|
||||
)
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
async def seed_statutes(conn) -> int:
|
||||
count = 0
|
||||
for s in STATUTORY_PROVISIONS:
|
||||
existing = await conn.fetchval(
|
||||
"""SELECT id FROM statutory_provisions
|
||||
WHERE statute_name = $1 AND section_number = $2""",
|
||||
s["statute_name"], s["section_number"],
|
||||
)
|
||||
if existing:
|
||||
continue
|
||||
await conn.execute(
|
||||
"""INSERT INTO statutory_provisions
|
||||
(statute_name, section_number, section_title, full_text, common_usage, subject_tags)
|
||||
VALUES ($1, $2, $3, $4, $5, $6)""",
|
||||
s["statute_name"], s["section_number"], s["section_title"],
|
||||
s["full_text"], s["common_usage"], json.dumps(s["subject_tags"]),
|
||||
)
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
async def main():
|
||||
await init_schema()
|
||||
pool = await get_pool()
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
n_lessons = await seed_lessons(conn)
|
||||
n_phrases = await seed_phrases(conn)
|
||||
n_case_law = await seed_case_law(conn)
|
||||
n_statutes = await seed_statutes(conn)
|
||||
|
||||
await close_pool()
|
||||
|
||||
print(f"✓ lessons_learned: {n_lessons} inserted")
|
||||
print(f"✓ transition_phrases: {n_phrases} inserted")
|
||||
print(f"✓ case_law: {n_case_law} inserted")
|
||||
print(f"✓ statutory_provisions: {n_statutes} inserted")
|
||||
print(f" Total: {n_lessons + n_phrases + n_case_law + n_statutes} records")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
257
scripts/.archive/validate-decision.py
Normal file
257
scripts/.archive/validate-decision.py
Normal file
@@ -0,0 +1,257 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Validate a decision against block-schema rules.
|
||||
|
||||
Usage: python validate-decision.py <case_number>
|
||||
|
||||
Checks:
|
||||
1. Neutral background (block-vav) — no party quotes or value words
|
||||
2. Weight compliance — blocks within expected ranges
|
||||
3. Structural integrity — all required blocks present
|
||||
4. Claims coverage — every claim in block-zayin addressed in block-yod
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "mcp-server" / "src"))
|
||||
|
||||
from legal_mcp.services.db import get_pool, init_schema, close_pool
|
||||
|
||||
|
||||
# Value/judgment words that shouldn't appear in neutral background
|
||||
VALUE_WORDS = [
|
||||
"חריג", "חטא", "בעייתי", "מזעזע", "שערורייתי", "מגוחך",
|
||||
"נפשע", "פגום", "חמור", "מקומם", "בלתי סביר", "מופרז",
|
||||
"מגונה", "פסול", "נלוז", "מטריד",
|
||||
]
|
||||
|
||||
# Party quote indicators
|
||||
QUOTE_INDICATORS = [
|
||||
r"לטענת\s+(העוררי|המשיב|מבקשי)",
|
||||
r"לדברי\s+(העוררי|המשיב|מבקשי)",
|
||||
r"העורר\s+טוען",
|
||||
r"המשיבה\s+טוענת",
|
||||
r"לשיטת\s+(העוררי|המשיב)",
|
||||
]
|
||||
|
||||
# Expected weight ranges per block type (for רישוי appeals)
|
||||
WEIGHT_RANGES_LICENSING = {
|
||||
"block-he": (0.5, 5),
|
||||
"block-vav": (3, 40),
|
||||
"block-zayin": (13, 40),
|
||||
"block-chet": (0, 15),
|
||||
"block-tet": (0, 15),
|
||||
"block-yod": (30, 75),
|
||||
"block-yod-alef": (1, 10),
|
||||
"block-yod-bet": (0, 2),
|
||||
}
|
||||
|
||||
# Expected weight ranges for היטל השבחה
|
||||
WEIGHT_RANGES_LEVY = {
|
||||
"block-he": (0, 5),
|
||||
"block-vav": (2, 20),
|
||||
"block-zayin": (15, 40),
|
||||
"block-chet": (0, 25),
|
||||
"block-tet": (0, 15),
|
||||
"block-yod": (25, 75),
|
||||
"block-yod-alef": (1, 10),
|
||||
"block-yod-bet": (0, 3),
|
||||
}
|
||||
|
||||
|
||||
def check_neutral_background(content: str) -> list[str]:
|
||||
"""Check block-vav for neutrality violations."""
|
||||
issues = []
|
||||
if not content:
|
||||
return issues
|
||||
|
||||
lines = content.split("\n")
|
||||
for i, line in enumerate(lines):
|
||||
# Check value words
|
||||
for word in VALUE_WORDS:
|
||||
if word in line:
|
||||
issues.append(f"מילת שיפוט ברקע (שורה {i+1}): \"{word}\" — \"{line[:80]}...\"")
|
||||
|
||||
# Check party quotes
|
||||
for pattern in QUOTE_INDICATORS:
|
||||
if re.search(pattern, line):
|
||||
match = re.search(pattern, line).group()
|
||||
issues.append(f"ציטוט מצד ברקע (שורה {i+1}): \"{match}\" — \"{line[:80]}...\"")
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
def check_weight_compliance(blocks: list[dict], appeal_type: str) -> list[str]:
|
||||
"""Check block weights are within expected ranges."""
|
||||
issues = []
|
||||
ranges = WEIGHT_RANGES_LEVY if appeal_type == "levy" else WEIGHT_RANGES_LICENSING
|
||||
|
||||
total_words = sum(b["word_count"] for b in blocks)
|
||||
if total_words == 0:
|
||||
return ["אין תוכן בהחלטה"]
|
||||
|
||||
for block in blocks:
|
||||
bid = block["block_id"]
|
||||
if bid in ranges and block["word_count"] > 0:
|
||||
weight = block["word_count"] / total_words * 100
|
||||
low, high = ranges[bid]
|
||||
if weight < low:
|
||||
issues.append(f"בלוק {bid} ({block['title']}): משקל {weight:.1f}% — מתחת לטווח ({low}-{high}%)")
|
||||
elif weight > high:
|
||||
issues.append(f"בלוק {bid} ({block['title']}): משקל {weight:.1f}% — מעל לטווח ({low}-{high}%)")
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
def check_structural_integrity(blocks: list[dict]) -> list[str]:
|
||||
"""Check all required blocks are present."""
|
||||
issues = []
|
||||
required = ["block-he", "block-zayin", "block-yod"]
|
||||
block_ids = {b["block_id"] for b in blocks if b["word_count"] > 0}
|
||||
|
||||
for req in required:
|
||||
if req not in block_ids:
|
||||
issues.append(f"בלוק חובה חסר: {req}")
|
||||
|
||||
# Check discussion is the heaviest block
|
||||
yod = next((b for b in blocks if b["block_id"] == "block-yod"), None)
|
||||
if yod:
|
||||
max_block = max((b for b in blocks if b["block_id"] not in ("block-alef", "block-bet", "block-gimel", "block-dalet")),
|
||||
key=lambda x: x["word_count"], default=None)
|
||||
if max_block and max_block["block_id"] != "block-yod":
|
||||
issues.append(f"בלוק הדיון (י) אינו הבלוק הגדול ביותר — {max_block['title']} ({max_block['word_count']} מילים) גדול יותר")
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
def check_no_duplication(vav_content: str, yod_content: str) -> list[str]:
|
||||
"""Check block-yod doesn't repeat block-vav content."""
|
||||
issues = []
|
||||
if not vav_content or not yod_content:
|
||||
return issues
|
||||
|
||||
# Find sentences from background that appear verbatim in discussion
|
||||
vav_sentences = [s.strip() for s in re.split(r'[.!?]', vav_content) if len(s.strip()) > 30]
|
||||
for sent in vav_sentences:
|
||||
if sent in yod_content:
|
||||
issues.append(f"כפילות: משפט מהרקע חוזר בדיון — \"{sent[:60]}...\"")
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
async def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("שימוש: python validate-decision.py <מספר_תיק>")
|
||||
sys.exit(1)
|
||||
|
||||
case_number = sys.argv[1]
|
||||
await init_schema()
|
||||
pool = await get_pool()
|
||||
|
||||
async with pool.acquire() as conn:
|
||||
case = await conn.fetchrow(
|
||||
"SELECT * FROM cases WHERE case_number = $1", case_number
|
||||
)
|
||||
if not case:
|
||||
print(f"תיק {case_number} לא נמצא")
|
||||
sys.exit(1)
|
||||
|
||||
decision = await conn.fetchrow(
|
||||
"SELECT * FROM decisions WHERE case_id = $1",
|
||||
case["id"],
|
||||
)
|
||||
if not decision:
|
||||
print(f"אין החלטה לתיק {case_number}")
|
||||
sys.exit(1)
|
||||
|
||||
blocks = await conn.fetch(
|
||||
"""SELECT block_id, title, content, word_count, weight_percent
|
||||
FROM decision_blocks WHERE decision_id = $1
|
||||
ORDER BY block_index""",
|
||||
decision["id"],
|
||||
)
|
||||
blocks = [dict(b) for b in blocks]
|
||||
|
||||
claims_count = await conn.fetchval(
|
||||
"SELECT count(*) FROM claims WHERE case_id = $1", case["id"]
|
||||
)
|
||||
|
||||
await close_pool()
|
||||
|
||||
# Determine appeal type
|
||||
num = case_number.split("/")[0].split("+")[0].split("-")[0]
|
||||
if num.startswith("8"):
|
||||
appeal_type = "levy"
|
||||
appeal_type_heb = "היטל השבחה"
|
||||
elif num.startswith("9"):
|
||||
appeal_type = "compensation"
|
||||
appeal_type_heb = "פיצויים"
|
||||
else:
|
||||
appeal_type = "licensing"
|
||||
appeal_type_heb = "רישוי ובנייה"
|
||||
|
||||
print(f"{'='*60}")
|
||||
print(f"ולידציה: {case_number} — {case['title']}")
|
||||
print(f"סוג: {appeal_type_heb} | מילים: {decision['total_words']} | טענות: {claims_count}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
all_issues = []
|
||||
|
||||
# 1. Neutral background
|
||||
vav = next((b for b in blocks if b["block_id"] == "block-vav"), None)
|
||||
issues = check_neutral_background(vav["content"] if vav else "")
|
||||
if issues:
|
||||
print(f"\n❌ רקע ניטרלי — {len(issues)} בעיות:")
|
||||
for i in issues:
|
||||
print(f" • {i}")
|
||||
all_issues.extend(issues)
|
||||
else:
|
||||
print("\n✅ רקע ניטרלי — תקין")
|
||||
|
||||
# 2. Weight compliance
|
||||
issues = check_weight_compliance(blocks, appeal_type)
|
||||
if issues:
|
||||
print(f"\n⚠ משקלות — {len(issues)} חריגות:")
|
||||
for i in issues:
|
||||
print(f" • {i}")
|
||||
all_issues.extend(issues)
|
||||
else:
|
||||
print("\n✅ משקלות — בטווח")
|
||||
|
||||
# 3. Structural integrity
|
||||
issues = check_structural_integrity(blocks)
|
||||
if issues:
|
||||
print(f"\n❌ מבנה — {len(issues)} בעיות:")
|
||||
for i in issues:
|
||||
print(f" • {i}")
|
||||
all_issues.extend(issues)
|
||||
else:
|
||||
print("\n✅ מבנה — תקין")
|
||||
|
||||
# 4. No duplication
|
||||
yod = next((b for b in blocks if b["block_id"] == "block-yod"), None)
|
||||
issues = check_no_duplication(
|
||||
vav["content"] if vav else "",
|
||||
yod["content"] if yod else "",
|
||||
)
|
||||
if issues:
|
||||
print(f"\n⚠ כפילויות — {len(issues)} נמצאו:")
|
||||
for i in issues:
|
||||
print(f" • {i}")
|
||||
all_issues.extend(issues)
|
||||
else:
|
||||
print("\n✅ ללא כפילויות — תקין")
|
||||
|
||||
# Summary
|
||||
print(f"\n{'='*60}")
|
||||
if all_issues:
|
||||
print(f"סה\"כ: {len(all_issues)} בעיות נמצאו")
|
||||
else:
|
||||
print("✅ ההחלטה עומדת בכל הכללים")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user