Clean up scripts/: archive 17, delete 5, add SCRIPTS.md registry

Active scripts (5): auto-sync-cases.sh, backup-db.sh, restore-db.sh,
notify.py, bidi_table.py

Archived (17): one-time migration/seeding scripts whose functionality
is now in MCP server or web API. Moved to scripts/.archive/

Deleted (5): zero-value scripts (duplicates, hardcoded single-case,
debug scripts)

Added scripts/SCRIPTS.md — registry of all scripts with purpose,
status, and what superseded them. CLAUDE.md updated with rule:
any script change requires SCRIPTS.md update.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-14 16:30:19 +00:00
parent 38e79bbf92
commit 5c9a5d702a
24 changed files with 62 additions and 578 deletions

View File

@@ -0,0 +1,163 @@
"""Backfill style_patterns.frequency with real occurrence counts.
The analyzer currently stores frequency=1 for every pattern (it only extracts
unique patterns, doesn't count occurrences). This script scans the full_text
of every decision in style_corpus and updates each pattern's frequency to
the true count of decisions containing the pattern_text as a substring.
Run once after analysis, and again whenever new decisions are added.
"""
from __future__ import annotations
import asyncio
import os
import re
import sys
import unicodedata
from pathlib import Path
# Load env
for line in (Path.home() / ".env").read_text().splitlines():
if "=" in line and not line.startswith("#"):
k, v = line.split("=", 1)
os.environ.setdefault(k.strip(), v.strip().strip('"').strip("'"))
sys.path.insert(0, "/home/chaim/legal-ai/mcp-server/src")
from legal_mcp.services import db as db_mod # noqa: E402
def _strip_nikud(text: str) -> str:
"""Remove Hebrew combining marks (nikud) for robust matching."""
return "".join(
c for c in unicodedata.normalize("NFD", text)
if not unicodedata.combining(c)
)
def _extract_searchable_variants(pattern_text: str) -> list[str]:
"""Extract searchable substrings from a pattern template.
The analyzer stores patterns as templates with:
- Placeholders in [brackets]: "בפנינו ערר על החלטת [הגוף] מיום [תאריך]"
- Alternatives separated by / : "נפנה ל... / ראה והשווה / נפנה להחלטה"
- Ellipsis ... for variable parts
This function returns a list of concrete substrings to search for.
We pick the longest fixed segment from each alternative (>= 4 chars)
so that matching is specific enough to be meaningful but still flexible.
"""
# Split on " / " or " או " to get alternatives
alternatives = re.split(r"\s*/\s*|\s+או\s+", pattern_text)
variants: list[str] = []
for alt in alternatives:
alt = alt.strip()
if not alt:
continue
# Remove bracket placeholders [X]
alt = re.sub(r"\[[^\]]*\]", "|", alt)
# Replace ellipsis with separator
alt = re.sub(r"\.{2,}", "|", alt)
# Remove ellipsis unicode
alt = alt.replace("", "|")
# Split on the | separator and take fixed segments
segments = [s.strip(" ,.:;\"'") for s in alt.split("|")]
# Keep segments long enough to be meaningful (>= 4 chars, not just common words)
good = [s for s in segments if len(s) >= 4]
if good:
# Use the longest segment as the key variant for this alternative
variants.append(max(good, key=len))
elif alt.strip():
# Fallback: use the whole cleaned alternative
stripped = alt.replace("|", " ").strip()
if len(stripped) >= 4:
variants.append(stripped)
# Deduplicate while preserving order
seen = set()
unique = []
for v in variants:
if v not in seen:
seen.add(v)
unique.append(v)
return unique
def _count_decisions_containing(variants: list[str], normalized_decisions: list) -> int:
"""Count how many decisions contain ANY of the variants."""
count = 0
for _, _, text in normalized_decisions:
if any(v in text for v in variants):
count += 1
return count
async def main() -> int:
pool = await db_mod.get_pool()
async with pool.acquire() as conn:
decisions = await conn.fetch(
"SELECT id, decision_number, full_text FROM style_corpus "
"WHERE full_text IS NOT NULL AND length(full_text) > 0"
)
patterns = await conn.fetch(
"SELECT id, pattern_text, pattern_type FROM style_patterns"
)
print(f"Scanning {len(patterns)} patterns across {len(decisions)} decisions...")
# Normalize decisions once
normalized_decisions = [
(d["id"], d["decision_number"], _strip_nikud(d["full_text"]))
for d in decisions
]
updates = []
for p in patterns:
pattern_text = p["pattern_text"]
if not pattern_text or len(pattern_text) < 3:
updates.append((0, p["id"]))
continue
variants = _extract_searchable_variants(_strip_nikud(pattern_text))
if not variants:
updates.append((0, p["id"]))
continue
count = _count_decisions_containing(variants, normalized_decisions)
updates.append((count, p["id"]))
await conn.executemany(
"UPDATE style_patterns SET frequency = $1 WHERE id = $2",
updates,
)
# Show distribution
rows = await conn.fetch(
"SELECT pattern_type, pattern_text, frequency "
"FROM style_patterns "
"ORDER BY frequency DESC "
"LIMIT 15"
)
print(f"\nTop 15 patterns by real frequency:")
for r in rows:
print(f" {r['frequency']:>3} [{r['pattern_type']:<22}] {r['pattern_text'][:90]}")
dist = await conn.fetch(
"SELECT frequency, count(*) FROM style_patterns "
"GROUP BY frequency ORDER BY frequency DESC"
)
print(f"\nFrequency distribution:")
for r in dist:
print(f" frequency={r['frequency']:>3}{r['count']} patterns")
return 0
if __name__ == "__main__":
sys.exit(asyncio.run(main()))

View File

@@ -0,0 +1,349 @@
"""Batch upload proofread training corpus to style DB.
Two-phase workflow:
--preview Extract metadata from all .md files, print review table, don't upload
--upload Actually upload all files (with optional --only FILE to run one)
Metadata extraction:
* decision_number: from filename (ARAR-YY-NNNN / ערר NNNN-YY) or decision date year
* decision_date: from "ניתנה ... <day> ב<Hebrew month> <YYYY>" near end of text
* categories: keyword heuristics on body text
"""
from __future__ import annotations
import argparse
import asyncio
import os
import re
import sys
from pathlib import Path
PROOFREAD_DIR = Path("/home/chaim/legal-ai/data/training/proofread")
# Manual metadata overrides for files where auto-extraction can't determine values.
METADATA_OVERRIDES: dict[str, dict] = {
"ARAR-25-1067 - יחיעם יפה ואח׳.md": {
"decision_date": "2025-11-27", # no "ניתנה" signature in file; user-provided
},
}
# Files to skip — already in style_corpus from legacy ingestion
# (verified by exact character-count match with existing DB rows).
SKIP_FILES = {
"תמא 38-בית הכרם-1126+1141-החלטה.md", # → corpus: 1126/1141
"היתר בניה-בית שמש-1180+1181-החלטה.md", # → corpus: 1180/1181
"היתר בניה-הראל-1043+1054-החלטה.md", # → corpus: 1043/1054
"היתר בניה-הראל-1071+1077-החלטה.md", # → corpus: 1071/1077
}
# Load env vars needed by mcp-server
ENV_FILE = Path.home() / ".env"
if ENV_FILE.exists():
for line in ENV_FILE.read_text().splitlines():
if "=" in line and not line.startswith("#"):
k, v = line.split("=", 1)
os.environ.setdefault(k.strip(), v.strip().strip('"').strip("'"))
# Make mcp-server package importable
sys.path.insert(0, "/home/chaim/legal-ai/mcp-server/src")
# ── Decision number extraction ───────────────────────────────────
FILENAME_NUMBER_PATTERNS = [
# ARAR-YY-NNNN[-X] - title.md
re.compile(r"^ARAR-(\d{2})-(\d{3,4})"),
# ערר NNNN-YY title.md or ערר NNNN-YY title
re.compile(r"^ערר\s+(\d{3,4})-(\d{2})"),
# ערר NNNN - title (no year in filename — needs date lookup)
re.compile(r"^ערר\s+(\d{3,4})\s*-"),
]
LEGACY_MULTI_PATTERN = re.compile(r"(\d{3,4})\+(\d{3,4})")
def decision_number_from_filename(stem: str) -> tuple[str | None, str | None]:
"""Return (number, year_short) or (multi_number, None) or (None, None).
year_short is YY (last 2 digits) if extractable from filename.
For legacy files with 'NNNN+NNNN' or no year, returns partial info
that must be completed from decision date.
"""
# ARAR-YY-NNNN
m = FILENAME_NUMBER_PATTERNS[0].match(stem)
if m:
year, num = m.group(1), m.group(2)
return f"{num}/{year}", year
# ערר NNNN-YY
m = FILENAME_NUMBER_PATTERNS[1].match(stem)
if m:
num, year = m.group(1), m.group(2)
return f"{num}/{year}", year
# ערר NNNN - title (no year)
m = FILENAME_NUMBER_PATTERNS[2].match(stem)
if m:
num = m.group(1)
return f"{num}/??", None
# Legacy: "NNNN+NNNN" merged decisions
m = LEGACY_MULTI_PATTERN.search(stem)
if m:
return f"{m.group(1)}+{m.group(2)}/??", None
return None, None
# ── Decision date extraction ─────────────────────────────────────
HEBREW_MONTHS = {
"ינואר": 1, "בינואר": 1,
"פברואר": 2, "בפברואר": 2,
"מרץ": 3, "מרס": 3, "במרץ": 3, "במרס": 3,
"אפריל": 4, "באפריל": 4,
"מאי": 5, "במאי": 5,
"יוני": 6, "ביוני": 6,
"יולי": 7, "ביולי": 7,
"אוגוסט": 8, "באוגוסט": 8,
"ספטמבר": 9, "בספטמבר": 9,
"אוקטובר": 10, "באוקטובר": 10,
"נובמבר": 11, "בנובמבר": 11,
"דצמבר": 12, "בדצמבר": 12,
}
# Matches "<day> ב<month>, <year>" or "<day> <month>, <year>" (with optional commas)
DATE_RE = re.compile(
r"(\d{1,2})\s+(ב?(?:ינואר|פברואר|מרץ|מרס|אפריל|מאי|יוני|יולי|אוגוסט|ספטמבר|אוקטובר|נובמבר|דצמבר))\s*[,.]?\s*(\d{4})"
)
NITNA_RE = re.compile(r"ניתנ[הו]?\s+(?:פה\s+אחד|בדעת\s+רוב|היום)?")
def decision_date_from_text(text: str) -> str | None:
"""Extract decision date in YYYY-MM-DD format from 'ניתנה... DATE' section.
Searches the last ~2000 chars where the signing block lives.
"""
tail = text[-2500:] if len(text) > 2500 else text
# Prefer dates near "ניתנה" marker
nitna_match = NITNA_RE.search(tail)
search_text = tail[nitna_match.start():] if nitna_match else tail
m = DATE_RE.search(search_text)
if not m:
# Fall back: search whole tail
m = DATE_RE.search(tail)
if not m:
return None
day = int(m.group(1))
month = HEBREW_MONTHS.get(m.group(2))
year = int(m.group(3))
if not month:
return None
try:
from datetime import date
return date(year, month, day).isoformat()
except ValueError:
return None
# ── Subject category extraction ──────────────────────────────────
# Categories as defined in the tool signature.
ALL_CATEGORIES = [
"בנייה", "שימוש חורג", "תכנית", "היתר", "הקלה",
"חלוקה", 'תמ"א 38', "היטל השבחה", "פיצויים 197",
]
def categorize(text: str) -> list[str]:
"""Heuristic category detection based on subject matter, not incidental mentions.
Strategy: the real subject is established in the opening 2000 chars
(first decision-opening paragraph). Secondary signal is repetition count
— casual mentions in law citations don't repeat.
"""
opening = text[:2000] # subject is stated up front
t = text
cats: list[str] = []
# תמ"א 38 — very specific marker, single mention is fine
if re.search(r'תמ[״"\']?א\s*38|תמא\s*38', t):
cats.append('תמ"א 38')
# היטל השבחה — require real engagement: must appear in opening OR 3+ times
hsbacha_count = len(re.findall(r"היטל(?:י)?\s+השבחה", t))
if hsbacha_count >= 3 or re.search(r"היטל(?:י)?\s+השבחה", opening):
cats.append("היטל השבחה")
# פיצויים 197 — require multiple mentions OR in opening
p197_re = r"פיצויים\s+לפי\s+(?:ס(?:עיף|')\s*)?197|סעיף\s*197|ס['\"]?\s*197"
p197_count = len(re.findall(p197_re, t))
if p197_count >= 2 or re.search(p197_re, opening):
cats.append("פיצויים 197")
# שימוש חורג — must appear in opening OR 3+ times (avoids law-quote false positives)
shimush_count = t.count("שימוש חורג")
if shimush_count >= 3 or "שימוש חורג" in opening:
cats.append("שימוש חורג")
# הקלה — real subject if 3+ mentions AND appears in opening
hakala_count = len(re.findall(r"\bהקלה\b|\bהקלות\b", t))
if hakala_count >= 3 and re.search(r"\bהקלה\b|\bהקלות\b", opening):
cats.append("הקלה")
# חלוקה — "איחוד וחלוקה" or "חלוקה חדשה" (specific phrases)
if re.search(r"איחוד\s+וחלוקה|חלוקה\s+חדשה|תכנית\s+לחלוקה", t):
cats.append("חלוקה")
# תכנית — plan-level appeal (primary subject). Allow ה/ב/ל prefixes on תכנית.
tochnit_opening = bool(re.search(
r"הפקדת\s+ה?תכנית|"
r"אישור\s+ה?תכנית|"
r"המלצה\s+להפקיד|"
r"להפקיד\s+את\s+ה?תכנית|"
r"לדון\s+בתכנית|"
r"דנה\s+בתכנית|"
r"החלטה\s+לאשר\s+ה?תכנית",
opening,
))
if tochnit_opening:
cats.append("תכנית")
# היתר — "בקשה להיתר" or "היתר בניה" as subject in opening
if re.search(r"בקשה\s+להיתר|היתר\s+בני(?:י)?ה", opening):
cats.append("היתר")
# בנייה — default/fallback for building-permit cases
# (not for plan-level תכנית-only cases)
has_permit_subject = "היתר" in cats or "הקלה" in cats or 'תמ"א 38' in cats
if has_permit_subject and "בנייה" not in cats:
cats.append("בנייה")
# If nothing matched, default to בנייה
return cats or ["בנייה"]
# ── Year fallback from date ──────────────────────────────────────
def finalize_decision_number(number: str | None, date_iso: str | None) -> str:
"""If filename number is missing year, fill it from decision date."""
if not number:
if date_iso:
# Extract last 2 digits of Hebrew year via Gregorian year
return f"??/{date_iso[2:4]}"
return ""
if number.endswith("/??"):
if date_iso:
yy = date_iso[2:4]
return number.replace("/??", f"/{yy}")
return number.replace("/??", "")
return number
# ── Main metadata extraction ─────────────────────────────────────
def extract_metadata(path: Path) -> dict:
text = path.read_text(encoding="utf-8")
num_from_name, _ = decision_number_from_filename(path.stem)
date_iso = decision_date_from_text(text)
decision_number = finalize_decision_number(num_from_name, date_iso)
cats = categorize(text)
meta = {
"file": path.name,
"decision_number": decision_number,
"decision_date": date_iso or "??",
"categories": cats,
"chars": len(text),
}
# Apply manual overrides
if path.name in METADATA_OVERRIDES:
meta.update(METADATA_OVERRIDES[path.name])
return meta
def print_preview(results: list[dict]) -> None:
"""Print review table of metadata for all files."""
print(f"\n{'#':<3} {'FILE':<55} {'NUMBER':<15} {'DATE':<12} {'CATEGORIES'}")
print("-" * 130)
for i, r in enumerate(results, 1):
file_short = r["file"] if len(r["file"]) <= 53 else r["file"][:50] + "..."
cats = ", ".join(r["categories"])
print(f"{i:<3} {file_short:<55} {r['decision_number']:<15} {r['decision_date']:<12} {cats}")
print()
# Highlight issues
issues = [r for r in results if r["decision_date"] == "??" or not r["decision_number"] or "??" in r["decision_number"]]
if issues:
print(f"⚠️ {len(issues)} files with incomplete metadata:")
for r in issues:
print(f" - {r['file']} → number={r['decision_number']!r} date={r['decision_date']!r}")
# ── Upload ───────────────────────────────────────────────────────
async def upload_one(meta: dict) -> dict:
from legal_mcp.tools.documents import document_upload_training
path = PROOFREAD_DIR / meta["file"]
result = await document_upload_training(
file_path=str(path),
decision_number=meta["decision_number"],
decision_date=meta["decision_date"] if meta["decision_date"] != "??" else "",
subject_categories=meta["categories"],
title=path.stem,
)
return {"file": meta["file"], "result": result}
async def upload_all(results: list[dict]) -> None:
for i, meta in enumerate(results, 1):
try:
r = await upload_one(meta)
print(f"[{i}/{len(results)}] ✓ {meta['file']}")
print(f" {r['result'][:200]}")
except Exception as e:
print(f"[{i}/{len(results)}] ✗ {meta['file']}: {e}")
# ── CLI ──────────────────────────────────────────────────────────
def main() -> int:
ap = argparse.ArgumentParser()
ap.add_argument("--preview", action="store_true", help="Show metadata table without uploading")
ap.add_argument("--upload", action="store_true", help="Upload all files to style corpus")
ap.add_argument("--only", help="Only process this specific filename")
args = ap.parse_args()
files = sorted(PROOFREAD_DIR.glob("*.md"))
files = [f for f in files if f.name not in SKIP_FILES]
if args.only:
files = [f for f in files if f.name == args.only]
if not files:
print(f"File not found: {args.only}")
return 1
results = [extract_metadata(f) for f in files]
if args.preview or not args.upload:
print_preview(results)
if not args.upload:
return 0
if args.upload:
print(f"\n>>> Uploading {len(results)} files to style corpus...\n")
asyncio.run(upload_all(results))
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@@ -0,0 +1,232 @@
"""Benchmark embedding models on case 1130-25 documents.
Compares voyage-3-large (current), voyage-4-large, and voyage-law-2
on Hebrew legal text retrieval quality, timing, and cost.
"""
import json
import os
import time
import sys
from pathlib import Path
import voyageai
API_KEY = os.environ.get("VOYAGE_API_KEY", "pa-qbfhBDxW0tVtgzr_abMyw_AJO2gli9w3nnqyHuQOW-e")
client = voyageai.Client(api_key=API_KEY)
MODELS = [
"voyage-3-large", # current
"voyage-4-large", # upgrade candidate
"voyage-law-2", # legal specialist
]
# Pricing per 1M tokens (from Voyage AI docs)
PRICING = {
"voyage-3-large": 0.06,
"voyage-4-large": 0.12,
"voyage-law-2": 0.12,
}
DOCS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents")
DOCUMENTS = {
"כתב ערר קובר": DOCS_DIR / "2025-08-14-כתב-ערר-קובר.md",
"כתב ערר מטמון": DOCS_DIR / "2025-10-22-כתב-ערר-מטמון.md",
"תשובת ועדת הראל": DOCS_DIR / "2025-09-02-כתב-תשובה-ועדת-הראל-לערר.md",
"תשובת ליבמן": DOCS_DIR / "2025-09-01-כתב-תשובה-ליבמן-לערר.md",
}
# Test queries — real questions a judge would ask about this case
QUERIES = [
"מהי הטענה המרכזית של העוררים בנוגע לחניה?",
"מה עמדת הוועדה המקומית לגבי התכנית?",
"האם יש פגיעה בזכויות הבנייה של השכנים?",
"מהם התנאים שנקבעו בהיתר הבנייה?",
"האם התכנית עומדת בתקן החניה?",
"מה טענות המשיבים לגבי הגובה והצפיפות?",
"האם נערך שימוע כדין לפני מתן ההחלטה?",
"מהם הנימוקים לאישור התכנית על ידי הוועדה המקומית?",
]
def chunk_text(text: str, chunk_size: int = 600, overlap: int = 100) -> list[str]:
"""Simple word-based chunking."""
words = text.split()
chunks = []
i = 0
while i < len(words):
chunk = " ".join(words[i:i + chunk_size])
chunks.append(chunk)
i += chunk_size - overlap
return chunks
def cosine_sim(a: list[float], b: list[float]) -> float:
dot = sum(x * y for x, y in zip(a, b))
norm_a = sum(x * x for x in a) ** 0.5
norm_b = sum(x * x for x in b) ** 0.5
return dot / (norm_a * norm_b) if norm_a and norm_b else 0.0
def main():
# Load and chunk documents
print("=" * 70)
print("Loading and chunking documents...")
print("=" * 70)
all_chunks = [] # (doc_name, chunk_index, text)
for doc_name, doc_path in DOCUMENTS.items():
text = doc_path.read_text(encoding="utf-8")
chunks = chunk_text(text)
for i, chunk in enumerate(chunks):
all_chunks.append((doc_name, i, chunk))
print(f" {doc_name}: {len(text):,} chars, {len(text.split()):,} words -> {len(chunks)} chunks")
chunk_texts = [c[2] for c in all_chunks]
total_chunks = len(chunk_texts)
print(f"\nTotal: {total_chunks} chunks")
# Estimate tokens (rough: 1 Hebrew word ~ 2-3 tokens)
total_words = sum(len(t.split()) for t in chunk_texts)
est_tokens_docs = int(total_words * 2.5)
total_query_words = sum(len(q.split()) for q in QUERIES)
est_tokens_queries = int(total_query_words * 2.5)
print(f"Estimated tokens per model: ~{est_tokens_docs:,} (docs) + ~{est_tokens_queries:,} (queries)")
results = {}
for model in MODELS:
print(f"\n{'=' * 70}")
print(f"Model: {model}")
print(f"{'=' * 70}")
# Embed documents
print(f" Embedding {total_chunks} chunks...")
t0 = time.time()
doc_embeddings = client.embed(
chunk_texts,
model=model,
input_type="document",
)
doc_time = time.time() - t0
doc_usage = doc_embeddings.total_tokens
doc_embs = doc_embeddings.embeddings
print(f" Done in {doc_time:.1f}s — {doc_usage:,} tokens used")
# Embed queries
print(f" Embedding {len(QUERIES)} queries...")
t0 = time.time()
query_embeddings = client.embed(
QUERIES,
model=model,
input_type="query",
)
query_time = time.time() - t0
query_usage = query_embeddings.total_tokens
query_embs = query_embeddings.embeddings
print(f" Done in {query_time:.1f}s — {query_usage:,} tokens used")
total_tokens = doc_usage + query_usage
cost = total_tokens / 1_000_000 * PRICING[model]
# Search: for each query, rank chunks by similarity
print(f"\n Search results:")
query_results = []
for qi, query in enumerate(QUERIES):
scores = []
for ci, doc_emb in enumerate(doc_embs):
sim = cosine_sim(query_embs[qi], doc_emb)
scores.append((sim, all_chunks[ci][0], all_chunks[ci][1], all_chunks[ci][2][:80]))
scores.sort(reverse=True)
top5 = scores[:5]
query_results.append({
"query": query,
"top5": [(s[0], s[1], s[2], s[3]) for s in top5],
})
print(f"\n Q{qi+1}: {query}")
for rank, (score, doc_name, chunk_idx, preview) in enumerate(top5):
print(f" #{rank+1} [{score:.4f}] {doc_name} (chunk {chunk_idx}): {preview}...")
results[model] = {
"doc_time": doc_time,
"query_time": query_time,
"doc_tokens": doc_usage,
"query_tokens": query_usage,
"total_tokens": total_tokens,
"cost_usd": cost,
"dimensions": len(doc_embs[0]),
"query_results": query_results,
}
# Summary comparison
print(f"\n{'=' * 70}")
print("SUMMARY")
print(f"{'=' * 70}")
print(f"\n{'Model':<25} {'Tokens':>10} {'Time':>8} {'Cost':>10} {'Dims':>6}")
print("-" * 65)
for model in MODELS:
r = results[model]
print(f"{model:<25} {r['total_tokens']:>10,} {r['doc_time']+r['query_time']:>7.1f}s ${r['cost_usd']:>8.5f} {r['dimensions']:>6}")
# Compare top-1 agreement between models
print(f"\n{'=' * 70}")
print("TOP-1 AGREEMENT (which doc is ranked #1 for each query)")
print(f"{'=' * 70}")
print(f"\n{'Query':<50}", end="")
for model in MODELS:
print(f" {model.split('-')[-1]:>10}", end="")
print()
print("-" * 85)
for qi, query in enumerate(QUERIES):
short_q = query[:48]
print(f"{short_q:<50}", end="")
for model in MODELS:
top1_doc = results[model]["query_results"][qi]["top5"][0][1]
# Shorten doc name
short_doc = top1_doc[:10]
print(f" {short_doc:>10}", end="")
print()
# Score distribution comparison
print(f"\n{'=' * 70}")
print("AVERAGE TOP-5 SCORES PER MODEL")
print(f"{'=' * 70}")
for model in MODELS:
all_top5_scores = []
for qr in results[model]["query_results"]:
for score, _, _, _ in qr["top5"]:
all_top5_scores.append(score)
avg = sum(all_top5_scores) / len(all_top5_scores)
top1_scores = [qr["top5"][0][0] for qr in results[model]["query_results"]]
avg_top1 = sum(top1_scores) / len(top1_scores)
print(f" {model:<25} avg top-1: {avg_top1:.4f} avg top-5: {avg:.4f}")
# Save full results
output_path = Path("/home/chaim/legal-ai/data/benchmark-embeddings.json")
serializable = {}
for model, r in results.items():
serializable[model] = {
"doc_time": r["doc_time"],
"query_time": r["query_time"],
"doc_tokens": r["doc_tokens"],
"query_tokens": r["query_tokens"],
"total_tokens": r["total_tokens"],
"cost_usd": r["cost_usd"],
"dimensions": r["dimensions"],
"queries": [
{
"query": qr["query"],
"top5": [{"score": s, "doc": d, "chunk": c, "preview": p} for s, d, c, p in qr["top5"]],
}
for qr in r["query_results"]
],
}
output_path.write_text(json.dumps(serializable, ensure_ascii=False, indent=2))
print(f"\nFull results saved to {output_path}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,203 @@
"""Compare Google Vision extractions vs existing MDs, then benchmark voyage-law-2."""
import json
import time
from pathlib import Path
import voyageai
API_KEY = "pa-qbfhBDxW0tVtgzr_abMyw_AJO2gli9w3nnqyHuQOW-e"
client = voyageai.Client(api_key=API_KEY)
MODEL = "voyage-law-2"
DOCS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents")
GOOGLE_DIR = DOCS_DIR / "extracted"
# Map new (Google Vision) files to existing MDs
PAIRS = [
("מרק קובר-כתב ערר.md", "2025-08-14-כתב-ערר-קובר.md"),
("תשובה לערר מטעם המשיבים.md", "2025-09-01-כתב-תשובה-ליבמן-לערר.md"),
("תשובת הועדה המרחבית לערר.md", "2025-09-02-כתב-תשובה-ועדת-הראל-לערר.md"),
("תשובת המשיב-יצחק מטמון.md", "2025-10-22-כתב-ערר-מטמון.md"),
("השלמת טיעון מטעם משיבים 2-3.md", "2025-12-23-השלמת-טיעון-ליבמן.md"),
("תשובה מטעם העורר להשלמת טיעון.md", "2025-12-08-תגובת-קובר-לבקשת-השלמת-טיעון.md"),
("בקשה להשלמת טיעון ממשיבים 2-3.md", "2025-12-03-בקשה-להשלמת-טיעון-ליבמן.md"),
("השלמת טיעון מטעם הוועדה המקומית.md", "2026-02-04-השלמת-טיעון-ועדת-הראל.md"),
("תגובת העורר לתשובת ועדת הראל להשלמת הטיעון ערר.md", "2026-02-10-תגובת-קובר-להשלמת-טיעון-הראל.md"),
("כתב תשובה-השלמת טיעון מטעם המשיב יצחק מטמון.md", "2026-02-12-כתב-תשובה-השלמת-טיעון-מטמון.md"),
("בקשת העורר לדחיית השלמת הטיעון במלואה.md", "2026-01-13-תגובת-קובר-לדחיית-השלמת-טיעון.md"),
("1130-25-החלטה לתיקון פרוטוקול.md", "2025-11-27-החלטה-לתיקון-פרוטוקול.md"),
("החלטת ביניים 1130-25.md", "2025-12-31-החלטת-ביניים.md"),
("1130-25-פרוטוקול ועדת ערר והחלטה.md", "2025-10-27-פרוטוקול-דיון-ועדת-ערר.md"),
("פרוטוקול ועדה מקומית לדיון בתכנית 152-1257682.md", "2025-07-23-פרוטוקול-ועדה-מקומית-הראל.md"),
]
QUERIES = [
"מהי הטענה המרכזית של העוררים בנוגע לחניה?",
"מה עמדת הוועדה המקומית לגבי התכנית?",
"האם יש פגיעה בזכויות הבנייה של השכנים?",
"מהם התנאים שנקבעו בהיתר הבנייה?",
"האם התכנית עומדת בתקן החניה?",
"מה טענות המשיבים לגבי הגובה והצפיפות?",
"האם נערך שימוע כדין לפני מתן ההחלטה?",
"מהם הנימוקים לאישור התכנית על ידי הוועדה המקומית?",
]
def cosine_sim(a, b):
dot = sum(x * y for x, y in zip(a, b))
na = sum(x * x for x in a) ** 0.5
nb = sum(x * x for x in b) ** 0.5
return dot / (na * nb) if na and nb else 0.0
def chunk_text(text, chunk_size=600, overlap=100):
words = text.split()
chunks = []
i = 0
while i < len(words):
chunks.append(" ".join(words[i:i + chunk_size]))
i += chunk_size - overlap
return chunks
def word_overlap(a, b):
wa, wb = set(a.split()), set(b.split())
if not wa or not wb:
return 0.0
return len(wa & wb) / max(len(wa), len(wb))
def main():
# ── Part 1: Document comparison ──
print("=" * 70)
print("PART 1: DOCUMENT COMPARISON (Google Vision vs Existing)")
print("=" * 70)
comparison_results = []
all_new_chunks = []
all_old_chunks = []
for new_name, old_name in PAIRS:
new_path = GOOGLE_DIR / new_name
old_path = DOCS_DIR / old_name
if not new_path.exists():
continue
if not old_path.exists():
print(f" SKIP (no existing): {old_name}")
continue
new_text = new_path.read_text(encoding="utf-8")
old_text = old_path.read_text(encoding="utf-8")
new_words = len(new_text.split())
old_words = len(old_text.split())
overlap = word_overlap(new_text, old_text)
short_name = old_name[:40]
diff = new_words - old_words
diff_pct = (diff / old_words * 100) if old_words else 0
comparison_results.append({
"name": short_name,
"old_words": old_words,
"new_words": new_words,
"diff": diff,
"diff_pct": diff_pct,
"overlap": overlap,
})
# Chunk for embedding
new_chunks = chunk_text(new_text)
old_chunks = chunk_text(old_text)
for i, c in enumerate(new_chunks):
all_new_chunks.append((short_name, i, c))
for i, c in enumerate(old_chunks):
all_old_chunks.append((short_name, i, c))
print(f"\n{'Document':<42} {'Old':>6} {'New':>6} {'Diff':>8} {'Overlap':>8}")
print("-" * 72)
for r in comparison_results:
print(f" {r['name']:<40} {r['old_words']:>6} {r['new_words']:>6} {r['diff']:>+7} ({r['diff_pct']:>+.0f}%) {r['overlap']:>7.0%}")
# ── Part 2: Embedding benchmark ──
print(f"\n{'=' * 70}")
print("PART 2: VOYAGE-LAW-2 EMBEDDING BENCHMARK")
print(f"{'=' * 70}")
new_texts = [c[2] for c in all_new_chunks]
old_texts = [c[2] for c in all_old_chunks]
print(f"\nNew chunks: {len(new_texts)}, Old chunks: {len(old_texts)}")
def embed_batched(texts, label):
BATCH = 20
all_embs = []
total_tokens = 0
t0 = time.time()
for i in range(0, len(texts), BATCH):
batch = texts[i:i+BATCH]
result = client.embed(batch, model=MODEL, input_type="document")
all_embs.extend(result.embeddings)
total_tokens += result.total_tokens
elapsed = time.time() - t0
print(f" {label}: {len(texts)} chunks, {total_tokens:,} tokens, {elapsed:.1f}s")
return all_embs, total_tokens, elapsed
# Embed new
print("Embedding NEW (Google Vision) chunks...")
new_embs, new_tokens, new_time = embed_batched(new_texts, "NEW")
# Embed old
print("Embedding OLD (existing) chunks...")
old_embs, old_tokens, old_time = embed_batched(old_texts, "OLD")
# Embed queries
print(f"Embedding {len(QUERIES)} queries...")
q_result = client.embed(QUERIES, model=MODEL, input_type="query")
q_embs = q_result.embeddings
# Search and compare
print(f"\n{'=' * 70}")
print("PART 3: SEARCH QUALITY COMPARISON")
print(f"{'=' * 70}")
for qi, query in enumerate(QUERIES):
# Score against new
new_scores = [(cosine_sim(q_embs[qi], e), all_new_chunks[i][0], all_new_chunks[i][2][:60]) for i, e in enumerate(new_embs)]
new_scores.sort(reverse=True)
# Score against old
old_scores = [(cosine_sim(q_embs[qi], e), all_old_chunks[i][0], all_old_chunks[i][2][:60]) for i, e in enumerate(old_embs)]
old_scores.sort(reverse=True)
print(f"\nQ{qi+1}: {query}")
print(f" {'NEW top-1':>10}: [{new_scores[0][0]:.4f}] {new_scores[0][1]}")
print(f" {'OLD top-1':>10}: [{old_scores[0][0]:.4f}] {old_scores[0][1]}")
if new_scores[0][0] > old_scores[0][0]:
print(f" >> NEW better by {new_scores[0][0] - old_scores[0][0]:.4f}")
else:
print(f" >> OLD better by {old_scores[0][0] - new_scores[0][0]:.4f}")
# Summary
new_avg = sum(max(cosine_sim(q_embs[qi], e) for e in new_embs) for qi in range(len(QUERIES))) / len(QUERIES)
old_avg = sum(max(cosine_sim(q_embs[qi], e) for e in old_embs) for qi in range(len(QUERIES))) / len(QUERIES)
print(f"\n{'=' * 70}")
print("SUMMARY")
print(f"{'=' * 70}")
print(f" {'Metric':<30} {'Old (existing)':>15} {'New (Google Vision)':>20}")
print(f" {'-' * 65}")
print(f" {'Total chunks':<30} {len(old_texts):>15} {len(new_texts):>20}")
print(f" {'Total tokens':<30} {old_tokens:>15,} {new_tokens:>20,}")
print(f" {'Embed time':<30} {old_time:>14.1f}s {new_time:>19.1f}s")
print(f" {'Avg top-1 score':<30} {old_avg:>15.4f} {new_avg:>20.4f}")
print(f" {'Score difference':<30} {'':>15} {new_avg - old_avg:>+20.4f}")
est_cost = (new_tokens + old_tokens) / 1_000_000 * 0.12
print(f"\n Embedding cost: ${est_cost:.3f}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,323 @@
#!/usr/bin/env python3
"""Decompose 6 final decisions into 12-block structure.
Uses heuristic parsing based on known section headers in Dafna's decisions.
"""
import asyncio
import json
import re
import sys
from pathlib import Path
from uuid import UUID
sys.path.insert(0, str(Path(__file__).parent.parent / "mcp-server" / "src"))
from legal_mcp.services.db import get_pool, init_schema, close_pool
# ═══════════════════════════════════════════════════════════════════
# Block definitions with detection patterns
# ═══════════════════════════════════════════════════════════════════
BLOCKS = [
{
"block_id": "block-alef",
"block_index": 1,
"title": "כותרת מוסדית",
"generation_type": "template-fill",
},
{
"block_id": "block-bet",
"block_index": 2,
"title": "הרכב הוועדה",
"generation_type": "template-fill",
},
{
"block_id": "block-gimel",
"block_index": 3,
"title": "צדדים",
"generation_type": "template-fill",
},
{
"block_id": "block-dalet",
"block_index": 4,
"title": "כותרת החלטה",
"generation_type": "template-fill",
},
{
"block_id": "block-he",
"block_index": 5,
"title": "פתיחה",
"generation_type": "paraphrase",
},
{
"block_id": "block-vav",
"block_index": 6,
"title": "רקע עובדתי",
"generation_type": "reproduction",
},
{
"block_id": "block-zayin",
"block_index": 7,
"title": "טענות הצדדים",
"generation_type": "paraphrase",
},
{
"block_id": "block-chet",
"block_index": 8,
"title": "הליכים בפני ועדת הערר",
"generation_type": "reproduction",
},
{
"block_id": "block-tet",
"block_index": 9,
"title": "תכניות חלות",
"generation_type": "guided-synthesis",
},
{
"block_id": "block-yod",
"block_index": 10,
"title": "דיון והכרעה",
"generation_type": "rhetorical-construction",
},
{
"block_id": "block-yod-alef",
"block_index": 11,
"title": "סיכום",
"generation_type": "paraphrase",
},
{
"block_id": "block-yod-bet",
"block_index": 12,
"title": "חתימות",
"generation_type": "template-fill",
},
]
# Section header patterns (Hebrew)
SECTION_PATTERNS = {
"claims": re.compile(r"תמצית\s*טענות\s*הצדדים|טענות\s*הצדדים|טענות\s*העוררי"),
"proceedings": re.compile(r"ההליכים\s*בפני\s*ועדת\s*הערר|הליכים\s*בפני\s*הוועדה|הדיון\s*בפני\s*ועדת\s*הערר"),
"plans": re.compile(r"תכניות\s*חלות|המסגרת\s*התכנונית|הוראות\s*התכנית"),
"discussion": re.compile(r"דיון\s*והכרעה|דיון|הכרעה"),
"summary": re.compile(r"^סיכום$|^סוף\s*דבר$", re.MULTILINE),
"appellant_claims": re.compile(r"טענות\s*העוררי|טענות\s*העורר"),
"respondent_claims": re.compile(r"עמדת\s*הוועדה\s*המקומית|תגובת\s*המשיבה|עמדת\s*המשיב"),
"permit_applicant": re.compile(r"עמדת\s*מבקש|עמדת\s*מגיש|עמדת\s*היזם"),
"panel": re.compile(r"בפני[:\s]|יו\"ר"),
"parties_vs": re.compile(r"\s*נגד\s*"),
"decision_title": re.compile(r"^החלטה$", re.MULTILINE),
"opening": re.compile(r"^לפנינו\s|^בפנינו\s"),
"signature": re.compile(r"ניתנה?\s*(היום|פה\s*אחד|ביום)|חתימ"),
}
def find_section_start(text: str, pattern: re.Pattern) -> int:
"""Find the character position where a section starts."""
match = pattern.search(text)
return match.start() if match else -1
def decompose_decision(text: str) -> list[dict]:
"""Parse decision text into blocks based on section headers."""
lines = text.split("\n")
total_len = len(text)
# Find key section boundaries
pos_claims = find_section_start(text, SECTION_PATTERNS["claims"])
pos_proceedings = find_section_start(text, SECTION_PATTERNS["proceedings"])
pos_plans = find_section_start(text, SECTION_PATTERNS["plans"])
pos_discussion = find_section_start(text, SECTION_PATTERNS["discussion"])
pos_summary = find_section_start(text, SECTION_PATTERNS["summary"])
pos_signature = find_section_start(text, SECTION_PATTERNS["signature"])
pos_opening = find_section_start(text, SECTION_PATTERNS["opening"])
pos_decision_title = find_section_start(text, SECTION_PATTERNS["decision_title"])
pos_panel = find_section_start(text, SECTION_PATTERNS["panel"])
pos_parties = find_section_start(text, SECTION_PATTERNS["parties_vs"])
# Build blocks based on what we found
blocks = []
# Blocks א-ד: Header area (before the opening "לפנינו")
header_end = pos_opening if pos_opening > 0 else pos_claims if pos_claims > 0 else 500
header_text = text[:header_end].strip()
# Try to split header into institutional header, panel, parties, title
if pos_panel > 0 and pos_panel < header_end:
blocks.append({"block_id": "block-alef", "content": text[:pos_panel].strip()})
if pos_parties > 0 and pos_parties < header_end:
blocks.append({"block_id": "block-bet", "content": text[pos_panel:pos_parties].strip()})
if pos_decision_title > 0 and pos_decision_title < header_end:
blocks.append({"block_id": "block-gimel", "content": text[pos_parties:pos_decision_title].strip()})
blocks.append({"block_id": "block-dalet", "content": "החלטה"})
else:
blocks.append({"block_id": "block-gimel", "content": text[pos_parties:header_end].strip()})
blocks.append({"block_id": "block-dalet", "content": "החלטה"})
else:
blocks.append({"block_id": "block-bet", "content": text[pos_panel:header_end].strip()})
blocks.append({"block_id": "block-gimel", "content": ""})
blocks.append({"block_id": "block-dalet", "content": "החלטה"})
else:
# Can't split — put everything in alef
blocks.append({"block_id": "block-alef", "content": header_text})
blocks.append({"block_id": "block-bet", "content": ""})
blocks.append({"block_id": "block-gimel", "content": ""})
blocks.append({"block_id": "block-dalet", "content": "החלטה"})
# Block ה: Opening — from "לפנינו" to claims section
if pos_opening > 0:
opening_end = pos_claims if pos_claims > pos_opening else pos_discussion if pos_discussion > pos_opening else total_len
# Opening is usually just 1-3 paragraphs
opening_text = text[pos_opening:min(pos_opening + 1000, opening_end)].strip()
# Find end of first few paragraphs
para_breaks = [i for i, c in enumerate(opening_text) if c == '\n' and i > 50]
if len(para_breaks) >= 2:
opening_text = opening_text[:para_breaks[1]].strip()
blocks.append({"block_id": "block-he", "content": opening_text})
# Block ו: Background — from after opening to claims
if pos_claims > pos_opening:
bg_start = pos_opening + len(opening_text)
blocks.append({"block_id": "block-vav", "content": text[bg_start:pos_claims].strip()})
else:
blocks.append({"block_id": "block-vav", "content": ""})
else:
blocks.append({"block_id": "block-he", "content": ""})
blocks.append({"block_id": "block-vav", "content": ""})
# Block ז: Claims
if pos_claims > 0:
claims_end = pos_proceedings if pos_proceedings > pos_claims else pos_discussion if pos_discussion > pos_claims else pos_summary if pos_summary > pos_claims else total_len
blocks.append({"block_id": "block-zayin", "content": text[pos_claims:claims_end].strip()})
else:
blocks.append({"block_id": "block-zayin", "content": ""})
# Block ח: Proceedings (optional)
if pos_proceedings > 0:
proc_end = pos_plans if pos_plans > pos_proceedings else pos_discussion if pos_discussion > pos_proceedings else pos_summary if pos_summary > pos_proceedings else total_len
blocks.append({"block_id": "block-chet", "content": text[pos_proceedings:proc_end].strip()})
else:
blocks.append({"block_id": "block-chet", "content": ""})
# Block ט: Plans (optional)
if pos_plans > 0 and pos_plans < (pos_discussion if pos_discussion > 0 else total_len):
plans_end = pos_discussion if pos_discussion > pos_plans else pos_summary if pos_summary > pos_plans else total_len
blocks.append({"block_id": "block-tet", "content": text[pos_plans:plans_end].strip()})
else:
blocks.append({"block_id": "block-tet", "content": ""})
# Block י: Discussion
if pos_discussion > 0:
disc_end = pos_summary if pos_summary > pos_discussion else pos_signature if pos_signature > pos_discussion else total_len
blocks.append({"block_id": "block-yod", "content": text[pos_discussion:disc_end].strip()})
else:
blocks.append({"block_id": "block-yod", "content": ""})
# Block יא: Summary
if pos_summary > 0:
summ_end = pos_signature if pos_signature > pos_summary else total_len
blocks.append({"block_id": "block-yod-alef", "content": text[pos_summary:summ_end].strip()})
else:
blocks.append({"block_id": "block-yod-alef", "content": ""})
# Block יב: Signatures
if pos_signature > 0:
blocks.append({"block_id": "block-yod-bet", "content": text[pos_signature:].strip()})
else:
blocks.append({"block_id": "block-yod-bet", "content": ""})
return blocks
async def main():
await init_schema()
pool = await get_pool()
async with pool.acquire() as conn:
decisions = await conn.fetch(
"""SELECT d.id as decision_id, c.case_number, c.title, d.total_words,
doc.extracted_text
FROM decisions d
JOIN cases c ON c.id = d.case_id
JOIN documents doc ON doc.case_id = d.case_id AND doc.doc_type = 'decision'
WHERE d.status = 'final'
ORDER BY c.case_number"""
)
for dec in decisions:
decision_id = dec["decision_id"]
case_number = dec["case_number"]
text = dec["extracted_text"]
total_words = len(text.split())
print(f"\n{'='*60}")
print(f"מפרק: {case_number}{dec['title']}")
print(f"{'='*60}")
# Decompose
blocks = decompose_decision(text)
# Merge with block metadata
block_data = []
for block_def in BLOCKS:
matching = [b for b in blocks if b["block_id"] == block_def["block_id"]]
content = matching[0]["content"] if matching else ""
word_count = len(content.split()) if content else 0
weight = round((word_count / total_words * 100), 2) if total_words > 0 and word_count > 0 else 0
block_data.append({
**block_def,
"content": content,
"word_count": word_count,
"weight_percent": weight,
"status": "final" if content else "empty",
})
# Print summary
for b in block_data:
status = "" if b["word_count"] > 0 else ""
print(f" {status} {b['block_id']:18s} | {b['title']:25s} | {b['word_count']:5d} מילים | {b['weight_percent']:5.1f}%")
# Store in DB
async with pool.acquire() as conn:
# Delete existing blocks for this decision
await conn.execute(
"DELETE FROM decision_blocks WHERE decision_id = $1", decision_id
)
for b in block_data:
await conn.execute(
"""INSERT INTO decision_blocks
(decision_id, block_id, block_index, title, content,
word_count, weight_percent, generation_type, status)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)""",
decision_id,
b["block_id"], b["block_index"], b["title"],
b["content"], b["word_count"], b["weight_percent"],
b["generation_type"], b["status"],
)
# Count paragraphs in discussion block
discussion = [b for b in block_data if b["block_id"] == "block-yod"][0]
if discussion["content"]:
paragraphs = [p.strip() for p in discussion["content"].split("\n") if p.strip() and len(p.strip()) > 20]
await conn.execute(
"UPDATE decisions SET total_paragraphs = $1 WHERE id = $2",
len(paragraphs), decision_id,
)
# Final summary
async with pool.acquire() as conn:
block_count = await conn.fetchval("SELECT count(*) FROM decision_blocks")
non_empty = await conn.fetchval("SELECT count(*) FROM decision_blocks WHERE status = 'final'")
await close_pool()
print(f"\n{'='*60}")
print(f"✅ סה\"כ בלוקים: {block_count} ({non_empty} עם תוכן)")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,139 @@
#!/usr/bin/env python3
"""Export a decision from DB to DOCX using the CJS template generator.
Usage: python export-decision-docx.py <case_number> [output.docx]
Pulls decision blocks from DB, generates structure JSON,
invokes create-decision-structure.cjs to produce DOCX.
"""
import asyncio
import json
import subprocess
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent / "mcp-server" / "src"))
from legal_mcp.services.db import get_pool, init_schema, close_pool
CJS_SCRIPT = Path(__file__).parent.parent / "skills" / "decision" / "scripts" / "create-decision-structure.cjs"
def block_id_to_hebrew(block_id: str) -> str:
"""Map block_id to Hebrew letter label."""
mapping = {
"block-alef": "א", "block-bet": "ב", "block-gimel": "ג",
"block-dalet": "ד", "block-he": "ה", "block-vav": "ו",
"block-zayin": "ז", "block-chet": "ח", "block-tet": "ט",
"block-yod": "י", "block-yod-alef": "יא", "block-yod-bet": "יב",
}
return mapping.get(block_id, "")
async def main():
if len(sys.argv) < 2:
print("שימוש: python export-decision-docx.py <מספר_תיק> [output.docx]")
sys.exit(1)
case_number = sys.argv[1]
output_path = sys.argv[2] if len(sys.argv) > 2 else f"החלטה-{case_number}.docx"
await init_schema()
pool = await get_pool()
async with pool.acquire() as conn:
# Get case info
case = await conn.fetchrow(
"SELECT * FROM cases WHERE case_number = $1", case_number
)
if not case:
print(f"תיק {case_number} לא נמצא")
sys.exit(1)
# Get decision
decision = await conn.fetchrow(
"SELECT * FROM decisions WHERE case_id = $1 AND status = 'final'",
case["id"],
)
if not decision:
print(f"אין החלטה סופית לתיק {case_number}")
sys.exit(1)
# Get blocks
blocks = await conn.fetch(
"""SELECT block_id, block_index, title, content, word_count
FROM decision_blocks
WHERE decision_id = $1
ORDER BY block_index""",
decision["id"],
)
await close_pool()
# Build structure JSON for CJS script
appellants = json.loads(case["appellants"]) if isinstance(case["appellants"], str) else case["appellants"]
respondents = json.loads(case["respondents"]) if isinstance(case["respondents"], str) else case["respondents"]
structure = {
"metadata": {
"case_number": case["case_number"],
"title": case["title"],
"subject": case["subject"],
"property_address": case["property_address"],
"committee": case["committee_type"],
"outcome": decision["outcome"] or "",
"decision_date": str(decision["decision_date"]) if decision["decision_date"] else "",
"author": decision["author"],
},
"parties": {
"appellants": [{"name": a} for a in appellants],
"respondents": [{"name": r} for r in respondents],
},
"blocks": [],
}
for block in blocks:
content = block["content"] or ""
# Skip empty header blocks
if block["block_id"] in ("block-alef", "block-bet", "block-gimel", "block-dalet") and not content:
continue
paragraphs = [p.strip() for p in content.split("\n") if p.strip()]
structure["blocks"].append({
"id": block["block_id"],
"index": block["block_index"],
"title": block["title"],
"hebrew_letter": block_id_to_hebrew(block["block_id"]),
"word_count": block["word_count"],
"paragraphs": paragraphs,
})
# Write JSON (absolute paths)
output_abs = Path(output_path).resolve()
json_path = output_abs.with_suffix(".json")
json_path.parent.mkdir(parents=True, exist_ok=True)
with open(json_path, "w", encoding="utf-8") as f:
json.dump(structure, f, ensure_ascii=False, indent=2)
print(f"JSON נוצר: {json_path}")
# Run CJS script with absolute paths
result = subprocess.run(
["node", str(CJS_SCRIPT), str(json_path), str(output_abs)],
capture_output=True, text=True,
cwd=str(CJS_SCRIPT.parent),
)
if result.returncode == 0:
print(f"✅ DOCX נוצר: {output_path}")
else:
print(f"❌ שגיאה ביצירת DOCX:")
print(result.stderr)
# JSON is still available for manual processing
print(f"ה-JSON זמין: {json_path}")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,134 @@
#!/usr/bin/env python3
"""Extract case law citations from block-yod and link to case_law table."""
import asyncio
import re
import sys
from pathlib import Path
from uuid import UUID
sys.path.insert(0, str(Path(__file__).parent.parent / "mcp-server" / "src"))
from legal_mcp.services.db import get_pool, init_schema, close_pool
# Patterns for Israeli case law citations
CITATION_PATTERNS = [
# עע"מ, בג"ץ, ע"א, etc.
re.compile(r'(עע"מ|בג"ץ|ע"א|בר"ם|עת"מ|עמ"נ|ע"ע|רע"א|דנ"א|בש"א)\s*(\d[\d/\-]+)'),
# ערר with number
re.compile(r'ערר\s*\(?\s*(?:מרכז|ירושלים|חי\'?|ת"א|דרום|צפון)?\s*\)?\s*(\d[\d/\-]+)'),
# ערר without district
re.compile(r'ערר\s+(\d{3,5}[\-/]\d{2,4})'),
]
def extract_citations_from_text(text: str) -> list[dict]:
"""Find all case law citations in text."""
citations = []
seen = set()
for pattern in CITATION_PATTERNS:
for match in pattern.finditer(text):
full_match = match.group(0)
if full_match in seen:
continue
seen.add(full_match)
# Get surrounding context (50 chars before and after)
start = max(0, match.start() - 50)
end = min(len(text), match.end() + 100)
context = text[start:end].replace("\n", " ")
citations.append({
"citation_text": full_match,
"context": context,
})
return citations
async def main():
await init_schema()
pool = await get_pool()
async with pool.acquire() as conn:
# Get all block-yod content with decision info
blocks = await conn.fetch(
"""SELECT db.content, d.id as decision_id, c.case_number
FROM decision_blocks db
JOIN decisions d ON d.id = db.decision_id
JOIN cases c ON c.id = d.case_id
WHERE db.block_id = 'block-yod' AND db.word_count > 0
ORDER BY c.case_number"""
)
# Get existing case_law for matching
case_laws = await conn.fetch("SELECT id, case_number, case_name FROM case_law")
case_law_map = {}
for cl in case_laws:
# Index by various forms of the case number
case_law_map[cl["case_number"]] = cl["id"]
# Also index by short number (e.g., "3975/22" from "עע"מ 3975/22")
parts = cl["case_number"].split()
if len(parts) > 1:
case_law_map[parts[-1]] = cl["id"]
total_citations = 0
total_linked = 0
for block in blocks:
case_number = block["case_number"]
decision_id = block["decision_id"]
text = block["content"]
citations = extract_citations_from_text(text)
if not citations:
continue
print(f"\n{case_number}: {len(citations)} ציטוטים נמצאו")
async with pool.acquire() as conn:
for cit in citations:
total_citations += 1
# Try to match to case_law table
case_law_id = None
for key, cl_id in case_law_map.items():
if key in cit["citation_text"] or cit["citation_text"] in key:
case_law_id = cl_id
break
if case_law_id:
# Check if already exists
existing = await conn.fetchval(
"""SELECT id FROM case_law_citations
WHERE case_law_id = $1 AND decision_id = $2""",
case_law_id, decision_id,
)
if not existing:
await conn.execute(
"""INSERT INTO case_law_citations
(case_law_id, decision_id, citation_type, context_text)
VALUES ($1, $2, 'support', $3)""",
case_law_id, decision_id, cit["context"],
)
total_linked += 1
print(f"{cit['citation_text'][:40]} → קושר לפסיקה")
else:
print(f"{cit['citation_text'][:40]} — לא נמצא ב-DB")
# Summary
async with pool.acquire() as conn:
total_in_db = await conn.fetchval("SELECT count(*) FROM case_law_citations")
await close_pool()
print(f"\n{'='*50}")
print(f"סה\"כ ציטוטים שנמצאו: {total_citations}")
print(f"סה\"כ קושרו לפסיקה ב-DB: {total_linked}")
print(f"סה\"כ ב-case_law_citations: {total_in_db}")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,228 @@
#!/usr/bin/env python3
"""Extract individual claims from block-zayin of each decision.
Identifies party sub-sections and individual claims (paragraphs).
Stores in the claims table with party_role classification.
"""
import asyncio
import json
import re
import sys
from pathlib import Path
from uuid import UUID
sys.path.insert(0, str(Path(__file__).parent.parent / "mcp-server" / "src"))
from legal_mcp.services.db import get_pool, init_schema, close_pool
# Party role detection patterns
PARTY_PATTERNS = [
# Appellants
(r"טענות\s*העוררי[םן]|טענות\s*העורר\b|טענות\s*המבקש|טענות\s*המערער", "appellant"),
# Respondent - local committee
(r"עמדת\s*הוועדה\s*המקומית|עמדת\s*המשיבה|טענות\s*המשיבה|תגובת\s*המשיבה|הוועדה\s*המקומית$", "committee"),
# Respondent - general
(r"עמדת\s*המשיבי[םן]|עמדת\s*המשיב\b|טענות\s*המשיבי[םן]|טענות\s*המשיב\b", "respondent"),
# Permit applicant
(r"מבקשי\s*ההיתר|עמדת\s*מבקש|עמדת\s*היזם|מגישי\s*התכנית", "permit_applicant"),
# Appraiser clarifications (היטל השבחה)
(r"הבהרות\s*השמא|התייחסות\s*הצדדים", "appraiser"),
]
def detect_party_role(line: str) -> str | None:
"""Detect if a line is a party section header. Returns role or None."""
for pattern, role in PARTY_PATTERNS:
if re.search(pattern, line):
return role
return None
def is_section_header(line: str) -> bool:
"""Check if line is a section/sub-section header (not a claim)."""
line = line.strip()
if not line:
return False
# Very short lines that are headers
if len(line) < 50 and (
detect_party_role(line) is not None
or re.match(r"^תמצית\s*טענות", line)
or re.match(r"^[א-ת][\.\)]\s*טענות", line)
or re.match(r"^[א-ת][\.\)]\s*כללי", line)
or re.match(r"^\d+\.\s*$", line) # just a number
):
return True
return False
def is_numbered_sub_header(line: str) -> bool:
"""Check if line is a numbered topic header within claims (e.g., '2. שיעור ההפקעה')."""
return bool(re.match(r"^\d+\.\s+\S.{3,40}$", line.strip()))
def extract_claims_from_block(text: str) -> list[dict]:
"""Extract individual claims grouped by party from block-zayin text."""
lines = text.split("\n")
claims = []
current_role = "appellant" # default if no header found
current_claim_lines = []
claim_index = 0
for line in lines:
stripped = line.strip()
if not stripped:
continue
# Check for party header — must be a SHORT line (header, not claim content)
role = detect_party_role(stripped) if len(stripped.split()) <= 8 else None
if role:
# Save accumulated claim
if current_claim_lines:
claim_text = "\n".join(current_claim_lines).strip()
if len(claim_text) > 30:
claims.append({
"party_role": current_role,
"claim_text": claim_text,
"claim_index": claim_index,
})
claim_index += 1
current_claim_lines = []
current_role = role
continue
# Skip generic section headers
if is_section_header(stripped):
# Save accumulated claim before skipping header
if current_claim_lines:
claim_text = "\n".join(current_claim_lines).strip()
if len(claim_text) > 30:
claims.append({
"party_role": current_role,
"claim_text": claim_text,
"claim_index": claim_index,
})
claim_index += 1
current_claim_lines = []
continue
# Numbered sub-header in היטל השבחה style (e.g., "2. שיעור ההפקעה")
# starts a new claim
if is_numbered_sub_header(stripped):
if current_claim_lines:
claim_text = "\n".join(current_claim_lines).strip()
if len(claim_text) > 30:
claims.append({
"party_role": current_role,
"claim_text": claim_text,
"claim_index": claim_index,
})
claim_index += 1
current_claim_lines = [stripped]
continue
# Each substantial paragraph is a separate claim
# Save previous accumulated claim first
if current_claim_lines:
claim_text = "\n".join(current_claim_lines).strip()
if len(claim_text) > 30:
claims.append({
"party_role": current_role,
"claim_text": claim_text,
"claim_index": claim_index,
})
claim_index += 1
current_claim_lines = [stripped]
# Save last claim
if current_claim_lines:
claim_text = "\n".join(current_claim_lines).strip()
if len(claim_text) > 30:
claims.append({
"party_role": current_role,
"claim_text": claim_text,
"claim_index": claim_index,
})
return claims
async def main():
await init_schema()
pool = await get_pool()
async with pool.acquire() as conn:
# Get all block-zayin with content
rows = await conn.fetch(
"""SELECT c.id as case_id, c.case_number, c.title,
db.content
FROM decision_blocks db
JOIN decisions d ON d.id = db.decision_id
JOIN cases c ON c.id = d.case_id
WHERE db.block_id = 'block-zayin' AND db.word_count > 0
ORDER BY c.case_number"""
)
total_claims = 0
for row in rows:
case_id = row["case_id"]
case_number = row["case_number"]
text = row["content"]
claims = extract_claims_from_block(text)
print(f"\n{'='*50}")
print(f"תיק: {case_number}{row['title']}")
print(f"{'='*50}")
async with pool.acquire() as conn:
# Delete existing claims for this case
await conn.execute("DELETE FROM claims WHERE case_id = $1", case_id)
role_counts = {}
for claim in claims:
role = claim["party_role"]
role_counts[role] = role_counts.get(role, 0) + 1
await conn.execute(
"""INSERT INTO claims (case_id, party_role, claim_text, claim_index, source_document)
VALUES ($1, $2, $3, $4, $5)""",
case_id,
claim["party_role"],
claim["claim_text"],
claim["claim_index"],
"block-zayin",
)
for role, count in sorted(role_counts.items()):
role_heb = {
"appellant": "עוררים",
"committee": "ועדה מקומית",
"respondent": "משיבים",
"permit_applicant": "מבקשי היתר",
"appraiser": "שמאי",
}.get(role, role)
print(f" {role_heb:20s}{count} טענות")
total_claims += len(claims)
print(f" סה\"כ: {len(claims)} טענות")
# Summary
async with pool.acquire() as conn:
total = await conn.fetchval("SELECT count(*) FROM claims")
by_role = await conn.fetch(
"SELECT party_role, count(*) as cnt FROM claims GROUP BY party_role ORDER BY cnt DESC"
)
print(f"\n{'='*50}")
print(f"סיכום כללי — {total} טענות מ-{len(rows)} החלטות")
for r in by_role:
print(f" {r['party_role']:20s}{r['cnt']}")
await close_pool()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,128 @@
"""Extract ALL PDFs from originals using Google Cloud Vision OCR.
Forces OCR on all pages (ignoring broken text layers).
Then runs voyage-law-2 embedding benchmark comparing old vs new.
"""
import asyncio
import json
import sys
import time
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src"))
from dotenv import load_dotenv
load_dotenv(Path.home() / ".env")
import fitz
from google.cloud import vision
from legal_mcp import config
API_KEY = config.GOOGLE_CLOUD_VISION_API_KEY
client = vision.ImageAnnotatorClient(client_options={"api_key": API_KEY})
ORIGINALS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/originals")
OUTPUT_DIR = ORIGINALS_DIR.parent / "extracted"
# Hebrew abbreviation quote fixer
import re
_ABBREV_FIXES = {
'עוהייד': 'עוה"ד', 'עוייד': 'עו"ד', 'הנייל': 'הנ"ל',
'מצייב': 'מצ"ב', 'ביהמייש': 'ביהמ"ש', 'תייז': 'ת"ז',
'עייי': 'ע"י', 'אחייכ': 'אח"כ', 'סייק': 'ס"ק',
'דייר': 'ד"ר', 'כדוייח': 'כדו"ח', 'חווייד': 'חוו"ד',
'מייר': 'מ"ר', 'יחייד': 'יח"ד', 'בייכ': 'ב"כ',
}
_ABBREV_PAT = re.compile('|'.join(re.escape(k) for k in sorted(_ABBREV_FIXES, key=len, reverse=True)))
def fix_quotes(text):
return _ABBREV_PAT.sub(lambda m: _ABBREV_FIXES[m.group()], text)
def ocr_page(image_bytes, page_num):
image = vision.Image(content=image_bytes)
response = client.document_text_detection(
image=image,
image_context=vision.ImageContext(language_hints=["he"]),
)
if response.error.message:
print(f" ERROR page {page_num}: {response.error.message}")
return ""
text = response.full_text_annotation.text if response.full_text_annotation else ""
return fix_quotes(text)
def process_pdf(pdf_path):
doc = fitz.open(str(pdf_path))
page_count = len(doc)
pages_text = []
t0 = time.time()
for i in range(page_count):
page = doc[i]
pix = page.get_pixmap(dpi=300)
img_bytes = pix.tobytes("png")
pt = time.time()
text = ocr_page(img_bytes, i + 1)
elapsed = time.time() - pt
pages_text.append(text)
print(f" Page {i+1}/{page_count}: {len(text):,} chars, {elapsed:.1f}s")
doc.close()
total_time = time.time() - t0
return "\n\n".join(pages_text), page_count, total_time
def main():
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
pdfs = sorted(ORIGINALS_DIR.glob("*.pdf"))
print(f"Found {len(pdfs)} PDFs\n")
results = []
total_pages = 0
total_time = 0.0
for pdf in pdfs:
out_file = OUTPUT_DIR / f"{pdf.stem}.md"
# Skip already extracted
if out_file.exists() and out_file.stat().st_size > 100:
text = out_file.read_text(encoding="utf-8")
doc = fitz.open(str(pdf))
pages = len(doc)
doc.close()
print(f"SKIP (exists): {pdf.name} ({pages} pages, {len(text):,} chars)")
results.append({"name": pdf.stem, "pages": pages, "chars": len(text), "words": len(text.split()), "time": 0, "skipped": True})
total_pages += pages
continue
print(f"{'=' * 60}")
print(f" {pdf.name} ({pdf.stat().st_size:,} bytes)")
text, pages, elapsed = process_pdf(pdf)
total_pages += pages
total_time += elapsed
out_file.write_text(text, encoding="utf-8")
words = len(text.split())
print(f" Result: {pages} pages, {len(text):,} chars, {words:,} words, {elapsed:.1f}s")
print(f" Saved: {out_file.name}\n")
results.append({"name": pdf.stem, "pages": pages, "chars": len(text), "words": words, "time": elapsed, "skipped": False})
print(f"\n{'=' * 60}")
print(f"TOTAL: {len(pdfs)} docs, {total_pages} pages, {total_time:.1f}s")
est_cost = total_pages * 0.0015
print(f"Estimated cost: ${est_cost:.2f}")
# Save results
Path("/home/chaim/legal-ai/data/google-vision-extraction.json").write_text(
json.dumps(results, ensure_ascii=False, indent=2)
)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,66 @@
"""Extract text from original PDF files using Claude Opus Vision OCR."""
import asyncio
import sys
import time
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src"))
from dotenv import load_dotenv
load_dotenv(Path.home() / ".env")
from legal_mcp.services import extractor
ORIGINALS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/originals")
OUTPUT_DIR = ORIGINALS_DIR / "extracted"
async def main():
OUTPUT_DIR.mkdir(exist_ok=True)
pdfs = sorted(ORIGINALS_DIR.glob("*.pdf"))
print(f"Found {len(pdfs)} PDFs\n")
total_cost = 0.0
total_pages = 0
total_time = 0.0
for pdf in pdfs:
print(f"{'=' * 60}")
print(f"Processing: {pdf.name}")
print(f" Size: {pdf.stat().st_size:,} bytes")
t0 = time.time()
text, page_count = await extractor.extract_text(str(pdf))
elapsed = time.time() - t0
total_pages += page_count
total_time += elapsed
# Estimate cost (Opus: $15/M input, $75/M output, ~1000 tokens per image)
# Rough: ~$0.05 per page for image input + output
est_cost = page_count * 0.05
total_cost += est_cost
# Save extracted text
out_file = OUTPUT_DIR / f"{pdf.stem}.md"
out_file.write_text(text, encoding="utf-8")
print(f" Pages: {page_count}")
print(f" Extracted: {len(text):,} chars, {len(text.split()):,} words")
print(f" Time: {elapsed:.1f}s ({elapsed/max(page_count,1):.1f}s/page)")
print(f" Est. cost: ${est_cost:.3f}")
print(f" Saved to: {out_file.name}")
print()
print(f"{'=' * 60}")
print(f"TOTAL")
print(f" Documents: {len(pdfs)}")
print(f" Pages: {total_pages}")
print(f" Time: {total_time:.1f}s")
print(f" Est. cost: ${total_cost:.3f}")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,113 @@
"""Extract text from original PDF files using Claude Opus Vision OCR on ALL pages.
Forces Vision OCR regardless of embedded text layer (which may be broken).
"""
import asyncio
import base64
import sys
import time
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "mcp-server" / "src"))
from dotenv import load_dotenv
load_dotenv(Path.home() / ".env")
import anthropic
import fitz
from legal_mcp import config
client = anthropic.Anthropic(api_key=config.ANTHROPIC_API_KEY)
MODEL = "claude-opus-4-20250514"
ORIGINALS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/originals")
OUTPUT_DIR = ORIGINALS_DIR.parent / "extracted"
async def ocr_page(image_bytes: bytes, page_num: int) -> str:
b64_image = base64.b64encode(image_bytes).decode("utf-8")
message = client.messages.create(
model=MODEL,
max_tokens=4096,
messages=[{
"role": "user",
"content": [
{
"type": "image",
"source": {"type": "base64", "media_type": "image/png", "data": b64_image},
},
{
"type": "text",
"text": (
"חלץ את כל הטקסט מהתמונה הזו. זהו מסמך משפטי בעברית. "
"שמור על מבנה הפסקאות המקורי. "
"אם יש כותרות, סמן אותן. "
"החזר רק את הטקסט המחולץ, ללא הערות נוספות."
),
},
],
}],
)
return message.content[0].text
async def process_pdf(pdf_path: Path) -> tuple[str, int, float, int, int]:
doc = fitz.open(str(pdf_path))
page_count = len(doc)
pages_text = []
total_input = 0
total_output = 0
t0 = time.time()
for i in range(page_count):
page = doc[i]
pix = page.get_pixmap(dpi=200)
img_bytes = pix.tobytes("png")
print(f" Page {i+1}/{page_count}...", end=" ", flush=True)
pt = time.time()
text = await ocr_page(img_bytes, i + 1)
elapsed = time.time() - pt
pages_text.append(text)
print(f"{len(text):,} chars, {elapsed:.1f}s")
doc.close()
total_time = time.time() - t0
full_text = "\n\n".join(pages_text)
return full_text, page_count, total_time
async def main():
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
pdfs = sorted(ORIGINALS_DIR.glob("*.pdf"))
print(f"Found {len(pdfs)} PDFs — extracting ALL pages with {MODEL}\n")
total_pages = 0
total_time = 0.0
for pdf in pdfs:
print(f"{'=' * 60}")
print(f" {pdf.name} ({pdf.stat().st_size:,} bytes)")
print(f"{'=' * 60}")
text, pages, elapsed = await process_pdf(pdf)
total_pages += pages
total_time += elapsed
out_file = OUTPUT_DIR / f"{pdf.stem}.md"
out_file.write_text(text, encoding="utf-8")
print(f" Result: {pages} pages, {len(text):,} chars, {len(text.split()):,} words")
print(f" Time: {elapsed:.1f}s ({elapsed/max(pages,1):.1f}s/page)")
print(f" Saved: {out_file.name}\n")
print(f"{'=' * 60}")
print(f"TOTAL: {len(pdfs)} docs, {total_pages} pages, {total_time:.1f}s")
est_cost = total_pages * 0.05
print(f"Estimated cost: ${est_cost:.2f}")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,177 @@
#!/usr/bin/env python3
"""Generate embeddings for decision blocks and case law.
Creates:
- paragraph_embeddings: for each decision block with content
- case_law_embeddings: for each case law summary
"""
import asyncio
import sys
from pathlib import Path
from uuid import UUID
sys.path.insert(0, str(Path(__file__).parent.parent / "mcp-server" / "src"))
from legal_mcp.services.db import get_pool, init_schema, close_pool
from legal_mcp.services.embeddings import embed_texts
from legal_mcp import config
async def generate_block_embeddings(conn) -> int:
"""Generate embeddings for decision blocks.
First creates decision_paragraphs records from block content,
then generates embeddings in paragraph_embeddings.
"""
blocks = await conn.fetch(
"""SELECT db.id as block_id, db.decision_id, db.block_id as block_type,
db.content, db.word_count, c.case_number
FROM decision_blocks db
JOIN decisions d ON d.id = db.decision_id
JOIN cases c ON c.id = d.case_id
WHERE db.word_count > 10
AND db.block_id NOT IN ('block-alef', 'block-bet', 'block-gimel', 'block-dalet')
ORDER BY c.case_number, db.block_index"""
)
if not blocks:
print(" אין בלוקים ליצירת embeddings")
return 0
print(f" מעבד {len(blocks)} בלוקים...")
# Create paragraphs and collect texts for embedding
para_records = []
para_number = 1
for block in blocks:
content = block["content"]
words = content.split()
# Split into chunks for embedding
if len(words) <= 600:
chunk_texts = [content]
else:
chunk_texts = []
for start in range(0, len(words), 400):
chunk_words = words[start:start + 500]
if len(chunk_words) > 50:
chunk_texts.append(" ".join(chunk_words))
for chunk_text in chunk_texts:
# Create decision_paragraph record
para_id = await conn.fetchval(
"""INSERT INTO decision_paragraphs
(block_id, paragraph_number, content, word_count)
VALUES ($1, $2, $3, $4)
ON CONFLICT DO NOTHING
RETURNING id""",
block["block_id"],
para_number,
chunk_text,
len(chunk_text.split()),
)
if para_id:
para_records.append({
"para_id": para_id,
"text": chunk_text,
"case_number": block["case_number"],
})
para_number += 1
if not para_records:
print(" אין פסקאות חדשות")
return 0
print(f" {len(para_records)} פסקאות נוצרו, מייצר embeddings...")
# Generate embeddings in batches
texts = [p["text"] for p in para_records]
embeddings = await embed_texts(texts, input_type="document")
# Store embeddings
count = 0
for para, embedding in zip(para_records, embeddings):
await conn.execute(
"""INSERT INTO paragraph_embeddings (paragraph_id, embedding)
VALUES ($1, $2)""",
para["para_id"],
embedding,
)
count += 1
return count
async def generate_case_law_embeddings(conn) -> int:
"""Generate embeddings for case law summaries."""
cases = await conn.fetch(
"""SELECT id, case_number, case_name, summary, key_quote
FROM case_law
WHERE summary != '' OR key_quote != ''"""
)
# Filter out existing
existing = await conn.fetch("SELECT case_law_id FROM case_law_embeddings")
existing_ids = {r["case_law_id"] for r in existing}
to_embed = [c for c in cases if c["id"] not in existing_ids]
if not to_embed:
print(" אין פסיקה חדשה ליצירת embeddings")
return 0
print(f" מייצר embeddings ל-{len(to_embed)} תקדימים...")
texts = []
for c in to_embed:
# Combine case info into a searchable text
text = f"{c['case_number']} {c['case_name']}: {c['summary']}"
if c["key_quote"]:
text += f" ציטוט: {c['key_quote']}"
texts.append(text)
embeddings = await embed_texts(texts, input_type="document")
count = 0
for case, embedding in zip(to_embed, embeddings):
await conn.execute(
"""INSERT INTO case_law_embeddings (case_law_id, chunk_text, embedding)
VALUES ($1, $2, $3)""",
case["id"],
f"{case['case_number']} {case['case_name']}: {case['summary']}",
embedding,
)
count += 1
return count
async def main():
await init_schema()
pool = await get_pool()
async with pool.acquire() as conn:
print("שלב 1: embeddings לבלוקי החלטה")
block_count = await generate_block_embeddings(conn)
print(f"{block_count} embeddings נוצרו")
print("\nשלב 2: embeddings לפסיקה")
cl_count = await generate_case_law_embeddings(conn)
print(f"{cl_count} embeddings נוצרו")
# Summary
para_total = await conn.fetchval("SELECT count(*) FROM paragraph_embeddings")
cl_total = await conn.fetchval("SELECT count(*) FROM case_law_embeddings")
await close_pool()
print(f"\nסיכום:")
print(f" סה\"כ paragraph_embeddings: {para_total}")
print(f" סה\"כ case_law_embeddings: {cl_total}")
print(f" מודל: {config.VOYAGE_MODEL}")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,118 @@
#!/usr/bin/env python3
"""Link claims to discussion paragraphs using semantic similarity.
For each claim, finds the most similar paragraph in block-yod of the same decision.
Updates claims.addressed_in_paragraph with the paragraph number.
"""
import asyncio
import sys
from pathlib import Path
from uuid import UUID
sys.path.insert(0, str(Path(__file__).parent.parent / "mcp-server" / "src"))
from legal_mcp.services.db import get_pool, init_schema, close_pool
from legal_mcp.services.embeddings import embed_texts
async def main():
await init_schema()
pool = await get_pool()
async with pool.acquire() as conn:
# Get all cases with both claims and discussion blocks
cases = await conn.fetch(
"""SELECT DISTINCT c.id as case_id, c.case_number
FROM cases c
JOIN claims cl ON cl.case_id = c.id
JOIN decisions d ON d.case_id = c.id
JOIN decision_blocks db ON db.decision_id = d.id AND db.block_id = 'block-yod' AND db.word_count > 0
ORDER BY c.case_number"""
)
total_linked = 0
for case in cases:
case_id = case["case_id"]
case_number = case["case_number"]
async with pool.acquire() as conn:
# Get claims for this case
claims = await conn.fetch(
"SELECT id, claim_text, party_role, claim_index FROM claims WHERE case_id = $1 ORDER BY claim_index",
case_id,
)
# Get discussion paragraphs (split block-yod into paragraphs)
yod_content = await conn.fetchval(
"""SELECT db.content FROM decision_blocks db
JOIN decisions d ON d.id = db.decision_id
WHERE d.case_id = $1 AND db.block_id = 'block-yod'""",
case_id,
)
if not yod_content or not claims:
continue
# Split discussion into paragraphs
disc_paragraphs = [p.strip() for p in yod_content.split("\n") if p.strip() and len(p.strip()) > 30]
if not disc_paragraphs:
continue
print(f"\n{case_number}: {len(claims)} טענות ← {len(disc_paragraphs)} פסקאות דיון")
# Embed all claims and discussion paragraphs
claim_texts = [c["claim_text"][:500] for c in claims]
all_texts = claim_texts + disc_paragraphs
embeddings = await embed_texts(all_texts, input_type="document")
claim_embeddings = embeddings[:len(claims)]
disc_embeddings = embeddings[len(claims):]
# For each claim, find the best matching discussion paragraph
linked = 0
async with pool.acquire() as conn:
for i, claim in enumerate(claims):
claim_emb = claim_embeddings[i]
# Cosine similarity
best_score = -1
best_para_idx = -1
for j, disc_emb in enumerate(disc_embeddings):
dot = sum(a * b for a, b in zip(claim_emb, disc_emb))
norm_a = sum(a * a for a in claim_emb) ** 0.5
norm_b = sum(b * b for b in disc_emb) ** 0.5
score = dot / (norm_a * norm_b) if norm_a > 0 and norm_b > 0 else 0
if score > best_score:
best_score = score
best_para_idx = j
if best_para_idx >= 0 and best_score > 0.3:
# paragraph_number is 1-indexed
para_num = best_para_idx + 1
await conn.execute(
"UPDATE claims SET addressed_in_paragraph = $1 WHERE id = $2",
para_num, claim["id"],
)
linked += 1
total_linked += linked
print(f" קושרו: {linked}/{len(claims)} טענות (ציון מינימלי: 0.3)")
# Summary
async with pool.acquire() as conn:
total_claims = await conn.fetchval("SELECT count(*) FROM claims")
linked_claims = await conn.fetchval("SELECT count(*) FROM claims WHERE addressed_in_paragraph IS NOT NULL")
await close_pool()
print(f"\n{'='*50}")
print(f"סיכום: {linked_claims}/{total_claims} טענות קושרו לפסקאות דיון ({linked_claims/total_claims*100:.0f}%)")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,382 @@
"""Proofread training corpus: strip Nevo additions from DOCX/PDF, output clean Markdown.
Nevo DOCX additions:
Front: ספרות / חקיקה שאוזכרה / מיני-רציו / topic tags / Nevo summary paragraphs
Back: 5129371512937154678313 / "בעניין עריכה ושינויים" link / "54678313-..." / "נוסח מסמך זה כפוף"
Nevo PDF additions:
"עמוד X מתוך Y" header on every page
PDF text extraction uses Google Cloud Vision OCR — PyMuPDF fragments Hebrew RTL
text unusably (words split mid-word, reading order broken). OCR gives clean output.
"""
from __future__ import annotations
import io
import os
import re
import sys
import time
from pathlib import Path
import fitz
from docx import Document
# Load GOOGLE_CLOUD_VISION_API_KEY from ~/.env if not already set
if not os.environ.get("GOOGLE_CLOUD_VISION_API_KEY"):
env_path = Path.home() / ".env"
if env_path.exists():
for line in env_path.read_text().splitlines():
if line.startswith("GOOGLE_CLOUD_VISION_API_KEY="):
os.environ["GOOGLE_CLOUD_VISION_API_KEY"] = line.split("=", 1)[1].strip().strip('"').strip("'")
break
from google.cloud import vision # noqa: E402
TRAINING_DIR = Path("/home/chaim/legal-ai/data/training")
OUTPUT_DIR = TRAINING_DIR / "proofread"
RAW_DIR = TRAINING_DIR / "raw"
# ── Nevo pattern detection ────────────────────────────────────────
NEVO_PREAMBLE_HEADERS = (
"ספרות:",
"חקיקה שאוזכרה:",
"מיני-רציו:",
)
# Strong decision-opening patterns — highly distinctive first words of real decision
# body. These rarely appear inside Nevo's own summary block, so first match wins.
DECISION_OPENING = re.compile(
r"^(עניינו\s|ענייננו\s|עסקינן\s|בפנינו\s|לפנינו\s|בערר\s+שלפנינו|זהו\s+ערר)"
)
# Section headers that definitively mark decision body start.
DECISION_SECTION_HEADERS = {
"רקע",
"פתח דבר",
"תמצית טענות הצדדים",
"העובדות",
"הרקע העובדתי",
"מבוא",
}
# Nevo postamble markers — everything from first match onwards is stripped.
NEVO_POSTAMBLE_MARKERS = (
"5129371512937154678313",
"בעניין עריכה ושינויים במסמכי פסיקה",
"נוסח מסמך זה כפוף לשינויי ניסוח ועריכה",
)
# Nevo inline watermark codes — appear as prefixes embedded in real paragraphs
# (e.g. "5129371ניתנה פה אחד" or "054678313האם ההיתר..."). These must be
# stripped from paragraph content, not used as postamble boundaries.
NEVO_INLINE_CODE_RE = re.compile(r"^0?(5129371|54678313)\d*")
# Nevo PDF page header: "עמוד X מתוך Y" or "עמוד X בן Y" (Hebrew variants)
PDF_PAGE_HEADER_RE = re.compile(
r"\s*עמוד\s*\n?\s*\d+\s*\n?\s*(?:מתוך|בן)\s*\n?\s*\d+\s*"
)
# Short orphan lines starting with "עמוד" — OCR artifacts from merged footer text
# (e.g. "עמודירבי", "עמוד :", "עמודי", "עמוד ר"). Conservative: up to 12 chars.
PDF_PAGE_ORPHAN_RE = re.compile(r"(?m)^עמוד[^\n]{0,12}$")
# "עמוד" followed by number (with optional garbled Nevo URL line after)
PDF_PAGE_BLOCK_RE = re.compile(
r"(?m)^\s*עמוד\s*\n\s*\d+[·.]?\s*\n[^\n]*\n", re.UNICODE
)
# Standalone "עמוד N" at line start
PDF_PAGE_NUM_LINE_RE = re.compile(r"(?m)^\s*עמוד\s*\n?\s*\d+[·.]?\s*$")
# Nevo watermark URL (and common OCR-garbled variants)
NEVO_URL_RE = re.compile(
r"(nevo\.co\.il|neto\.co\.il|netocoal|neetocoal|nevocoal|nevo\.co|rawo\.co\.il)",
re.IGNORECASE,
)
def find_decision_start(paragraphs: list[str]) -> int:
"""Find index of first real decision paragraph, skipping Nevo preamble.
Strategy:
1. If no Nevo headers present → start at 0.
2. Otherwise, scan past Nevo headers; look for first paragraph matching
DECISION_OPENING regex or DECISION_SECTION_HEADERS.
3. Fallback: first paragraph after "ועדת הערר ... קבעה כלהלן:" bullet block
that doesn't look like summary (heuristic: longer, has proper sentence).
"""
has_nevo_preamble = any(
any(p.startswith(h) for h in NEVO_PREAMBLE_HEADERS) for p in paragraphs[:10]
)
if not has_nevo_preamble:
return 0
# Scan for strong decision-opening markers
for i, p in enumerate(paragraphs):
stripped = p.strip()
if stripped in DECISION_SECTION_HEADERS:
return i
if DECISION_OPENING.match(stripped):
return i
# Fallback: find "ועדת הערר ... קבעה כלהלן" and take first long para after bullets
for i, p in enumerate(paragraphs):
if "קבעה כלהלן" in p or "קבעה את הדברים הבאים" in p:
# Skip summary paragraphs (Nevo typically has 3-8 of these)
for j in range(i + 1, min(i + 15, len(paragraphs))):
if len(paragraphs[j]) > 80 and not paragraphs[j].strip().startswith("*"):
# Check if this looks like real decision content
return j
break
# Last resort: strip only the first 10 paragraphs of preamble
return min(10, len(paragraphs) - 1)
def find_decision_end(paragraphs: list[str]) -> int:
"""Find exclusive end index: first paragraph that is a Nevo postamble marker."""
for i, p in enumerate(paragraphs):
for marker in NEVO_POSTAMBLE_MARKERS:
if marker in p:
return i
return len(paragraphs)
# ── DOCX proofreading ─────────────────────────────────────────────
def _strip_inline_nevo_codes(paragraphs: list[str]) -> list[str]:
"""Remove Nevo inline watermark codes from paragraph prefixes; drop pure-code paras."""
out: list[str] = []
for p in paragraphs:
stripped = NEVO_INLINE_CODE_RE.sub("", p).strip()
if stripped:
out.append(stripped)
return out
def proofread_docx(path: Path) -> tuple[str, dict]:
"""Extract clean decision text from Nevo DOCX. Returns (markdown, stats)."""
doc = Document(str(path))
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
start = find_decision_start(paragraphs)
end = find_decision_end(paragraphs)
clean = _strip_inline_nevo_codes(paragraphs[start:end])
md = "\n\n".join(clean)
return md, {
"total_paragraphs": len(paragraphs),
"preamble_stripped": start,
"postamble_stripped": len(paragraphs) - end,
"clean_paragraphs": len(clean),
}
# ── PDF proofreading (Google Vision OCR) ──────────────────────────
_vision_client: vision.ImageAnnotatorClient | None = None
def _get_vision_client() -> vision.ImageAnnotatorClient:
global _vision_client
if _vision_client is None:
api_key = os.environ.get("GOOGLE_CLOUD_VISION_API_KEY")
if not api_key:
raise RuntimeError("GOOGLE_CLOUD_VISION_API_KEY not set")
_vision_client = vision.ImageAnnotatorClient(
client_options={"api_key": api_key}
)
return _vision_client
# Hebrew abbreviation quote fixes — Google Vision renders ״ as 'יי'
_HEBREW_ABBREV_FIXES: dict[str, str] = {
"עוהייד": 'עוה"ד',
"עוייד": 'עו"ד',
"הנייל": 'הנ"ל',
"מצייב": 'מצ"ב',
"ביהמייש": 'ביהמ"ש',
"תייז": 'ת"ז',
"עייי": 'ע"י',
"אחייכ": 'אח"כ',
"סייק": 'ס"ק',
"דייר": 'ד"ר',
"חווייד": 'חוו"ד',
"מייר": 'מ"ר',
"יחייד": 'יח"ד',
"בייכ": 'ב"כ',
"בייה": 'ב"ה',
"שייח": 'ש"ח',
"יוייר": 'יו"ר',
"בליימ": 'בל"מ',
"תבייע": 'תב"ע',
"תמייא": 'תמ"א',
"סייה": 'ס"ה',
"שייפ": 'ש"פ',
"שצייפ": 'שצ"פ',
"שבייצ": 'שב"צ',
"עסיים": 'עס"ם',
"הייה": 'ה"ה',
"פסייד": 'פס"ד',
"תיידא": 'תיד"א',
"בגייץ": 'בג"ץ',
"עתיים": 'עת"ם',
"עעיים": 'עע"ם',
# Hebrew calendar day prefixes (כ"א .. כ"ט etc.)
"כייא": 'כ"א', "כייב": 'כ"ב', "כייג": 'כ"ג', "כייד": 'כ"ד',
"כייה": 'כ"ה', "כייו": 'כ"ו', "כייז": 'כ"ז', "כייח": 'כ"ח', "כייט": 'כ"ט',
"לייא": 'ל"א',
"יייא": 'י"א', "יייב": 'י"ב', "יייג": 'י"ג', "יייד": 'י"ד',
"טייו": 'ט"ו', "טייז": 'ט"ז', "יייז": 'י"ז', "יייח": 'י"ח', "יייט": 'י"ט',
# Hebrew calendar years (תשפ"ה, תשפ"ד...)
"תשפייא": 'תשפ"א', "תשפייב": 'תשפ"ב', "תשפייג": 'תשפ"ג',
"תשפייד": 'תשפ"ד', "תשפייה": 'תשפ"ה', "תשפייו": 'תשפ"ו',
"תשפיין": 'תשפ"ן',
}
_ABBREV_PATTERN = re.compile(
"|".join(re.escape(k) for k in sorted(_HEBREW_ABBREV_FIXES, key=len, reverse=True))
)
def _fix_hebrew_quotes(text: str) -> str:
return _ABBREV_PATTERN.sub(lambda m: _HEBREW_ABBREV_FIXES[m.group()], text)
def _ocr_page_image(image_bytes: bytes, page_num: int) -> str:
client = _get_vision_client()
image = vision.Image(content=image_bytes)
response = client.document_text_detection(
image=image,
image_context=vision.ImageContext(language_hints=["he"]),
)
if response.error.message:
raise RuntimeError(f"Vision error page {page_num}: {response.error.message}")
text = response.full_text_annotation.text if response.full_text_annotation else ""
return _fix_hebrew_quotes(text)
_FOOTER_JUNK_RE = re.compile(
r"^("
r"\s*|" # blank
r"[-·*.\"\'׳״]+|" # stray punctuation
r"\d{1,3}[\s\-·*.\"\'׳״]*|" # page number with any stray char
r"עמוד[\s\d\-·*.\"\'׳״]*|" # "עמוד" / "עמוד N" w/ trailing noise
r"[-·*\s\"\'׳״]*[a-zA-Z][a-zA-Z0-9 .\-·*_]{0,30}" # garbled latin (nevo URL variants)
r")$"
)
def _clean_page_text(text: str) -> str:
"""Strip Nevo page headers, footers and watermarks from a single page's OCR text.
Nevo footer on each page looks like:
עמוד
N (or "", "N*")
nevo.co.il (or OCR-garbled: "new coal", "neto coal", etc.)
- (optional stray dash)
Google Vision OCRs this block at the end of each page's text.
"""
# 1. Strip top header "עמוד X מתוך Y" anywhere
text = PDF_PAGE_HEADER_RE.sub("\n", text)
# 2. Walk back from end, dropping footer junk lines
lines = text.split("\n")
while lines and _FOOTER_JUNK_RE.match(lines[-1].strip()):
lines.pop()
text = "\n".join(lines)
# 3. Final pass: strip any leftover Nevo URLs mid-text and orphan "עמוד X" lines
text = NEVO_URL_RE.sub("", text)
text = PDF_PAGE_NUM_LINE_RE.sub("", text)
text = PDF_PAGE_ORPHAN_RE.sub("", text)
return text.strip()
def proofread_pdf(path: Path) -> tuple[str, dict]:
"""Extract clean decision text from Nevo PDF via Google Vision OCR."""
doc = fitz.open(str(path))
pages: list[str] = []
for i, page in enumerate(doc):
pix = page.get_pixmap(dpi=300)
img_bytes = pix.tobytes("png")
text = _ocr_page_image(img_bytes, i + 1)
pages.append(_clean_page_text(text))
# Small delay between API calls to be safe
time.sleep(0.1)
doc.close()
body = "\n\n".join(p for p in pages if p)
body = re.sub(r"\n{3,}", "\n\n", body)
body = re.sub(r"[ \t]+\n", "\n", body)
for marker in NEVO_POSTAMBLE_MARKERS:
idx = body.find(marker)
if idx != -1:
body = body[:idx].rstrip()
break
return body, {
"pages": len(pages),
"chars": len(body),
}
# ── Orchestration ─────────────────────────────────────────────────
SKIP_FILES = {
"הכנת שאלות מחקר.docx",
"סוכן_מנתח_ומחקר_משפטי_Paperclip_מדריך.docx",
"README.md",
}
def output_filename(src: Path) -> str:
"""Build clean output filename preserving case identifier."""
stem = src.stem
# Normalize: replace spaces with - where helpful, but keep Hebrew intact
return f"{stem}.md"
def main(argv: list[str]) -> int:
OUTPUT_DIR.mkdir(exist_ok=True)
RAW_DIR.mkdir(exist_ok=True)
# Filter files
only = argv[1:] if len(argv) > 1 else None
files: list[Path] = []
for p in sorted(TRAINING_DIR.iterdir()):
if p.is_dir() or p.name.startswith("."):
continue
if p.name in SKIP_FILES:
continue
if p.suffix.lower() not in (".docx", ".pdf"):
continue
if only and p.name not in only:
continue
files.append(p)
print(f"Processing {len(files)} files...\n")
for path in files:
try:
if path.suffix.lower() == ".docx":
md, stats = proofread_docx(path)
else:
md, stats = proofread_pdf(path)
out_path = OUTPUT_DIR / output_filename(path)
out_path.write_text(md, encoding="utf-8")
print(f"{path.name}")
print(f"{out_path.name} ({len(md):,} chars) {stats}")
except Exception as e:
print(f"{path.name}: {e}")
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv))

View File

@@ -0,0 +1,254 @@
#!/usr/bin/env python3
"""Seed appeals (cases) from legacy vault metadata."""
import asyncio
import json
import sys
from datetime import date
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent / "mcp-server" / "src"))
from legal_mcp.services.db import get_pool, init_schema, close_pool
APPEALS = [
# ── Active (01_Projects) ──
{
"case_number": "1130/25",
"title": "ערר קרית יערים-1 — קובר",
"appellants": ["מרק קובר", "יצחק מטמון"],
"respondents": ["הוועדה המרחבית הראל", "ליבמן"],
"subject": "ערר על אישור תכנית להוספת קומה וזכויות בנייה",
"property_address": "רח' אבינדב 23, קריית יערים",
"status": "in_progress",
"expected_outcome": "partial",
},
{
"case_number": "1194/25+1199/25",
"title": "ערר קרית יערים-2 — מטמון/קובר",
"appellants": ["יצחק מטמון", "מרק קובר"],
"respondents": ["הוועדה המקומית"],
"subject": "תוספת קומה + הגדלת זכויות בנייה",
"property_address": "חלקה 240, גוש 29536, רח' אבינדב",
"status": "new",
"expected_outcome": "",
},
{
"case_number": "8027-25",
"title": "ערר היטל השבחה תחכמוני 20",
"appellants": ["עובדיה", "מירב", "ווינשטיין ואח'"],
"respondents": ["הוועדה המקומית ירושלים"],
"subject": "היטל השבחה",
"property_address": "רח' תחכמוני, ירושלים, גוש 30069, חלקה 156",
"status": "new",
"expected_outcome": "",
},
# ── Archived — completed decisions ──
{
"case_number": "1180-1181",
"title": "ערר הכט",
"appellants": [],
"respondents": [],
"subject": "רישוי ובנייה",
"property_address": "",
"status": "final",
"expected_outcome": "rejected",
"notes": "פורסם 05.02.2026. דחייה. שימש כמודל לניתוח סגנון.",
},
{
"case_number": "1126/25+1141/25",
"title": "תמ\"א 38/2 בית הכרם",
"appellants": ["מרכז קהילתי זיו-מרקס", "12 תושבים"],
"respondents": ["הוועדה המקומית", "יזם"],
"subject": "תמ\"א 38/2 הריסה ובנייה מחדש",
"property_address": "רח' החלוץ 36, בית הכרם, גוש 30159/6",
"status": "final",
"expected_outcome": "partial",
"notes": "גרסה סופית טיוטה 9, מרץ 2026. קבלה חלקית. שימש כמודל לניתוח סגנון.",
},
{
"case_number": "8255-25",
"title": "בל\"מ אפרים אבי",
"appellants": ["אפרים אברהם"],
"respondents": ["הוועדה המקומית ירושלים"],
"subject": "היטל השבחה — בקשה להארכת מועד",
"property_address": "רח' הורקניה 4, קטמונים, ירושלים",
"status": "final",
"expected_outcome": "rejected",
"notes": "גרסה סופית מאושרת. דחייה.",
},
# ── Archived — unified decisions ──
{
"case_number": "8107-25",
"title": "אבו זאהריה",
"appellants": ["אבו זאהריה מפיד"],
"respondents": ["הוועדה המקומית ירושלים"],
"subject": "ערר על החלטת שמאי מכריע — היטל השבחה",
"property_address": "רח' אום כולתום 26, בית חנינא, גוש 30615, חלקה 69",
"status": "final",
"expected_outcome": "",
"notes": "החלטה מאחדת: ערר גפני.",
},
{
"case_number": "9005-24",
"title": "רמת שלמה — פיצויים ס' 197",
"appellants": ["קירמאיר אסתר ואח' (63-67 עוררים)"],
"respondents": ["הוועדה המקומית ירושלים"],
"subject": "פיצויים לפי סעיף 197",
"property_address": "רמת שלמה, ירושלים, גוש 30561, חלקות 36, 40",
"status": "final",
"expected_outcome": "",
"notes": "החלטה מאחדת: ערר ורדי 9003-23.",
},
# ── Archived — in progress ──
{
"case_number": "1113/25",
"title": "אייל מבורך לוי ואברהם עדי",
"appellants": ["אייל מבורך לוי", "אברהם עדי"],
"respondents": ["הוועדה המקומית הראל"],
"subject": "הרחבת דירות + תוספת 2 יח\"ד",
"property_address": "רח' השלום 63, מבשרת ציון, גוש 30475, חלקה 5",
"status": "in_progress",
"expected_outcome": "",
},
{
"case_number": "1128/25",
"title": "שטרית",
"appellants": [],
"respondents": [],
"subject": "",
"property_address": "",
"status": "drafted",
"expected_outcome": "",
},
{
"case_number": "1107/06/25",
"title": "בלוי נ' הוועדה המקומית",
"appellants": ["בלוי מאיר", "מזיע מאיר", "דזימיטרובסקי הדסה"],
"respondents": ["הוועדה המקומית ירושלים", "היזם"],
"subject": "תוספת בנייה וחיזוק מפני רעידות (תמ\"א 38/1)",
"property_address": "רח' הרב בלוי 16, ירושלים, גוש 30099/115",
"status": "in_progress",
"expected_outcome": "",
},
{
"case_number": "8141-23",
"title": "אזורים בנין",
"appellants": ["אזורים בנין (1965) בע\"מ"],
"respondents": ["הוועדה המקומית ירושלים"],
"subject": "היטל השבחה — תכנית 101-0611905",
"property_address": "רח' הנביאים 27, ירושלים",
"status": "drafted",
"expected_outcome": "",
},
{
"case_number": "8047-24",
"title": "משכן אליהו — היטל השבחה שמאי מכריע",
"appellants": ["עומר דרוויש"],
"respondents": ["הוועדה המקומית ירושלים"],
"subject": "ערר על שמאית מכריעה — היטל השבחה",
"property_address": "גוש 30614, חלקה 89, בית חנינא",
"status": "in_progress",
"expected_outcome": "",
},
{
"case_number": "1195-25",
"title": "וליד ג'מל",
"appellants": ["וליד ג'מל"],
"respondents": ["ועדת משנה מטה יהודה", "סמיר מוסא זעאתרה"],
"subject": "הסדרת קומה שלישית למשרדים",
"property_address": "גוש 30492, חלקה 23, כפר עין נקובא",
"status": "in_progress",
"expected_outcome": "",
},
{
"case_number": "1200/25",
"title": "קרית ענבים נופש",
"appellants": ["קרית ענבים נופש בע\"מ"],
"respondents": ["הוועדה המקומית מטה יהודה", "חברי קיבוץ קרית ענבים"],
"subject": "שימוש חורג — סופרמרקט בייעוד ספורט ונופש",
"property_address": "קיבוץ קרית ענבים, גוש 29551",
"status": "in_progress",
"expected_outcome": "",
},
{
"case_number": "1184/25",
"title": "שטוקהיים — בית נקופה",
"appellants": ["אמנון שטוקהיים", "אילנית שטוקהיים"],
"respondents": ["הוועדה המקומית מטה יהודה", "יערה טל"],
"subject": "אישור בקשה להיתר עם הקלות",
"property_address": "מגרש 51, גוש 31399, חלקה 52, בית נקופה",
"status": "in_progress",
"expected_outcome": "",
},
{
"case_number": "8070-25",
"title": "היטל השבחה — דירת גג",
"appellants": ["חיים ראם"],
"respondents": ["הוועדה המקומית ירושלים"],
"subject": "היטל השבחה — הקלה להשלמת דירת גג",
"property_address": "רח' צ.פ. חיות 2, דירה 31, נווה יעקב",
"status": "in_progress",
"expected_outcome": "",
},
{
"case_number": "8136-24",
"title": "ערר השבחה — מרפסות שירות",
"appellants": [],
"respondents": [],
"subject": "היטל השבחה — מרפסות שירות",
"property_address": "",
"status": "in_progress",
"expected_outcome": "",
},
{
"case_number": "8007-24",
"title": "עומר דרוויש — שומה מכרעת",
"appellants": [],
"respondents": [],
"subject": "היטל השבחה",
"property_address": "",
"status": "in_progress",
"expected_outcome": "",
},
]
async def main():
await init_schema()
pool = await get_pool()
inserted = 0
skipped = 0
async with pool.acquire() as conn:
for a in APPEALS:
existing = await conn.fetchval(
"SELECT id FROM cases WHERE case_number = $1", a["case_number"]
)
if existing:
skipped += 1
continue
await conn.execute(
"""INSERT INTO cases
(case_number, title, appellants, respondents, subject,
property_address, status, expected_outcome, notes)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)""",
a["case_number"],
a["title"],
json.dumps(a.get("appellants", [])),
json.dumps(a.get("respondents", [])),
a.get("subject", ""),
a.get("property_address", ""),
a.get("status", "new"),
a.get("expected_outcome", ""),
a.get("notes", ""),
)
inserted += 1
await close_pool()
print(f"✓ appeals: {inserted} inserted, {skipped} skipped (already exist)")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,449 @@
#!/usr/bin/env python3
"""Seed knowledge tables from legacy vault data.
Imports: lessons_learned, transition_phrases, case_law, statutory_provisions.
Sources: docs/legal-decision-lessons.md, skills/decision/SKILL.md
"""
import asyncio
import json
import sys
from pathlib import Path
# Add mcp-server to path so we can reuse db module
sys.path.insert(0, str(Path(__file__).parent.parent / "mcp-server" / "src"))
from legal_mcp.services.db import get_pool, init_schema, close_pool
# ═══════════════════════════════════════════════════════════════════
# Data: Lessons Learned
# ═══════════════════════════════════════════════════════════════════
LESSONS = [
# --- הכט 1180-1181 (rejected, 02.2026) ---
{
"lesson_title": "Discussion = continuous essay, no sub-headers",
"lesson_text": "הדיון נקרא כחיבור משפטי רציף עם סעיפים ממוספרים, לא כמתווה מובנה עם כותרות משנה. הגרסה המפורסמת של הכט השתמשה באפס כותרות משנה בדיון, בעוד הטיוטה שלנו הכילה 6 כותרות H2.",
"category": "structure",
"applies_to": ["block-yod"],
"source_case": "הכט 1180-1181",
"severity": "critical",
},
{
"lesson_title": "Citation through consolidating decision",
"lesson_text": "להשתמש בהחלטה מאחדת קודמת (כמו ערר נגאח 1011-03-25) לצטט מספר תקדימים בפסקה אחת ארוכה (~600 מילים), במקום לצטט כל תקדים בפסקה נפרדת.",
"category": "style",
"applies_to": ["block-yod"],
"source_case": "הכט 1180-1181",
"severity": "important",
},
{
"lesson_title": "Paragraph length variation in discussion",
"lesson_text": "לא לפרגמנט טיעונים משפטיים ארוכים לפסקאות זהות וקצרות. לגוון אורך פסקאות מ-20 עד 600+ מילים. פסקאות ציטוט מרכזיות ארוכות מאוד.",
"category": "style",
"applies_to": ["block-yod"],
"source_case": "הכט 1180-1181",
"severity": "important",
},
{
"lesson_title": "Opening formula promises both conclusion AND elaboration",
"lesson_text": 'פתיחת הדיון צריכה להבטיח גם מסקנה וגם הרחבה: "לאחר שבחנו... החלטנו בשלב ראשון כי... אך יחד עם זאת ועל מנת לא לצאת בחסר... מצאנו להוסיף מספר הערות"',
"category": "style",
"applies_to": ["block-yod"],
"source_case": "הכט 1180-1181",
"severity": "important",
},
{
"lesson_title": 'Summary title is "סיכום"',
"lesson_text": 'כותרת פרק הסיכום היא "סיכום" בלבד, לא "סיכום והכרעה" ולא "סוף דבר".',
"category": "structure",
"applies_to": ["block-yod-alef"],
"source_case": "הכט 1180-1181",
"severity": "nice-to-have",
},
# --- בית הכרם 1126/25 (partial acceptance, 03.2026) ---
{
"lesson_title": "Threshold question is STRATEGIC, not mandatory",
"lesson_text": "שאלת הסף (זכות ערר לפי ס' 152) היא כלי אסטרטגי, לא חובה. כשלתיק יש שאלות מהותיות חזקות (חניה, קווי בניין, שימור), דפנה מעדיפה להתעמק בתוכן על פני חסימה פרוצדורלית. זה גם מחזק את ההחלטה מפני ביקורת שיפוטית.",
"category": "process",
"applies_to": ["all"],
"source_case": "בית הכרם 1126/25",
"severity": "critical",
},
{
"lesson_title": "Concentric circles = rejected appeals only",
"lesson_text": 'מודל השכבות (עיגולים קונצנטריים, סעיף 6.3 ב-SKILL) הוא כלי אחד מתוך כמה, לא המסגרת הנדרשת. לעררים שמתקבלים חלקית, דפנה משתמשת בניתוח גמיש נושא-נושא.',
"category": "process",
"applies_to": ["block-yod"],
"source_case": "בית הכרם 1126/25",
"severity": "critical",
},
{
"lesson_title": "New opening type: tension mapping",
"lesson_text": 'לקבלה חלקית או תיקים עם סוגיות מורכבות מצטלבות, פתיחת "מיפוי מתחים": רשימה של 6+ מתחים ספציפיים בתבליטים לפני הניתוח. דפוס: "בערר דנן עולות שאלות כיצד והאם..." → רשימת מתחים → "כל הנקודות לעיל עומדות לפנינו..."',
"category": "structure",
"applies_to": ["block-yod"],
"source_case": "בית הכרם 1126/25",
"severity": "important",
},
{
"lesson_title": "Single building weakens TAMA 38 interest",
"lesson_text": 'כשתמ"א 38 חלה על בית בודד (לעומת בניין דירות גדול), אינטרס החיזוק מפני רעידת אדמה חלש יותר. זה מצדיק אישור זהיר יותר של זכויות, במיוחד קווי בניין וחניה.',
"category": "content",
"applies_to": ["block-yod"],
"source_case": "בית הכרם 1126/25",
"severity": "important",
},
{
"lesson_title": "Master plan as shield against ad-hoc planning",
"lesson_text": 'כשקיימת תכנית אב — לצטט אותה כדי לתת לגיטימציה להיתר בודד. מסקנה: ההיתר "משתלב בחזון כולל קיים" במקום ליצור תקדים אד-הוק.',
"category": "content",
"applies_to": ["block-yod"],
"source_case": "בית הכרם 1126/25",
"severity": "important",
},
{
"lesson_title": "Deep plan provision citations for parking",
"lesson_text": "לסוגיות חניה/תשתיות, דפנה נכנסת עמוק להוראות תכנית עם ציטוטים ישירים נרחבים (300+ מילים) וניתוח משולב. כולל מספרי סעיפים ספציפיים (לדוגמה: 6.8(4), 6.8(9), נספח תנועה, 5166b).",
"category": "content",
"applies_to": ["block-yod", "block-tet"],
"source_case": "בית הכרם 1126/25",
"severity": "important",
},
{
"lesson_title": "Ultra-minimal summary for partial acceptance",
"lesson_text": "בקבלה חלקית, כל ההנמקה כבר בדיון. סיכום = הוראות אופרטיביות בלבד (בדרך כלל 3 סעיפים קצרים). ללא דיון בהוצאות. ללא סיום חם.",
"category": "structure",
"applies_to": ["block-yod-alef"],
"source_case": "בית הכרם 1126/25",
"severity": "important",
},
# --- קרית יערים-1 (03.2026) ---
{
"lesson_title": "Neutral background rule",
"lesson_text": 'רקע (בלוק ו) = עובדות אובייקטיביות בלבד. מבחן: האם המשפט מכיל ציטוט ישיר מצד, או מילות ערך/שיפוט (חריג, חטא, בעייתי)? אם כן → שייך בטענות (בלוק ז) או דיון (בלוק י), לא ברקע. החלטות קודמות = עובדה יבשה ("ביום X נדחתה תכנית Y"), ללא נימוקים וציטוטים.',
"category": "structure",
"applies_to": ["block-vav"],
"source_case": "קרית יערים-1 (1130/25)",
"severity": "critical",
},
{
"lesson_title": "12-block mandatory structure",
"lesson_text": 'מבנה 12 בלוקים פורמלי חובה עם שלב "טיוטת טרום-דיון". כולל: פתיחה (ה) → רקע (ו) → טענות (ז) → הליכים (ח) → תכניות (ט) → דיון (י) → סיכום (יא). חידוש מאריאלי: "ההליכים בפני ועדת הערר" כפרק נפרד. כל בלוק נכתב כאילו שופט בית משפט מנהלי קורא בפעם הראשונה.',
"category": "structure",
"applies_to": ["all"],
"source_case": "קרית יערים-1 (1130/25)",
"severity": "critical",
},
# --- Meta-lesson ---
{
"lesson_title": "Skill was over-indexed on single case type",
"lesson_text": "ה-SKILL המקורי היה מבוסס יתר על מקרה אחד (הכט = דחייה). מודל העיגולים, שאלת סף כחובה, וסיום חם — כולם דפוסים מתיק בודד. בית הכרם (קבלה חלקית) חשף שהגישה של דפנה גמישה יותר ממה שתפסנו. צריך להבחין בין דפוסים אוניברסליים לתלויי-תוצאה.",
"category": "process",
"applies_to": ["all"],
"source_case": "בית הכרם 1126/25",
"severity": "critical",
},
]
# ═══════════════════════════════════════════════════════════════════
# Data: Transition Phrases
# ═══════════════════════════════════════════════════════════════════
TRANSITION_PHRASES = [
# From הכט
{"phrase": "ועל מנת לא לצאת בחסר", "usage_context": "פתיחת אוביטר דיקטה / הנמקה נוספת", "block_types": ["block-yod"], "source_decision": "הכט 1180-1181"},
{"phrase": "נציין כי טענות אלו נטענו בלשון רפה", "usage_context": "הכרה בטענות חלשות תוך דיון בהן", "block_types": ["block-yod"], "source_decision": "הכט 1180-1181"},
{"phrase": "עינינו הרואות", "usage_context": "סיכום אחרי ציטוט ארוך", "block_types": ["block-yod"], "source_decision": "הכט 1180-1181"},
{"phrase": "נוסיף.", "usage_context": "מעבר קצר ביותר (מילה אחת) לנקודה הבאה", "block_types": ["block-yod"], "source_decision": "הכט 1180-1181"},
{"phrase": "אם כך, לעת הזו", "usage_context": "הסקת מסקנה מציטוטים", "block_types": ["block-yod"], "source_decision": "הכט 1180-1181"},
{"phrase": "למעלה מן הצורך", "usage_context": "דיון לא הכרחי להכרעה אך נכתב מטעמים אסטרטגיים", "block_types": ["block-yod"], "source_decision": "הכט 1180-1181"},
{"phrase": "למיטב הבנתנו", "usage_context": "עמדה זהירה בשאלה משפטית לא מיושבת", "block_types": ["block-yod"], "source_decision": "הכט 1180-1181"},
{"phrase": "נשלים ונציין", "usage_context": "נקודה אחרונה לפני מעבר לסיכום", "block_types": ["block-yod"], "source_decision": "הכט 1180-1181"},
# From בית הכרם
{"phrase": "הדברים משליכים על שיקול הדעת ב...", "usage_context": "קישור ממצא למסקנה", "block_types": ["block-yod"], "source_decision": "בית הכרם 1126/25"},
{"phrase": "רוצה לומר כי", "usage_context": "ניסוח חלופי / הסבר", "block_types": ["block-yod"], "source_decision": "בית הכרם 1126/25"},
{"phrase": "נוצר מצב בו", "usage_context": "הצגת מצב עובדתי / בעיה", "block_types": ["block-yod"], "source_decision": "בית הכרם 1126/25"},
{"phrase": "לכך נוסיף כי", "usage_context": "הוספת שכבה נוספת לטיעון", "block_types": ["block-yod"], "source_decision": "בית הכרם 1126/25"},
{"phrase": "יש אולי להצר על כך ש...", "usage_context": "הערה ביקורתית עדינה (כלפי רשות תכנון)", "block_types": ["block-yod"], "source_decision": "בית הכרם 1126/25"},
{"phrase": "עם ההבנה לטענה זו של העוררים, אין בידנו לקבלה", "usage_context": "הכרה רכה בטענה תוך דחייתה", "block_types": ["block-yod"], "source_decision": "בית הכרם 1126/25"},
# General (from SKILL.md)
{"phrase": "ברי כי", "usage_context": "מסקנה מובנת מאליה", "block_types": ["block-yod"], "source_decision": ""},
{"phrase": "נפנה ל...", "usage_context": "פתיחת ניתוח חוק/פסיקה", "block_types": ["block-yod"], "source_decision": ""},
{"phrase": "מכל האמור לעיל", "usage_context": "מעבר לסיכום", "block_types": ["block-yod", "block-yod-alef"], "source_decision": ""},
{"phrase": "נשוב על כך כי", "usage_context": "חזרה מכוונת על עיקרון חשוב", "block_types": ["block-yod"], "source_decision": ""},
{"phrase": "דא עקא", "usage_context": "הצגת בעיה מרכזית או סתירה", "block_types": ["block-yod"], "source_decision": ""},
{"phrase": "ובמילים אחרות", "usage_context": "הבהרה / ניסוח מחדש", "block_types": ["block-yod"], "source_decision": ""},
{"phrase": "הגענו לכלל מסקנה כי", "usage_context": "מסקנה מרכזית (פתיחת דיון)", "block_types": ["block-yod"], "source_decision": ""},
{"phrase": "לא נוכל לקבל", "usage_context": "דחיית עמדה / טענה", "block_types": ["block-yod"], "source_decision": ""},
{"phrase": "מקובלת עלינו", "usage_context": "קבלת עמדה", "block_types": ["block-yod"], "source_decision": ""},
{"phrase": "התרשמנו כי", "usage_context": "מסקנה מדיון / עיון במסמכים", "block_types": ["block-yod"], "source_decision": ""},
{"phrase": "נחדד כי", "usage_context": "חידוד נקודה קודמת", "block_types": ["block-yod"], "source_decision": ""},
{"phrase": "סיכומם של דברים", "usage_context": "פתיחת סיכום מהותי לפני פרק הסיכום", "block_types": ["block-yod"], "source_decision": ""},
{"phrase": "המסקנה מכל האמור היא כי", "usage_context": "מסקנת ביניים מקיפה", "block_types": ["block-yod"], "source_decision": ""},
{"phrase": "לעמדתנו", "usage_context": "עמדת הוועדה", "block_types": ["block-yod"], "source_decision": ""},
{"phrase": "בנסיבות אלה", "usage_context": "מעבר מעובדות למסקנה", "block_types": ["block-yod"], "source_decision": ""},
{"phrase": "נזכיר כי", "usage_context": "תזכורת לעיקרון ידוע", "block_types": ["block-yod"], "source_decision": ""},
{"phrase": "מצאנו כי", "usage_context": "קביעה עובדתית", "block_types": ["block-yod"], "source_decision": ""},
{"phrase": "שוכנענו כי", "usage_context": "קביעה לאחר בחינה", "block_types": ["block-yod"], "source_decision": ""},
{"phrase": "על כן ולו רק מסיבה זו", "usage_context": "נטרול טענה חלשה לפני ניתוח עמוק", "block_types": ["block-yod"], "source_decision": ""},
{"phrase": "יחד עם זאת, מצאנו לנכון לדון בשאלה העקרונית", "usage_context": "מעבר לדיון עקרוני למרות דחייה", "block_types": ["block-yod"], "source_decision": ""},
{"phrase": "משכך", "usage_context": "הסקת מסקנה מעמדה שהוצגה", "block_types": ["block-yod"], "source_decision": ""},
{"phrase": "הדברים מתחדדים שעה ש...", "usage_context": "הבהרה נוספת לאור נסיבות", "block_types": ["block-yod"], "source_decision": ""},
{"phrase": "זאת ועוד", "usage_context": "הוספת נימוק", "block_types": ["block-yod"], "source_decision": ""},
{"phrase": "יתרה מכך", "usage_context": "חיזוק הנמקה קודמת", "block_types": ["block-yod"], "source_decision": ""},
{"phrase": "לאור כל האמור לעיל", "usage_context": "פתיחת סיכום סופי", "block_types": ["block-yod", "block-yod-alef"], "source_decision": ""},
{"phrase": "נפתח בכך כי", "usage_context": "פתיחת דיון (לא מסמך)", "block_types": ["block-yod"], "source_decision": ""},
{"phrase": "נפנה בעניין זה להחלטת...", "usage_context": "הפניה לתקדים", "block_types": ["block-yod"], "source_decision": ""},
{"phrase": "ברי כי משאב הקרקע יקר לבעליו ולציבור", "usage_context": "הצדקת שימוש יעיל בקרקע", "block_types": ["block-yod"], "source_decision": ""},
{"phrase": "נסכם כי", "usage_context": "מעבר לסיכום ביניים", "block_types": ["block-yod"], "source_decision": ""},
{"phrase": "נחזור על כך כי", "usage_context": "חזרה אמפתית על קביעה חשובה", "block_types": ["block-yod"], "source_decision": ""},
]
# ═══════════════════════════════════════════════════════════════════
# Data: Case Law
# ═══════════════════════════════════════════════════════════════════
CASE_LAW = [
{
"case_number": "עע\"מ 3975/22",
"case_name": "ב. קרן-נכסים",
"court": "בית המשפט העליון",
"subject_tags": ["proprietary_claims", "feasibility"],
"summary": "פסק דין מנחה בנושא בדיקת היתכנות קניינית — מתי ועדה צריכה לבחון זכויות קניין לפני מתן היתר.",
"key_quote": "",
},
{
"case_number": "ערר (מרכז) 1011-03-25",
"case_name": "נגאח עבד אל קאדר",
"court": "ועדת ערר מרכז",
"subject_tags": ["proprietary_claims", "consolidating_decision"],
"summary": "החלטה מאחדת בנושא טענות קנייניות — ריכזה את כל הפסיקה בנושא.",
"key_quote": "",
},
{
"case_number": "ערר 1071/25",
"case_name": "מינץ",
"court": "ועדת ערר ירושלים",
"subject_tags": ["self_reference", "previous_decision"],
"summary": "החלטה קודמת של ועדת הערר עצמה — שימוש כתקדים פנימי.",
"key_quote": "",
},
{
"case_number": "ערר 1192/18",
"case_name": "אילן",
"court": "ועדת ערר ירושלים",
"subject_tags": ["preservation", "nuisance"],
"summary": "שימור ומטרדים — איזון בין שימור מבנים לזכויות שכנים.",
"key_quote": "",
},
{
"case_number": "ערר 1009-02-24",
"case_name": "מובשוביץ",
"court": "ועדת ערר ירושלים",
"subject_tags": ["urban_renewal", "tama_38"],
"summary": 'התחדשות עירונית — ציטוט נרחב (~400 מילים) בהחלטת בית הכרם.',
"key_quote": "",
},
{
"case_number": "ערר 1156/18",
"case_name": "ארד",
"court": "ועדת ערר ירושלים",
"subject_tags": ["construction_nuisance"],
"summary": "מטרדי בנייה — מתי מטרד בנייה מצדיק התערבות.",
"key_quote": "",
},
{
"case_number": "ערר 1169/19",
"case_name": "זוהר",
"court": "ועדת ערר ירושלים",
"subject_tags": ["construction_nuisance"],
"summary": "מטרדי בנייה — המשך קו הפסיקה של ערר ארד.",
"key_quote": "",
},
{
"case_number": "ערר (ירושלים) 1078+1083/24",
"case_name": "אריאלי",
"court": "ועדת ערר ירושלים",
"subject_tags": ["structure_example", "proceedings_block"],
"summary": "שימשה כמודל מבני — פרק הליכים נפרד (31 סעיפים), מבנה מפורט.",
"key_quote": "",
},
{
"case_number": "ערר אדלר",
"case_name": "אדלר",
"court": "ועדת ערר ירושלים",
"subject_tags": ["consolidating_decision"],
"summary": "החלטה מאחדת שצוטטה בבית הכרם — טכניקת ציטוט דרך החלטה מרכזת.",
"key_quote": "",
},
]
# ═══════════════════════════════════════════════════════════════════
# Data: Statutory Provisions
# ═══════════════════════════════════════════════════════════════════
STATUTORY_PROVISIONS = [
{
"statute_name": "חוק התכנון והבנייה, תשכ\"ה-1965",
"section_number": "152(א)(2)",
"section_title": "זכות ערר על אישור תכנית",
"full_text": "",
"common_usage": "שאלת סף — האם קיימת זכות ערר. כלי אסטרטגי, לא חובה.",
"subject_tags": ["threshold", "right_to_appeal"],
},
{
"statute_name": "חוק התכנון והבנייה, תשכ\"ה-1965",
"section_number": "149",
"section_title": "הקלה",
"full_text": "",
"common_usage": "בקשות להקלה — סטייה מתכנית בניין עיר.",
"subject_tags": ["deviation", "relief"],
},
{
"statute_name": "חוק התכנון והבנייה, תשכ\"ה-1965",
"section_number": "145",
"section_title": "היתר בנייה",
"full_text": "",
"common_usage": "עררים על סירוב/אישור היתר בנייה.",
"subject_tags": ["building_permit"],
},
{
"statute_name": "חוק התכנון והבנייה, תשכ\"ה-1965",
"section_number": "196-198",
"section_title": "היטל השבחה",
"full_text": "",
"common_usage": "עררי היטל השבחה (8xxx) — חיוב בגין עליית שווי מקרקעין.",
"subject_tags": ["betterment_levy"],
},
{
"statute_name": "חוק התכנון והבנייה, תשכ\"ה-1965",
"section_number": "197",
"section_title": "פיצויים בגין ירידת ערך",
"full_text": "",
"common_usage": "עררי פיצויים (9xxx) — תביעה בגין ירידת ערך מקרקעין בשל תכנית.",
"subject_tags": ["compensation", "depreciation"],
},
{
"statute_name": "תמ\"א 38",
"section_number": "תיקון 2 + 3",
"section_title": "חיזוק מבנים מפני רעידות אדמה",
"full_text": "",
"common_usage": "חיזוק/הריסה ובנייה מחדש. אינטרס חלש יותר בבית בודד.",
"subject_tags": ["tama_38", "seismic_reinforcement"],
},
{
"statute_name": "חוק המקרקעין, תשכ\"ט-1969",
"section_number": "71ב(א)(1)",
"section_title": "רוב הדרוש לשינוי ברכוש משותף",
"full_text": "",
"common_usage": "בדיקת היתכנות קניינית — האם יש רוב לשינוי ברכוש משותף.",
"subject_tags": ["proprietary_claims", "common_property"],
},
]
# ═══════════════════════════════════════════════════════════════════
# Import Logic
# ═══════════════════════════════════════════════════════════════════
async def seed_lessons(conn) -> int:
count = 0
for l in LESSONS:
existing = await conn.fetchval(
"SELECT id FROM lessons_learned WHERE lesson_title = $1", l["lesson_title"]
)
if existing:
continue
await conn.execute(
"""INSERT INTO lessons_learned (lesson_title, lesson_text, category, applies_to, source_case, severity)
VALUES ($1, $2, $3, $4, $5, $6)""",
l["lesson_title"], l["lesson_text"], l["category"],
json.dumps(l["applies_to"]), l["source_case"], l["severity"],
)
count += 1
return count
async def seed_phrases(conn) -> int:
count = 0
for p in TRANSITION_PHRASES:
existing = await conn.fetchval(
"SELECT id FROM transition_phrases WHERE phrase = $1", p["phrase"]
)
if existing:
continue
await conn.execute(
"""INSERT INTO transition_phrases (phrase, usage_context, block_types, source_decision)
VALUES ($1, $2, $3, $4)""",
p["phrase"], p["usage_context"],
json.dumps(p["block_types"]), p["source_decision"],
)
count += 1
return count
async def seed_case_law(conn) -> int:
count = 0
for c in CASE_LAW:
existing = await conn.fetchval(
"SELECT id FROM case_law WHERE case_number = $1", c["case_number"]
)
if existing:
continue
await conn.execute(
"""INSERT INTO case_law (case_number, case_name, court, subject_tags, summary, key_quote)
VALUES ($1, $2, $3, $4, $5, $6)""",
c["case_number"], c["case_name"], c["court"],
json.dumps(c["subject_tags"]), c["summary"], c.get("key_quote", ""),
)
count += 1
return count
async def seed_statutes(conn) -> int:
count = 0
for s in STATUTORY_PROVISIONS:
existing = await conn.fetchval(
"""SELECT id FROM statutory_provisions
WHERE statute_name = $1 AND section_number = $2""",
s["statute_name"], s["section_number"],
)
if existing:
continue
await conn.execute(
"""INSERT INTO statutory_provisions
(statute_name, section_number, section_title, full_text, common_usage, subject_tags)
VALUES ($1, $2, $3, $4, $5, $6)""",
s["statute_name"], s["section_number"], s["section_title"],
s["full_text"], s["common_usage"], json.dumps(s["subject_tags"]),
)
count += 1
return count
async def main():
await init_schema()
pool = await get_pool()
async with pool.acquire() as conn:
n_lessons = await seed_lessons(conn)
n_phrases = await seed_phrases(conn)
n_case_law = await seed_case_law(conn)
n_statutes = await seed_statutes(conn)
await close_pool()
print(f"✓ lessons_learned: {n_lessons} inserted")
print(f"✓ transition_phrases: {n_phrases} inserted")
print(f"✓ case_law: {n_case_law} inserted")
print(f"✓ statutory_provisions: {n_statutes} inserted")
print(f" Total: {n_lessons + n_phrases + n_case_law + n_statutes} records")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,257 @@
#!/usr/bin/env python3
"""Validate a decision against block-schema rules.
Usage: python validate-decision.py <case_number>
Checks:
1. Neutral background (block-vav) — no party quotes or value words
2. Weight compliance — blocks within expected ranges
3. Structural integrity — all required blocks present
4. Claims coverage — every claim in block-zayin addressed in block-yod
"""
import asyncio
import json
import re
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent / "mcp-server" / "src"))
from legal_mcp.services.db import get_pool, init_schema, close_pool
# Value/judgment words that shouldn't appear in neutral background
VALUE_WORDS = [
"חריג", "חטא", "בעייתי", "מזעזע", "שערורייתי", "מגוחך",
"נפשע", "פגום", "חמור", "מקומם", "בלתי סביר", "מופרז",
"מגונה", "פסול", "נלוז", "מטריד",
]
# Party quote indicators
QUOTE_INDICATORS = [
r"לטענת\s+(העוררי|המשיב|מבקשי)",
r"לדברי\s+(העוררי|המשיב|מבקשי)",
r"העורר\s+טוען",
r"המשיבה\s+טוענת",
r"לשיטת\s+(העוררי|המשיב)",
]
# Expected weight ranges per block type (for רישוי appeals)
WEIGHT_RANGES_LICENSING = {
"block-he": (0.5, 5),
"block-vav": (3, 40),
"block-zayin": (13, 40),
"block-chet": (0, 15),
"block-tet": (0, 15),
"block-yod": (30, 75),
"block-yod-alef": (1, 10),
"block-yod-bet": (0, 2),
}
# Expected weight ranges for היטל השבחה
WEIGHT_RANGES_LEVY = {
"block-he": (0, 5),
"block-vav": (2, 20),
"block-zayin": (15, 40),
"block-chet": (0, 25),
"block-tet": (0, 15),
"block-yod": (25, 75),
"block-yod-alef": (1, 10),
"block-yod-bet": (0, 3),
}
def check_neutral_background(content: str) -> list[str]:
"""Check block-vav for neutrality violations."""
issues = []
if not content:
return issues
lines = content.split("\n")
for i, line in enumerate(lines):
# Check value words
for word in VALUE_WORDS:
if word in line:
issues.append(f"מילת שיפוט ברקע (שורה {i+1}): \"{word}\"\"{line[:80]}...\"")
# Check party quotes
for pattern in QUOTE_INDICATORS:
if re.search(pattern, line):
match = re.search(pattern, line).group()
issues.append(f"ציטוט מצד ברקע (שורה {i+1}): \"{match}\"\"{line[:80]}...\"")
return issues
def check_weight_compliance(blocks: list[dict], appeal_type: str) -> list[str]:
"""Check block weights are within expected ranges."""
issues = []
ranges = WEIGHT_RANGES_LEVY if appeal_type == "levy" else WEIGHT_RANGES_LICENSING
total_words = sum(b["word_count"] for b in blocks)
if total_words == 0:
return ["אין תוכן בהחלטה"]
for block in blocks:
bid = block["block_id"]
if bid in ranges and block["word_count"] > 0:
weight = block["word_count"] / total_words * 100
low, high = ranges[bid]
if weight < low:
issues.append(f"בלוק {bid} ({block['title']}): משקל {weight:.1f}% — מתחת לטווח ({low}-{high}%)")
elif weight > high:
issues.append(f"בלוק {bid} ({block['title']}): משקל {weight:.1f}% — מעל לטווח ({low}-{high}%)")
return issues
def check_structural_integrity(blocks: list[dict]) -> list[str]:
"""Check all required blocks are present."""
issues = []
required = ["block-he", "block-zayin", "block-yod"]
block_ids = {b["block_id"] for b in blocks if b["word_count"] > 0}
for req in required:
if req not in block_ids:
issues.append(f"בלוק חובה חסר: {req}")
# Check discussion is the heaviest block
yod = next((b for b in blocks if b["block_id"] == "block-yod"), None)
if yod:
max_block = max((b for b in blocks if b["block_id"] not in ("block-alef", "block-bet", "block-gimel", "block-dalet")),
key=lambda x: x["word_count"], default=None)
if max_block and max_block["block_id"] != "block-yod":
issues.append(f"בלוק הדיון (י) אינו הבלוק הגדול ביותר — {max_block['title']} ({max_block['word_count']} מילים) גדול יותר")
return issues
def check_no_duplication(vav_content: str, yod_content: str) -> list[str]:
"""Check block-yod doesn't repeat block-vav content."""
issues = []
if not vav_content or not yod_content:
return issues
# Find sentences from background that appear verbatim in discussion
vav_sentences = [s.strip() for s in re.split(r'[.!?]', vav_content) if len(s.strip()) > 30]
for sent in vav_sentences:
if sent in yod_content:
issues.append(f"כפילות: משפט מהרקע חוזר בדיון — \"{sent[:60]}...\"")
return issues
async def main():
if len(sys.argv) < 2:
print("שימוש: python validate-decision.py <מספר_תיק>")
sys.exit(1)
case_number = sys.argv[1]
await init_schema()
pool = await get_pool()
async with pool.acquire() as conn:
case = await conn.fetchrow(
"SELECT * FROM cases WHERE case_number = $1", case_number
)
if not case:
print(f"תיק {case_number} לא נמצא")
sys.exit(1)
decision = await conn.fetchrow(
"SELECT * FROM decisions WHERE case_id = $1",
case["id"],
)
if not decision:
print(f"אין החלטה לתיק {case_number}")
sys.exit(1)
blocks = await conn.fetch(
"""SELECT block_id, title, content, word_count, weight_percent
FROM decision_blocks WHERE decision_id = $1
ORDER BY block_index""",
decision["id"],
)
blocks = [dict(b) for b in blocks]
claims_count = await conn.fetchval(
"SELECT count(*) FROM claims WHERE case_id = $1", case["id"]
)
await close_pool()
# Determine appeal type
num = case_number.split("/")[0].split("+")[0].split("-")[0]
if num.startswith("8"):
appeal_type = "levy"
appeal_type_heb = "היטל השבחה"
elif num.startswith("9"):
appeal_type = "compensation"
appeal_type_heb = "פיצויים"
else:
appeal_type = "licensing"
appeal_type_heb = "רישוי ובנייה"
print(f"{'='*60}")
print(f"ולידציה: {case_number}{case['title']}")
print(f"סוג: {appeal_type_heb} | מילים: {decision['total_words']} | טענות: {claims_count}")
print(f"{'='*60}")
all_issues = []
# 1. Neutral background
vav = next((b for b in blocks if b["block_id"] == "block-vav"), None)
issues = check_neutral_background(vav["content"] if vav else "")
if issues:
print(f"\n❌ רקע ניטרלי — {len(issues)} בעיות:")
for i in issues:
print(f"{i}")
all_issues.extend(issues)
else:
print("\n✅ רקע ניטרלי — תקין")
# 2. Weight compliance
issues = check_weight_compliance(blocks, appeal_type)
if issues:
print(f"\n⚠ משקלות — {len(issues)} חריגות:")
for i in issues:
print(f"{i}")
all_issues.extend(issues)
else:
print("\n✅ משקלות — בטווח")
# 3. Structural integrity
issues = check_structural_integrity(blocks)
if issues:
print(f"\n❌ מבנה — {len(issues)} בעיות:")
for i in issues:
print(f"{i}")
all_issues.extend(issues)
else:
print("\n✅ מבנה — תקין")
# 4. No duplication
yod = next((b for b in blocks if b["block_id"] == "block-yod"), None)
issues = check_no_duplication(
vav["content"] if vav else "",
yod["content"] if yod else "",
)
if issues:
print(f"\n⚠ כפילויות — {len(issues)} נמצאו:")
for i in issues:
print(f"{i}")
all_issues.extend(issues)
else:
print("\n✅ ללא כפילויות — תקין")
# Summary
print(f"\n{'='*60}")
if all_issues:
print(f"סה\"כ: {len(all_issues)} בעיות נמצאו")
else:
print("✅ ההחלטה עומדת בכל הכללים")
if __name__ == "__main__":
asyncio.run(main())