diff --git a/CLAUDE.md b/CLAUDE.md index 9e14e1f..558c80f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -30,7 +30,7 @@ - לקחים מהשוואת טיוטות לגרסאות סופיות - סקריפט ייצוא DOCX -כל החומר הועבר לתיקיית `legacy/` כקריאה בלבד. **הפרויקט הנוכחי** מעביר את הידע הזה למערכת מובנית עם PostgreSQL + pgvector + n8n. +הידע שהופק מה-vault הוטמע במערכת הנוכחית — מסמכי ייחוס (`docs/`), קורפוס אימון (`data/training/`), ומבנה 12 בלוקים. ה-vault המקורי נמחק; הפרויקט הנוכחי עובד עם PostgreSQL + pgvector. --- @@ -82,15 +82,23 @@ │ └── docx/ עיצוב DOCX ├── data/ │ ├── training/ ← 4 החלטות לאימון (DOCX) -│ ├── exports/ ← ייצוא legacy (תיקים ישנים) +│ ├── exports/ ← טיוטות DOCX מיוצאות │ └── cases/{case-number}/ ← תיקי עררים (מבנה שטוח, סטטוס ב-DB) ├── web/ ← UI + API + integration clients ├── mcp-server/ ← MCP server + services + tools -└── scripts/ ← סקריפטים וכלי עזר +└── scripts/ ← סקריפטים וכלי עזר (ראה scripts/SCRIPTS.md) + └── .archive/ ← סקריפטים שהושלמו (לא להריץ) ``` --- +## כלל: עדכון `scripts/SCRIPTS.md` + +בכל פעם שנוצר, נמחק, או משתנה סקריפט בתיקיית `scripts/` — **חובה לעדכן את `scripts/SCRIPTS.md`** בהתאם. +הקובץ מתעד את התפקיד, הסטטוס, וההחלפה (אם יש) של כל סקריפט. + +--- + ## ניהול משימות — TaskMaster AI הפרויקט משתמש ב-**TaskMaster AI** (MCP server) לניהול משימות מובנה: diff --git a/scripts/backfill_pattern_frequency.py b/scripts/.archive/backfill_pattern_frequency.py similarity index 100% rename from scripts/backfill_pattern_frequency.py rename to scripts/.archive/backfill_pattern_frequency.py diff --git a/scripts/batch_upload_training.py b/scripts/.archive/batch_upload_training.py similarity index 100% rename from scripts/batch_upload_training.py rename to scripts/.archive/batch_upload_training.py diff --git a/scripts/benchmark_embeddings.py b/scripts/.archive/benchmark_embeddings.py similarity index 100% rename from scripts/benchmark_embeddings.py rename to scripts/.archive/benchmark_embeddings.py diff --git a/scripts/benchmark_new_vs_old.py b/scripts/.archive/benchmark_new_vs_old.py similarity index 100% rename from scripts/benchmark_new_vs_old.py rename to scripts/.archive/benchmark_new_vs_old.py diff --git a/scripts/decompose-decisions.py b/scripts/.archive/decompose-decisions.py similarity index 100% rename from scripts/decompose-decisions.py rename to scripts/.archive/decompose-decisions.py diff --git a/scripts/export-decision-docx.py b/scripts/.archive/export-decision-docx.py similarity index 100% rename from scripts/export-decision-docx.py rename to scripts/.archive/export-decision-docx.py diff --git a/scripts/extract-citations.py b/scripts/.archive/extract-citations.py similarity index 100% rename from scripts/extract-citations.py rename to scripts/.archive/extract-citations.py diff --git a/scripts/extract-claims.py b/scripts/.archive/extract-claims.py similarity index 100% rename from scripts/extract-claims.py rename to scripts/.archive/extract-claims.py diff --git a/scripts/extract_all_google_vision.py b/scripts/.archive/extract_all_google_vision.py similarity index 100% rename from scripts/extract_all_google_vision.py rename to scripts/.archive/extract_all_google_vision.py diff --git a/scripts/extract_originals.py b/scripts/.archive/extract_originals.py similarity index 100% rename from scripts/extract_originals.py rename to scripts/.archive/extract_originals.py diff --git a/scripts/extract_originals_ocr.py b/scripts/.archive/extract_originals_ocr.py similarity index 100% rename from scripts/extract_originals_ocr.py rename to scripts/.archive/extract_originals_ocr.py diff --git a/scripts/generate-embeddings.py b/scripts/.archive/generate-embeddings.py similarity index 100% rename from scripts/generate-embeddings.py rename to scripts/.archive/generate-embeddings.py diff --git a/scripts/link-claims-to-discussion.py b/scripts/.archive/link-claims-to-discussion.py similarity index 100% rename from scripts/link-claims-to-discussion.py rename to scripts/.archive/link-claims-to-discussion.py diff --git a/scripts/proofread_training_corpus.py b/scripts/.archive/proofread_training_corpus.py similarity index 100% rename from scripts/proofread_training_corpus.py rename to scripts/.archive/proofread_training_corpus.py diff --git a/scripts/seed-appeals.py b/scripts/.archive/seed-appeals.py similarity index 100% rename from scripts/seed-appeals.py rename to scripts/.archive/seed-appeals.py diff --git a/scripts/seed-knowledge.py b/scripts/.archive/seed-knowledge.py similarity index 100% rename from scripts/seed-knowledge.py rename to scripts/.archive/seed-knowledge.py diff --git a/scripts/validate-decision.py b/scripts/.archive/validate-decision.py similarity index 100% rename from scripts/validate-decision.py rename to scripts/.archive/validate-decision.py diff --git a/scripts/SCRIPTS.md b/scripts/SCRIPTS.md new file mode 100644 index 0000000..185efa9 --- /dev/null +++ b/scripts/SCRIPTS.md @@ -0,0 +1,51 @@ +# scripts/ — מדריך סקריפטים + +> **כלל:** כל עדכון, יצירה, או מחיקה של סקריפט בתיקייה זו מחייב עדכון של קובץ זה. + +--- + +## סקריפטים פעילים + +| Script | Type | Purpose | Scheduled | +|--------|------|---------|-----------| +| `auto-sync-cases.sh` | bash | סנכרון תיקי ערר ל-Gitea — רץ כל דקה | `* * * * *` (cron) | +| `backup-db.sh` | bash | גיבוי PostgreSQL יומי ל-`data/backups/` (gzip) | לתזמן: `0 2 * * *` | +| `restore-db.sh` | bash | שחזור DB מגיבוי (companion ל-backup-db.sh) | ידני | +| `notify.py` | python | שליחת מייל התראה מסוכנים via SMTP (Gmail) | נקרא ע"י סוכנים | +| `bidi_table.py` | python | יצירת טבלאות box-drawing עם תמיכה ב-BiDi (עברית+אנגלית) | ספריית עזר | + +## תיקיית `.archive/` — סקריפטים שהושלמו + +סקריפטים חד-פעמיים שהפונקציונליות שלהם הוטמעה ב-MCP server או ב-API. +נשמרים ב-git לצורך היסטוריה — **אין להריץ אותם**. + +| Script | Original Purpose | Superseded By | +|--------|-----------------|---------------| +| `backfill_pattern_frequency.py` | עדכון תדירות דפוסי סגנון ב-DB | `web/app.py::_extract_pattern_variants()` | +| `batch_upload_training.py` | העלאת קורפוס אימון (16 קבצים) | Web UI: `/api/training/upload` | +| `benchmark_embeddings.py` | השוואת מודלי embeddings (voyage-3 vs voyage-4) | הושלם — voyage-3-large נבחר | +| `benchmark_new_vs_old.py` | השוואת Google Vision vs markdown קיים | הושלם — בדיקה חד-פעמית לתיק 1130-25 | +| `decompose-decisions.py` | פירוק החלטות סופיות ל-12 בלוקים | MCP: `write_block()`, `write_all_blocks()` | +| `export-decision-docx.py` | ייצוא החלטה ל-DOCX | MCP: `export_docx()` | +| `extract-citations.py` | חילוץ ציטוטי פסיקה מבלוק י | MCP service: `references_extractor.py` | +| `extract-claims.py` | חילוץ טענות מבלוק ז | MCP: `extract_claims()` + `claims_extractor.py` | +| `extract_all_google_vision.py` | OCR בכמות עם Google Vision | MCP: `document_upload()` pipeline | +| `extract_originals.py` | חילוץ טקסט מ-PDF עם Claude Opus | MCP service: `extractor.py` | +| `extract_originals_ocr.py` | חילוץ OCR מלא מ-PDF | MCP service: `extractor.py` | +| `generate-embeddings.py` | יצירת embeddings לבלוקים ופסיקה | אוטומטי — נוצרים עם יצירת בלוקים | +| `link-claims-to-discussion.py` | קישור טענות לפסקאות דיון | MCP service: `qa_validator.py` | +| `proofread_training_corpus.py` | ניקוי Nevo מ-DOCX/PDF ל-Markdown | MCP service: `proofreader.py` + Web UI | +| `seed-appeals.py` | seeding תיקי ערר ראשוניים ל-DB | MCP: `case_create()` | +| `seed-knowledge.py` | seeding לקחים, ביטויי מעבר, פסיקה | MCP: `record_chair_feedback()`, `precedent_attach()` | +| `validate-decision.py` | ולידציה מול block-schema | MCP: `validate_decision()` + `qa_validator.py` | + +## סקריפטים שנמחקו (git history בלבד) + +| Script | Reason | +|--------|--------| +| `import-final-decisions.py` | מיגרציה הושלמה — כל ההחלטות ב-`data/training/` | +| `compare_extractions.py` | בדיקה חד-פעמית לתיק 1130-25 | +| `decompose-decisions-v2.py` | כפילות של v1 | +| `extract_google_vision.py` | hardcoded לתיק בודד | +| `extract_google_vision_single.py` | wrapper חד-פעמי | +| `test-search.py` | סקריפט דיבאג | diff --git a/scripts/compare_extractions.py b/scripts/compare_extractions.py deleted file mode 100644 index 3c639cb..0000000 --- a/scripts/compare_extractions.py +++ /dev/null @@ -1,126 +0,0 @@ -"""Compare existing MD files with freshly extracted text from PDFs.""" - -import difflib -from pathlib import Path - -DOCS_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents") -EXTRACTED_DIR = DOCS_DIR / "extracted" - -# Map: existing MD -> extracted MD -PAIRS = [ - ("2025-08-14-כתב-ערר-קובר.md", "מרק קובר-כתב ערר.md", "Appeal - Kuber"), - ("2025-09-01-כתב-תשובה-ליבמן-לערר.md", "תשובה לערר מטעם המשיבים.md", "Response - Livman"), - ("2025-09-02-כתב-תשובה-ועדת-הראל-לערר.md", "תשובת הועדה המרחבית לערר.md", "Response - Committee"), - ("2025-10-22-כתב-ערר-מטמון.md", "תשובת המשיב-יצחק מטמון.md", "Response - Matmon"), -] - - -def normalize(text: str) -> str: - """Normalize text for comparison.""" - # Remove markdown formatting, extra whitespace - lines = text.strip().split("\n") - lines = [l.strip() for l in lines if l.strip()] - return "\n".join(lines) - - -def word_overlap(a: str, b: str) -> float: - """Calculate word-level overlap ratio.""" - words_a = set(a.split()) - words_b = set(b.split()) - if not words_a or not words_b: - return 0.0 - intersection = words_a & words_b - return len(intersection) / max(len(words_a), len(words_b)) - - -def main(): - print(f"{'=' * 70}") - print("COMPARISON: Existing MD vs Fresh PDF Extraction") - print(f"{'=' * 70}\n") - - summary = [] - - for existing_name, extracted_name, label in PAIRS: - existing_path = DOCS_DIR / existing_name - extracted_path = EXTRACTED_DIR / extracted_name - - if not existing_path.exists(): - print(f"SKIP: {existing_name} not found") - continue - if not extracted_path.exists(): - print(f"SKIP: {extracted_name} not found") - continue - - existing_text = existing_path.read_text(encoding="utf-8") - extracted_text = extracted_path.read_text(encoding="utf-8") - - existing_norm = normalize(existing_text) - extracted_norm = normalize(extracted_text) - - # Stats - existing_chars = len(existing_text) - extracted_chars = len(extracted_text) - existing_words = len(existing_text.split()) - extracted_words = len(extracted_text.split()) - - # Similarity - overlap = word_overlap(existing_norm, extracted_norm) - - # Sequence matcher ratio (slower but more accurate) - # Use first 5000 chars for speed - sm = difflib.SequenceMatcher(None, existing_norm[:5000], extracted_norm[:5000]) - seq_ratio = sm.ratio() - - # Find lines in extracted but not in existing (new content) - existing_lines = set(existing_norm.split("\n")) - extracted_lines = set(extracted_norm.split("\n")) - new_lines = extracted_lines - existing_lines - missing_lines = existing_lines - extracted_lines - - print(f"{'=' * 70}") - print(f" {label}") - print(f" Existing: {existing_name}") - print(f" Extracted: {extracted_name}") - print(f"{'=' * 70}") - print(f" {'Metric':<30} {'Existing MD':>15} {'Fresh PDF':>15} {'Diff':>10}") - print(f" {'-' * 70}") - print(f" {'Characters':<30} {existing_chars:>15,} {extracted_chars:>15,} {extracted_chars - existing_chars:>+10,}") - print(f" {'Words':<30} {existing_words:>15,} {extracted_words:>15,} {extracted_words - existing_words:>+10,}") - print(f" {'Lines':<30} {len(existing_lines):>15,} {len(extracted_lines):>15,} {len(extracted_lines) - len(existing_lines):>+10,}") - print(f" {'Word overlap':<30} {overlap:>15.1%}") - print(f" {'Sequence similarity':<30} {seq_ratio:>15.1%}") - print(f" {'Lines only in fresh PDF':<30} {len(new_lines):>15}") - print(f" {'Lines only in existing MD':<30} {len(missing_lines):>15}") - - # Show sample differences - if new_lines: - print(f"\n Sample lines ONLY in fresh extraction (first 3):") - for line in sorted(new_lines)[:3]: - print(f" + {line[:100]}") - if missing_lines: - print(f"\n Sample lines ONLY in existing MD (first 3):") - for line in sorted(missing_lines)[:3]: - print(f" - {line[:100]}") - - print() - - summary.append({ - "label": label, - "existing_words": existing_words, - "extracted_words": extracted_words, - "word_overlap": overlap, - "seq_similarity": seq_ratio, - }) - - # Summary table - print(f"\n{'=' * 70}") - print("SUMMARY") - print(f"{'=' * 70}") - print(f" {'Document':<25} {'Existing':>10} {'Fresh':>10} {'Overlap':>10} {'Similarity':>12}") - print(f" {'-' * 67}") - for s in summary: - print(f" {s['label']:<25} {s['existing_words']:>10,} {s['extracted_words']:>10,} {s['word_overlap']:>10.1%} {s['seq_similarity']:>12.1%}") - - -if __name__ == "__main__": - main() diff --git a/scripts/decompose-decisions-v2.py b/scripts/decompose-decisions-v2.py deleted file mode 100644 index a6ec90d..0000000 --- a/scripts/decompose-decisions-v2.py +++ /dev/null @@ -1,289 +0,0 @@ -#!/usr/bin/env python3 -"""Decompose final decisions into 12-block structure — V2 calibrated on הכט. - -Key insight: DOCX extraction strips header blocks (א-ד). The real content -starts at block ה (opening "לפנינו"). We identify blocks by known section -headers and line-by-line analysis. -""" - -import asyncio -import json -import re -import sys -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent / "mcp-server" / "src")) - -from legal_mcp.services.db import get_pool, init_schema, close_pool - - -BLOCK_DEFS = [ - ("block-alef", 1, "כותרת מוסדית", "template-fill"), - ("block-bet", 2, "הרכב הוועדה", "template-fill"), - ("block-gimel", 3, "צדדים", "template-fill"), - ("block-dalet", 4, "כותרת החלטה", "template-fill"), - ("block-he", 5, "פתיחה", "paraphrase"), - ("block-vav", 6, "רקע עובדתי", "reproduction"), - ("block-zayin", 7, "טענות הצדדים", "paraphrase"), - ("block-chet", 8, "הליכים בפני ועדת הערר", "reproduction"), - ("block-tet", 9, "תכניות חלות", "guided-synthesis"), - ("block-yod", 10, "דיון והכרעה", "rhetorical-construction"), - ("block-yod-alef", 11, "סיכום", "paraphrase"), - ("block-yod-bet", 12, "חתימות", "template-fill"), -] - - -def find_line(lines: list[str], pattern: str, start: int = 0) -> int: - """Find first line matching pattern (substring or regex). Returns -1 if not found.""" - pat = re.compile(pattern) - for i in range(start, len(lines)): - if pat.search(lines[i]): - return i - return -1 - - -def slice_text(lines: list[str], start: int, end: int) -> str: - """Join lines[start:end] into text.""" - if start < 0 or end <= start: - return "" - return "\n".join(lines[start:end]).strip() - - -def count_words(text: str) -> int: - return len(text.split()) if text else 0 - - -def decompose(text: str) -> dict[str, str]: - """Parse decision into blocks. Returns {block_id: content}.""" - lines = text.split("\n") - n = len(lines) - blocks = {} - - # Find key section headers - # Style 1: רישוי — descriptive headers ("תמצית טענות הצדדים", "דיון והכרעה") - # Style 2: היטל השבחה — numbered headers ("א. רקע עובדתי", "ו. דיון והכרעה") - opening = find_line(lines, r"^לפנינו\s|^בפנינו\s|^בפני\s*ועדת|^בפני\s*בקשה") - - claims = find_line(lines, r"תמצית\s*טענות|טענות\s*הצדדים|טענות\s*העוררי") - if claims == -1: - claims = find_line(lines, r"^טענות\s*העוררי") - if claims == -1: - # היטל השבחה style: "ב. טענות העורר" - claims = find_line(lines, r"^[א-ת][\.\)]\s*טענות") - - background = find_line(lines, r"^[א-ת][\.\)]\s*רקע\s*עובדתי") - - proceedings = find_line(lines, r"ההליכים\s*בפני|הליכים\s*בפני|הדיון\s*בפני\s*ועדת\s*הערר") - if proceedings == -1: - # היטל השבחה: "ד. הבהרות השמאית" or similar procedural sections - proceedings = find_line(lines, r"^[א-ת][\.\)]\s*הבהרות|^[א-ת][\.\)]\s*ההליך") - - plans = find_line(lines, r"תכניות\s*חלות|המסגרת\s*הנורמטיבית|הוראות\s*התכנית") - if plans == -1: - plans = find_line(lines, r"^[א-ת][\.\)]\s*המסגרת\s*הנורמטיבית") - - discussion = find_line(lines, r"^דיון\s*והכרעה|^דיון$|^הכרעה$") - if discussion == -1: - discussion = find_line(lines, r"^[א-ת][\.\)]\s*דיון\s*והכרעה") - - summary = find_line(lines, r"^סיכום\s*$|^סוף\s*דבר\s*$") - if summary == -1: - summary = find_line(lines, r"^[א-ת][\.\)]\s*סיכום") - signature = find_line(lines, r"^ניתנה?\s*(היום|פה\s*אחד|ביום)") - - # If no explicit discussion header, look for the opening formula - if discussion == -1: - discussion = find_line(lines, r"לאחר\s*שבחנו\s*את\s*טענות") - - # ── Header blocks (א-ד): everything before opening ── - if opening >= 0: - header_text = slice_text(lines, 0, opening) - if header_text: - # Try to split header, but usually DOCX extraction loses these - blocks["block-alef"] = header_text - else: - blocks["block-alef"] = "" - else: - blocks["block-alef"] = "" - - blocks["block-bet"] = "" # Usually lost in extraction - blocks["block-gimel"] = "" - blocks["block-dalet"] = "החלטה" - - # ── Block ה: Opening — first 1-3 paragraphs from "לפנינו" ── - if opening >= 0: - next_section = claims if claims > opening else discussion if discussion > opening else n - opening_end = opening + 1 - for i in range(opening + 1, min(opening + 5, next_section)): - line = lines[i].strip() - if not line: - break - opening_end = i + 1 - blocks["block-he"] = slice_text(lines, opening, opening_end) - else: - blocks["block-he"] = "" - - # ── Block ו: Background ── - # Style 1 (רישוי): after opening, before claims - # Style 2 (היטל השבחה): explicit "א. רקע עובדתי" section - if background >= 0: - # Explicit background header (היטל השבחה style) - bg_end = claims if claims > background else (proceedings if proceedings > background else (discussion if discussion > background else n)) - blocks["block-vav"] = slice_text(lines, background, bg_end) - # In this case, opening (ה) might not exist — "לפנינו" may be absent - elif opening >= 0 and claims > opening: - bg_start = opening + 1 - he_lines = count_words(blocks.get("block-he", "")) - if he_lines > 0: - he_end = opening - for i in range(opening, min(opening + 5, claims)): - if lines[i].strip(): - he_end = i + 1 - else: - break - bg_start = he_end - blocks["block-vav"] = slice_text(lines, bg_start, claims) - elif opening >= 0 and discussion > opening: - blocks["block-vav"] = slice_text(lines, opening + 1, discussion) - else: - blocks["block-vav"] = "" - - # ── Block ז: Claims — from claims header to next section ── - if claims >= 0: - claims_end = min( - x for x in [proceedings, plans, discussion, summary, n] - if x > claims - ) - blocks["block-zayin"] = slice_text(lines, claims, claims_end) - else: - blocks["block-zayin"] = "" - - # ── Block ח: Proceedings (optional) ── - if proceedings >= 0: - proc_end = min( - x for x in [plans, discussion, summary, n] - if x > proceedings - ) - blocks["block-chet"] = slice_text(lines, proceedings, proc_end) - else: - blocks["block-chet"] = "" - - # ── Block ט: Plans (optional) ── - if plans >= 0 and (discussion == -1 or plans < discussion): - plans_end = min( - x for x in [discussion, summary, n] - if x > plans - ) - blocks["block-tet"] = slice_text(lines, plans, plans_end) - else: - blocks["block-tet"] = "" - - # ── Block י: Discussion ── - if discussion >= 0: - disc_end = summary if summary > discussion else (signature if signature > discussion else n) - blocks["block-yod"] = slice_text(lines, discussion, disc_end) - else: - blocks["block-yod"] = "" - - # ── Block יא: Summary ── - if summary >= 0: - summ_end = signature if signature > summary else n - blocks["block-yod-alef"] = slice_text(lines, summary, summ_end) - else: - blocks["block-yod-alef"] = "" - - # ── Block יב: Signatures ── - if signature >= 0: - blocks["block-yod-bet"] = slice_text(lines, signature, n) - else: - blocks["block-yod-bet"] = "" - - return blocks - - -async def main(): - await init_schema() - pool = await get_pool() - - async with pool.acquire() as conn: - decisions = await conn.fetch( - """SELECT d.id as decision_id, c.case_number, c.title, - doc.extracted_text - FROM decisions d - JOIN cases c ON c.id = d.case_id - JOIN documents doc ON doc.case_id = d.case_id AND doc.doc_type = 'decision' - WHERE d.status = 'final' - ORDER BY c.case_number""" - ) - - for dec in decisions: - decision_id = dec["decision_id"] - case_number = dec["case_number"] - text = dec["extracted_text"] - total_words = count_words(text) - - print(f"\n{'='*60}") - print(f"מפרק: {case_number} — {dec['title']}") - print(f"סה\"כ מילים: {total_words}") - print(f"{'='*60}") - - parsed = decompose(text) - - async with pool.acquire() as conn: - # Delete existing blocks - await conn.execute( - "DELETE FROM decision_blocks WHERE decision_id = $1", decision_id - ) - - total_parsed_words = 0 - for block_id, block_index, title, gen_type in BLOCK_DEFS: - content = parsed.get(block_id, "") - wc = count_words(content) - weight = round(wc / total_words * 100, 1) if total_words > 0 and wc > 0 else 0 - status = "final" if wc > 0 else "empty" - total_parsed_words += wc - - await conn.execute( - """INSERT INTO decision_blocks - (decision_id, block_id, block_index, title, content, - word_count, weight_percent, generation_type, status) - VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)""", - decision_id, block_id, block_index, title, - content, wc, weight, gen_type, status, - ) - - marker = "✅" if wc > 0 else "⬜" - print(f" {marker} {block_id:18s} | {title:25s} | {wc:5d} מילים | {weight:5.1f}%") - - # Update decision totals - disc_words = count_words(parsed.get("block-yod", "")) - disc_paras = len([p for p in parsed.get("block-yod", "").split("\n") if p.strip() and len(p.strip()) > 20]) - await conn.execute( - "UPDATE decisions SET total_words = $1, total_paragraphs = $2, updated_at = now() WHERE id = $3", - total_words, disc_paras, decision_id, - ) - - coverage = round(total_parsed_words / total_words * 100, 1) if total_words > 0 else 0 - print(f" --- כיסוי: {total_parsed_words}/{total_words} מילים ({coverage}%)") - - # Summary - async with pool.acquire() as conn: - stats = await conn.fetch( - """SELECT block_id, count(*) as decisions, - avg(word_count) as avg_words, - avg(weight_percent) as avg_weight - FROM decision_blocks - WHERE word_count > 0 - GROUP BY block_id ORDER BY block_id""" - ) - - print(f"\n{'='*60}") - print("סטטיסטיקה לפי בלוק (רק בלוקים עם תוכן):") - for s in stats: - print(f" {s['block_id']:18s} | {s['decisions']} החלטות | ממוצע {s['avg_words']:.0f} מילים | {s['avg_weight']:.1f}%") - - await close_pool() - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/scripts/extract_google_vision.py b/scripts/extract_google_vision.py deleted file mode 100644 index 5679654..0000000 --- a/scripts/extract_google_vision.py +++ /dev/null @@ -1,66 +0,0 @@ -"""Extract text from PDF using Google Cloud Vision API.""" - -import io -import time -from pathlib import Path - -import fitz # PyMuPDF for rendering pages to images -from google.cloud import vision - -API_KEY = "AIzaSyDZgUsxsy_FHkkREU7R_oQLJALU3_V26j8" - -PDF_PATH = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/originals/מרק קובר-כתב ערר.pdf") -OUTPUT_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/extracted") - - -def main(): - OUTPUT_DIR.mkdir(parents=True, exist_ok=True) - - client = vision.ImageAnnotatorClient( - client_options={"api_key": API_KEY} - ) - - doc = fitz.open(str(PDF_PATH)) - page_count = len(doc) - print(f"Processing: {PDF_PATH.name} ({page_count} pages)\n") - - pages_text = [] - total_time = 0.0 - - for i in range(page_count): - page = doc[i] - pix = page.get_pixmap(dpi=300) - img_bytes = pix.tobytes("png") - - image = vision.Image(content=img_bytes) - - print(f" Page {i+1}/{page_count}...", end=" ", flush=True) - t0 = time.time() - response = client.document_text_detection( - image=image, - image_context={"language_hints": ["he"]} - ) - elapsed = time.time() - t0 - total_time += elapsed - - if response.error.message: - print(f"ERROR: {response.error.message}") - pages_text.append("") - continue - - text = response.full_text_annotation.text if response.full_text_annotation else "" - pages_text.append(text) - print(f"{len(text):,} chars, {elapsed:.1f}s") - - doc.close() - - full_text = "\n\n".join(pages_text) - out_file = OUTPUT_DIR / f"{PDF_PATH.stem}.md" - out_file.write_text(full_text, encoding="utf-8") - - print(f"\nTotal: {len(full_text):,} chars, {len(full_text.split()):,} words, {total_time:.1f}s") - print(f"Saved: {out_file}") - - -if __name__ == "__main__": - main() diff --git a/scripts/extract_google_vision_single.py b/scripts/extract_google_vision_single.py deleted file mode 100644 index b06f3e3..0000000 --- a/scripts/extract_google_vision_single.py +++ /dev/null @@ -1,54 +0,0 @@ -"""Extract text from a single PDF using Google Cloud Vision API.""" - -import sys -import time -from pathlib import Path - -import fitz -from google.cloud import vision - -API_KEY = "AIzaSyDZgUsxsy_FHkkREU7R_oQLJALU3_V26j8" -OUTPUT_DIR = Path("/home/chaim/legal-ai/data/cases/1130-25/documents/extracted") - -def main(): - pdf_path = Path(sys.argv[1]) - OUTPUT_DIR.mkdir(parents=True, exist_ok=True) - - client = vision.ImageAnnotatorClient(client_options={"api_key": API_KEY}) - doc = fitz.open(str(pdf_path)) - page_count = len(doc) - print(f"Processing: {pdf_path.name} ({page_count} pages)\n") - - pages_text = [] - total_time = 0.0 - - for i in range(page_count): - page = doc[i] - pix = page.get_pixmap(dpi=300) - img_bytes = pix.tobytes("png") - image = vision.Image(content=img_bytes) - - print(f" Page {i+1}/{page_count}...", end=" ", flush=True) - t0 = time.time() - response = client.document_text_detection(image=image, image_context={"language_hints": ["he"]}) - elapsed = time.time() - t0 - total_time += elapsed - - if response.error.message: - print(f"ERROR: {response.error.message}") - pages_text.append("") - continue - - text = response.full_text_annotation.text if response.full_text_annotation else "" - pages_text.append(text) - print(f"{len(text):,} chars, {elapsed:.1f}s") - - doc.close() - full_text = "\n\n".join(pages_text) - out_file = OUTPUT_DIR / f"{pdf_path.stem}.md" - out_file.write_text(full_text, encoding="utf-8") - print(f"\nTotal: {len(full_text):,} chars, {len(full_text.split()):,} words, {total_time:.1f}s") - print(f"Saved: {out_file}") - -if __name__ == "__main__": - main() diff --git a/scripts/test-search.py b/scripts/test-search.py deleted file mode 100644 index a8566d9..0000000 --- a/scripts/test-search.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python3 -"""Test semantic search functions.""" - -import asyncio -import sys -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).parent.parent / "mcp-server" / "src")) - -from legal_mcp.services.db import search_similar_paragraphs, search_similar_case_law, search_precedents, init_schema -from legal_mcp.services.embeddings import embed_query - - -async def main(): - await init_schema() - - queries = [ - "טענות קנייניות רוב דרוש בעלי דירות רכוש משותף", - "חניה תנועה חניות מצוקת חניה", - "היטל השבחה שמאי מכריע התערבות", - ] - - for query in queries: - print(f'=== שאילתה: "{query}" ===') - emb = await embed_query(query) - results = await search_precedents(emb, limit=3) - - if not results: - print(" אין תוצאות") - else: - for i, r in enumerate(results): - score = r["score"] - cn = r["case_number"] - rtype = r["type"] - content = r["content"][:120].replace("\n", " ") - print(f" {i+1}. [{rtype}] {score:.3f} | {cn} | {content}") - print() - - -asyncio.run(main())