Files
legal-ai/scripts/.archive/backfill_pattern_frequency.py
Chaim 5c9a5d702a Clean up scripts/: archive 17, delete 5, add SCRIPTS.md registry
Active scripts (5): auto-sync-cases.sh, backup-db.sh, restore-db.sh,
notify.py, bidi_table.py

Archived (17): one-time migration/seeding scripts whose functionality
is now in MCP server or web API. Moved to scripts/.archive/

Deleted (5): zero-value scripts (duplicates, hardcoded single-case,
debug scripts)

Added scripts/SCRIPTS.md — registry of all scripts with purpose,
status, and what superseded them. CLAUDE.md updated with rule:
any script change requires SCRIPTS.md update.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-14 16:30:19 +00:00

164 lines
5.4 KiB
Python

"""Backfill style_patterns.frequency with real occurrence counts.
The analyzer currently stores frequency=1 for every pattern (it only extracts
unique patterns, doesn't count occurrences). This script scans the full_text
of every decision in style_corpus and updates each pattern's frequency to
the true count of decisions containing the pattern_text as a substring.
Run once after analysis, and again whenever new decisions are added.
"""
from __future__ import annotations
import asyncio
import os
import re
import sys
import unicodedata
from pathlib import Path
# Load env
for line in (Path.home() / ".env").read_text().splitlines():
if "=" in line and not line.startswith("#"):
k, v = line.split("=", 1)
os.environ.setdefault(k.strip(), v.strip().strip('"').strip("'"))
sys.path.insert(0, "/home/chaim/legal-ai/mcp-server/src")
from legal_mcp.services import db as db_mod # noqa: E402
def _strip_nikud(text: str) -> str:
"""Remove Hebrew combining marks (nikud) for robust matching."""
return "".join(
c for c in unicodedata.normalize("NFD", text)
if not unicodedata.combining(c)
)
def _extract_searchable_variants(pattern_text: str) -> list[str]:
"""Extract searchable substrings from a pattern template.
The analyzer stores patterns as templates with:
- Placeholders in [brackets]: "בפנינו ערר על החלטת [הגוף] מיום [תאריך]"
- Alternatives separated by / : "נפנה ל... / ראה והשווה / נפנה להחלטה"
- Ellipsis ... for variable parts
This function returns a list of concrete substrings to search for.
We pick the longest fixed segment from each alternative (>= 4 chars)
so that matching is specific enough to be meaningful but still flexible.
"""
# Split on " / " or " או " to get alternatives
alternatives = re.split(r"\s*/\s*|\s+או\s+", pattern_text)
variants: list[str] = []
for alt in alternatives:
alt = alt.strip()
if not alt:
continue
# Remove bracket placeholders [X]
alt = re.sub(r"\[[^\]]*\]", "|", alt)
# Replace ellipsis with separator
alt = re.sub(r"\.{2,}", "|", alt)
# Remove ellipsis unicode
alt = alt.replace("", "|")
# Split on the | separator and take fixed segments
segments = [s.strip(" ,.:;\"'") for s in alt.split("|")]
# Keep segments long enough to be meaningful (>= 4 chars, not just common words)
good = [s for s in segments if len(s) >= 4]
if good:
# Use the longest segment as the key variant for this alternative
variants.append(max(good, key=len))
elif alt.strip():
# Fallback: use the whole cleaned alternative
stripped = alt.replace("|", " ").strip()
if len(stripped) >= 4:
variants.append(stripped)
# Deduplicate while preserving order
seen = set()
unique = []
for v in variants:
if v not in seen:
seen.add(v)
unique.append(v)
return unique
def _count_decisions_containing(variants: list[str], normalized_decisions: list) -> int:
"""Count how many decisions contain ANY of the variants."""
count = 0
for _, _, text in normalized_decisions:
if any(v in text for v in variants):
count += 1
return count
async def main() -> int:
pool = await db_mod.get_pool()
async with pool.acquire() as conn:
decisions = await conn.fetch(
"SELECT id, decision_number, full_text FROM style_corpus "
"WHERE full_text IS NOT NULL AND length(full_text) > 0"
)
patterns = await conn.fetch(
"SELECT id, pattern_text, pattern_type FROM style_patterns"
)
print(f"Scanning {len(patterns)} patterns across {len(decisions)} decisions...")
# Normalize decisions once
normalized_decisions = [
(d["id"], d["decision_number"], _strip_nikud(d["full_text"]))
for d in decisions
]
updates = []
for p in patterns:
pattern_text = p["pattern_text"]
if not pattern_text or len(pattern_text) < 3:
updates.append((0, p["id"]))
continue
variants = _extract_searchable_variants(_strip_nikud(pattern_text))
if not variants:
updates.append((0, p["id"]))
continue
count = _count_decisions_containing(variants, normalized_decisions)
updates.append((count, p["id"]))
await conn.executemany(
"UPDATE style_patterns SET frequency = $1 WHERE id = $2",
updates,
)
# Show distribution
rows = await conn.fetch(
"SELECT pattern_type, pattern_text, frequency "
"FROM style_patterns "
"ORDER BY frequency DESC "
"LIMIT 15"
)
print(f"\nTop 15 patterns by real frequency:")
for r in rows:
print(f" {r['frequency']:>3} [{r['pattern_type']:<22}] {r['pattern_text'][:90]}")
dist = await conn.fetch(
"SELECT frequency, count(*) FROM style_patterns "
"GROUP BY frequency ORDER BY frequency DESC"
)
print(f"\nFrequency distribution:")
for r in dist:
print(f" frequency={r['frequency']:>3}{r['count']} patterns")
return 0
if __name__ == "__main__":
sys.exit(asyncio.run(main()))