"""Backfill style_patterns.frequency with real occurrence counts. The analyzer currently stores frequency=1 for every pattern (it only extracts unique patterns, doesn't count occurrences). This script scans the full_text of every decision in style_corpus and updates each pattern's frequency to the true count of decisions containing the pattern_text as a substring. Run once after analysis, and again whenever new decisions are added. """ from __future__ import annotations import asyncio import os import re import sys import unicodedata from pathlib import Path # Load env for line in (Path.home() / ".env").read_text().splitlines(): if "=" in line and not line.startswith("#"): k, v = line.split("=", 1) os.environ.setdefault(k.strip(), v.strip().strip('"').strip("'")) sys.path.insert(0, "/home/chaim/legal-ai/mcp-server/src") from legal_mcp.services import db as db_mod # noqa: E402 def _strip_nikud(text: str) -> str: """Remove Hebrew combining marks (nikud) for robust matching.""" return "".join( c for c in unicodedata.normalize("NFD", text) if not unicodedata.combining(c) ) def _extract_searchable_variants(pattern_text: str) -> list[str]: """Extract searchable substrings from a pattern template. The analyzer stores patterns as templates with: - Placeholders in [brackets]: "בפנינו ערר על החלטת [הגוף] מיום [תאריך]" - Alternatives separated by / : "נפנה ל... / ראה והשווה / נפנה להחלטה" - Ellipsis ... for variable parts This function returns a list of concrete substrings to search for. We pick the longest fixed segment from each alternative (>= 4 chars) so that matching is specific enough to be meaningful but still flexible. """ # Split on " / " or " או " to get alternatives alternatives = re.split(r"\s*/\s*|\s+או\s+", pattern_text) variants: list[str] = [] for alt in alternatives: alt = alt.strip() if not alt: continue # Remove bracket placeholders [X] alt = re.sub(r"\[[^\]]*\]", "|", alt) # Replace ellipsis with separator alt = re.sub(r"\.{2,}", "|", alt) # Remove ellipsis unicode alt = alt.replace("…", "|") # Split on the | separator and take fixed segments segments = [s.strip(" ,.:;\"'") for s in alt.split("|")] # Keep segments long enough to be meaningful (>= 4 chars, not just common words) good = [s for s in segments if len(s) >= 4] if good: # Use the longest segment as the key variant for this alternative variants.append(max(good, key=len)) elif alt.strip(): # Fallback: use the whole cleaned alternative stripped = alt.replace("|", " ").strip() if len(stripped) >= 4: variants.append(stripped) # Deduplicate while preserving order seen = set() unique = [] for v in variants: if v not in seen: seen.add(v) unique.append(v) return unique def _count_decisions_containing(variants: list[str], normalized_decisions: list) -> int: """Count how many decisions contain ANY of the variants.""" count = 0 for _, _, text in normalized_decisions: if any(v in text for v in variants): count += 1 return count async def main() -> int: pool = await db_mod.get_pool() async with pool.acquire() as conn: decisions = await conn.fetch( "SELECT id, decision_number, full_text FROM style_corpus " "WHERE full_text IS NOT NULL AND length(full_text) > 0" ) patterns = await conn.fetch( "SELECT id, pattern_text, pattern_type FROM style_patterns" ) print(f"Scanning {len(patterns)} patterns across {len(decisions)} decisions...") # Normalize decisions once normalized_decisions = [ (d["id"], d["decision_number"], _strip_nikud(d["full_text"])) for d in decisions ] updates = [] for p in patterns: pattern_text = p["pattern_text"] if not pattern_text or len(pattern_text) < 3: updates.append((0, p["id"])) continue variants = _extract_searchable_variants(_strip_nikud(pattern_text)) if not variants: updates.append((0, p["id"])) continue count = _count_decisions_containing(variants, normalized_decisions) updates.append((count, p["id"])) await conn.executemany( "UPDATE style_patterns SET frequency = $1 WHERE id = $2", updates, ) # Show distribution rows = await conn.fetch( "SELECT pattern_type, pattern_text, frequency " "FROM style_patterns " "ORDER BY frequency DESC " "LIMIT 15" ) print(f"\nTop 15 patterns by real frequency:") for r in rows: print(f" {r['frequency']:>3} [{r['pattern_type']:<22}] {r['pattern_text'][:90]}") dist = await conn.fetch( "SELECT frequency, count(*) FROM style_patterns " "GROUP BY frequency ORDER BY frequency DESC" ) print(f"\nFrequency distribution:") for r in dist: print(f" frequency={r['frequency']:>3} → {r['count']} patterns") return 0 if __name__ == "__main__": sys.exit(asyncio.run(main()))