Auto-strip Nevo preambles and separate style analysis per appeal subtype

- Add strip_nevo_preamble() to extractor.py — auto-removes Nevo database
  headers (bibliography, legislation, mini-ratio) during training upload
- Add appeal_subtype column to style_patterns table — patterns are now
  stored per subtype instead of globally mixed
- Update clear_style_patterns() to support subtype-scoped deletion
- Pass appeal_subtype through analyze_corpus → store → upsert pipeline

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-15 14:03:06 +00:00
parent ba39707c70
commit 5dd24729e2
4 changed files with 65 additions and 18 deletions

View File

@@ -116,6 +116,7 @@ CREATE TABLE IF NOT EXISTS style_patterns (
frequency INTEGER DEFAULT 1, frequency INTEGER DEFAULT 1,
context TEXT DEFAULT '', context TEXT DEFAULT '',
examples JSONB DEFAULT '[]', examples JSONB DEFAULT '[]',
appeal_subtype TEXT DEFAULT '',
created_at TIMESTAMPTZ DEFAULT now() created_at TIMESTAMPTZ DEFAULT now()
); );
@@ -165,6 +166,9 @@ ALTER TABLE cases ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT '';
ALTER TABLE style_corpus ADD COLUMN IF NOT EXISTS practice_area TEXT DEFAULT 'appeals_committee'; ALTER TABLE style_corpus ADD COLUMN IF NOT EXISTS practice_area TEXT DEFAULT 'appeals_committee';
ALTER TABLE style_corpus ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT ''; ALTER TABLE style_corpus ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT '';
-- הרחבת style_patterns עם appeal_subtype לניתוח סגנון נפרד לכל סוג ערר
ALTER TABLE style_patterns ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT '';
-- טבלת qa_results -- טבלת qa_results
CREATE TABLE IF NOT EXISTS qa_results ( CREATE TABLE IF NOT EXISTS qa_results (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
@@ -973,12 +977,14 @@ async def upsert_style_pattern(
pattern_text: str, pattern_text: str,
context: str = "", context: str = "",
examples: list[str] | None = None, examples: list[str] | None = None,
appeal_subtype: str = "",
) -> None: ) -> None:
pool = await get_pool() pool = await get_pool()
async with pool.acquire() as conn: async with pool.acquire() as conn:
existing = await conn.fetchrow( existing = await conn.fetchrow(
"SELECT id, frequency FROM style_patterns WHERE pattern_type = $1 AND pattern_text = $2", "SELECT id, frequency FROM style_patterns "
pattern_type, pattern_text, "WHERE pattern_type = $1 AND pattern_text = $2 AND appeal_subtype = $3",
pattern_type, pattern_text, appeal_subtype,
) )
if existing: if existing:
await conn.execute( await conn.execute(
@@ -987,18 +993,27 @@ async def upsert_style_pattern(
) )
else: else:
await conn.execute( await conn.execute(
"""INSERT INTO style_patterns (pattern_type, pattern_text, context, examples) """INSERT INTO style_patterns (pattern_type, pattern_text, context, examples, appeal_subtype)
VALUES ($1, $2, $3, $4)""", VALUES ($1, $2, $3, $4, $5)""",
pattern_type, pattern_text, context, pattern_type, pattern_text, context,
json.dumps(examples or []), json.dumps(examples or []),
appeal_subtype,
) )
async def clear_style_patterns() -> None: async def clear_style_patterns(appeal_subtype: str = "") -> None:
"""Delete all existing style patterns (used before re-analysis).""" """Delete style patterns, optionally filtered by appeal_subtype.
Empty appeal_subtype = delete ALL patterns.
"""
pool = await get_pool() pool = await get_pool()
async with pool.acquire() as conn: async with pool.acquire() as conn:
await conn.execute("DELETE FROM style_patterns") if appeal_subtype:
await conn.execute(
"DELETE FROM style_patterns WHERE appeal_subtype = $1", appeal_subtype
)
else:
await conn.execute("DELETE FROM style_patterns")
# ── Semantic Search (V2 — decision blocks & case law) ───────────── # ── Semantic Search (V2 — decision blocks & case law) ─────────────

View File

@@ -218,3 +218,30 @@ def _extract_rtf(path: Path) -> str:
"""Extract text from RTF file.""" """Extract text from RTF file."""
rtf_content = path.read_text(encoding="utf-8", errors="replace") rtf_content = path.read_text(encoding="utf-8", errors="replace")
return rtf_to_text(rtf_content) return rtf_to_text(rtf_content)
# ── Nevo preamble stripping ──────────────────────────────────────
_NEVO_MARKERS = ("ספרות:", "חקיקה שאוזכרה:", "מיני-רציו:", "פסקי דין שאוזכרו:",
"כתבי עת:", "הועתק מנבו")
_DECISION_START = re.compile(
r"^(בפנינו|לפנינו|הערר שבנדון|ועדת הערר לתכנון|רקע עובדתי|עסקינן)",
re.MULTILINE,
)
def strip_nevo_preamble(text: str) -> str:
"""Remove Nevo database preamble (bibliography, legislation, mini-ratio) from decision text.
Returns the original text unchanged if no preamble is detected.
"""
head = text[:400]
if not any(marker in head for marker in _NEVO_MARKERS):
return text
m = _DECISION_START.search(text)
if m and m.start() > 50:
stripped = text[m.start():]
logger.debug("Stripped %d chars of Nevo preamble", m.start())
return stripped
return text

View File

@@ -134,8 +134,8 @@ async def analyze_corpus(appeal_subtype: str = "") -> dict:
if not rows: if not rows:
return {"error": "אין החלטות בקורפוס. העלה החלטות קודמות תחילה."} return {"error": "אין החלטות בקורפוס. העלה החלטות קודמות תחילה."}
# Clear old patterns before re-analysis # Clear old patterns for this subtype (or all if unfiltered)
await db.clear_style_patterns() await db.clear_style_patterns(appeal_subtype)
# Calculate token budget # Calculate token budget
total_chars = sum(len(row["full_text"]) for row in rows) total_chars = sum(len(row["full_text"]) for row in rows)
@@ -147,12 +147,12 @@ async def analyze_corpus(appeal_subtype: str = "") -> dict:
) )
if estimated_tokens < MAX_INPUT_TOKENS: if estimated_tokens < MAX_INPUT_TOKENS:
return await _analyze_single_pass(rows) return await _analyze_single_pass(rows, appeal_subtype)
else: else:
return await _analyze_multi_pass(rows) return await _analyze_multi_pass(rows, appeal_subtype)
async def _analyze_single_pass(rows) -> dict: async def _analyze_single_pass(rows, appeal_subtype: str = "") -> dict:
"""Send all decisions in a single API call.""" """Send all decisions in a single API call."""
decisions_text = "" decisions_text = ""
for row in rows: for row in rows:
@@ -164,10 +164,10 @@ async def _analyze_single_pass(rows) -> dict:
timeout=claude_session.LONG_TIMEOUT, timeout=claude_session.LONG_TIMEOUT,
) )
return await _parse_and_store_patterns(raw, len(rows)) return await _parse_and_store_patterns(raw, len(rows), appeal_subtype)
async def _analyze_multi_pass(rows) -> dict: async def _analyze_multi_pass(rows, appeal_subtype: str = "") -> dict:
"""Analyze each decision individually, then synthesize patterns.""" """Analyze each decision individually, then synthesize patterns."""
all_patterns = [] all_patterns = []
@@ -197,7 +197,7 @@ async def _analyze_multi_pass(rows) -> dict:
timeout=claude_session.LONG_TIMEOUT, timeout=claude_session.LONG_TIMEOUT,
) )
return await _parse_and_store_patterns(raw, len(rows)) return await _parse_and_store_patterns(raw, len(rows), appeal_subtype)
def _extract_json(response_text: str) -> list | None: def _extract_json(response_text: str) -> list | None:
@@ -248,14 +248,16 @@ def _extract_json(response_text: str) -> list | None:
return None return None
async def _parse_and_store_patterns(response_text: str, num_decisions: int) -> dict: async def _parse_and_store_patterns(
response_text: str, num_decisions: int, appeal_subtype: str = "",
) -> dict:
"""Parse Claude's response and store patterns in the database.""" """Parse Claude's response and store patterns in the database."""
patterns = _extract_json(response_text) patterns = _extract_json(response_text)
if patterns is None: if patterns is None:
return {"error": "Could not parse analysis results", "raw": response_text} return {"error": "Could not parse analysis results", "raw": response_text}
# Store patterns # Store patterns tagged by appeal_subtype
count = 0 count = 0
for pattern in patterns: for pattern in patterns:
await db.upsert_style_pattern( await db.upsert_style_pattern(
@@ -263,11 +265,13 @@ async def _parse_and_store_patterns(response_text: str, num_decisions: int) -> d
pattern_text=pattern.get("text", ""), pattern_text=pattern.get("text", ""),
context=pattern.get("context", ""), context=pattern.get("context", ""),
examples=[pattern.get("example", "")], examples=[pattern.get("example", "")],
appeal_subtype=appeal_subtype,
) )
count += 1 count += 1
return { return {
"patterns_found": count, "patterns_found": count,
"decisions_analyzed": num_decisions, "decisions_analyzed": num_decisions,
"appeal_subtype": appeal_subtype or "all",
"pattern_types": list({p.get("type") for p in patterns}), "pattern_types": list({p.get("type") for p in patterns}),
} }

View File

@@ -152,8 +152,9 @@ async def document_upload_training(
if source.resolve() != dest.resolve(): if source.resolve() != dest.resolve():
shutil.copy2(str(source), str(dest)) shutil.copy2(str(source), str(dest))
# Extract text # Extract text and strip Nevo preamble
text, page_count = await extractor.extract_text(str(dest)) text, page_count = await extractor.extract_text(str(dest))
text = extractor.strip_nevo_preamble(text)
# Parse date # Parse date
d_date = None d_date = None