Auto-strip Nevo preambles and separate style analysis per appeal subtype

- Add strip_nevo_preamble() to extractor.py — auto-removes Nevo database headers (bibliography, legislation, mini-ratio) during training upload - Add appeal_subtype column to style_patterns table — patterns are now stored per subtype instead of globally mixed - Update clear_style_patterns() to support subtype-scoped deletion - Pass appeal_subtype through analyze_corpus → store → upsert pipeline Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-15 14:03:06 +00:00
parent ba39707c70
commit 5dd24729e2
4 changed files with 65 additions and 18 deletions
--- a/mcp-server/src/legal_mcp/services/db.py
+++ b/mcp-server/src/legal_mcp/services/db.py
@@ -116,6 +116,7 @@ CREATE TABLE IF NOT EXISTS style_patterns (
    frequency INTEGER DEFAULT 1,
    context TEXT DEFAULT '',
    examples JSONB DEFAULT '[]',
+    appeal_subtype TEXT DEFAULT '',
    created_at TIMESTAMPTZ DEFAULT now()
 );

@@ -165,6 +166,9 @@ ALTER TABLE cases ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT '';
 ALTER TABLE style_corpus ADD COLUMN IF NOT EXISTS practice_area TEXT DEFAULT 'appeals_committee';
 ALTER TABLE style_corpus ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT '';

+-- הרחבת style_patterns עם appeal_subtype לניתוח סגנון נפרד לכל סוג ערר
+ALTER TABLE style_patterns ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT '';
+
 -- טבלת qa_results
 CREATE TABLE IF NOT EXISTS qa_results (
    id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
@@ -973,12 +977,14 @@ async def upsert_style_pattern(
    pattern_text: str,
    context: str = "",
    examples: list[str] | None = None,
+    appeal_subtype: str = "",
 ) -> None:
    pool = await get_pool()
    async with pool.acquire() as conn:
        existing = await conn.fetchrow(
-            "SELECT id, frequency FROM style_patterns WHERE pattern_type = $1 AND pattern_text = $2",
-            pattern_type, pattern_text,
+            "SELECT id, frequency FROM style_patterns "
+            "WHERE pattern_type = $1 AND pattern_text = $2 AND appeal_subtype = $3",
+            pattern_type, pattern_text, appeal_subtype,
        )
        if existing:
            await conn.execute(
@@ -987,18 +993,27 @@ async def upsert_style_pattern(
            )
        else:
            await conn.execute(
-                """INSERT INTO style_patterns (pattern_type, pattern_text, context, examples)
-                   VALUES ($1, $2, $3, $4)""",
+                """INSERT INTO style_patterns (pattern_type, pattern_text, context, examples, appeal_subtype)
+                   VALUES ($1, $2, $3, $4, $5)""",
                pattern_type, pattern_text, context,
                json.dumps(examples or []),
+                appeal_subtype,
            )


-async def clear_style_patterns() -> None:
-    """Delete all existing style patterns (used before re-analysis)."""
+async def clear_style_patterns(appeal_subtype: str = "") -> None:
+    """Delete style patterns, optionally filtered by appeal_subtype.
+
+    Empty appeal_subtype = delete ALL patterns.
+    """
    pool = await get_pool()
    async with pool.acquire() as conn:
-        await conn.execute("DELETE FROM style_patterns")
+        if appeal_subtype:
+            await conn.execute(
+                "DELETE FROM style_patterns WHERE appeal_subtype = $1", appeal_subtype
+            )
+        else:
+            await conn.execute("DELETE FROM style_patterns")


 # ── Semantic Search (V2 — decision blocks & case law) ─────────────
--- a/mcp-server/src/legal_mcp/services/extractor.py
+++ b/mcp-server/src/legal_mcp/services/extractor.py
@@ -218,3 +218,30 @@ def _extract_rtf(path: Path) -> str:
    """Extract text from RTF file."""
    rtf_content = path.read_text(encoding="utf-8", errors="replace")
    return rtf_to_text(rtf_content)
+
+
+# ── Nevo preamble stripping ──────────────────────────────────────
+
+_NEVO_MARKERS = ("ספרות:", "חקיקה שאוזכרה:", "מיני-רציו:", "פסקי דין שאוזכרו:",
+                 "כתבי עת:", "הועתק מנבו")
+
+_DECISION_START = re.compile(
+    r"^(בפנינו|לפנינו|הערר שבנדון|ועדת הערר לתכנון|רקע עובדתי|עסקינן)",
+    re.MULTILINE,
+)
+
+
+def strip_nevo_preamble(text: str) -> str:
+    """Remove Nevo database preamble (bibliography, legislation, mini-ratio) from decision text.
+
+    Returns the original text unchanged if no preamble is detected.
+    """
+    head = text[:400]
+    if not any(marker in head for marker in _NEVO_MARKERS):
+        return text
+    m = _DECISION_START.search(text)
+    if m and m.start() > 50:
+        stripped = text[m.start():]
+        logger.debug("Stripped %d chars of Nevo preamble", m.start())
+        return stripped
+    return text
--- a/mcp-server/src/legal_mcp/services/style_analyzer.py
+++ b/mcp-server/src/legal_mcp/services/style_analyzer.py
@@ -134,8 +134,8 @@ async def analyze_corpus(appeal_subtype: str = "") -> dict:
    if not rows:
        return {"error": "אין החלטות בקורפוס. העלה החלטות קודמות תחילה."}

-    # Clear old patterns before re-analysis
-    await db.clear_style_patterns()
+    # Clear old patterns for this subtype (or all if unfiltered)
+    await db.clear_style_patterns(appeal_subtype)

    # Calculate token budget
    total_chars = sum(len(row["full_text"]) for row in rows)
@@ -147,12 +147,12 @@ async def analyze_corpus(appeal_subtype: str = "") -> dict:
    )

    if estimated_tokens < MAX_INPUT_TOKENS:
-        return await _analyze_single_pass(rows)
+        return await _analyze_single_pass(rows, appeal_subtype)
    else:
-        return await _analyze_multi_pass(rows)
+        return await _analyze_multi_pass(rows, appeal_subtype)


-async def _analyze_single_pass(rows) -> dict:
+async def _analyze_single_pass(rows, appeal_subtype: str = "") -> dict:
    """Send all decisions in a single API call."""
    decisions_text = ""
    for row in rows:
@@ -164,10 +164,10 @@ async def _analyze_single_pass(rows) -> dict:
        timeout=claude_session.LONG_TIMEOUT,
    )

-    return await _parse_and_store_patterns(raw, len(rows))
+    return await _parse_and_store_patterns(raw, len(rows), appeal_subtype)


-async def _analyze_multi_pass(rows) -> dict:
+async def _analyze_multi_pass(rows, appeal_subtype: str = "") -> dict:
    """Analyze each decision individually, then synthesize patterns."""
    all_patterns = []

@@ -197,7 +197,7 @@ async def _analyze_multi_pass(rows) -> dict:
        timeout=claude_session.LONG_TIMEOUT,
    )

-    return await _parse_and_store_patterns(raw, len(rows))
+    return await _parse_and_store_patterns(raw, len(rows), appeal_subtype)


 def _extract_json(response_text: str) -> list | None:
@@ -248,14 +248,16 @@ def _extract_json(response_text: str) -> list | None:
    return None


-async def _parse_and_store_patterns(response_text: str, num_decisions: int) -> dict:
+async def _parse_and_store_patterns(
+    response_text: str, num_decisions: int, appeal_subtype: str = "",
+) -> dict:
    """Parse Claude's response and store patterns in the database."""
    patterns = _extract_json(response_text)

    if patterns is None:
        return {"error": "Could not parse analysis results", "raw": response_text}

-    # Store patterns
+    # Store patterns tagged by appeal_subtype
    count = 0
    for pattern in patterns:
        await db.upsert_style_pattern(
@@ -263,11 +265,13 @@ async def _parse_and_store_patterns(response_text: str, num_decisions: int) -> d
            pattern_text=pattern.get("text", ""),
            context=pattern.get("context", ""),
            examples=[pattern.get("example", "")],
+            appeal_subtype=appeal_subtype,
        )
        count += 1

    return {
        "patterns_found": count,
        "decisions_analyzed": num_decisions,
+        "appeal_subtype": appeal_subtype or "all",
        "pattern_types": list({p.get("type") for p in patterns}),
    }
--- a/mcp-server/src/legal_mcp/tools/documents.py
+++ b/mcp-server/src/legal_mcp/tools/documents.py
@@ -152,8 +152,9 @@ async def document_upload_training(
    if source.resolve() != dest.resolve():
        shutil.copy2(str(source), str(dest))

-    # Extract text
+    # Extract text and strip Nevo preamble
    text, page_count = await extractor.extract_text(str(dest))
+    text = extractor.strip_nevo_preamble(text)

    # Parse date
    d_date = None