Auto-strip Nevo preambles and separate style analysis per appeal subtype

- Add strip_nevo_preamble() to extractor.py — auto-removes Nevo database headers (bibliography, legislation, mini-ratio) during training upload - Add appeal_subtype column to style_patterns table — patterns are now stored per subtype instead of globally mixed - Update clear_style_patterns() to support subtype-scoped deletion - Pass appeal_subtype through analyze_corpus → store → upsert pipeline Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-15 14:03:06 +00:00
parent ba39707c70
commit 5dd24729e2
4 changed files with 65 additions and 18 deletions
--- a/mcp-server/src/legal_mcp/services/extractor.py
+++ b/mcp-server/src/legal_mcp/services/extractor.py
@@ -218,3 +218,30 @@ def _extract_rtf(path: Path) -> str:
    """Extract text from RTF file."""
    rtf_content = path.read_text(encoding="utf-8", errors="replace")
    return rtf_to_text(rtf_content)
+
+
+# ── Nevo preamble stripping ──────────────────────────────────────
+
+_NEVO_MARKERS = ("ספרות:", "חקיקה שאוזכרה:", "מיני-רציו:", "פסקי דין שאוזכרו:",
+                 "כתבי עת:", "הועתק מנבו")
+
+_DECISION_START = re.compile(
+    r"^(בפנינו|לפנינו|הערר שבנדון|ועדת הערר לתכנון|רקע עובדתי|עסקינן)",
+    re.MULTILINE,
+)
+
+
+def strip_nevo_preamble(text: str) -> str:
+    """Remove Nevo database preamble (bibliography, legislation, mini-ratio) from decision text.
+
+    Returns the original text unchanged if no preamble is detected.
+    """
+    head = text[:400]
+    if not any(marker in head for marker in _NEVO_MARKERS):
+        return text
+    m = _DECISION_START.search(text)
+    if m and m.start() > 50:
+        stripped = text[m.start():]
+        logger.debug("Stripped %d chars of Nevo preamble", m.start())
+        return stripped
+    return text