diff --git a/mcp-server/src/legal_mcp/services/db.py b/mcp-server/src/legal_mcp/services/db.py index db90819..4056790 100644 --- a/mcp-server/src/legal_mcp/services/db.py +++ b/mcp-server/src/legal_mcp/services/db.py @@ -116,6 +116,7 @@ CREATE TABLE IF NOT EXISTS style_patterns ( frequency INTEGER DEFAULT 1, context TEXT DEFAULT '', examples JSONB DEFAULT '[]', + appeal_subtype TEXT DEFAULT '', created_at TIMESTAMPTZ DEFAULT now() ); @@ -165,6 +166,9 @@ ALTER TABLE cases ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT ''; ALTER TABLE style_corpus ADD COLUMN IF NOT EXISTS practice_area TEXT DEFAULT 'appeals_committee'; ALTER TABLE style_corpus ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT ''; +-- הרחבת style_patterns עם appeal_subtype לניתוח סגנון נפרד לכל סוג ערר +ALTER TABLE style_patterns ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT ''; + -- טבלת qa_results CREATE TABLE IF NOT EXISTS qa_results ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), @@ -973,12 +977,14 @@ async def upsert_style_pattern( pattern_text: str, context: str = "", examples: list[str] | None = None, + appeal_subtype: str = "", ) -> None: pool = await get_pool() async with pool.acquire() as conn: existing = await conn.fetchrow( - "SELECT id, frequency FROM style_patterns WHERE pattern_type = $1 AND pattern_text = $2", - pattern_type, pattern_text, + "SELECT id, frequency FROM style_patterns " + "WHERE pattern_type = $1 AND pattern_text = $2 AND appeal_subtype = $3", + pattern_type, pattern_text, appeal_subtype, ) if existing: await conn.execute( @@ -987,18 +993,27 @@ async def upsert_style_pattern( ) else: await conn.execute( - """INSERT INTO style_patterns (pattern_type, pattern_text, context, examples) - VALUES ($1, $2, $3, $4)""", + """INSERT INTO style_patterns (pattern_type, pattern_text, context, examples, appeal_subtype) + VALUES ($1, $2, $3, $4, $5)""", pattern_type, pattern_text, context, json.dumps(examples or []), + appeal_subtype, ) -async def clear_style_patterns() -> None: - """Delete all existing style patterns (used before re-analysis).""" +async def clear_style_patterns(appeal_subtype: str = "") -> None: + """Delete style patterns, optionally filtered by appeal_subtype. + + Empty appeal_subtype = delete ALL patterns. + """ pool = await get_pool() async with pool.acquire() as conn: - await conn.execute("DELETE FROM style_patterns") + if appeal_subtype: + await conn.execute( + "DELETE FROM style_patterns WHERE appeal_subtype = $1", appeal_subtype + ) + else: + await conn.execute("DELETE FROM style_patterns") # ── Semantic Search (V2 — decision blocks & case law) ───────────── diff --git a/mcp-server/src/legal_mcp/services/extractor.py b/mcp-server/src/legal_mcp/services/extractor.py index af5172d..42691ca 100644 --- a/mcp-server/src/legal_mcp/services/extractor.py +++ b/mcp-server/src/legal_mcp/services/extractor.py @@ -218,3 +218,30 @@ def _extract_rtf(path: Path) -> str: """Extract text from RTF file.""" rtf_content = path.read_text(encoding="utf-8", errors="replace") return rtf_to_text(rtf_content) + + +# ── Nevo preamble stripping ────────────────────────────────────── + +_NEVO_MARKERS = ("ספרות:", "חקיקה שאוזכרה:", "מיני-רציו:", "פסקי דין שאוזכרו:", + "כתבי עת:", "הועתק מנבו") + +_DECISION_START = re.compile( + r"^(בפנינו|לפנינו|הערר שבנדון|ועדת הערר לתכנון|רקע עובדתי|עסקינן)", + re.MULTILINE, +) + + +def strip_nevo_preamble(text: str) -> str: + """Remove Nevo database preamble (bibliography, legislation, mini-ratio) from decision text. + + Returns the original text unchanged if no preamble is detected. + """ + head = text[:400] + if not any(marker in head for marker in _NEVO_MARKERS): + return text + m = _DECISION_START.search(text) + if m and m.start() > 50: + stripped = text[m.start():] + logger.debug("Stripped %d chars of Nevo preamble", m.start()) + return stripped + return text diff --git a/mcp-server/src/legal_mcp/services/style_analyzer.py b/mcp-server/src/legal_mcp/services/style_analyzer.py index 0d5cb5a..829747d 100644 --- a/mcp-server/src/legal_mcp/services/style_analyzer.py +++ b/mcp-server/src/legal_mcp/services/style_analyzer.py @@ -134,8 +134,8 @@ async def analyze_corpus(appeal_subtype: str = "") -> dict: if not rows: return {"error": "אין החלטות בקורפוס. העלה החלטות קודמות תחילה."} - # Clear old patterns before re-analysis - await db.clear_style_patterns() + # Clear old patterns for this subtype (or all if unfiltered) + await db.clear_style_patterns(appeal_subtype) # Calculate token budget total_chars = sum(len(row["full_text"]) for row in rows) @@ -147,12 +147,12 @@ async def analyze_corpus(appeal_subtype: str = "") -> dict: ) if estimated_tokens < MAX_INPUT_TOKENS: - return await _analyze_single_pass(rows) + return await _analyze_single_pass(rows, appeal_subtype) else: - return await _analyze_multi_pass(rows) + return await _analyze_multi_pass(rows, appeal_subtype) -async def _analyze_single_pass(rows) -> dict: +async def _analyze_single_pass(rows, appeal_subtype: str = "") -> dict: """Send all decisions in a single API call.""" decisions_text = "" for row in rows: @@ -164,10 +164,10 @@ async def _analyze_single_pass(rows) -> dict: timeout=claude_session.LONG_TIMEOUT, ) - return await _parse_and_store_patterns(raw, len(rows)) + return await _parse_and_store_patterns(raw, len(rows), appeal_subtype) -async def _analyze_multi_pass(rows) -> dict: +async def _analyze_multi_pass(rows, appeal_subtype: str = "") -> dict: """Analyze each decision individually, then synthesize patterns.""" all_patterns = [] @@ -197,7 +197,7 @@ async def _analyze_multi_pass(rows) -> dict: timeout=claude_session.LONG_TIMEOUT, ) - return await _parse_and_store_patterns(raw, len(rows)) + return await _parse_and_store_patterns(raw, len(rows), appeal_subtype) def _extract_json(response_text: str) -> list | None: @@ -248,14 +248,16 @@ def _extract_json(response_text: str) -> list | None: return None -async def _parse_and_store_patterns(response_text: str, num_decisions: int) -> dict: +async def _parse_and_store_patterns( + response_text: str, num_decisions: int, appeal_subtype: str = "", +) -> dict: """Parse Claude's response and store patterns in the database.""" patterns = _extract_json(response_text) if patterns is None: return {"error": "Could not parse analysis results", "raw": response_text} - # Store patterns + # Store patterns tagged by appeal_subtype count = 0 for pattern in patterns: await db.upsert_style_pattern( @@ -263,11 +265,13 @@ async def _parse_and_store_patterns(response_text: str, num_decisions: int) -> d pattern_text=pattern.get("text", ""), context=pattern.get("context", ""), examples=[pattern.get("example", "")], + appeal_subtype=appeal_subtype, ) count += 1 return { "patterns_found": count, "decisions_analyzed": num_decisions, + "appeal_subtype": appeal_subtype or "all", "pattern_types": list({p.get("type") for p in patterns}), } diff --git a/mcp-server/src/legal_mcp/tools/documents.py b/mcp-server/src/legal_mcp/tools/documents.py index e23a930..d984a80 100644 --- a/mcp-server/src/legal_mcp/tools/documents.py +++ b/mcp-server/src/legal_mcp/tools/documents.py @@ -152,8 +152,9 @@ async def document_upload_training( if source.resolve() != dest.resolve(): shutil.copy2(str(source), str(dest)) - # Extract text + # Extract text and strip Nevo preamble text, page_count = await extractor.extract_text(str(dest)) + text = extractor.strip_nevo_preamble(text) # Parse date d_date = None