Auto-strip Nevo preambles and separate style analysis per appeal subtype
- Add strip_nevo_preamble() to extractor.py — auto-removes Nevo database headers (bibliography, legislation, mini-ratio) during training upload - Add appeal_subtype column to style_patterns table — patterns are now stored per subtype instead of globally mixed - Update clear_style_patterns() to support subtype-scoped deletion - Pass appeal_subtype through analyze_corpus → store → upsert pipeline Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -116,6 +116,7 @@ CREATE TABLE IF NOT EXISTS style_patterns (
|
|||||||
frequency INTEGER DEFAULT 1,
|
frequency INTEGER DEFAULT 1,
|
||||||
context TEXT DEFAULT '',
|
context TEXT DEFAULT '',
|
||||||
examples JSONB DEFAULT '[]',
|
examples JSONB DEFAULT '[]',
|
||||||
|
appeal_subtype TEXT DEFAULT '',
|
||||||
created_at TIMESTAMPTZ DEFAULT now()
|
created_at TIMESTAMPTZ DEFAULT now()
|
||||||
);
|
);
|
||||||
|
|
||||||
@@ -165,6 +166,9 @@ ALTER TABLE cases ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT '';
|
|||||||
ALTER TABLE style_corpus ADD COLUMN IF NOT EXISTS practice_area TEXT DEFAULT 'appeals_committee';
|
ALTER TABLE style_corpus ADD COLUMN IF NOT EXISTS practice_area TEXT DEFAULT 'appeals_committee';
|
||||||
ALTER TABLE style_corpus ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT '';
|
ALTER TABLE style_corpus ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT '';
|
||||||
|
|
||||||
|
-- הרחבת style_patterns עם appeal_subtype לניתוח סגנון נפרד לכל סוג ערר
|
||||||
|
ALTER TABLE style_patterns ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT '';
|
||||||
|
|
||||||
-- טבלת qa_results
|
-- טבלת qa_results
|
||||||
CREATE TABLE IF NOT EXISTS qa_results (
|
CREATE TABLE IF NOT EXISTS qa_results (
|
||||||
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||||||
@@ -973,12 +977,14 @@ async def upsert_style_pattern(
|
|||||||
pattern_text: str,
|
pattern_text: str,
|
||||||
context: str = "",
|
context: str = "",
|
||||||
examples: list[str] | None = None,
|
examples: list[str] | None = None,
|
||||||
|
appeal_subtype: str = "",
|
||||||
) -> None:
|
) -> None:
|
||||||
pool = await get_pool()
|
pool = await get_pool()
|
||||||
async with pool.acquire() as conn:
|
async with pool.acquire() as conn:
|
||||||
existing = await conn.fetchrow(
|
existing = await conn.fetchrow(
|
||||||
"SELECT id, frequency FROM style_patterns WHERE pattern_type = $1 AND pattern_text = $2",
|
"SELECT id, frequency FROM style_patterns "
|
||||||
pattern_type, pattern_text,
|
"WHERE pattern_type = $1 AND pattern_text = $2 AND appeal_subtype = $3",
|
||||||
|
pattern_type, pattern_text, appeal_subtype,
|
||||||
)
|
)
|
||||||
if existing:
|
if existing:
|
||||||
await conn.execute(
|
await conn.execute(
|
||||||
@@ -987,18 +993,27 @@ async def upsert_style_pattern(
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
await conn.execute(
|
await conn.execute(
|
||||||
"""INSERT INTO style_patterns (pattern_type, pattern_text, context, examples)
|
"""INSERT INTO style_patterns (pattern_type, pattern_text, context, examples, appeal_subtype)
|
||||||
VALUES ($1, $2, $3, $4)""",
|
VALUES ($1, $2, $3, $4, $5)""",
|
||||||
pattern_type, pattern_text, context,
|
pattern_type, pattern_text, context,
|
||||||
json.dumps(examples or []),
|
json.dumps(examples or []),
|
||||||
|
appeal_subtype,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
async def clear_style_patterns() -> None:
|
async def clear_style_patterns(appeal_subtype: str = "") -> None:
|
||||||
"""Delete all existing style patterns (used before re-analysis)."""
|
"""Delete style patterns, optionally filtered by appeal_subtype.
|
||||||
|
|
||||||
|
Empty appeal_subtype = delete ALL patterns.
|
||||||
|
"""
|
||||||
pool = await get_pool()
|
pool = await get_pool()
|
||||||
async with pool.acquire() as conn:
|
async with pool.acquire() as conn:
|
||||||
await conn.execute("DELETE FROM style_patterns")
|
if appeal_subtype:
|
||||||
|
await conn.execute(
|
||||||
|
"DELETE FROM style_patterns WHERE appeal_subtype = $1", appeal_subtype
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
await conn.execute("DELETE FROM style_patterns")
|
||||||
|
|
||||||
|
|
||||||
# ── Semantic Search (V2 — decision blocks & case law) ─────────────
|
# ── Semantic Search (V2 — decision blocks & case law) ─────────────
|
||||||
|
|||||||
@@ -218,3 +218,30 @@ def _extract_rtf(path: Path) -> str:
|
|||||||
"""Extract text from RTF file."""
|
"""Extract text from RTF file."""
|
||||||
rtf_content = path.read_text(encoding="utf-8", errors="replace")
|
rtf_content = path.read_text(encoding="utf-8", errors="replace")
|
||||||
return rtf_to_text(rtf_content)
|
return rtf_to_text(rtf_content)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Nevo preamble stripping ──────────────────────────────────────
|
||||||
|
|
||||||
|
_NEVO_MARKERS = ("ספרות:", "חקיקה שאוזכרה:", "מיני-רציו:", "פסקי דין שאוזכרו:",
|
||||||
|
"כתבי עת:", "הועתק מנבו")
|
||||||
|
|
||||||
|
_DECISION_START = re.compile(
|
||||||
|
r"^(בפנינו|לפנינו|הערר שבנדון|ועדת הערר לתכנון|רקע עובדתי|עסקינן)",
|
||||||
|
re.MULTILINE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def strip_nevo_preamble(text: str) -> str:
|
||||||
|
"""Remove Nevo database preamble (bibliography, legislation, mini-ratio) from decision text.
|
||||||
|
|
||||||
|
Returns the original text unchanged if no preamble is detected.
|
||||||
|
"""
|
||||||
|
head = text[:400]
|
||||||
|
if not any(marker in head for marker in _NEVO_MARKERS):
|
||||||
|
return text
|
||||||
|
m = _DECISION_START.search(text)
|
||||||
|
if m and m.start() > 50:
|
||||||
|
stripped = text[m.start():]
|
||||||
|
logger.debug("Stripped %d chars of Nevo preamble", m.start())
|
||||||
|
return stripped
|
||||||
|
return text
|
||||||
|
|||||||
@@ -134,8 +134,8 @@ async def analyze_corpus(appeal_subtype: str = "") -> dict:
|
|||||||
if not rows:
|
if not rows:
|
||||||
return {"error": "אין החלטות בקורפוס. העלה החלטות קודמות תחילה."}
|
return {"error": "אין החלטות בקורפוס. העלה החלטות קודמות תחילה."}
|
||||||
|
|
||||||
# Clear old patterns before re-analysis
|
# Clear old patterns for this subtype (or all if unfiltered)
|
||||||
await db.clear_style_patterns()
|
await db.clear_style_patterns(appeal_subtype)
|
||||||
|
|
||||||
# Calculate token budget
|
# Calculate token budget
|
||||||
total_chars = sum(len(row["full_text"]) for row in rows)
|
total_chars = sum(len(row["full_text"]) for row in rows)
|
||||||
@@ -147,12 +147,12 @@ async def analyze_corpus(appeal_subtype: str = "") -> dict:
|
|||||||
)
|
)
|
||||||
|
|
||||||
if estimated_tokens < MAX_INPUT_TOKENS:
|
if estimated_tokens < MAX_INPUT_TOKENS:
|
||||||
return await _analyze_single_pass(rows)
|
return await _analyze_single_pass(rows, appeal_subtype)
|
||||||
else:
|
else:
|
||||||
return await _analyze_multi_pass(rows)
|
return await _analyze_multi_pass(rows, appeal_subtype)
|
||||||
|
|
||||||
|
|
||||||
async def _analyze_single_pass(rows) -> dict:
|
async def _analyze_single_pass(rows, appeal_subtype: str = "") -> dict:
|
||||||
"""Send all decisions in a single API call."""
|
"""Send all decisions in a single API call."""
|
||||||
decisions_text = ""
|
decisions_text = ""
|
||||||
for row in rows:
|
for row in rows:
|
||||||
@@ -164,10 +164,10 @@ async def _analyze_single_pass(rows) -> dict:
|
|||||||
timeout=claude_session.LONG_TIMEOUT,
|
timeout=claude_session.LONG_TIMEOUT,
|
||||||
)
|
)
|
||||||
|
|
||||||
return await _parse_and_store_patterns(raw, len(rows))
|
return await _parse_and_store_patterns(raw, len(rows), appeal_subtype)
|
||||||
|
|
||||||
|
|
||||||
async def _analyze_multi_pass(rows) -> dict:
|
async def _analyze_multi_pass(rows, appeal_subtype: str = "") -> dict:
|
||||||
"""Analyze each decision individually, then synthesize patterns."""
|
"""Analyze each decision individually, then synthesize patterns."""
|
||||||
all_patterns = []
|
all_patterns = []
|
||||||
|
|
||||||
@@ -197,7 +197,7 @@ async def _analyze_multi_pass(rows) -> dict:
|
|||||||
timeout=claude_session.LONG_TIMEOUT,
|
timeout=claude_session.LONG_TIMEOUT,
|
||||||
)
|
)
|
||||||
|
|
||||||
return await _parse_and_store_patterns(raw, len(rows))
|
return await _parse_and_store_patterns(raw, len(rows), appeal_subtype)
|
||||||
|
|
||||||
|
|
||||||
def _extract_json(response_text: str) -> list | None:
|
def _extract_json(response_text: str) -> list | None:
|
||||||
@@ -248,14 +248,16 @@ def _extract_json(response_text: str) -> list | None:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
async def _parse_and_store_patterns(response_text: str, num_decisions: int) -> dict:
|
async def _parse_and_store_patterns(
|
||||||
|
response_text: str, num_decisions: int, appeal_subtype: str = "",
|
||||||
|
) -> dict:
|
||||||
"""Parse Claude's response and store patterns in the database."""
|
"""Parse Claude's response and store patterns in the database."""
|
||||||
patterns = _extract_json(response_text)
|
patterns = _extract_json(response_text)
|
||||||
|
|
||||||
if patterns is None:
|
if patterns is None:
|
||||||
return {"error": "Could not parse analysis results", "raw": response_text}
|
return {"error": "Could not parse analysis results", "raw": response_text}
|
||||||
|
|
||||||
# Store patterns
|
# Store patterns tagged by appeal_subtype
|
||||||
count = 0
|
count = 0
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
||||||
await db.upsert_style_pattern(
|
await db.upsert_style_pattern(
|
||||||
@@ -263,11 +265,13 @@ async def _parse_and_store_patterns(response_text: str, num_decisions: int) -> d
|
|||||||
pattern_text=pattern.get("text", ""),
|
pattern_text=pattern.get("text", ""),
|
||||||
context=pattern.get("context", ""),
|
context=pattern.get("context", ""),
|
||||||
examples=[pattern.get("example", "")],
|
examples=[pattern.get("example", "")],
|
||||||
|
appeal_subtype=appeal_subtype,
|
||||||
)
|
)
|
||||||
count += 1
|
count += 1
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"patterns_found": count,
|
"patterns_found": count,
|
||||||
"decisions_analyzed": num_decisions,
|
"decisions_analyzed": num_decisions,
|
||||||
|
"appeal_subtype": appeal_subtype or "all",
|
||||||
"pattern_types": list({p.get("type") for p in patterns}),
|
"pattern_types": list({p.get("type") for p in patterns}),
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -152,8 +152,9 @@ async def document_upload_training(
|
|||||||
if source.resolve() != dest.resolve():
|
if source.resolve() != dest.resolve():
|
||||||
shutil.copy2(str(source), str(dest))
|
shutil.copy2(str(source), str(dest))
|
||||||
|
|
||||||
# Extract text
|
# Extract text and strip Nevo preamble
|
||||||
text, page_count = await extractor.extract_text(str(dest))
|
text, page_count = await extractor.extract_text(str(dest))
|
||||||
|
text = extractor.strip_nevo_preamble(text)
|
||||||
|
|
||||||
# Parse date
|
# Parse date
|
||||||
d_date = None
|
d_date = None
|
||||||
|
|||||||
Reference in New Issue
Block a user