fix(chunker): תיקון זיהוי כותרות טענות הצדדים ביחיד/נקבה + שלוש שכבות הגנה
**סיבת-שורש:** רג'קס respondent_claims כיסה רק צורת רבים (המשיבים/המשיבין), ולא יחיד נקבה (המשיבה) ויחיד זכר (המשיב). הכותרת "טענות המשיבה:" בתיק 8181-21 נבלעה לתוך מקטע ruling → חולצה כהלכה שגויה. **שלוש שכבות הגנה:** 1. chunker.py — הרחבת SECTION_PATTERNS לכסות יחיד/זכר/נקבה + תגובת/תשובת 2. halacha_extractor.py — עיגון חיובי: drop ל-ruling chunks שלפני legal_analysis 3. halacha_quality.py — FLAG_PARTY_CLAIM: זיהוי שפת-טענות-צד בציטוט התומך **היקף:** 93 תיקים עם 0 chunks של טענות (כנראה בגלל כותרות ביחיד שלא הוכרו); 628 הלכות מאושרות מתיקים אלה — חלקן עשויות להיות תקינות, יש לעשות re-chunk. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -638,12 +638,20 @@ async def _select_extractable_chunks(
|
||||
extraction. Previously the fallback took *all* chunks, re-admitting exactly
|
||||
the sections the primary filter excludes.
|
||||
|
||||
Positive-anchor guard: when a document has a "דיון/הכרעה" section
|
||||
(section_type='legal_analysis'), extraction starts from that section
|
||||
onwards. Any 'ruling' chunks that appear BEFORE the first legal_analysis
|
||||
chunk by position are dropped — they most likely result from a party-claims
|
||||
section whose header was not recognised by the chunker and was therefore
|
||||
absorbed into the preceding section.
|
||||
|
||||
Returns ``(chunks, used_fallback)`` so the caller can log the fallback once.
|
||||
"""
|
||||
chunks = await db.list_precedent_chunks(
|
||||
case_law_id, section_types=EXTRACTABLE_SECTIONS,
|
||||
)
|
||||
if chunks:
|
||||
chunks = _apply_discussion_anchor(chunks)
|
||||
return chunks, False
|
||||
all_chunks = await db.list_precedent_chunks(case_law_id)
|
||||
filtered = [
|
||||
@@ -653,6 +661,37 @@ async def _select_extractable_chunks(
|
||||
return filtered, True
|
||||
|
||||
|
||||
def _apply_discussion_anchor(chunks: list[dict]) -> list[dict]:
|
||||
"""Drop 'ruling' chunks that precede the first 'legal_analysis' chunk.
|
||||
|
||||
In Israeli planning-committee decisions the discussion section
|
||||
(דיון / הכרעה / דיון והכרעה) always comes after the parties' claims.
|
||||
A 'ruling'-labelled chunk that appears before the discussion is a strong
|
||||
signal that a party-claims section was silently absorbed into it (chunker
|
||||
regex didn't match the header). Dropping those early 'ruling' chunks is
|
||||
safe because all reasoning content falls at or after the 'דיון' anchor.
|
||||
"""
|
||||
analysis_indices = [
|
||||
c["chunk_index"] for c in chunks
|
||||
if c.get("section_type") == "legal_analysis"
|
||||
]
|
||||
if not analysis_indices:
|
||||
return chunks
|
||||
first_analysis = min(analysis_indices)
|
||||
filtered = [
|
||||
c for c in chunks
|
||||
if not (c.get("section_type") == "ruling" and c["chunk_index"] < first_analysis)
|
||||
]
|
||||
dropped = len(chunks) - len(filtered)
|
||||
if dropped:
|
||||
logger.info(
|
||||
"halacha_extractor: positive-anchor guard dropped %d pre-discussion "
|
||||
"'ruling' chunk(s) (first legal_analysis at chunk_index=%d)",
|
||||
dropped, first_analysis,
|
||||
)
|
||||
return filtered
|
||||
|
||||
|
||||
async def _extract_impl(case_law_id: UUID, force: bool = False,
|
||||
effort: str | None = None) -> dict:
|
||||
"""Core extraction (caller holds the global advisory lock for the duration).
|
||||
|
||||
Reference in New Issue
Block a user