From 9618dc895b233b79f6b4d45f38b41658b012cfbc Mon Sep 17 00:00:00 2001 From: Chaim Date: Wed, 17 Jun 2026 17:17:00 +0000 Subject: [PATCH] =?UTF-8?q?feat(chunker):=20=D7=94=D7=95=D7=A1=D7=A4=D7=AA?= =?UTF-8?q?=20=D7=93=D7=A4=D7=95=D7=A1=D7=99=20=D7=98=D7=A2=D7=A0=D7=95?= =?UTF-8?q?=D7=AA=20=D7=9C=D7=A4=D7=A1=D7=99=D7=A7=D7=AA=20=D7=91=D7=99?= =?UTF-8?q?=D7=AA-=D7=94=D7=9E=D7=A9=D7=A4=D7=98=20(parties=5Fclaims)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit פסקי-דין של ביהמ"ש העליון/מנהלי משתמשים בנוסחאות שונות מוועדת הערר: - "טענות הצדדים" / "טיעוני הצדדים" — סוג חדש parties_claims - "טיעוני המערערים/ת" — מוסף ל-appellant_claims - "טיעוני המשיבים/ה" — מוסף ל-respondent_claims parties_claims הוסף ל-NON_REASONING_SECTIONS בhalacha_extractor כדי שלא יוזנו לחילוץ הלכות (בדיוק כמו appellant_claims/respondent_claims). Co-Authored-By: Claude Sonnet 4.6 --- mcp-server/src/legal_mcp/services/chunker.py | 19 ++++++++++++------- .../legal_mcp/services/halacha_extractor.py | 2 +- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/mcp-server/src/legal_mcp/services/chunker.py b/mcp-server/src/legal_mcp/services/chunker.py index e41656d..54020aa 100644 --- a/mcp-server/src/legal_mcp/services/chunker.py +++ b/mcp-server/src/legal_mcp/services/chunker.py @@ -22,18 +22,23 @@ from legal_mcp import config # court rulings use slightly different vocabulary (פסק דין, נימוקים, סוף דבר). SECTION_PATTERNS = [ (r"רקע\s*עובדתי|רקע\s*כללי|העובדות|הרקע", "facts"), - # appellant_claims: covers singular (עורר/עוררת, מערער/מערערת) and plural - # (עוררים/עוררין, מערערים). Previously only plural was matched, so headers - # like "טענות העורר:" were silently absorbed into the preceding section. + # parties_claims: bilateral section common in Supreme Court / administrative + # court decisions ("טענות הצדדים", "טיעוני הצדדים"). Not split by side. ( - r"(?:טענות|עיקר\s*טענות)\s*ה(?:עוררי[םן]|עורר[ת]?|מערערי[םן]|מערער[ת]?)", + r"(?:טענות|טיעוני|עמדות)\s*הצדדים", + "parties_claims", + ), + # appellant_claims: covers singular (עורר/עוררת, מערער/מערערת) and plural + # (עוררים/עוררין, מערערים), plus court-format verb "טיעוני". + ( + r"(?:טענות|עיקר\s*טענות|טיעוני)\s*ה(?:עוררי[םן]|עורר[ת]?|מערערי[םן]|מערער[ת]?)", "appellant_claims", ), # respondent_claims: covers singular (משיב/משיבה) and plural (משיבים/משיבין), - # plus verb forms תשובת/תגובת. "טענות המשיבה:" (feminine singular) was the - # root cause of halacha 8181-21 index-11 being extracted from party claims. + # plus verb forms תשובת/תגובת/טיעוני. "טענות המשיבה:" (feminine singular) was + # the root cause of halacha 8181-21 index-11 being extracted from party claims. ( - r"(?:טענות|תשובת|תגובת|עיקר\s*טענות)\s*ה(?:משיבי[םן]|משיב[ה]?)", + r"(?:טענות|תשובת|תגובת|עיקר\s*טענות|טיעוני)\s*ה(?:משיבי[םן]|משיב[ה]?)", "respondent_claims", ), (r"דיון\s*והכרעה|דיון|הכרעה|ניתוח\s*משפטי|המסגרת\s*המשפטית|נימוקים", "legal_analysis"), diff --git a/mcp-server/src/legal_mcp/services/halacha_extractor.py b/mcp-server/src/legal_mcp/services/halacha_extractor.py index 796c011..6f0a148 100644 --- a/mcp-server/src/legal_mcp/services/halacha_extractor.py +++ b/mcp-server/src/legal_mcp/services/halacha_extractor.py @@ -101,7 +101,7 @@ EXTRACTABLE_SECTIONS = ("legal_analysis", "ruling", "conclusion") # reasoning that merely landed under 'other' is still reached. Raises precision # on the dominant Facts↔Reasoning confusion class (#81.6; INV-LRN2 # quality-at-source; LegalSeg / rhetorical-role labeling). -NON_REASONING_SECTIONS = ("facts", "appellant_claims", "respondent_claims", "intro") +NON_REASONING_SECTIONS = ("facts", "appellant_claims", "respondent_claims", "parties_claims", "intro") # Two prompts — choose by source's is_binding flag.