legal-ai/mcp-server/tests/test_chunker_section_patterns.py

"""Tests for the Hebrew section-header patterns in the chunker.

Focuses on the singular/feminine forms that were previously missing and
caused party-claims sections to bleed into the preceding section type.
"""
from __future__ import annotations

import pytest

from legal_mcp.services.chunker import chunk_document


def _section_types(text: str) -> list[str]:
    """Return the sequence of section_type values in the chunked output."""
    return [c.section_type for c in chunk_document(text)]


def _has_section(text: str, section: str) -> bool:
    return section in _section_types(text)


# ── respondent_claims patterns ──────────────────────────────────────────────

@pytest.mark.parametrize("header", [
    "טענות המשיבים:",          # plural masculine — was working before
    "טענות המשיבין:",          # plural alternative — was working
    "תשובת המשיבים:",          # plural verb form — was working
    "טענות המשיבה:",           # singular feminine — BUG FIX (8181-21)
    "טענות המשיב:",            # singular masculine — BUG FIX
    "תשובת המשיבה:",           # singular feminine verb form — BUG FIX
    "תגובת המשיבה:",           # תגובה form — BUG FIX
    "תגובת המשיבים:",          # plural תגובה — BUG FIX
    "עיקר טענות המשיבה:",     # prefix + singular — BUG FIX
])
def test_respondent_claims_recognized(header):
    # Explicit + avoids Python implicit string-literal concatenation merging
    # the "\n\n" separator into the following header string.
    text = (
        "רקע עובדתי\n"
        + "עובדות כלליות. " * 20 + "\n\n"
        + f"{header}\n"
        + "הוועדה המקומית טוענת כי אין מקום לקבל את הערר. " * 15 + "\n\n"
        + "דיון והכרעה\n"
        + "לאחר בחינת הטענות אנו קובעים כי יש לקבל את הערר בחלקו. " * 15
    )
    assert _has_section(text, "respondent_claims"), (
        f"Header '{header}' should produce respondent_claims chunk"
    )


# ── appellant_claims patterns ────────────────────────────────────────────────

@pytest.mark.parametrize("header", [
    "טענות העוררים:",          # plural masculine — was working before
    "טענות העוררין:",          # plural alternative — was working
    "טענות המערערים:",         # מערערים plural — was working
    "טענות העורר:",            # singular masculine — BUG FIX
    "טענות העוררת:",           # singular feminine — BUG FIX
    "טענות המערער:",           # מערער singular — BUG FIX
    "טענות המערערת:",          # מערערת singular feminine — BUG FIX
    "עיקר טענות העורר:",      # prefix + singular — BUG FIX
])
def test_appellant_claims_recognized(header):
    text = (
        "רקע עובדתי\n"
        + "עובדות כלליות. " * 20 + "\n\n"
        + f"{header}\n"
        + "ב\"כ העורר טוען כי יש לקבל את הערר ולבטל את ההחלטה. " * 15 + "\n\n"
        + "דיון והכרעה\n"
        + "לאחר בחינת הטענות אנו קובעים כי יש לקבל את הערר בחלקו. " * 15
    )
    assert _has_section(text, "appellant_claims"), (
        f"Header '{header}' should produce appellant_claims chunk"
    )


# ── regression: existing plural forms still work ────────────────────────────

def test_regression_plural_respondent_and_legal_analysis():
    """Full decision with plural respondent — all sections must be preserved."""
    text = (
        "מבוא\n"
        + "ערר זה הוגש על החלטת הוועדה המקומית לתכנון ובנייה. " * 10 + "\n\n"
        + "רקע עובדתי\n"
        + "העורר רכש את הנכס בשנת 2010 ופנה לקבלת היתר בנייה. " * 10 + "\n\n"
        + "טענות העוררים:\n"
        + "ב\"כ העוררים טוען כי נפל פגם יסודי בהחלטת הוועדה. " * 10 + "\n\n"
        + "טענות המשיבים:\n"
        + "הוועדה המקומית סבורה כי ההחלטה תקינה ומבוססת. " * 10 + "\n\n"
        + "דיון והכרעה\n"
        + "לאחר בחינת הטענות קובעים כי הערר מתקבל בחלקו. " * 10
    )
    types = set(_section_types(text))
    assert "appellant_claims" in types
    assert "respondent_claims" in types
    assert "legal_analysis" in types


# ── party claims do NOT bleed into legal_analysis ───────────────────────────

def test_respondent_singular_does_not_bleed_into_ruling():
    """Before the fix, "טענות המשיבה:" was absorbed into the preceding section.
    After the fix it must produce its own respondent_claims chunk."""
    text = (
        "החלטה\n"
        + "הוועדה דנה בערר שהוגש על החלטת הוועדה המקומית. " * 10 + "\n\n"
        + "טענות המשיבה:\n"
        + "הוועדה המקומית טוענת שאין כל השבחה בנסיבות העניין. " * 10 + "\n\n"
        + "דיון\n"
        + "אנו בוחנים את הטענות לאחר עיון בחומר שהוגש. " * 10
    )
    # The respondent claims must NOT remain labeled as 'ruling'
    chunks = chunk_document(text)
    respondent_chunks = [c for c in chunks if c.section_type == "respondent_claims"]
    assert len(respondent_chunks) > 0, (
        "Singular 'טענות המשיבה' must produce respondent_claims chunks, not bleed into ruling"
    )
    # Verify the respondent text didn't end up in ruling chunks
    ruling_text = " ".join(c.content for c in chunks if c.section_type == "ruling")
    assert "הוועדה המקומית טוענת שאין השבחה" not in ruling_text, (
        "Respondent claim text must not appear in ruling chunks"
    )