From 8153bc9f033480eb590021e61a0740bec3255dc3 Mon Sep 17 00:00:00 2001 From: Chaim Date: Mon, 25 May 2026 16:12:20 +0000 Subject: [PATCH] fix(extractor): add regex fix for Hebrew law year gershayim corruption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit תש[א-ת]+יי[א-ת] → תש[א-ת]+"[א-ת] (e.g. תשכייה → תשכ"ה) Co-Authored-By: Claude Sonnet 4.6 --- mcp-server/src/legal_mcp/services/extractor.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mcp-server/src/legal_mcp/services/extractor.py b/mcp-server/src/legal_mcp/services/extractor.py index 44cd0e5..c882570 100644 --- a/mcp-server/src/legal_mcp/services/extractor.py +++ b/mcp-server/src/legal_mcp/services/extractor.py @@ -118,6 +118,10 @@ _ABBREV_PATTERN = re.compile( '|'.join(re.escape(k) for k in sorted(_HEBREW_ABBREV_FIXES, key=len, reverse=True)) ) +# Matches Hebrew law year abbreviations where gershayim was encoded as double-yod. +# e.g. תשכייה → תשכ"ה, תשנייב → תשנ"ב +_HEBREW_YEAR_RE = re.compile(r'(תש[א-ת]+)יי([א-ת])') + def _fix_hebrew_quotes(text: str) -> str: """Fix known Hebrew abbreviation quote replacements. @@ -126,7 +130,9 @@ def _fix_hebrew_quotes(text: str) -> str: some born-digital PDFs encode gershayim (״) as double-yod (יי), producing the same corruption patterns as OCR. """ - return _ABBREV_PATTERN.sub(lambda m: _HEBREW_ABBREV_FIXES[m.group()], text) + text = _ABBREV_PATTERN.sub(lambda m: _HEBREW_ABBREV_FIXES[m.group()], text) + text = _HEBREW_YEAR_RE.sub(r'\1"\2', text) + return text # ── Extraction ───────────────────────────────────────────────────