fix(extractor): add regex fix for Hebrew law year gershayim corruption
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m36s

תש[א-ת]+יי[א-ת] → תש[א-ת]+"[א-ת]  (e.g. תשכייה → תשכ"ה)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-25 16:12:20 +00:00
parent 4892fb6e8f
commit 8153bc9f03

View File

@@ -118,6 +118,10 @@ _ABBREV_PATTERN = re.compile(
'|'.join(re.escape(k) for k in sorted(_HEBREW_ABBREV_FIXES, key=len, reverse=True))
)
# Matches Hebrew law year abbreviations where gershayim was encoded as double-yod.
# e.g. תשכייה → תשכ"ה, תשנייב → תשנ"ב
_HEBREW_YEAR_RE = re.compile(r'(תש[א-ת]+)יי([א-ת])')
def _fix_hebrew_quotes(text: str) -> str:
"""Fix known Hebrew abbreviation quote replacements.
@@ -126,7 +130,9 @@ def _fix_hebrew_quotes(text: str) -> str:
some born-digital PDFs encode gershayim (״) as double-yod (יי), producing
the same corruption patterns as OCR.
"""
return _ABBREV_PATTERN.sub(lambda m: _HEBREW_ABBREV_FIXES[m.group()], text)
text = _ABBREV_PATTERN.sub(lambda m: _HEBREW_ABBREV_FIXES[m.group()], text)
text = _HEBREW_YEAR_RE.sub(r'\1"\2', text)
return text
# ── Extraction ───────────────────────────────────────────────────