fix(extractor): add regex fix for Hebrew law year gershayim corruption
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m36s
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m36s
תש[א-ת]+יי[א-ת] → תש[א-ת]+"[א-ת] (e.g. תשכייה → תשכ"ה) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -118,6 +118,10 @@ _ABBREV_PATTERN = re.compile(
|
||||
'|'.join(re.escape(k) for k in sorted(_HEBREW_ABBREV_FIXES, key=len, reverse=True))
|
||||
)
|
||||
|
||||
# Matches Hebrew law year abbreviations where gershayim was encoded as double-yod.
|
||||
# e.g. תשכייה → תשכ"ה, תשנייב → תשנ"ב
|
||||
_HEBREW_YEAR_RE = re.compile(r'(תש[א-ת]+)יי([א-ת])')
|
||||
|
||||
|
||||
def _fix_hebrew_quotes(text: str) -> str:
|
||||
"""Fix known Hebrew abbreviation quote replacements.
|
||||
@@ -126,7 +130,9 @@ def _fix_hebrew_quotes(text: str) -> str:
|
||||
some born-digital PDFs encode gershayim (״) as double-yod (יי), producing
|
||||
the same corruption patterns as OCR.
|
||||
"""
|
||||
return _ABBREV_PATTERN.sub(lambda m: _HEBREW_ABBREV_FIXES[m.group()], text)
|
||||
text = _ABBREV_PATTERN.sub(lambda m: _HEBREW_ABBREV_FIXES[m.group()], text)
|
||||
text = _HEBREW_YEAR_RE.sub(r'\1"\2', text)
|
||||
return text
|
||||
|
||||
|
||||
# ── Extraction ───────────────────────────────────────────────────
|
||||
|
||||
Reference in New Issue
Block a user