fix(extractor): add regex fix for Hebrew law year gershayim corruption
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m36s
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 1m36s
תש[א-ת]+יי[א-ת] → תש[א-ת]+"[א-ת] (e.g. תשכייה → תשכ"ה) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -118,6 +118,10 @@ _ABBREV_PATTERN = re.compile(
|
|||||||
'|'.join(re.escape(k) for k in sorted(_HEBREW_ABBREV_FIXES, key=len, reverse=True))
|
'|'.join(re.escape(k) for k in sorted(_HEBREW_ABBREV_FIXES, key=len, reverse=True))
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Matches Hebrew law year abbreviations where gershayim was encoded as double-yod.
|
||||||
|
# e.g. תשכייה → תשכ"ה, תשנייב → תשנ"ב
|
||||||
|
_HEBREW_YEAR_RE = re.compile(r'(תש[א-ת]+)יי([א-ת])')
|
||||||
|
|
||||||
|
|
||||||
def _fix_hebrew_quotes(text: str) -> str:
|
def _fix_hebrew_quotes(text: str) -> str:
|
||||||
"""Fix known Hebrew abbreviation quote replacements.
|
"""Fix known Hebrew abbreviation quote replacements.
|
||||||
@@ -126,7 +130,9 @@ def _fix_hebrew_quotes(text: str) -> str:
|
|||||||
some born-digital PDFs encode gershayim (״) as double-yod (יי), producing
|
some born-digital PDFs encode gershayim (״) as double-yod (יי), producing
|
||||||
the same corruption patterns as OCR.
|
the same corruption patterns as OCR.
|
||||||
"""
|
"""
|
||||||
return _ABBREV_PATTERN.sub(lambda m: _HEBREW_ABBREV_FIXES[m.group()], text)
|
text = _ABBREV_PATTERN.sub(lambda m: _HEBREW_ABBREV_FIXES[m.group()], text)
|
||||||
|
text = _HEBREW_YEAR_RE.sub(r'\1"\2', text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
# ── Extraction ───────────────────────────────────────────────────
|
# ── Extraction ───────────────────────────────────────────────────
|
||||||
|
|||||||
Reference in New Issue
Block a user