From 4892fb6e8f62e8932a1e4cbb7f7b142ac7f90b04 Mon Sep 17 00:00:00 2001 From: Chaim Date: Mon, 25 May 2026 15:59:39 +0000 Subject: [PATCH] fix(extractor): apply Hebrew quote fixer to direct PDF extraction path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Born-digital Hebrew PDFs from legal software often encode gershayim (״) as double-yod (יי), producing the same corruption patterns as OCR. The fixer was only called after Google Cloud Vision OCR — digitally created PDFs that passed quality checks received no correction. Changes: - Apply _fix_hebrew_quotes() in the direct extraction path - Add 'בליימ' → 'בל"מ' (בקשה להארכת מועד — systematic corruption in 1017-03-26) - Add 'תמייא' → 'תמ"א' (תכנית מתאר ארצית) - Update docstring to reflect the broader scope Co-Authored-By: Claude Sonnet 4.6 --- mcp-server/src/legal_mcp/services/extractor.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/mcp-server/src/legal_mcp/services/extractor.py b/mcp-server/src/legal_mcp/services/extractor.py index 18d36e0..44cd0e5 100644 --- a/mcp-server/src/legal_mcp/services/extractor.py +++ b/mcp-server/src/legal_mcp/services/extractor.py @@ -109,6 +109,9 @@ _HEBREW_ABBREV_FIXES: dict[str, str] = { 'מייר': 'מ"ר', 'יחייד': 'יח"ד', 'בייכ': 'ב"כ', + # Patterns where double-yod (יי) substitutes for gershayim (״) in born-digital PDFs + 'בליימ': 'בל"מ', # בקשה להארכת מועד — appears in RTL legal docs + 'תמייא': 'תמ"א', # תכנית מתאר ארצית } _ABBREV_PATTERN = re.compile( @@ -117,7 +120,12 @@ _ABBREV_PATTERN = re.compile( def _fix_hebrew_quotes(text: str) -> str: - """Fix known Hebrew abbreviation quote replacements from Google Vision OCR.""" + """Fix known Hebrew abbreviation quote replacements. + + Applied to both Google Vision OCR output and direct PyMuPDF extraction — + some born-digital PDFs encode gershayim (״) as double-yod (יי), producing + the same corruption patterns as OCR. + """ return _ABBREV_PATTERN.sub(lambda m: _HEBREW_ABBREV_FIXES[m.group()], text) @@ -189,7 +197,7 @@ async def _extract_pdf(path: Path) -> tuple[str, int, list[int]]: text = page.get_text().strip() if len(text) > 50 and _text_quality_ok(text): - pages_text.append(text) + pages_text.append(_fix_hebrew_quotes(text)) logger.debug("Page %d: direct extraction (%d chars, quality OK)", page_num + 1, len(text)) else: reason = "insufficient text" if len(text) <= 50 else "low quality OCR layer"