From 4892fb6e8f62e8932a1e4cbb7f7b142ac7f90b04 Mon Sep 17 00:00:00 2001
From: Chaim <chaim@marcus-law.co.il>
Date: Mon, 25 May 2026 15:59:39 +0000
Subject: [PATCH] fix(extractor): apply Hebrew quote fixer to direct PDF
 extraction path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Born-digital Hebrew PDFs from legal software often encode gershayim (״)
as double-yod (יי), producing the same corruption patterns as OCR.
The fixer was only called after Google Cloud Vision OCR — digitally
created PDFs that passed quality checks received no correction.

Changes:
- Apply _fix_hebrew_quotes() in the direct extraction path
- Add 'בליימ' → 'בל"מ' (בקשה להארכת מועד — systematic corruption in 1017-03-26)
- Add 'תמייא' → 'תמ"א' (תכנית מתאר ארצית)
- Update docstring to reflect the broader scope

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 mcp-server/src/legal_mcp/services/extractor.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/mcp-server/src/legal_mcp/services/extractor.py b/mcp-server/src/legal_mcp/services/extractor.py
index 18d36e0..44cd0e5 100644
--- a/mcp-server/src/legal_mcp/services/extractor.py
+++ b/mcp-server/src/legal_mcp/services/extractor.py
@@ -109,6 +109,9 @@ _HEBREW_ABBREV_FIXES: dict[str, str] = {
     'מייר': 'מ"ר',
     'יחייד': 'יח"ד',
     'בייכ': 'ב"כ',
+    # Patterns where double-yod (יי) substitutes for gershayim (״) in born-digital PDFs
+    'בליימ': 'בל"מ',   # בקשה להארכת מועד — appears in RTL legal docs
+    'תמייא': 'תמ"א',   # תכנית מתאר ארצית
 }
 
 _ABBREV_PATTERN = re.compile(
@@ -117,7 +120,12 @@ _ABBREV_PATTERN = re.compile(
 
 
 def _fix_hebrew_quotes(text: str) -> str:
-    """Fix known Hebrew abbreviation quote replacements from Google Vision OCR."""
+    """Fix known Hebrew abbreviation quote replacements.
+
+    Applied to both Google Vision OCR output and direct PyMuPDF extraction —
+    some born-digital PDFs encode gershayim (״) as double-yod (יי), producing
+    the same corruption patterns as OCR.
+    """
     return _ABBREV_PATTERN.sub(lambda m: _HEBREW_ABBREV_FIXES[m.group()], text)
 
 
@@ -189,7 +197,7 @@ async def _extract_pdf(path: Path) -> tuple[str, int, list[int]]:
         text = page.get_text().strip()
 
         if len(text) > 50 and _text_quality_ok(text):
-            pages_text.append(text)
+            pages_text.append(_fix_hebrew_quotes(text))
             logger.debug("Page %d: direct extraction (%d chars, quality OK)", page_num + 1, len(text))
         else:
             reason = "insufficient text" if len(text) <= 50 else "low quality OCR layer"