Fix compare sections query: match by number segment

Document titles are '[קורפוס] ARAR-23-1188 - ...' but decision_number
is '1188/23' — previous LIKE %1188/23% wouldn't match. Now extracts
the first numeric segment and matches against title.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-11 12:28:52 +00:00
parent 5cb0be473c
commit ffa089e1df

View File

@@ -802,16 +802,24 @@ async def training_compare(a: str, b: str):
"FROM style_patterns WHERE frequency > 0"
)
# Section breakdown via document_chunks
# Section breakdown via document_chunks.
# decision_number format is "NNNN/YY" but document titles are like
# "[קורפוס] ARAR-YY-NNNN - ..." so we match on the number segment only.
async def section_stats(corpus_row):
nm = corpus_row["decision_number"]
if not nm:
return []
# Extract the first numeric segment (e.g., "1188" from "1188/23")
num_match = re.match(r"(\d{3,4})", nm)
num = num_match.group(1) if num_match else nm
rows2 = await conn.fetch(
"SELECT dc.section_type, sum(length(dc.content))::int as chars "
"FROM document_chunks dc JOIN documents d ON dc.document_id=d.id "
"WHERE d.title LIKE $1 AND dc.section_type IS NOT NULL "
"WHERE d.title LIKE '[קורפוס]%' "
" AND (d.title LIKE $1 OR d.title LIKE $2) "
" AND dc.section_type IS NOT NULL "
"GROUP BY dc.section_type ORDER BY chars DESC",
f"%{num}%",
f"%{nm}%",
)
return [{"type": r["section_type"], "chars": r["chars"]} for r in rows2]