fix(digests): enrich self-cleans duplicate-yomon rows (re-sent issues)

אותו יומון יכול להגיע כשני PDF שונים (re-send/forward → בייטים שונים →
content_hash dedup מפספס), אבל yomon_number ייחודי → ה-update ב-enrich מתנגש
על uq_digests_yomon_number. עכשיו enrich תופס את ההתנגשות, מוחק את השורה
הכפולה (היומון כבר קיים), ומחזיר status='duplicate' — כך ה-cron לא מנסה אותה
שוב ושוב. סוגר לולאת-retry אינסופית פוטנציאלית במערכת הלא-מאוישת.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-08 04:59:12 +00:00
parent 4b5c8a2772
commit a3a02ca67a

View File

@@ -213,7 +213,23 @@ async def enrich_digest(digest_id: UUID | str, progress: ProgressCb | None = Non
fields["subject_tags"] = extracted["subject_tags"]
if fields:
await db.update_digest(digest_id, **fields)
try:
await db.update_digest(digest_id, **fields)
except Exception as e:
# The same yomon issue can arrive as two different PDFs (re-sent /
# forwarded twice → different bytes → content_hash dedup misses it),
# but the yomon_number is unique. The extracted number then collides
# on uq_digests_yomon_number. This row is a duplicate of an already-
# ingested yomon → drop it so it isn't retried forever by the cron.
if "uq_digests_yomon_number" in str(e):
await db.delete_digest(digest_id)
logger.info(
"digest %s is a duplicate yomon (%s) — deleted",
digest_id, fields.get("yomon_number"),
)
return {"status": "duplicate", "digest_id": str(digest_id),
"yomon_number": fields.get("yomon_number")}
raise
merged = await db.get_digest(digest_id)
await progress("embedding", 75, "מחשב embedding")