Add training corpus UI with Nevo proofreading pipeline

- New proofreader service strips Nevo editorial additions (front matter,
  postamble, page headers, watermarks, inline codes) from DOCX/PDF/MD
- PDF pages use Google Vision OCR for clean Hebrew RTL extraction
- New training page at #/training with drag-and-drop upload, automatic
  metadata extraction (decision number, date, categories), reviewable
  preview, and style pattern report grouped by type
- API endpoints: /api/training/{analyze,upload,corpus,patterns,
  analyze-style,analyze-style/status}
- Fix claude_session.query to pipe prompt via stdin, avoiding ARG_MAX
  overflow when analyzing 900K+ char corpus
- CLI scripts for batch proofreading and corpus upload

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-11 11:04:58 +00:00
parent ecda95d610
commit 32f18de049
6 changed files with 1960 additions and 3 deletions

View File

@@ -283,6 +283,120 @@ header nav a:hover, header nav a.active { color: #fff; background: rgba(255,255,
}
.skill-install-result.error { background: #ffebee; border-color: #ffcdd2; }
/* ── Training Corpus Upload ───────────────────────────── */
.training-review {
border: 1px solid #e5e5e5; border-radius: 8px; padding: 14px 16px;
margin-bottom: 12px; background: #fafafa;
}
.training-review .review-header {
display: flex; align-items: center; gap: 10px;
padding-bottom: 10px; margin-bottom: 12px;
border-bottom: 1px solid #eee;
}
.training-review .review-header strong { font-size: 0.95em; color: #1a1a2e; flex: 1; }
.training-review .review-meta { font-size: 0.78em; color: #888; }
.training-review .btn-icon {
background: transparent; border: none; color: #aaa; cursor: pointer;
font-size: 1.1em; padding: 4px 8px; border-radius: 4px;
}
.training-review .btn-icon:hover { background: #ffebee; color: #c62828; }
.training-review .review-fields {
display: grid; grid-template-columns: 1fr 160px; gap: 14px; margin-bottom: 12px;
}
.training-review .review-fields label {
display: flex; flex-direction: column; gap: 4px;
font-size: 0.8em; color: #666; font-weight: 500;
}
.training-review .review-fields input {
padding: 7px 10px; border: 1px solid #ddd; border-radius: 6px;
font-size: 0.88em; font-family: inherit;
}
.training-review .review-fields input:focus {
outline: none; border-color: #e94560;
}
.training-review .review-cats { margin-bottom: 10px; }
.training-review .review-cats-label {
font-size: 0.8em; color: #666; font-weight: 500; margin-bottom: 6px;
}
.training-review .review-cats-list { display: flex; flex-wrap: wrap; gap: 6px; }
.cat-chip {
display: inline-flex; align-items: center; gap: 5px;
padding: 4px 10px; border: 1px solid #ddd; border-radius: 14px;
font-size: 0.78em; cursor: pointer; background: #fff;
transition: background 0.12s;
}
.cat-chip:hover { background: #f0f0f0; }
.cat-chip input[type="checkbox"] { margin: 0; cursor: pointer; }
.cat-chip:has(input:checked) { background: #ffe4ea; border-color: #e94560; color: #c62828; }
.review-preview {
margin-top: 6px; border: 1px solid #eee; border-radius: 6px;
background: #fff; padding: 8px 12px;
}
.review-preview summary {
cursor: pointer; font-size: 0.78em; color: #888; font-weight: 500;
}
.review-preview pre {
margin-top: 10px; font-size: 0.78em; color: #333; direction: rtl;
white-space: pre-wrap; font-family: inherit; line-height: 1.5;
max-height: 250px; overflow-y: auto;
}
.training-task {
padding: 10px 14px; margin-bottom: 6px; border-radius: 6px;
background: #f7f7f7; font-size: 0.85em;
display: flex; align-items: center; gap: 10px;
}
.training-task:last-child { margin-bottom: 0; }
.corpus-table { width: 100%; border-collapse: collapse; font-size: 0.82em; }
.corpus-table th, .corpus-table td {
text-align: right; padding: 8px 10px; border-bottom: 1px solid #eee;
}
.corpus-table th {
background: #f7f7f7; font-weight: 600; color: #555;
font-size: 0.78em; text-transform: uppercase;
}
.corpus-table tr:hover td { background: #fafafa; }
.cat-tag {
display: inline-block; padding: 2px 8px; margin: 0 2px;
background: #e3f2fd; color: #1565c0; border-radius: 10px;
font-size: 0.72em; font-weight: 500;
}
/* Pattern groups */
.pattern-group {
border: 1px solid #eee; border-radius: 8px; margin-bottom: 10px;
background: #fff;
}
.pattern-group[open] { background: #fafafa; }
.pattern-group summary {
padding: 12px 16px; cursor: pointer; font-size: 0.9em;
display: flex; align-items: center; gap: 10px; list-style: none;
}
.pattern-group summary::-webkit-details-marker { display: none; }
.pattern-group summary::before {
content: '▸'; transition: transform 0.15s; font-size: 0.9em; color: #888;
}
.pattern-group[open] summary::before { transform: rotate(90deg); }
.pattern-count {
margin-right: auto; background: #e3f2fd; color: #1565c0;
padding: 2px 10px; border-radius: 10px; font-size: 0.76em; font-weight: 500;
}
.pattern-list {
padding: 4px 16px 14px 16px; display: flex; flex-direction: column; gap: 8px;
}
.pattern-item {
padding: 10px 14px; background: #fff; border: 1px solid #eee;
border-radius: 6px; font-size: 0.84em;
}
.pattern-text { color: #1a1a2e; font-weight: 500; }
.pattern-context { color: #666; font-size: 0.88em; margin-top: 4px; }
.pattern-meta {
color: #999; font-size: 0.78em; margin-top: 6px;
display: flex; gap: 10px;
}
@media (max-width: 800px) {
.main { padding: 16px; }
header { padding: 14px 16px; }
@@ -302,6 +416,7 @@ header nav a:hover, header nav a.active { color: #fff; background: rgba(255,255,
<a href="#/" id="navHome">תיקים</a>
<a href="#/new" id="navNew">+ תיק חדש</a>
<a href="#/upload" id="navUpload">העלאה</a>
<a href="#/training" id="navTraining">אימון סגנון</a>
<a href="#/skills" id="navSkills">Skills</a>
</nav>
</header>
@@ -552,6 +667,75 @@ header nav a:hover, header nav a.active { color: #fff; background: rgba(255,255,
<div class="card-body" id="legacyTasksList"></div>
</div>
</div>
<!-- ══ Page: Training Corpus Upload ══ -->
<div class="page" id="page-training">
<div class="page-header">
<h2>אימון סגנון — העלאת החלטות לקורפוס</h2>
</div>
<div class="card">
<div class="card-body">
<p style="margin-bottom:12px;color:#555;line-height:1.6">
העלה החלטות קודמות של דפנה כדי ללמד את המערכת את סגנון הכתיבה שלה.
הקבצים יעברו <strong>הגהה אוטומטית</strong> (הסרת תוספות נבו, כותרות, סימני מים)
ו<strong>חילוץ מטא-דאטה</strong> (מספר החלטה, תאריך, קטגוריות) לסקירה לפני ההעלאה.
</p>
<div class="upload-zone" id="trainingDropZone">
<div style="font-size:3em;color:#ccc;margin-bottom:16px">&#128218;</div>
<h3>גרור קבצי החלטה לכאן או לחץ לבחירה</h3>
<p>PDF, DOCX, MD — עד 50MB. ניתן להעלות מספר קבצים בבת אחת.</p>
<input type="file" id="trainingFileInput" multiple accept=".pdf,.docx,.md,.txt">
</div>
</div>
</div>
<div class="card" id="trainingAnalysisCard" style="display:none">
<div class="card-header">
<span>סקירת מטא-דאטה לפני ההעלאה</span>
<span id="trainingAnalysisStatus" style="float:left;font-weight:400;color:#888;font-size:0.9em"></span>
</div>
<div class="card-body">
<div id="trainingReviewList"></div>
<div style="display:flex;gap:10px;margin-top:16px;justify-content:flex-end">
<button class="btn btn-ghost" onclick="cancelTrainingReview()">בטל</button>
<button class="btn btn-primary" id="trainingUploadBtn" onclick="uploadAllTraining()">
העלה הכל לקורפוס
</button>
</div>
</div>
</div>
<div class="card" id="trainingTasksCard" style="display:none">
<div class="card-header">עיבוד והעלאה</div>
<div class="card-body" id="trainingTasksList"></div>
</div>
<div class="card">
<div class="card-header">
<span>קורפוס הסגנון</span>
<span id="corpusCount" style="float:left;font-weight:400;color:#888;font-size:0.9em"></span>
</div>
<div class="card-body" id="corpusList">
<div class="empty">טוען...</div>
</div>
</div>
<div class="card">
<div class="card-header">
<span>דוח סגנון — דפוסים שחולצו</span>
<span style="float:left;display:flex;gap:10px;align-items:center">
<span id="patternsCount" style="font-weight:400;color:#888;font-size:0.9em"></span>
<button class="btn btn-primary" id="analyzeStyleBtn" onclick="runStyleAnalysis()">
נתח קורפוס
</button>
</span>
</div>
<div class="card-body" id="patternsList">
<div class="empty">טוען...</div>
</div>
</div>
</div>
</div>
<!-- Status Bar -->
@@ -614,6 +798,11 @@ function handleRoute() {
document.getElementById('navUpload').classList.add('active');
subtitle = 'העלאת מסמכים';
loadLegacyPending();
} else if (hash === '#/training') {
document.getElementById('page-training').classList.add('active');
document.getElementById('navTraining').classList.add('active');
subtitle = 'אימון סגנון';
initTrainingPage();
}
document.getElementById('pageSubtitle').textContent = subtitle;
@@ -1559,6 +1748,377 @@ async function restartPaperclip() {
// Init legacy upload listeners
setupLegacyUpload();
// ── Training Corpus Upload ─────────────────────────────────────────
const ALL_CATEGORIES = [
'בנייה', 'שימוש חורג', 'תכנית', 'היתר', 'הקלה',
'חלוקה', 'תמ"א 38', 'היטל השבחה', 'פיצויים 197',
];
let _trainingReviews = []; // in-progress metadata awaiting user approval
function initTrainingPage() {
setupTrainingDropZone();
loadCorpusList();
loadStylePatterns();
pollStyleAnalysisStatus();
// Reset review state on re-entry
_trainingReviews = [];
document.getElementById('trainingAnalysisCard').style.display = 'none';
document.getElementById('trainingTasksCard').style.display = 'none';
document.getElementById('trainingReviewList').innerHTML = '';
document.getElementById('trainingTasksList').innerHTML = '';
}
function setupTrainingDropZone() {
const zone = document.getElementById('trainingDropZone');
const input = document.getElementById('trainingFileInput');
if (zone._wired) return;
zone._wired = true;
zone.addEventListener('click', () => input.click());
zone.addEventListener('dragover', (e) => { e.preventDefault(); zone.classList.add('dragging'); });
zone.addEventListener('dragleave', () => zone.classList.remove('dragging'));
zone.addEventListener('drop', (e) => {
e.preventDefault();
zone.classList.remove('dragging');
handleTrainingFiles(e.dataTransfer.files);
});
input.addEventListener('change', () => handleTrainingFiles(input.files));
}
async function handleTrainingFiles(fileList) {
const files = Array.from(fileList || []);
if (!files.length) return;
const card = document.getElementById('trainingAnalysisCard');
const status = document.getElementById('trainingAnalysisStatus');
card.style.display = '';
status.textContent = `מעלה ומנתח ${files.length} קבצים...`;
for (const file of files) {
try {
// 1. Upload to pending dir
status.textContent = `מעלה: ${file.name}...`;
const fd = new FormData();
fd.append('file', file);
const upRes = await fetch(API + '/upload', { method: 'POST', body: fd });
if (!upRes.ok) throw new Error(`Upload failed: ${await upRes.text()}`);
const uploadInfo = await upRes.json();
// 2. Analyze (proofread + extract metadata)
status.textContent = `מנתח: ${file.name}...`;
const analyzeFd = new FormData();
analyzeFd.append('filename', uploadInfo.filename);
const anRes = await fetch(API + '/training/analyze', { method: 'POST', body: analyzeFd });
if (!anRes.ok) throw new Error(`Analyze failed: ${await anRes.text()}`);
const analysis = await anRes.json();
_trainingReviews.push({
...analysis,
_pendingName: uploadInfo.filename,
_originalName: file.name,
_status: 'ready',
});
} catch (e) {
toast(`שגיאה בעיבוד ${file.name}: ${e.message}`, 'error');
}
}
status.textContent = '';
renderTrainingReview();
}
function renderTrainingReview() {
const list = document.getElementById('trainingReviewList');
if (!_trainingReviews.length) {
list.innerHTML = '<div class="empty">אין קבצים לסקירה</div>';
document.getElementById('trainingAnalysisCard').style.display = 'none';
return;
}
list.innerHTML = _trainingReviews.map((r, i) => renderReviewRow(r, i)).join('');
}
function renderReviewRow(r, idx) {
const catsHtml = ALL_CATEGORIES.map(c => {
const checked = r.subject_categories.includes(c) ? 'checked' : '';
return `<label class="cat-chip"><input type="checkbox" ${checked} onchange="toggleCat(${idx}, '${c}')"> ${c}</label>`;
}).join('');
return `
<div class="training-review" data-idx="${idx}">
<div class="review-header">
<strong>${esc(r._originalName)}</strong>
<span class="review-meta">${r.chars.toLocaleString('he-IL')} תווים · ${esc(r.stats.source_type)}</span>
<button class="btn-icon" onclick="removeTrainingReview(${idx})" title="הסר">✕</button>
</div>
<div class="review-fields">
<label>מספר החלטה
<input type="text" value="${esc(r.decision_number)}"
onchange="_trainingReviews[${idx}].decision_number=this.value"
placeholder="NNNN/YY">
</label>
<label>תאריך
<input type="date" value="${esc(r.decision_date)}"
onchange="_trainingReviews[${idx}].decision_date=this.value">
</label>
</div>
<div class="review-cats">
<div class="review-cats-label">קטגוריות:</div>
<div class="review-cats-list">${catsHtml}</div>
</div>
<details class="review-preview">
<summary>תצוגה מקדימה של טקסט מנוקה (500 תווים ראשונים)</summary>
<pre>${esc(r.preview)}</pre>
</details>
</div>
`;
}
function toggleCat(idx, cat) {
const r = _trainingReviews[idx];
const i = r.subject_categories.indexOf(cat);
if (i >= 0) r.subject_categories.splice(i, 1);
else r.subject_categories.push(cat);
}
function removeTrainingReview(idx) {
const r = _trainingReviews[idx];
// Clean up the uploaded pending file
if (r._pendingName) {
fetch(API + '/uploads/' + encodeURIComponent(r._pendingName), { method: 'DELETE' })
.catch(() => {});
}
_trainingReviews.splice(idx, 1);
renderTrainingReview();
}
function cancelTrainingReview() {
// Delete all pending uploads
for (const r of _trainingReviews) {
if (r._pendingName) {
fetch(API + '/uploads/' + encodeURIComponent(r._pendingName), { method: 'DELETE' })
.catch(() => {});
}
}
_trainingReviews = [];
renderTrainingReview();
}
async function uploadAllTraining() {
const btn = document.getElementById('trainingUploadBtn');
btn.disabled = true;
const tasksCard = document.getElementById('trainingTasksCard');
const tasksList = document.getElementById('trainingTasksList');
tasksCard.style.display = '';
tasksList.innerHTML = '';
for (let i = 0; i < _trainingReviews.length; i++) {
const r = _trainingReviews[i];
const row = document.createElement('div');
row.className = 'training-task';
row.innerHTML = `<span class="mini-spinner"></span> ${esc(r._originalName)}${esc(r.decision_number || '—')}`;
tasksList.appendChild(row);
try {
const res = await fetch(API + '/training/upload', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
filename: r._pendingName,
decision_number: r.decision_number,
decision_date: r.decision_date,
subject_categories: r.subject_categories,
title: r._originalName.replace(/\.[^.]+$/, ''),
}),
});
if (!res.ok) {
const err = await res.text();
throw new Error(err);
}
const { task_id } = await res.json();
const result = await pollTrainingProgress(task_id, row, r._originalName);
row.innerHTML = `<span style="color:#0a0">✓</span> ${esc(r._originalName)}${result.chars.toLocaleString('he-IL')} תווים, ${result.chunks} קטעים`;
} catch (e) {
row.innerHTML = `<span style="color:#c00">✗</span> ${esc(r._originalName)}${esc(e.message.substring(0, 200))}`;
}
}
_trainingReviews = [];
renderTrainingReview();
btn.disabled = false;
loadCorpusList();
toast('ההעלאה הושלמה', 'success');
}
const TRAINING_STEP_LABELS = {
queued: 'בתור',
proofreading: 'הגהה',
saving: 'שמירה',
corpus: 'קליטה לקורפוס',
chunking: 'פיצול לקטעים',
embedding: 'יצירת embeddings',
completed: 'הושלם',
failed: 'נכשל',
};
function pollTrainingProgress(taskId, row, name) {
return new Promise((resolve, reject) => {
const es = new EventSource(API + '/progress/' + taskId);
es.onmessage = (e) => {
const data = JSON.parse(e.data);
const label = TRAINING_STEP_LABELS[data.step] || TRAINING_STEP_LABELS[data.status] || data.status;
row.innerHTML = `<span class="mini-spinner"></span> ${esc(name)}${esc(label)}...`;
if (data.status === 'completed') {
es.close();
resolve(data.result);
} else if (data.status === 'failed') {
es.close();
reject(new Error(data.error || 'Processing failed'));
}
};
es.onerror = () => {
es.close();
reject(new Error('connection lost'));
};
});
}
// ── Style Analysis (patterns) ────────────────────────────
const PATTERN_TYPE_LABELS = {
opening_formula: 'נוסחאות פתיחה',
closing_formula: 'נוסחאות סיום',
transition: 'ביטויי מעבר',
characteristic_phrase: 'ביטויים אופייניים',
argument_flow: 'זרימת טיעון',
analysis_structure: 'מבנה ניתוח',
evidence_handling: 'טיפול בראיות',
citation_style: 'סגנון ציטוט',
};
async function loadStylePatterns() {
const container = document.getElementById('patternsList');
const count = document.getElementById('patternsCount');
try {
const res = await fetch(API + '/training/patterns');
const data = await res.json();
count.textContent = `${data.total} דפוסים`;
if (!data.total) {
container.innerHTML = '<div class="empty">אין דפוסים עדיין. לחץ "נתח קורפוס" כדי לחלץ דפוסים מההחלטות הקיימות.</div>';
return;
}
const typeOrder = [
'opening_formula', 'transition', 'characteristic_phrase',
'argument_flow', 'analysis_structure', 'evidence_handling',
'citation_style', 'closing_formula',
];
const types = typeOrder.filter(t => data.by_type[t]);
Object.keys(data.by_type).forEach(t => { if (!types.includes(t)) types.push(t); });
container.innerHTML = types.map(type => `
<details class="pattern-group" open>
<summary>
<strong>${esc(PATTERN_TYPE_LABELS[type] || type)}</strong>
<span class="pattern-count">${data.by_type[type].length}</span>
</summary>
<div class="pattern-list">
${data.by_type[type].map(p => `
<div class="pattern-item">
<div class="pattern-text">${esc(p.pattern_text)}</div>
${p.context ? `<div class="pattern-context">${esc(p.context)}</div>` : ''}
<div class="pattern-meta">
<span>תדירות: ${p.frequency}</span>
${p.examples && p.examples.length ? `<span>· ${p.examples.length} דוגמאות</span>` : ''}
</div>
</div>
`).join('')}
</div>
</details>
`).join('');
} catch (e) {
container.innerHTML = `<div class="empty">שגיאה בטעינה: ${esc(e.message)}</div>`;
}
}
async function runStyleAnalysis() {
const btn = document.getElementById('analyzeStyleBtn');
btn.disabled = true;
try {
const res = await fetch(API + '/training/analyze-style', { method: 'POST' });
if (res.status === 409) {
toast('ניתוח כבר רץ ברקע', 'warn');
} else if (!res.ok) {
throw new Error(await res.text());
} else {
toast('ניתוח סגנון התחיל — 2-5 דקות', 'success');
}
pollStyleAnalysisStatus();
} catch (e) {
toast('שגיאה: ' + e.message, 'error');
btn.disabled = false;
}
}
async function pollStyleAnalysisStatus() {
const btn = document.getElementById('analyzeStyleBtn');
try {
const res = await fetch(API + '/training/analyze-style/status');
const state = await res.json();
if (state.running) {
btn.disabled = true;
btn.innerHTML = `<span class="mini-spinner"></span> מנתח... ${state.elapsed || 0}s`;
setTimeout(pollStyleAnalysisStatus, 3000);
} else {
btn.disabled = false;
btn.textContent = 'נתח קורפוס';
if (state.error) {
toast('ניתוח נכשל: ' + state.error.substring(0, 150), 'error');
} else if (state.result) {
toast('הניתוח הושלם — הדפוסים עודכנו', 'success');
loadStylePatterns();
}
}
} catch (e) {
btn.disabled = false;
}
}
async function loadCorpusList() {
const container = document.getElementById('corpusList');
const count = document.getElementById('corpusCount');
try {
const res = await fetch(API + '/training/corpus');
const rows = await res.json();
count.textContent = `${rows.length} החלטות`;
if (!rows.length) {
container.innerHTML = '<div class="empty">הקורפוס ריק</div>';
return;
}
container.innerHTML = `
<table class="corpus-table">
<thead>
<tr><th>מספר</th><th>תאריך</th><th>קטגוריות</th><th>תווים</th><th>נוצר</th></tr>
</thead>
<tbody>
${rows.map(r => `
<tr>
<td>${esc(r.decision_number || '—')}</td>
<td>${esc(r.decision_date || '—')}</td>
<td>${(r.subject_categories || []).map(c => `<span class="cat-tag">${esc(c)}</span>`).join('')}</td>
<td>${r.chars.toLocaleString('he-IL')}</td>
<td>${esc(r.created_at ? r.created_at.substring(0, 10) : '—')}</td>
</tr>
`).join('')}
</tbody>
</table>
`;
} catch (e) {
container.innerHTML = `<div class="empty">שגיאה בטעינה: ${esc(e.message)}</div>`;
}
}
</script>
</body>
</html>