feat: KI-Review-Schritt in der Pipeline (Korrekturlesen vor Audio)
Alle Pairs eines Bildes (de/en/sv) gehen zusammen mit dem Bild an Sonnet zur Prüfung von Rechtschreibung, Übersetzungs-Konsistenz und Plausibilität. Korrekturen werden vor der Audio-Erzeugung angewendet; vorhandene Audios korrigierter Zellen werden invalidiert. Review-Fehler sind nicht fatal. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,10 +1,11 @@
|
||||
// Automatische Content-Pipeline pro Bild: Pairs generieren → übersetzen → Audio → ready.
|
||||
// Automatische Content-Pipeline pro Bild: Pairs generieren → übersetzen → KI-Review → Audio → ready.
|
||||
// In-Process-Queue mit einem Worker (rate-limit-freundlich). Jeder Schritt ist idempotent,
|
||||
// d.h. ein Resume nach Crash/Redeploy überspringt bereits Erledigtes.
|
||||
const { query } = require('../db');
|
||||
const { LANGS, fillMissingRow } = require('./translate');
|
||||
const { translateWordGroup } = require('./pairContent');
|
||||
const { generatePairsForObject, persistPair } = require('./generatePairs');
|
||||
const { reviewPicturePairs } = require('./reviewPairs');
|
||||
const { generateAndStore, describeError } = require('../routes/audios');
|
||||
|
||||
const queue = [];
|
||||
@@ -153,6 +154,25 @@ async function runPicture(pictureId) {
|
||||
await setStep(pictureId, 'translate', progress);
|
||||
}
|
||||
|
||||
// ── Step 2.5: KI-Review — alle Pairs + Bild an Sonnet zum Korrekturlesen ────
|
||||
// (Rechtschreibung, Übersetzungs-Konsistenz, Plausibilität zum Bild). Korrekturen
|
||||
// landen vor der Audio-Erzeugung in der DB; Fehler sind wie beim Übersetzen nicht
|
||||
// fatal — Audio läuft trotzdem, der Lauf wird nicht abgebrochen.
|
||||
progress.reviewedPairs = 0;
|
||||
progress.correctionsApplied = 0;
|
||||
progress.reviewFailures = 0;
|
||||
await setStep(pictureId, 'review', progress);
|
||||
try {
|
||||
await reviewPicturePairs({
|
||||
pictureId, pictureUrl: picture.picture_link, pairs, progress,
|
||||
onProgress: () => setStep(pictureId, 'review', progress),
|
||||
});
|
||||
} catch (err) {
|
||||
progress.reviewFailures++;
|
||||
console.error(`Review-Fehler bei Bild ${pictureId}:`, err.message);
|
||||
}
|
||||
await setStep(pictureId, 'review', progress);
|
||||
|
||||
// ── Step 3: Audio für alle Sätze + Wörter des Bildes in allen Sprachen ──────
|
||||
try {
|
||||
const units = await collectAudioUnits(pictureId, pairs);
|
||||
|
||||
224
src/lib/reviewPairs.js
Normal file
224
src/lib/reviewPairs.js
Normal file
@@ -0,0 +1,224 @@
|
||||
// KI-Review der Pipeline: alle Pairs eines Bildes (alle Sprachen) + das Bild selbst
|
||||
// gehen an Sonnet zum Korrekturlesen (Rechtschreibung, Übersetzungs-Konsistenz,
|
||||
// Plausibilität zum Bild). Korrekturen werden vor der Audio-Erzeugung in die DB
|
||||
// geschrieben; bereits vorhandene Audios der korrigierten Zellen werden gelöscht,
|
||||
// damit Step 3 sie mit dem neuen Text neu erzeugt.
|
||||
const { query } = require('../db');
|
||||
const { callClaude, tokenize, LANGS } = require('./translate');
|
||||
const { deleteFile, keyFromUrl } = require('../s3');
|
||||
|
||||
const REVIEW_MODEL = process.env.REVIEW_MODEL || process.env.TRANSLATE_MODEL || 'claude-sonnet-4-5';
|
||||
const BATCH_SIZE = 15; // Pairs pro Claude-Call (Bild wird je Batch mitgeschickt)
|
||||
|
||||
const TOKEN_RE = /⟦(PH\d+):([^⟧]*)⟧/g;
|
||||
|
||||
// Refs der Form "q:<uuid>:sentence_de" — kompakt im Prompt, eindeutig in der itemMap.
|
||||
const TABLE_PREFIX = { questions: 'q', statements: 's', words: 'w' };
|
||||
|
||||
function makeItem(table, id, field, lang, text) {
|
||||
const { tokenized, tokens } = tokenize(text);
|
||||
return {
|
||||
ref: `${TABLE_PREFIX[table]}:${id}:${field}_${lang}`,
|
||||
table, id, column: `${field}_${lang}`, field, lang,
|
||||
tokenized, tokens,
|
||||
};
|
||||
}
|
||||
|
||||
// Alle gefüllten Textzellen der Pairs + Objekt-Wörter des Bildes laden.
|
||||
// Rückgabe: { pairBlocks, wordBlock, itemMap } — itemMap: ref → Item (Whitelist).
|
||||
async function loadReviewItems(pictureId, pairs) {
|
||||
const itemMap = new Map();
|
||||
const add = (table, row, field, lang) => {
|
||||
const text = (row[`${field}_${lang}`] || '').trim();
|
||||
if (!text) return null;
|
||||
const item = makeItem(table, row.id, field, lang, text);
|
||||
if (!itemMap.has(item.ref)) itemMap.set(item.ref, item);
|
||||
return itemMap.get(item.ref);
|
||||
};
|
||||
|
||||
const questionIds = [...new Set(pairs.map(p => p.question_id).filter(Boolean))];
|
||||
const stmtIds = [...new Set(pairs.flatMap(p => [p.positive_statement_id, p.negative_statement_id]).filter(Boolean))];
|
||||
|
||||
const questions = new Map();
|
||||
if (questionIds.length) {
|
||||
const r = await query(
|
||||
`SELECT id, sentence_de, sentence_en, sentence_sv FROM questions WHERE id = ANY($1)`, [questionIds]);
|
||||
r.rows.forEach(row => questions.set(row.id, row));
|
||||
}
|
||||
const statements = new Map();
|
||||
if (stmtIds.length) {
|
||||
const r = await query(
|
||||
`SELECT id, positive_sentence_de, positive_sentence_en, positive_sentence_sv,
|
||||
negative_sentence_de, negative_sentence_en, negative_sentence_sv
|
||||
FROM statements WHERE id = ANY($1)`, [stmtIds]);
|
||||
r.rows.forEach(row => statements.set(row.id, row));
|
||||
}
|
||||
|
||||
// Wörter: über die Statement-Links der word-Pairs + object_words des Bildes
|
||||
const stmtWords = new Map(); // statementId → [wordId]
|
||||
const wordIds = new Set();
|
||||
if (stmtIds.length) {
|
||||
for (const link of ['statement_positive_words', 'statement_negative_words']) {
|
||||
const r = await query(`SELECT statement_id, word_id FROM ${link} WHERE statement_id = ANY($1)`, [stmtIds]);
|
||||
for (const x of r.rows) {
|
||||
if (!stmtWords.has(x.statement_id)) stmtWords.set(x.statement_id, []);
|
||||
stmtWords.get(x.statement_id).push(x.word_id);
|
||||
wordIds.add(x.word_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
const objectWordIds = new Set();
|
||||
const ow = await query(
|
||||
`SELECT ow.word_id FROM object_words ow
|
||||
JOIN object_pictures op ON op.object_id = ow.object_id
|
||||
WHERE op.picture_id = $1`, [pictureId]);
|
||||
ow.rows.forEach(x => { objectWordIds.add(x.word_id); wordIds.add(x.word_id); });
|
||||
|
||||
const words = new Map();
|
||||
if (wordIds.size) {
|
||||
const r = await query(
|
||||
`SELECT id, titel_de, titel_en, titel_sv FROM words WHERE id = ANY($1) AND status <> 'blocked'`,
|
||||
[[...wordIds]]);
|
||||
r.rows.forEach(row => words.set(row.id, row));
|
||||
}
|
||||
|
||||
// Prompt-Blöcke pro Pair zusammensetzen
|
||||
const lines = (table, row, field) =>
|
||||
LANGS.map(l => add(table, row, field, l)).filter(Boolean)
|
||||
.map(it => ` ${it.ref} [${it.lang}]: "${it.tokenized}"`);
|
||||
|
||||
const pairBlocks = [];
|
||||
for (const p of pairs) {
|
||||
const block = [`PAIR (answer_type: ${p.answer_type}):`];
|
||||
const q = p.question_id && questions.get(p.question_id);
|
||||
if (q) block.push(...lines('questions', q, 'sentence'));
|
||||
for (const [stmtId, label] of [[p.positive_statement_id, 'positive_sentence'],
|
||||
[p.negative_statement_id, 'negative_sentence']]) {
|
||||
const s = stmtId && statements.get(stmtId);
|
||||
if (!s) continue;
|
||||
if (p.answer_type === 'word') {
|
||||
for (const wid of stmtWords.get(stmtId) || []) {
|
||||
const w = words.get(wid);
|
||||
if (w) block.push(...lines('words', w, 'titel'));
|
||||
}
|
||||
} else {
|
||||
block.push(...lines('statements', s, label));
|
||||
}
|
||||
}
|
||||
if (block.length > 1) pairBlocks.push(block.join('\n'));
|
||||
}
|
||||
|
||||
const wordLines = [];
|
||||
for (const wid of objectWordIds) {
|
||||
const w = words.get(wid);
|
||||
if (w) wordLines.push(...lines('words', w, 'titel'));
|
||||
}
|
||||
const wordBlock = wordLines.length ? `BILD-WÖRTER (Vokabeln zum Bild):\n${wordLines.join('\n')}` : null;
|
||||
|
||||
return { pairBlocks, wordBlock, itemMap };
|
||||
}
|
||||
|
||||
function buildReviewPrompt(pictureUrl, blocks) {
|
||||
const system =
|
||||
'Du bist Lektor für eine Kinder-Sprachlern-App (Deutsch, Englisch, Schwedisch). ' +
|
||||
'Du prüfst Lerninhalte zu einem Bild auf (a) Rechtschreibung und Grammatik je Sprache, ' +
|
||||
'(b) korrekte und konsistente Übersetzung zwischen Deutsch/Englisch/Schwedisch — die Sprachfassungen ' +
|
||||
'einer Zeile müssen dieselbe Bedeutung haben, (c) Plausibilität zum Bild. ' +
|
||||
'Korrigiere NUR echte Fehler, behalte Stil und Länge bei. ' +
|
||||
'Antworte AUSSCHLIESSLICH mit gültigem JSON, ohne Markdown, ohne Erklärungen.';
|
||||
const text =
|
||||
`Prüfe die folgenden Inhalte zum beigefügten Bild. Jede Zeile hat eine Referenz (ref), ` +
|
||||
`eine Sprache und den Text.\n\n` +
|
||||
`WICHTIG: Tokens der Form ⟦PHn:wort⟧ sind geschützte Platzhalter. Du darfst das Wort INNERHALB ` +
|
||||
`des Tokens korrigieren, aber das Token-Format muss exakt erhalten bleiben (⟦PHn:wort⟧). ` +
|
||||
`Kein Token darf gelöscht, verdoppelt oder erfunden werden.\n\n` +
|
||||
blocks.join('\n\n') + '\n\n' +
|
||||
`Antwort-Format — NUR Zeilen, die wirklich einen Fehler enthalten (sonst leeres Array):\n` +
|
||||
`{"corrections":[{"ref":"<ref>","corrected":"<korrigierter Text>"}]}`;
|
||||
return {
|
||||
system,
|
||||
user: [
|
||||
{ type: 'image', source: { type: 'url', url: pictureUrl } },
|
||||
{ type: 'text', text },
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
// Token-Mengen vor/nach Korrektur müssen identisch sein; keine Fremd-Fragmente.
|
||||
function validateCorrection(item, corrected) {
|
||||
if (typeof corrected !== 'string' || !corrected.trim()) return { ok: false, reason: 'leer' };
|
||||
const keys = [...corrected.matchAll(TOKEN_RE)].map(m => m[1]).sort();
|
||||
const expected = item.tokens.map(t => t.key).sort();
|
||||
if (keys.length !== expected.length || keys.some((k, i) => k !== expected[i]))
|
||||
return { ok: false, reason: 'Platzhalter-Tokens verändert' };
|
||||
const stripped = corrected.replace(TOKEN_RE, '');
|
||||
if (/[⟦⟧]|\{\{|\}\}/.test(stripped)) return { ok: false, reason: 'Fragment im Text' };
|
||||
|
||||
// Detokenisieren: ⟦PHn:label⟧ → {{label.type:uuid}} (Label darf korrigiert sein)
|
||||
const labels = {};
|
||||
for (const m of corrected.matchAll(TOKEN_RE)) labels[m[1]] = m[2].trim();
|
||||
let out = corrected;
|
||||
for (const t of item.tokens) {
|
||||
const label = labels[t.key] || t.sourceLabel;
|
||||
out = out.replace(new RegExp(`⟦${t.key}:[^⟧]*⟧`, 'g'), `{{${label}.${t.type}:${t.uuid}}}`);
|
||||
}
|
||||
return { ok: true, detokenized: out.trim() };
|
||||
}
|
||||
|
||||
// Vorhandene Audios der korrigierten Zelle löschen (inkl. S3), damit Step 3 neu erzeugt.
|
||||
async function invalidateAudio(table, id, field, lang) {
|
||||
const r = await query(
|
||||
`SELECT id, audio_link FROM audios
|
||||
WHERE source_table=$1 AND source_id=$2 AND source_field=$3 AND language=$4`,
|
||||
[table, id, field, lang]);
|
||||
for (const row of r.rows) {
|
||||
const k = keyFromUrl(row.audio_link);
|
||||
if (k) await deleteFile(k).catch(() => {});
|
||||
await query(`DELETE FROM audios WHERE id = $1`, [row.id]);
|
||||
}
|
||||
}
|
||||
|
||||
async function applyCorrection(item, newText) {
|
||||
await query(`UPDATE ${item.table} SET ${item.column} = $1 WHERE id = $2`, [newText, item.id]);
|
||||
await invalidateAudio(item.table, item.id, item.field, item.lang);
|
||||
}
|
||||
|
||||
// Haupteinstieg: reviewt alle Pairs eines Bildes in Batches; wirft nie — Fehler
|
||||
// werden in progress.reviewFailures gezählt, die Pipeline läuft weiter.
|
||||
async function reviewPicturePairs({ pictureId, pictureUrl, pairs, progress, onProgress }) {
|
||||
if (!pictureUrl || !pairs.length) return;
|
||||
const { pairBlocks, wordBlock, itemMap } = await loadReviewItems(pictureId, pairs);
|
||||
if (!pairBlocks.length && !wordBlock) return;
|
||||
|
||||
const batches = [];
|
||||
for (let i = 0; i < pairBlocks.length; i += BATCH_SIZE)
|
||||
batches.push(pairBlocks.slice(i, i + BATCH_SIZE));
|
||||
if (!batches.length) batches.push([]);
|
||||
if (wordBlock) batches[0] = [wordBlock, ...batches[0]];
|
||||
|
||||
for (const batch of batches) {
|
||||
try {
|
||||
const { system, user } = buildReviewPrompt(pictureUrl, batch);
|
||||
const data = await callClaude({ system, user, maxTokens: 8000, model: REVIEW_MODEL });
|
||||
const corrections = Array.isArray(data.corrections) ? data.corrections : [];
|
||||
for (const c of corrections) {
|
||||
const item = itemMap.get(c && c.ref);
|
||||
if (!item) continue; // unbekannte Ref → verwerfen
|
||||
const v = validateCorrection(item, c.corrected);
|
||||
if (!v.ok) {
|
||||
console.warn(`Review: Korrektur für ${c.ref} verworfen (${v.reason})`);
|
||||
continue;
|
||||
}
|
||||
await applyCorrection(item, v.detokenized);
|
||||
progress.correctionsApplied++;
|
||||
}
|
||||
} catch (err) {
|
||||
progress.reviewFailures++;
|
||||
console.error(`Review-Batch-Fehler bei Bild ${pictureId}:`, err.message);
|
||||
}
|
||||
progress.reviewedPairs = Math.min(progress.reviewedPairs + BATCH_SIZE, pairs.length);
|
||||
if (onProgress) await onProgress();
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = { reviewPicturePairs, loadReviewItems, buildReviewPrompt, validateCorrection, invalidateAudio };
|
||||
@@ -54,7 +54,7 @@ function detokenize(translated, tokens, labelsFromClaude) {
|
||||
return { text: out, missingTokens: tokens.filter(t => !seen.has(t.key)).map(t => t.key) };
|
||||
}
|
||||
|
||||
async function callClaude({ system, user, maxTokens = 2000 }) {
|
||||
async function callClaude({ system, user, maxTokens = 2000, model = TRANSLATE_MODEL }) {
|
||||
const apiKey = process.env.ANTHROPIC_API_KEY;
|
||||
if (!apiKey) { const e = new Error('ANTHROPIC_API_KEY nicht konfiguriert'); e.status = 500; throw e; }
|
||||
|
||||
@@ -69,7 +69,7 @@ async function callClaude({ system, user, maxTokens = 2000 }) {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json', 'x-api-key': apiKey, 'anthropic-version': '2023-06-01' },
|
||||
body: JSON.stringify({
|
||||
model: TRANSLATE_MODEL, max_tokens: maxTokens, system,
|
||||
model, max_tokens: maxTokens, system,
|
||||
messages: [{ role: 'user', content: user }],
|
||||
}),
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user