feat: automatische Content-Pipeline (release → pairs → übersetzen → audio → ready)

- pictures.pipeline_* Spalten + app_settings Tabelle (Migration) - lib/placeholders.js: Placeholder-Auflösung; TTS spricht keine UUIDs mehr - lib/pairContent.js: geteilte Pair-Logik (Readiness mit Skip-Optionen) - lib/generatePairs.js: Claude-Generierung (konfigurierbare Anzahl, nur Nomen/Adjektive bei word-Pairs) + serverseitige Persistenz inkl. object_pairs - lib/pipeline.js: In-Process-Runner, idempotente Schritte, Boot-Resume - routes/pipeline.js: release/retry/overview/bundle/settings + Bild-Publish (kaskadiert Fragen/Statements/Pairs/Wörter/Objekte/Bild) Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-10 20:52:11 +02:00
parent 29a260e351
commit 6af2428df5
10 changed files with 946 additions and 151 deletions
--- a/src/lib/pipeline.js
+++ b/src/lib/pipeline.js
@@ -0,0 +1,337 @@
+// Automatische Content-Pipeline pro Bild: Pairs generieren → übersetzen → Audio → ready.
+// In-Process-Queue mit einem Worker (rate-limit-freundlich). Jeder Schritt ist idempotent,
+// d.h. ein Resume nach Crash/Redeploy überspringt bereits Erledigtes.
+const { query } = require('../db');
+const { LANGS, fillMissingRow } = require('./translate');
+const { translateWordGroup } = require('./pairContent');
+const { generatePairsForObject, persistPair } = require('./generatePairs');
+const { generateAndStore } = require('../routes/audios');
+
+const queue = [];
+let running = false;
+
+function enqueue(pictureId) {
+  if (!queue.includes(pictureId)) queue.push(pictureId);
+  pump();
+}
+
+async function pump() {
+  if (running) return;
+  running = true;
+  try {
+    while (queue.length) {
+      const id = queue.shift();
+      try { await runPicture(id); }
+      catch (err) {
+        console.error(`Pipeline für Bild ${id} fehlgeschlagen:`, err);
+        await setFailed(id, null, err).catch(() => {});
+      }
+    }
+  } finally { running = false; }
+}
+
+// Beim API-Boot: hängengebliebene Läufe (queued/running) wieder aufnehmen.
+async function resumePending() {
+  const r = await query(
+    `SELECT id FROM pictures WHERE pipeline_status IN ('queued', 'running')`);
+  for (const row of r.rows) enqueue(row.id);
+  if (r.rows.length) console.log(`Pipeline: ${r.rows.length} Bild(er) nach Neustart wieder aufgenommen`);
+}
+
+async function getPairsPerObject() {
+  const r = await query(`SELECT value FROM app_settings WHERE key = 'pipeline.pairs_per_object'`);
+  const n = parseInt(r.rows[0]?.value);
+  return Math.min(Math.max(isNaN(n) ? 5 : n, 1), 20);
+}
+
+async function setStep(pictureId, step, progress) {
+  await query(
+    `UPDATE pictures SET pipeline_step = $2, pipeline_progress = $3 WHERE id = $1`,
+    [pictureId, step, JSON.stringify(progress)]);
+}
+
+async function setFailed(pictureId, step, err) {
+  await query(
+    `UPDATE pictures SET pipeline_status='failed', pipeline_step=COALESCE($2, pipeline_step),
+            pipeline_error=$3, pipeline_finished_at=NOW() WHERE id=$1`,
+    [pictureId, step, (err.message || String(err)).slice(0, 1000)]);
+}
+
+// Objekte eines Bildes inkl. zugewiesener Wörter + Selektionen laden.
+async function loadObjects(pictureId) {
+  const objs = (await query(
+    `SELECT o.id, o.status, o.selections
+       FROM object_pictures op JOIN objects o ON o.id = op.object_id
+      WHERE op.picture_id = $1 AND o.status <> 'blocked'
+      ORDER BY o.created_at`, [pictureId])).rows;
+  for (const o of objs) {
+    o.words = (await query(
+      `SELECT w.id, w.titel_de, w.titel_en, w.titel_sv
+         FROM object_words ow JOIN words w ON w.id = ow.word_id
+        WHERE ow.object_id = $1`, [o.id])).rows;
+  }
+  return objs;
+}
+
+// Alle Pairs eines Bildes (über object_pairs), ohne geblockte.
+async function loadPairs(pictureId) {
+  return (await query(
+    `SELECT DISTINCT p.id, p.answer_type, p.status, p.question_id,
+            p.positive_statement_id, p.negative_statement_id
+       FROM object_pairs op
+       JOIN object_pictures pic ON pic.object_id = op.object_id
+       JOIN pairs p ON p.id = op.pair_id
+      WHERE pic.picture_id = $1 AND p.status <> 'blocked'
+      ORDER BY p.id`, [pictureId])).rows;
+}
+
+async function runPicture(pictureId) {
+  // Claim — nur Bilder, die in der Pipeline sind
+  const claim = await query(
+    `UPDATE pictures
+        SET pipeline_status='running', pipeline_error=NULL,
+            pipeline_started_at=COALESCE(pipeline_started_at, NOW()), pipeline_finished_at=NULL
+      WHERE id=$1 AND pipeline_status IN ('queued','running','failed')
+      RETURNING id, picture_link`, [pictureId]);
+  if (!claim.rows.length) return;
+  const picture = claim.rows[0];
+
+  const objects = await loadObjects(pictureId);
+  if (!objects.length) { await setFailed(pictureId, 'pairs', new Error('Bild hat keine Objekte')); return; }
+
+  const progress = { objectsDone: 0, objectsTotal: objects.length, pairsCreated: 0,
+                     translatedPairs: 0, pairsTotal: 0, audiosDone: 0, audiosTotal: 0, incompletePairs: 0 };
+
+  // ── Step 1: Pairs generieren (pro Objekt, skip wenn schon genug) ────────────
+  try {
+    const targetCount = await getPairsPerObject();
+    await setStep(pictureId, 'pairs', progress);
+    const claudeObjects = objects.map(o => ({ id: o.id, words: o.words, selections: o.selections || [] }));
+
+    for (const obj of objects) {
+      const have = parseInt((await query(
+        `SELECT count(*) AS c FROM object_pairs op JOIN pairs p ON p.id = op.pair_id
+          WHERE op.object_id = $1 AND p.status <> 'blocked'`, [obj.id])).rows[0].c);
+      if (have < targetCount) {
+        if (!picture.picture_link) throw new Error('Bild hat keinen picture_link für die KI-Analyse');
+        let pairs;
+        try {
+          pairs = await generatePairsForObject({
+            imageUrl: picture.picture_link, objects: claudeObjects,
+            selectedObjectId: obj.id, count: targetCount - have,
+          });
+        } catch (err) {
+          // Ein Retry bei Parse-/API-Fehlern, dann aufgeben
+          pairs = await generatePairsForObject({
+            imageUrl: picture.picture_link, objects: claudeObjects,
+            selectedObjectId: obj.id, count: targetCount - have,
+          });
+        }
+        for (const p of pairs) {
+          await persistPair(p, obj.id);
+          progress.pairsCreated++;
+          await setStep(pictureId, 'pairs', progress);
+        }
+      }
+      progress.objectsDone++;
+      await setStep(pictureId, 'pairs', progress);
+    }
+  } catch (err) { await setFailed(pictureId, 'pairs', err); return; }
+
+  // ── Step 2: Übersetzen (pro Pair, füllt nur fehlende Sprachen) ──────────────
+  const pairs = await loadPairs(pictureId);
+  progress.pairsTotal = pairs.length;
+  try {
+    await setStep(pictureId, 'translate', progress);
+    for (const p of pairs) {
+      let questionRow = null;
+      if (p.question_id) {
+        questionRow = (await query(
+          `SELECT sentence_de, sentence_en, sentence_sv FROM questions WHERE id = $1`,
+          [p.question_id])).rows[0] || null;
+        await fillMissingRow('questions', p.question_id, ['sentence']);
+      }
+      if (p.answer_type === 'word') {
+        if (p.positive_statement_id)
+          await translateWordGroup(p.positive_statement_id, 'statement_positive_words', questionRow, false);
+        if (p.negative_statement_id)
+          await translateWordGroup(p.negative_statement_id, 'statement_negative_words', questionRow, false);
+      } else {
+        if ((p.answer_type === 'text' || p.answer_type === 'question') && p.positive_statement_id)
+          await fillMissingRow('statements', p.positive_statement_id, ['positive_sentence']);
+        if (p.answer_type === 'question' && p.negative_statement_id)
+          await fillMissingRow('statements', p.negative_statement_id, ['negative_sentence']);
+      }
+      progress.translatedPairs++;
+      await setStep(pictureId, 'translate', progress);
+    }
+  } catch (err) { await setFailed(pictureId, 'translate', err); return; }
+
+  // ── Step 3: Audio für alle Sätze + Wörter des Bildes in allen Sprachen ──────
+  try {
+    const units = await collectAudioUnits(pictureId, pairs);
+    progress.audiosTotal = units.length;
+    progress.audiosDone = units.filter(u => u.hasAudio).length;
+    await setStep(pictureId, 'audio', progress);
+
+    const failures = [];
+    for (const u of units) {
+      if (u.hasAudio) continue;
+      try {
+        await generateWithBackoff(u);
+        progress.audiosDone++;
+      } catch (err) {
+        failures.push(`${u.source_table}/${u.source_field}/${u.language}: ${err.message}`);
+      }
+      await setStep(pictureId, 'audio', progress);
+    }
+    if (failures.length && progress.audiosDone === 0)
+      throw new Error(`Alle Audio-Generierungen fehlgeschlagen: ${failures[0]}`);
+    progress.audioFailures = failures.length;
+  } catch (err) { await setFailed(pictureId, 'audio', err); return; }
+
+  // ── Step 4: Abschluss — vollständige Inhalte auf 'reviewed', Bild auf 'ready'
+  try {
+    let incomplete = 0;
+    for (const p of pairs) {
+      if (await isPairComplete(p)) {
+        if (p.question_id)
+          await query(`UPDATE questions SET status='reviewed' WHERE id=$1 AND status='draft'`, [p.question_id]);
+        const stmtIds = [p.positive_statement_id, p.negative_statement_id].filter(Boolean);
+        if (stmtIds.length)
+          await query(`UPDATE statements SET status='reviewed' WHERE id = ANY($1) AND status='draft'`, [stmtIds]);
+        await query(`UPDATE pairs SET status='reviewed' WHERE id=$1 AND status='draft'`, [p.id]);
+      } else incomplete++;
+    }
+    progress.incompletePairs = incomplete;
+    await query(
+      `UPDATE objects SET status='reviewed'
+        WHERE id IN (SELECT object_id FROM object_pictures WHERE picture_id = $1) AND status='draft'`, [pictureId]);
+    await query(
+      `UPDATE pictures SET status = CASE WHEN status='uploaded' THEN 'reviewed' ELSE status END,
+              pipeline_status='ready', pipeline_step=NULL, pipeline_progress=$2, pipeline_finished_at=NOW()
+        WHERE id=$1`, [pictureId, JSON.stringify(progress)]);
+  } catch (err) { await setFailed(pictureId, 'finish', err); return; }
+}
+
+// Alle 3 Sprachen in den genutzten Feldern des Pairs gefüllt? (Spiegel des Review-Checks)
+async function isPairComplete(p) {
+  if (p.question_id) {
+    const q = (await query(
+      `SELECT sentence_de, sentence_en, sentence_sv FROM questions WHERE id=$1`, [p.question_id])).rows[0];
+    if (!q || LANGS.some(l => !(q[`sentence_${l}`] || '').trim())) return false;
+  }
+  if (p.answer_type === 'word') {
+    for (const [stmtId, link] of [[p.positive_statement_id, 'statement_positive_words'],
+                                  [p.negative_statement_id, 'statement_negative_words']]) {
+      if (!stmtId) continue;
+      const ws = (await query(
+        `SELECT w.titel_de, w.titel_en, w.titel_sv FROM ${link} lw JOIN words w ON w.id = lw.word_id
+          WHERE lw.statement_id = $1`, [stmtId])).rows;
+      if (ws.some(w => LANGS.some(l => !(w[`titel_${l}`] || '').trim()))) return false;
+    }
+  } else if (p.answer_type === 'text' || p.answer_type === 'question') {
+    if (p.positive_statement_id) {
+      const s = (await query(
+        `SELECT positive_sentence_de, positive_sentence_en, positive_sentence_sv FROM statements WHERE id=$1`,
+        [p.positive_statement_id])).rows[0];
+      if (!s || LANGS.some(l => !(s[`positive_sentence_${l}`] || '').trim())) return false;
+    }
+    if (p.answer_type === 'question' && p.negative_statement_id) {
+      const s = (await query(
+        `SELECT negative_sentence_de, negative_sentence_en, negative_sentence_sv FROM statements WHERE id=$1`,
+        [p.negative_statement_id])).rows[0];
+      const hasAny = s && LANGS.some(l => (s[`negative_sentence_${l}`] || '').trim());
+      if (hasAny && LANGS.some(l => !(s[`negative_sentence_${l}`] || '').trim())) return false;
+    }
+  }
+  return true;
+}
+
+// Audio-Einheiten des Bildes: Frage-/Statement-Sätze + verlinkte Wörter × Sprachen.
+// Nur Felder, die in ALLEN Sprachen Text haben (Regel wie audios.js computeUnits).
+async function collectAudioUnits(pictureId, pairs) {
+  const units = [];
+  const questionIds = [...new Set(pairs.map(p => p.question_id).filter(Boolean))];
+  const stmtIds = [...new Set(pairs.flatMap(p => [p.positive_statement_id, p.negative_statement_id]).filter(Boolean))];
+
+  // Wörter: über die Statements der Pairs (word-Typ) + object_words des Bildes
+  const wordIds = new Set();
+  if (stmtIds.length) {
+    for (const link of ['statement_positive_words', 'statement_negative_words']) {
+      const r = await query(`SELECT word_id FROM ${link} WHERE statement_id = ANY($1)`, [stmtIds]);
+      r.rows.forEach(x => wordIds.add(x.word_id));
+    }
+  }
+  const ow = await query(
+    `SELECT ow.word_id FROM object_words ow
+      JOIN object_pictures op ON op.object_id = ow.object_id
+     WHERE op.picture_id = $1`, [pictureId]);
+  ow.rows.forEach(x => wordIds.add(x.word_id));
+
+  const sources = [];
+  if (questionIds.length) {
+    const r = await query(
+      `SELECT id, sentence_de, sentence_en, sentence_sv FROM questions WHERE id = ANY($1)`, [questionIds]);
+    r.rows.forEach(row => sources.push({ table: 'questions', row, fields: ['sentence'] }));
+  }
+  if (stmtIds.length) {
+    const r = await query(
+      `SELECT id, positive_sentence_de, positive_sentence_en, positive_sentence_sv,
+              negative_sentence_de, negative_sentence_en, negative_sentence_sv
+         FROM statements WHERE id = ANY($1)`, [stmtIds]);
+    r.rows.forEach(row => sources.push({ table: 'statements', row, fields: ['positive_sentence', 'negative_sentence'] }));
+  }
+  if (wordIds.size) {
+    const r = await query(
+      `SELECT id, titel_de, titel_en, titel_sv FROM words WHERE id = ANY($1) AND status <> 'blocked'`,
+      [[...wordIds]]);
+    r.rows.forEach(row => sources.push({ table: 'words', row, fields: ['titel'] }));
+  }
+
+  // Vorhandene Audios in einem Rutsch laden
+  const sourceIds = sources.map(s => s.row.id);
+  const have = new Set();
+  if (sourceIds.length) {
+    const a = await query(
+      `SELECT source_table, source_id, source_field, language FROM audios
+        WHERE source_id = ANY($1) AND status <> 'blocked'`, [sourceIds]);
+    a.rows.forEach(x => have.add(`${x.source_table}|${x.source_id}|${x.source_field}|${x.language}`));
+  }
+
+  for (const { table, row, fields } of sources) {
+    for (const f of fields) {
+      const allFilled = LANGS.every(l => (row[`${f}_${l}`] || '').trim());
+      if (!allFilled) continue;
+      for (const l of LANGS) {
+        units.push({
+          source_table: table, source_id: row.id, source_field: f, language: l,
+          text: (row[`${f}_${l}`] || '').trim(),
+          hasAudio: have.has(`${table}|${row.id}|${f}|${l}`),
+        });
+      }
+    }
+  }
+  return units;
+}
+
+// ElevenLabs mit Backoff bei Rate-Limit (429).
+async function generateWithBackoff(u) {
+  const delays = [2000, 8000, 30000];
+  for (let attempt = 0; ; attempt++) {
+    try {
+      return await generateAndStore({
+        text: u.text, language: u.language,
+        source_table: u.source_table, source_id: u.source_id, source_field: u.source_field,
+      });
+    } catch (err) {
+      if (err.status === 429 && attempt < delays.length) {
+        await new Promise(r => setTimeout(r, delays[attempt]));
+        continue;
+      }
+      throw err;
+    }
+  }
+}
+
+module.exports = { enqueue, resumePending };