feat: Placeholder in der Auto-Generierung + Token-Leak-Fix

- Pair-Generierung markiert Nomen per [surface|lemma]-Markup und löst sie zu
  {{label.o:objectId}} / {{label.w:wordId}} auf (Words werden auto-erstellt)
- Pipeline übersetzt + vertont Placeholder-Wörter aus den Sätzen mit
- translateText halluziniert keine ⟦PHn⟧-Tokens mehr (kein Token-Prompt ohne
  Tokens, defensives Strippen); TTS/Review lösen geleakte Tokens auf
- POST /api/pipeline/repair-tokens repariert bestehende Sätze + Audios

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
2026-06-12 22:43:39 +02:00
parent 25d1e89446
commit 895d7c56a1
6 changed files with 199 additions and 18 deletions

View File

@@ -5,7 +5,8 @@ const { LANGS } = require('../lib/translate');
const { loadPairContext, computeReadiness, loadPairContent } = require('../lib/pairContent');
const { enqueue, loadPairs, collectAudioUnits, generateWithBackoff, translatePair } = require('../lib/pipeline');
const { describeError } = require('./audios');
const { PLACEHOLDER_RE } = require('../lib/placeholders');
const { PLACEHOLDER_RE, TOKEN_RE, stripLeakedTokens } = require('../lib/placeholders');
const { invalidateAudio } = require('../lib/reviewPairs');
// ── Objekt-Wort-Erkennung in Sätzen (für die manuelle Zuweisung beim Review) ──
@@ -241,6 +242,74 @@ router.post('/picture/:id/audio-fill', async (req, res, next) => {
} catch (err) { next(err); }
});
// POST /api/pipeline/repair-tokens — Datenreparatur: geleakte ⟦PHn:…⟧-Tokens
// (Claude-Halluzination beim Übersetzen, vor dem Fix) aus allen Sätzen entfernen.
// Betroffene Audios werden gelöscht und direkt mit dem reparierten Text neu erzeugt.
router.post('/repair-tokens', async (req, res, next) => {
try {
const hasToken = v => { TOKEN_RE.lastIndex = 0; return TOKEN_RE.test(v || ''); };
const result = { cells_fixed: 0, audios_regenerated: 0, audios_failed: 0, details: [] };
const targets = [
{ table: 'questions', fields: ['sentence'] },
{ table: 'statements', fields: ['positive_sentence', 'negative_sentence'] },
{ table: 'words', fields: ['titel'] },
];
// 1) Textzellen säubern + zugehörige Audios löschen & neu generieren
for (const t of targets) {
const cols = t.fields.flatMap(f => LANGS.map(l => `${f}_${l}`));
const r = await query(
`SELECT id, ${cols.join(', ')} FROM ${t.table}
WHERE ${cols.map(c => `${c} LIKE '%⟦PH%'`).join(' OR ')}`);
for (const row of r.rows) {
for (const f of t.fields) {
for (const l of LANGS) {
const col = `${f}_${l}`;
if (!hasToken(row[col])) continue;
const fixed = stripLeakedTokens(row[col]).replace(/\s{2,}/g, ' ').trim();
await query(`UPDATE ${t.table} SET ${col} = $1 WHERE id = $2`, [fixed, row.id]);
await invalidateAudio(t.table, row.id, f, l);
result.cells_fixed++;
const detail = { table: t.table, id: row.id, column: col, fixed };
try {
await generateWithBackoff({ text: fixed, language: l, source_table: t.table, source_id: row.id, source_field: f });
result.audios_regenerated++;
} catch (err) {
result.audios_failed++;
detail.audio_error = describeError(err);
}
result.details.push(detail);
}
}
}
}
// 2) Audios, deren vertonter Text noch Tokens enthält (Zelle ggf. schon anderweitig
// korrigiert) → löschen und mit dem aktuellen Zellen-Text neu erzeugen
const audios = await query(
`SELECT id, source_table, source_id, source_field, language FROM audios WHERE text LIKE '%⟦PH%'`);
for (const a of audios.rows) {
const r = await query(
`SELECT ${a.source_field}_${a.language} AS text FROM ${a.source_table} WHERE id = $1`, [a.source_id]);
const text = (r.rows[0]?.text || '').trim();
await invalidateAudio(a.source_table, a.source_id, a.source_field, a.language);
const detail = { table: 'audios', id: a.id, column: `${a.source_field}_${a.language}` };
if (text) {
try {
await generateWithBackoff({ text, language: a.language, source_table: a.source_table, source_id: a.source_id, source_field: a.source_field });
result.audios_regenerated++;
} catch (err) {
result.audios_failed++;
detail.audio_error = describeError(err);
}
}
result.details.push(detail);
}
res.json(result);
} catch (err) { next(err); }
});
// GET /api/pipeline/settings
router.get('/settings', async (req, res, next) => {
try {