feat: Objekt-Wörter deterministisch tokenisieren (Forward + Backfill)
Objekt-Hervorhebung (Chip + Bildregion) hängt an {{label.o:uuid}}-Tokens im
Satz. Bisher entstanden die nur aus dem LLM-Nomen-Markup, das Haiku oft
ausließ -> Objekt blieb un-getokt (z.B. "ryggsäcken"/Rucksack), obwohl korrekt
verlinkt.
- src/lib/objectTagging.js: deterministischer, flexions-toleranter Tagger
(schwed. bestimmte Form -en/-et/...), idempotent, schützt bestehende Tokens.
- generatePairs.resolveNounMarkup: Sweep als Sicherheitsnetz + titel_sv im Lookup.
- pipeline.retagPair/retagObjects: per-Pair Nachtokenisierung (Hybrid-LLM-Fallback
nur für in anderer Sprache bestätigte Objekte), Backfill über Bild/alle Bilder.
- POST /api/pipeline/retag-objects (dry_run/use_llm/picture_id).
Ändert nur Satz-Textfelder -> Audio/Alignment bleiben gültig.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
// Pair-Generierung via Claude (Vision) + serverseitige Persistenz.
|
// Pair-Generierung via Claude (Vision) + serverseitige Persistenz.
|
||||||
// Genutzt von lib/pipeline.js (Automatik) und routes/claude.js (manueller Endpoint).
|
// Genutzt von lib/pipeline.js (Automatik) und routes/claude.js (manueller Endpoint).
|
||||||
const { query } = require('../db');
|
const { query } = require('../db');
|
||||||
|
const { tagObjectWords } = require('./objectTagging');
|
||||||
|
|
||||||
const ANTHROPIC_API_URL = 'https://api.anthropic.com/v1/messages';
|
const ANTHROPIC_API_URL = 'https://api.anthropic.com/v1/messages';
|
||||||
const GENERATE_MODEL = process.env.GENERATE_MODEL || 'claude-haiku-4-5-20251001';
|
const GENERATE_MODEL = process.env.GENERATE_MODEL || 'claude-haiku-4-5-20251001';
|
||||||
@@ -92,7 +93,7 @@ async function resolveNounMarkup(text, objects, selectedObjectId) {
|
|||||||
(a.id === selectedObjectId ? -1 : 0) - (b.id === selectedObjectId ? -1 : 0));
|
(a.id === selectedObjectId ? -1 : 0) - (b.id === selectedObjectId ? -1 : 0));
|
||||||
for (const obj of ordered) {
|
for (const obj of ordered) {
|
||||||
for (const w of obj.words || []) {
|
for (const w of obj.words || []) {
|
||||||
for (const t of [w.titel_de, w.titel_en]) {
|
for (const t of [w.titel_de, w.titel_en, w.titel_sv]) {
|
||||||
const key = (t || '').trim().toLowerCase();
|
const key = (t || '').trim().toLowerCase();
|
||||||
if (key && !objectByLemma.has(key)) objectByLemma.set(key, obj.id);
|
if (key && !objectByLemma.has(key)) objectByLemma.set(key, obj.id);
|
||||||
}
|
}
|
||||||
@@ -121,6 +122,9 @@ async function resolveNounMarkup(text, objects, selectedObjectId) {
|
|||||||
}
|
}
|
||||||
let out = text;
|
let out = text;
|
||||||
for (const [from, to] of replacements) out = out.split(from).join(to);
|
for (const [from, to] of replacements) out = out.split(from).join(to);
|
||||||
|
// Sicherheitsnetz: Objekt-Wörter, die das Modell NICHT als [..]-Nomen markiert hat,
|
||||||
|
// deterministisch nachtokenisieren (der deutsche Satz wird hier verarbeitet).
|
||||||
|
out = tagObjectWords(out, 'de', objects);
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
151
src/lib/objectTagging.js
Normal file
151
src/lib/objectTagging.js
Normal file
@@ -0,0 +1,151 @@
|
|||||||
|
// Deterministisches Tokenisieren von OBJEKT-Wörtern in Sätzen.
|
||||||
|
//
|
||||||
|
// Hintergrund: Objekt-Tokens ({{label.o:objectId}}) entstehen bisher nur aus dem
|
||||||
|
// Nomen-Markup [Oberfläche|Grundform], das das Generierungs-Modell setzen SOLL. Tut es das
|
||||||
|
// nicht (häufig bei kleinen Modellen), fehlt der Token komplett und das Frontend kann das
|
||||||
|
// Objekt weder als Chip noch als Bildregion hervorheben.
|
||||||
|
//
|
||||||
|
// Dieser Tagger findet Objekt-Wörter direkt im Satz – anhand der Wort-Titel der Objekte des
|
||||||
|
// Bildes – und ist damit unabhängig vom LLM-Markup. Er wird genutzt:
|
||||||
|
// - Forward: als Sicherheitsnetz in generatePairs.resolveNounMarkup / pipeline.translatePair
|
||||||
|
// - Backfill: scripts/backfill-object-tokens.js über bestehende Daten
|
||||||
|
//
|
||||||
|
// Wichtig: bereits vorhandene Tokens ({{…}}, ⟦PHn:…⟧) bleiben unangetastet, und es werden NUR
|
||||||
|
// Objekt-Tokens (.o:) erzeugt – Wort-Tokens (.w:) fasst dieser Tagger nicht an.
|
||||||
|
|
||||||
|
const { PLACEHOLDER_RE } = require('./placeholders');
|
||||||
|
|
||||||
|
// Flexions-Endungen je Sprache (bestimmte Form / Plural / Genitiv), längere zuerst, damit der
|
||||||
|
// Regex greedy die längste Form greift (z.B. "ryggsäcken" statt nur "ryggsäck").
|
||||||
|
const SUFFIXES = {
|
||||||
|
sv: ['ens', 'ets', 'na', 'en', 'et', 'or', 'ar', 'er', 'n', 'a', 's'],
|
||||||
|
de: ['en', 'es', 'er', 'em', 'e', 'n', 's'],
|
||||||
|
en: ['es', 's'],
|
||||||
|
};
|
||||||
|
// Lemmata, die kürzer als das sind, werden NUR exakt gematcht (keine Flexion) – sonst matchen
|
||||||
|
// kurze Wörter wie "bi" zu viel ("bil", "bin", …).
|
||||||
|
const MIN_LEN_FOR_SUFFIX = 4;
|
||||||
|
|
||||||
|
function escapeRegex(s) {
|
||||||
|
return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Bestehende Tokens (sowohl {{label.type:uuid}} als auch ⟦PHn:label⟧) erkennen, damit wir
|
||||||
|
// nicht in sie hineinschreiben.
|
||||||
|
const EXISTING_TOKEN_RE = /\{\{[^.{}]+\.[wo]:[0-9a-f-]{36}\}\}|⟦PH\d+:[^⟧]*⟧/g;
|
||||||
|
|
||||||
|
// Baut aus den Objekten der Sprache eine Liste { lemma, lemmaLc, objectId }, längste zuerst.
|
||||||
|
function buildLemmas(objects, lang) {
|
||||||
|
const out = [];
|
||||||
|
const seen = new Set();
|
||||||
|
for (const obj of objects || []) {
|
||||||
|
for (const w of obj.words || []) {
|
||||||
|
const title = (w[`titel_${lang}`] || '').trim();
|
||||||
|
if (!title) continue;
|
||||||
|
const key = title.toLowerCase();
|
||||||
|
if (seen.has(key)) continue;
|
||||||
|
seen.add(key);
|
||||||
|
out.push({ lemma: title, lemmaLc: key, objectId: obj.id });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out.sort((a, b) => b.lemma.length - a.lemma.length);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tagged eine zusammenhängende Klartext-Passage (ohne bestehende Tokens).
|
||||||
|
function tagPlainSegment(text, lemmas, suffixes) {
|
||||||
|
if (!text) return text;
|
||||||
|
// Ein kombinierter Regex über alle Lemmata (längste zuerst). Pro Lemma optional eine
|
||||||
|
// Flexions-Endung, sofern lang genug. Wortgrenzen via Unicode-Lookarounds (statt \b, das
|
||||||
|
// bei å/ä/ö/ü unzuverlässig ist).
|
||||||
|
const alts = lemmas.map(({ lemma }) => {
|
||||||
|
const esc = escapeRegex(lemma);
|
||||||
|
if (lemma.length >= MIN_LEN_FOR_SUFFIX && suffixes.length) {
|
||||||
|
return `${esc}(?:${suffixes.map(escapeRegex).join('|')})?`;
|
||||||
|
}
|
||||||
|
return esc;
|
||||||
|
});
|
||||||
|
if (!alts.length) return text;
|
||||||
|
const re = new RegExp(`(?<![\\p{L}\\p{N}])(${alts.join('|')})(?![\\p{L}\\p{N}])`, 'giu');
|
||||||
|
|
||||||
|
return text.replace(re, (surface) => {
|
||||||
|
const sLc = surface.toLowerCase();
|
||||||
|
// Passendes Objekt bestimmen: längstes Lemma, das Präfix der Oberfläche ist und dessen
|
||||||
|
// Rest eine erlaubte (oder leere) Endung ist.
|
||||||
|
for (const { lemma, lemmaLc, objectId } of lemmas) {
|
||||||
|
if (!sLc.startsWith(lemmaLc)) continue;
|
||||||
|
const rest = sLc.slice(lemmaLc.length);
|
||||||
|
const restOk = rest === '' ||
|
||||||
|
(lemma.length >= MIN_LEN_FOR_SUFFIX && suffixes.includes(rest));
|
||||||
|
if (restOk) return `{{${surface}.o:${objectId}}}`;
|
||||||
|
}
|
||||||
|
return surface; // kein sauberer Treffer → unverändert lassen
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Hauptfunktion: tagged Objekt-Wörter in `sentence` für Sprache `lang`.
|
||||||
|
// `objects`: [{ id, words: [{titel_de,titel_en,titel_sv}] }]
|
||||||
|
function tagObjectWords(sentence, lang, objects) {
|
||||||
|
if (!sentence) return sentence;
|
||||||
|
const lemmas = buildLemmas(objects, lang);
|
||||||
|
if (!lemmas.length) return sentence;
|
||||||
|
const suffixes = SUFFIXES[lang] || [];
|
||||||
|
|
||||||
|
// Satz in [Klartext, Token, Klartext, …] zerlegen; nur Klartext-Teile taggen.
|
||||||
|
let out = '';
|
||||||
|
let last = 0;
|
||||||
|
EXISTING_TOKEN_RE.lastIndex = 0;
|
||||||
|
let m;
|
||||||
|
while ((m = EXISTING_TOKEN_RE.exec(sentence)) !== null) {
|
||||||
|
out += tagPlainSegment(sentence.slice(last, m.index), lemmas, suffixes);
|
||||||
|
out += m[0]; // bestehenden Token unverändert übernehmen
|
||||||
|
last = m.index + m[0].length;
|
||||||
|
}
|
||||||
|
out += tagPlainSegment(sentence.slice(last), lemmas, suffixes);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wickelt das erste Vorkommen von `surface` (exakte Zeichenkette, an Wortgrenzen, NICHT
|
||||||
|
// innerhalb eines bestehenden Tokens) in einen Objekt-Token. Für den LLM-Fallback, der die
|
||||||
|
// gebeugte Oberflächenform liefert, die der deterministische Tagger nicht erkannt hat.
|
||||||
|
function wrapSurface(sentence, surface, objectId) {
|
||||||
|
const surf = (surface || '').trim();
|
||||||
|
if (!sentence || !surf) return sentence;
|
||||||
|
let out = '';
|
||||||
|
let done = false;
|
||||||
|
EXISTING_TOKEN_RE.lastIndex = 0;
|
||||||
|
const segments = [];
|
||||||
|
let m, cursor = 0;
|
||||||
|
// Klartext-Segmente (außerhalb bestehender Tokens) sammeln
|
||||||
|
while ((m = EXISTING_TOKEN_RE.exec(sentence)) !== null) {
|
||||||
|
segments.push({ text: sentence.slice(cursor, m.index), start: cursor, token: false });
|
||||||
|
segments.push({ text: m[0], start: m.index, token: true });
|
||||||
|
cursor = m.index + m[0].length;
|
||||||
|
}
|
||||||
|
segments.push({ text: sentence.slice(cursor), start: cursor, token: false });
|
||||||
|
|
||||||
|
for (const seg of segments) {
|
||||||
|
if (done || seg.token) { out += seg.text; continue; }
|
||||||
|
// Erstes Vorkommen an Wortgrenzen im Klartext-Segment ersetzen
|
||||||
|
const re = new RegExp(`(?<![\\p{L}\\p{N}])(${escapeRegex(surf)})(?![\\p{L}\\p{N}])`, 'u');
|
||||||
|
const mm = seg.text.match(re);
|
||||||
|
if (mm) {
|
||||||
|
out += seg.text.slice(0, mm.index) + `{{${mm[1]}.o:${objectId}}}` + seg.text.slice(mm.index + mm[1].length);
|
||||||
|
done = true;
|
||||||
|
} else {
|
||||||
|
out += seg.text;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Liefert die Menge der Objekt-IDs, die in einem Satz als Objekt-Token vorkommen.
|
||||||
|
function objectIdsInSentence(sentence) {
|
||||||
|
const ids = new Set();
|
||||||
|
for (const mm of String(sentence || '').matchAll(PLACEHOLDER_RE)) {
|
||||||
|
if (mm[2] === 'o') ids.add(mm[3]);
|
||||||
|
}
|
||||||
|
return ids;
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = { tagObjectWords, wrapSurface, buildLemmas, objectIdsInSentence };
|
||||||
@@ -2,8 +2,9 @@
|
|||||||
// In-Process-Queue mit einem Worker (rate-limit-freundlich). Jeder Schritt ist idempotent,
|
// In-Process-Queue mit einem Worker (rate-limit-freundlich). Jeder Schritt ist idempotent,
|
||||||
// d.h. ein Resume nach Crash/Redeploy überspringt bereits Erledigtes.
|
// d.h. ein Resume nach Crash/Redeploy überspringt bereits Erledigtes.
|
||||||
const { query } = require('../db');
|
const { query } = require('../db');
|
||||||
const { LANGS, fillMissingRow } = require('./translate');
|
const { LANGS, fillMissingRow, callClaude } = require('./translate');
|
||||||
const { PLACEHOLDER_RE } = require('./placeholders');
|
const { PLACEHOLDER_RE } = require('./placeholders');
|
||||||
|
const { tagObjectWords, wrapSurface, objectIdsInSentence } = require('./objectTagging');
|
||||||
const { translateWordGroup } = require('./pairContent');
|
const { translateWordGroup } = require('./pairContent');
|
||||||
const { generatePairsForObject, persistPair } = require('./generatePairs');
|
const { generatePairsForObject, persistPair } = require('./generatePairs');
|
||||||
const { reviewPicturePairs } = require('./reviewPairs');
|
const { reviewPicturePairs } = require('./reviewPairs');
|
||||||
@@ -87,6 +88,137 @@ async function loadPairs(pictureId) {
|
|||||||
ORDER BY p.id`, [pictureId])).rows;
|
ORDER BY p.id`, [pictureId])).rows;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Satzfelder EINES Pairs (table/id/col/lang) – questions + statements.
|
||||||
|
function pairSentenceFields(p) {
|
||||||
|
const fields = [];
|
||||||
|
const add = (table, id, cols) => { if (id) for (const col of cols) fields.push({ table, id, col, lang: col.slice(-2) }); };
|
||||||
|
add('questions', p.question_id, ['sentence_de', 'sentence_en', 'sentence_sv']);
|
||||||
|
add('statements', p.positive_statement_id, ['positive_sentence_de', 'positive_sentence_en', 'positive_sentence_sv']);
|
||||||
|
add('statements', p.negative_statement_id, ['negative_sentence_de', 'negative_sentence_en', 'negative_sentence_sv']);
|
||||||
|
return fields;
|
||||||
|
}
|
||||||
|
|
||||||
|
// LLM-Fallback: exakte (gebeugte) Oberflächenform eines Objektworts in einem Satz finden.
|
||||||
|
async function locateSurfaceLLM(sentence, label) {
|
||||||
|
try {
|
||||||
|
const data = await callClaude({
|
||||||
|
system: 'Du findest die exakte Oberflächenform eines Wortes in einem Satz. Antworte AUSSCHLIESSLICH mit gültigem JSON.',
|
||||||
|
user: `Satz: "${sentence}"\nGesuchtes Wort (Grundform/Bedeutung): "${label}"\n\n` +
|
||||||
|
`Gib die EXAKTE Zeichenkette zurück, genau so wie das Wort (ggf. gebeugt / bestimmte Form / Plural) ` +
|
||||||
|
`im Satz vorkommt. Kommt es NICHT vor: null.\nFormat: {"surface":"…"|null}`,
|
||||||
|
maxTokens: 80,
|
||||||
|
});
|
||||||
|
const s = data && typeof data.surface === 'string' ? data.surface.trim() : null;
|
||||||
|
if (!s) return null;
|
||||||
|
// Nur akzeptieren, wenn die Form wirklich (an Wortgrenzen) im Satz steht.
|
||||||
|
return new RegExp(`(?<![\\p{L}\\p{N}])${s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}(?![\\p{L}\\p{N}])`, 'u').test(sentence) ? s : null;
|
||||||
|
} catch { return null; }
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tokenisiert OBJEKT-Wörter in den Sätzen EINES Pairs nach.
|
||||||
|
// Deterministisch (tagObjectWords); optional Hybrid-LLM-Fallback für gebeugte Formen, die
|
||||||
|
// deterministisch nicht erkannt wurden – aber NUR für Objekte, die in einer anderen Sprache
|
||||||
|
// desselben Pairs bereits als Token bestätigt sind (minimale Calls, keine Halluzinationen).
|
||||||
|
// Idempotent. `dryRun` ⇒ kein UPDATE. Gibt geänderte Felder { table,id,col,lang,before,after }.
|
||||||
|
async function retagPair(p, objects, { dryRun = false, useLLM = false } = {}) {
|
||||||
|
const fields = pairSentenceFields(p);
|
||||||
|
if (!fields.length) return [];
|
||||||
|
// Aktuelle Texte laden (gruppiert pro Tabelle/Zeile)
|
||||||
|
const byRow = new Map(); // `${table}|${id}` → { table, id, cols:Set }
|
||||||
|
for (const f of fields) {
|
||||||
|
const k = `${f.table}|${f.id}`;
|
||||||
|
if (!byRow.has(k)) byRow.set(k, { table: f.table, id: f.id, cols: new Set() });
|
||||||
|
byRow.get(k).cols.add(f.col);
|
||||||
|
}
|
||||||
|
const text = {}; // `${table}|${id}|${col}` → string
|
||||||
|
for (const { table, id, cols } of byRow.values()) {
|
||||||
|
const colList = [...cols];
|
||||||
|
const row = (await query(`SELECT ${colList.join(', ')} FROM ${table} WHERE id = $1`, [id])).rows[0] || {};
|
||||||
|
for (const col of colList) text[`${table}|${id}|${col}`] = row[col] || '';
|
||||||
|
}
|
||||||
|
const key = f => `${f.table}|${f.id}|${f.col}`;
|
||||||
|
|
||||||
|
// 1) Deterministischer Sweep (in-memory)
|
||||||
|
const tagged = {};
|
||||||
|
for (const f of fields) {
|
||||||
|
const before = text[key(f)];
|
||||||
|
tagged[key(f)] = before && before.trim() ? tagObjectWords(before, f.lang, objects) : before;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2) Hybrid-LLM-Fallback: Objekt-IDs, die in ≥1 Sprache getokt sind, in fehlenden Sprachen suchen.
|
||||||
|
if (useLLM) {
|
||||||
|
const presentByObj = new Map(); // objectId → Set<lang>
|
||||||
|
for (const f of fields) for (const oid of objectIdsInSentence(tagged[key(f)])) {
|
||||||
|
if (!presentByObj.has(oid)) presentByObj.set(oid, new Set());
|
||||||
|
presentByObj.get(oid).add(f.lang);
|
||||||
|
}
|
||||||
|
const labelOf = (oid, lang) => {
|
||||||
|
const o = objects.find(x => x.id === oid);
|
||||||
|
for (const w of o?.words || []) if ((w[`titel_${lang}`] || '').trim()) return w[`titel_${lang}`].trim();
|
||||||
|
return null;
|
||||||
|
};
|
||||||
|
for (const f of fields) {
|
||||||
|
const cur = tagged[key(f)];
|
||||||
|
if (!cur || !cur.trim()) continue;
|
||||||
|
for (const [oid, langs] of presentByObj) {
|
||||||
|
if (langs.has(f.lang)) continue; // schon getokt in dieser Sprache
|
||||||
|
if (objectIdsInSentence(cur).has(oid)) continue; // (Sicherheit)
|
||||||
|
const label = labelOf(oid, f.lang);
|
||||||
|
if (!label) continue;
|
||||||
|
const surface = await locateSurfaceLLM(cur, label);
|
||||||
|
if (surface) tagged[key(f)] = wrapSurface(tagged[key(f)], surface, oid);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3) Diff + (optional) schreiben
|
||||||
|
const changes = [];
|
||||||
|
for (const { table, id, cols } of byRow.values()) {
|
||||||
|
const set = {};
|
||||||
|
for (const col of cols) {
|
||||||
|
const k = `${table}|${id}|${col}`;
|
||||||
|
if (tagged[k] !== text[k]) {
|
||||||
|
set[col] = tagged[k];
|
||||||
|
changes.push({ table, id, col, lang: col.slice(-2), before: text[k], after: tagged[k] });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const cells = Object.keys(set);
|
||||||
|
if (!dryRun && cells.length) {
|
||||||
|
await query(
|
||||||
|
`UPDATE ${table} SET ${cells.map((c, i) => `${c} = $${i + 1}`).join(', ')} WHERE id = $${cells.length + 1}`,
|
||||||
|
[...cells.map(c => set[c]), id]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return changes;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Backfill/Retag über ein Bild oder alle Bilder. Gibt eine Zusammenfassung zurück.
|
||||||
|
async function retagObjects({ pictureId = null, dryRun = false, useLLM = false } = {}) {
|
||||||
|
const picIds = pictureId
|
||||||
|
? [pictureId]
|
||||||
|
: (await query(`SELECT id FROM pictures ORDER BY created_at`)).rows.map(r => r.id);
|
||||||
|
const report = { pictures: 0, pairs: 0, changedPairs: 0, changedFields: 0, dryRun, useLLM, samples: [] };
|
||||||
|
for (const pid of picIds) {
|
||||||
|
const objects = await loadObjects(pid);
|
||||||
|
if (!objects.length) continue;
|
||||||
|
const pairs = await loadPairs(pid);
|
||||||
|
report.pictures++;
|
||||||
|
for (const p of pairs) {
|
||||||
|
report.pairs++;
|
||||||
|
let changes = [];
|
||||||
|
try { changes = await retagPair(p, objects, { dryRun, useLLM }); }
|
||||||
|
catch (err) { console.error(`Retag-Fehler bei Pair ${p.id}:`, err.message); continue; }
|
||||||
|
if (changes.length) {
|
||||||
|
report.changedPairs++;
|
||||||
|
report.changedFields += changes.length;
|
||||||
|
if (report.samples.length < 25)
|
||||||
|
report.samples.push({ pair: p.id, changes: changes.map(c => ({ col: c.col, after: c.after })) });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return report;
|
||||||
|
}
|
||||||
|
|
||||||
// Word-IDs aller {{label.w:uuid}}-Placeholder in den Sätzen der Pairs.
|
// Word-IDs aller {{label.w:uuid}}-Placeholder in den Sätzen der Pairs.
|
||||||
// Diese Wörter entstehen bei der Generierung (Nomen im Satz) und hängen nicht an
|
// Diese Wörter entstehen bei der Generierung (Nomen im Satz) und hängen nicht an
|
||||||
// statement_words/object_words — für Übersetzung + Audio müssen sie mitgenommen werden.
|
// statement_words/object_words — für Übersetzung + Audio müssen sie mitgenommen werden.
|
||||||
@@ -179,6 +311,13 @@ async function runPicture(pictureId) {
|
|||||||
progress.translatedPairs++;
|
progress.translatedPairs++;
|
||||||
await setStep(pictureId, 'translate', progress);
|
await setStep(pictureId, 'translate', progress);
|
||||||
}
|
}
|
||||||
|
// Objekt-Wörter, die das Modell nicht als Nomen markiert hat, deterministisch nachtokenisieren
|
||||||
|
// (Sicherheitsnetz; bestehende Tokens bleiben unangetastet).
|
||||||
|
for (const p of pairs) {
|
||||||
|
try { await retagPair(p, objects); }
|
||||||
|
catch (err) { console.error(`Objekt-Tagging-Fehler bei Pair ${p.id}:`, err.message); }
|
||||||
|
}
|
||||||
|
|
||||||
// Nomen-Wörter aus Satz-Placeholdern ({{label.w:id}}) mitübersetzen
|
// Nomen-Wörter aus Satz-Placeholdern ({{label.w:id}}) mitübersetzen
|
||||||
try {
|
try {
|
||||||
for (const wid of await collectPlaceholderWordIds(pairs)) {
|
for (const wid of await collectPlaceholderWordIds(pairs)) {
|
||||||
@@ -398,4 +537,4 @@ async function generateWithBackoff(u) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = { enqueue, resumePending, loadPairs, collectAudioUnits, generateWithBackoff, translatePair };
|
module.exports = { enqueue, resumePending, loadObjects, loadPairs, collectAudioUnits, generateWithBackoff, translatePair, retagPair, retagObjects };
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ const router = require('express').Router();
|
|||||||
const { query } = require('../db');
|
const { query } = require('../db');
|
||||||
const { LANGS } = require('../lib/translate');
|
const { LANGS } = require('../lib/translate');
|
||||||
const { loadPairContext, computeReadiness, loadPairContent } = require('../lib/pairContent');
|
const { loadPairContext, computeReadiness, loadPairContent } = require('../lib/pairContent');
|
||||||
const { enqueue, loadPairs, collectAudioUnits, generateWithBackoff, translatePair } = require('../lib/pipeline');
|
const { enqueue, loadPairs, collectAudioUnits, generateWithBackoff, translatePair, retagObjects } = require('../lib/pipeline');
|
||||||
const { describeError } = require('./audios');
|
const { describeError } = require('./audios');
|
||||||
const { PLACEHOLDER_RE, TOKEN_RE, stripLeakedTokens } = require('../lib/placeholders');
|
const { PLACEHOLDER_RE, TOKEN_RE, stripLeakedTokens } = require('../lib/placeholders');
|
||||||
const { invalidateAudio } = require('../lib/reviewPairs');
|
const { invalidateAudio } = require('../lib/reviewPairs');
|
||||||
@@ -310,6 +310,24 @@ router.post('/repair-tokens', async (req, res, next) => {
|
|||||||
} catch (err) { next(err); }
|
} catch (err) { next(err); }
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// POST /api/pipeline/retag-objects — Backfill: Objekt-Wörter in bestehenden Sätzen
|
||||||
|
// nachtokenisieren (deterministisch + optional Hybrid-LLM-Fallback für gebeugte Formen).
|
||||||
|
// Body: { picture_id?, dry_run?, use_llm? }. Ohne picture_id über ALLE Bilder.
|
||||||
|
// Ändert nur die Satz-Textfelder; Audio/Alignment bleiben gültig (Sprechtext unverändert).
|
||||||
|
router.post('/retag-objects', async (req, res, next) => {
|
||||||
|
try {
|
||||||
|
const pictureId = req.body?.picture_id || null;
|
||||||
|
const dryRun = !!req.body?.dry_run;
|
||||||
|
const useLLM = !!req.body?.use_llm;
|
||||||
|
if (pictureId) {
|
||||||
|
const pr = await query(`SELECT id FROM pictures WHERE id = $1`, [pictureId]);
|
||||||
|
if (!pr.rows.length) return res.status(404).json({ error: 'Bild nicht gefunden' });
|
||||||
|
}
|
||||||
|
const report = await retagObjects({ pictureId, dryRun, useLLM });
|
||||||
|
res.json(report);
|
||||||
|
} catch (err) { next(err); }
|
||||||
|
});
|
||||||
|
|
||||||
// GET /api/pipeline/settings
|
// GET /api/pipeline/settings
|
||||||
router.get('/settings', async (req, res, next) => {
|
router.get('/settings', async (req, res, next) => {
|
||||||
try {
|
try {
|
||||||
|
|||||||
Reference in New Issue
Block a user