feat: Objekt-Wörter deterministisch tokenisieren (Forward + Backfill)
Objekt-Hervorhebung (Chip + Bildregion) hängt an {{label.o:uuid}}-Tokens im
Satz. Bisher entstanden die nur aus dem LLM-Nomen-Markup, das Haiku oft
ausließ -> Objekt blieb un-getokt (z.B. "ryggsäcken"/Rucksack), obwohl korrekt
verlinkt.
- src/lib/objectTagging.js: deterministischer, flexions-toleranter Tagger
(schwed. bestimmte Form -en/-et/...), idempotent, schützt bestehende Tokens.
- generatePairs.resolveNounMarkup: Sweep als Sicherheitsnetz + titel_sv im Lookup.
- pipeline.retagPair/retagObjects: per-Pair Nachtokenisierung (Hybrid-LLM-Fallback
nur für in anderer Sprache bestätigte Objekte), Backfill über Bild/alle Bilder.
- POST /api/pipeline/retag-objects (dry_run/use_llm/picture_id).
Ändert nur Satz-Textfelder -> Audio/Alignment bleiben gültig.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -3,7 +3,7 @@ const router = require('express').Router();
|
||||
const { query } = require('../db');
|
||||
const { LANGS } = require('../lib/translate');
|
||||
const { loadPairContext, computeReadiness, loadPairContent } = require('../lib/pairContent');
|
||||
const { enqueue, loadPairs, collectAudioUnits, generateWithBackoff, translatePair } = require('../lib/pipeline');
|
||||
const { enqueue, loadPairs, collectAudioUnits, generateWithBackoff, translatePair, retagObjects } = require('../lib/pipeline');
|
||||
const { describeError } = require('./audios');
|
||||
const { PLACEHOLDER_RE, TOKEN_RE, stripLeakedTokens } = require('../lib/placeholders');
|
||||
const { invalidateAudio } = require('../lib/reviewPairs');
|
||||
@@ -310,6 +310,24 @@ router.post('/repair-tokens', async (req, res, next) => {
|
||||
} catch (err) { next(err); }
|
||||
});
|
||||
|
||||
// POST /api/pipeline/retag-objects — Backfill: Objekt-Wörter in bestehenden Sätzen
|
||||
// nachtokenisieren (deterministisch + optional Hybrid-LLM-Fallback für gebeugte Formen).
|
||||
// Body: { picture_id?, dry_run?, use_llm? }. Ohne picture_id über ALLE Bilder.
|
||||
// Ändert nur die Satz-Textfelder; Audio/Alignment bleiben gültig (Sprechtext unverändert).
|
||||
router.post('/retag-objects', async (req, res, next) => {
|
||||
try {
|
||||
const pictureId = req.body?.picture_id || null;
|
||||
const dryRun = !!req.body?.dry_run;
|
||||
const useLLM = !!req.body?.use_llm;
|
||||
if (pictureId) {
|
||||
const pr = await query(`SELECT id FROM pictures WHERE id = $1`, [pictureId]);
|
||||
if (!pr.rows.length) return res.status(404).json({ error: 'Bild nicht gefunden' });
|
||||
}
|
||||
const report = await retagObjects({ pictureId, dryRun, useLLM });
|
||||
res.json(report);
|
||||
} catch (err) { next(err); }
|
||||
});
|
||||
|
||||
// GET /api/pipeline/settings
|
||||
router.get('/settings', async (req, res, next) => {
|
||||
try {
|
||||
|
||||
Reference in New Issue
Block a user