fix: bessere Wort-Kategorisierung, weniger "Sonstiges"
- Taxonomie um "Eigenschaften" (Adjektive) und "Verben & Handlungen" ergänzt → Wortarten haben ein Zuhause statt Sonstiges. - Klassifizierer geschärft: klare Wortart-/Themen-Regeln, "Sonstiges" nur als letzter Ausweg; Sofort-Pfad nutzt jetzt Beispielsätze und kleinere Batches (15) für deutlich genauere Treffer. - ?reset=true: bestehende Zuordnungen verwerfen und neu klassifizieren. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -154,6 +154,8 @@ async function migrate() {
|
||||
['Farben', 'Colors', 'Färger'],
|
||||
['Zahlen & Zeit', 'Numbers & Time', 'Tal & tid'],
|
||||
['Werkzeuge', 'Tools', 'Verktyg'],
|
||||
['Eigenschaften', 'Properties', 'Egenskaper'],
|
||||
['Verben & Handlungen','Verbs & Actions', 'Verb & handlingar'],
|
||||
['Sonstiges', 'Other', 'Övrigt'],
|
||||
];
|
||||
for (const [de, en, sv] of CATEGORY_TAXONOMY) {
|
||||
|
||||
@@ -85,6 +85,17 @@ async function examplesForWord(wordId, max = MAX_EXAMPLES) {
|
||||
return out;
|
||||
}
|
||||
|
||||
// Gemeinsame Klassifizierungs-Regeln. Drückt Sonstiges stark zurück und gibt Wortart-Hinweise.
|
||||
const CLASSIFY_RULES =
|
||||
`Rules:\n` +
|
||||
`- Pick the SINGLE best-fitting category by its exact German name.\n` +
|
||||
`- Most concrete nouns DO fit a topic: animals→Tiere, food/fruit/vegetables→Lebensmittel, ` +
|
||||
`sky/star/fire/water/mountain/plants→Natur & Pflanzen, furniture/window/carpet/cushion→Wohnen & Möbel, ` +
|
||||
`street/building/lamp post→Stadt & Gebäude, books/pages→Schule & Bildung.\n` +
|
||||
`- Adjectives / properties (warm, fast, sweet, old, fragile, transparent…) → "Eigenschaften".\n` +
|
||||
`- Verbs / actions → "Verben & Handlungen".\n` +
|
||||
`- Use "Sonstiges" ONLY as a true last resort when nothing else fits at all.`;
|
||||
|
||||
function buildPrompt(word, examples, categoryNamesDe) {
|
||||
const title = word.titel_en || word.titel_de || word.titel_sv || '';
|
||||
const titleDe = word.titel_de ? ` (de: "${word.titel_de}")` : '';
|
||||
@@ -92,10 +103,8 @@ function buildPrompt(word, examples, categoryNamesDe) {
|
||||
? `\n\nExample sentences using the word:\n${examples.map(e => `- ${e}`).join('\n')}`
|
||||
: '';
|
||||
return (
|
||||
`Categories (choose exactly one, by its German name):\n${categoryNamesDe.join(', ')}\n\n` +
|
||||
`Classify this single vocabulary word into the best-fitting category. ` +
|
||||
`If none fits, use "Sonstiges".\n\n` +
|
||||
`Word: "${title}"${titleDe}${ex}\n\n` +
|
||||
`Categories (German names):\n${categoryNamesDe.join(', ')}\n\n${CLASSIFY_RULES}\n\n` +
|
||||
`Classify this single vocabulary word.\n\nWord: "${title}"${titleDe}${ex}\n\n` +
|
||||
`Reply with JSON only: {"category":"<exact German category name>"}`
|
||||
);
|
||||
}
|
||||
@@ -237,7 +246,7 @@ async function runCategorizationTick() {
|
||||
// Sofortiger One-Shot-Backfill (synchron, ohne 24h-Batch-Verzug): klassifiziert bestehende,
|
||||
// in Pairs verwendete Wörter ohne Kategorie in Schüben per /v1/messages und materialisiert
|
||||
// pair_categories direkt. Für den Live-Test gedacht; der Stundenjob bleibt für laufenden Nachschub.
|
||||
async function classifyWordsSync({ max = 2000 } = {}) {
|
||||
async function classifyWordsSync({ max = 2000, reset = false } = {}) {
|
||||
if (running) return { skipped: true };
|
||||
running = true;
|
||||
try {
|
||||
@@ -247,24 +256,29 @@ async function classifyWordsSync({ max = 2000 } = {}) {
|
||||
const system = 'Du bist ein präziser Klassifizierer. Antworte AUSSCHLIESSLICH mit gültigem JSON, ohne Markdown.';
|
||||
let processed = 0, linked = 0;
|
||||
|
||||
// reset → bestehende Zuordnungen verwerfen und mit verbesserter Logik/Taxonomie neu klassifizieren
|
||||
if (reset) await query(`DELETE FROM word_categories`).catch(() => {});
|
||||
|
||||
while (processed < max) {
|
||||
const words = await findUncategorizedUsedWords(Math.min(40, max - processed));
|
||||
const words = await findUncategorizedUsedWords(Math.min(15, max - processed));
|
||||
if (!words.length) break;
|
||||
|
||||
const list = words.map(w => {
|
||||
const lines = [];
|
||||
for (const w of words) {
|
||||
const t = w.titel_en || w.titel_de || w.titel_sv || '';
|
||||
const de = w.titel_de && w.titel_de !== t ? ` (de: ${w.titel_de})` : '';
|
||||
return `${w.id}\t${t}${de}`;
|
||||
}).join('\n');
|
||||
const ex = await examplesForWord(w.id, 2);
|
||||
const exStr = ex.length ? ` | e.g.: ${ex.map(e => `"${e}"`).join('; ')}` : '';
|
||||
lines.push(`${w.id}\t${t}${de}${exStr}`);
|
||||
}
|
||||
const user =
|
||||
`Categories (choose exactly one German name per word):\n${names.join(', ')}\n\n` +
|
||||
`Classify each vocabulary word into the best-fitting category. If none fits, use "Sonstiges".\n` +
|
||||
`Words (id<TAB>title):\n${list}\n\n` +
|
||||
`Reply with JSON only: {"assignments":[{"id":"<id>","category":"<German category name>"}]}`;
|
||||
`Categories (German names):\n${names.join(', ')}\n\n${CLASSIFY_RULES}\n\n` +
|
||||
`Classify each vocabulary word below.\nWords (id<TAB>title | examples):\n${lines.join('\n')}\n\n` +
|
||||
`Reply with JSON only: {"assignments":[{"id":"<id>","category":"<exact German category name>"}]}`;
|
||||
|
||||
let assignments = [];
|
||||
try {
|
||||
const data = await messagesCall(system, user, 2000);
|
||||
const data = await messagesCall(system, user, 1500);
|
||||
assignments = Array.isArray(data.assignments) ? data.assignments : [];
|
||||
} catch { /* Fehler → ganze Charge bekommt Fallback, damit der Lauf fortschreitet */ }
|
||||
|
||||
|
||||
@@ -6,11 +6,13 @@ const STATUSES = ['requested', 'blocked', 'published'];
|
||||
|
||||
// POST /api/categories/auto-assign — Kategorisierung anstoßen.
|
||||
// ?sync=true → sofortiger One-Shot-Backfill bestehender Wörter (synchron, kein 24h-Verzug)
|
||||
// ?sync=true&reset=true → bestehende Zuordnungen verwerfen und alles neu klassifizieren
|
||||
// sonst → ein asynchroner Batch-Tick (submit/collect über die Message Batches API)
|
||||
router.post('/auto-assign', async (req, res, next) => {
|
||||
try {
|
||||
const sync = req.query.sync === 'true' || req.body?.sync === true;
|
||||
const result = sync ? await classifyWordsSync({}) : await runCategorizationTick();
|
||||
const reset = req.query.reset === 'true' || req.body?.reset === true;
|
||||
const result = sync ? await classifyWordsSync({ reset }) : await runCategorizationTick();
|
||||
res.json(result);
|
||||
} catch (err) { next(err); }
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user