diff --git a/src/db-migrate.js b/src/db-migrate.js index 2341696..8088851 100644 --- a/src/db-migrate.js +++ b/src/db-migrate.js @@ -154,6 +154,8 @@ async function migrate() { ['Farben', 'Colors', 'Färger'], ['Zahlen & Zeit', 'Numbers & Time', 'Tal & tid'], ['Werkzeuge', 'Tools', 'Verktyg'], + ['Eigenschaften', 'Properties', 'Egenskaper'], + ['Verben & Handlungen','Verbs & Actions', 'Verb & handlingar'], ['Sonstiges', 'Other', 'Övrigt'], ]; for (const [de, en, sv] of CATEGORY_TAXONOMY) { diff --git a/src/lib/classifyWords.js b/src/lib/classifyWords.js index d57f244..959d7c9 100644 --- a/src/lib/classifyWords.js +++ b/src/lib/classifyWords.js @@ -85,6 +85,17 @@ async function examplesForWord(wordId, max = MAX_EXAMPLES) { return out; } +// Gemeinsame Klassifizierungs-Regeln. Drückt Sonstiges stark zurück und gibt Wortart-Hinweise. +const CLASSIFY_RULES = + `Rules:\n` + + `- Pick the SINGLE best-fitting category by its exact German name.\n` + + `- Most concrete nouns DO fit a topic: animals→Tiere, food/fruit/vegetables→Lebensmittel, ` + + `sky/star/fire/water/mountain/plants→Natur & Pflanzen, furniture/window/carpet/cushion→Wohnen & Möbel, ` + + `street/building/lamp post→Stadt & Gebäude, books/pages→Schule & Bildung.\n` + + `- Adjectives / properties (warm, fast, sweet, old, fragile, transparent…) → "Eigenschaften".\n` + + `- Verbs / actions → "Verben & Handlungen".\n` + + `- Use "Sonstiges" ONLY as a true last resort when nothing else fits at all.`; + function buildPrompt(word, examples, categoryNamesDe) { const title = word.titel_en || word.titel_de || word.titel_sv || ''; const titleDe = word.titel_de ? ` (de: "${word.titel_de}")` : ''; @@ -92,10 +103,8 @@ function buildPrompt(word, examples, categoryNamesDe) { ? `\n\nExample sentences using the word:\n${examples.map(e => `- ${e}`).join('\n')}` : ''; return ( - `Categories (choose exactly one, by its German name):\n${categoryNamesDe.join(', ')}\n\n` + - `Classify this single vocabulary word into the best-fitting category. ` + - `If none fits, use "Sonstiges".\n\n` + - `Word: "${title}"${titleDe}${ex}\n\n` + + `Categories (German names):\n${categoryNamesDe.join(', ')}\n\n${CLASSIFY_RULES}\n\n` + + `Classify this single vocabulary word.\n\nWord: "${title}"${titleDe}${ex}\n\n` + `Reply with JSON only: {"category":""}` ); } @@ -237,7 +246,7 @@ async function runCategorizationTick() { // Sofortiger One-Shot-Backfill (synchron, ohne 24h-Batch-Verzug): klassifiziert bestehende, // in Pairs verwendete Wörter ohne Kategorie in Schüben per /v1/messages und materialisiert // pair_categories direkt. Für den Live-Test gedacht; der Stundenjob bleibt für laufenden Nachschub. -async function classifyWordsSync({ max = 2000 } = {}) { +async function classifyWordsSync({ max = 2000, reset = false } = {}) { if (running) return { skipped: true }; running = true; try { @@ -247,24 +256,29 @@ async function classifyWordsSync({ max = 2000 } = {}) { const system = 'Du bist ein präziser Klassifizierer. Antworte AUSSCHLIESSLICH mit gültigem JSON, ohne Markdown.'; let processed = 0, linked = 0; + // reset → bestehende Zuordnungen verwerfen und mit verbesserter Logik/Taxonomie neu klassifizieren + if (reset) await query(`DELETE FROM word_categories`).catch(() => {}); + while (processed < max) { - const words = await findUncategorizedUsedWords(Math.min(40, max - processed)); + const words = await findUncategorizedUsedWords(Math.min(15, max - processed)); if (!words.length) break; - const list = words.map(w => { + const lines = []; + for (const w of words) { const t = w.titel_en || w.titel_de || w.titel_sv || ''; const de = w.titel_de && w.titel_de !== t ? ` (de: ${w.titel_de})` : ''; - return `${w.id}\t${t}${de}`; - }).join('\n'); + const ex = await examplesForWord(w.id, 2); + const exStr = ex.length ? ` | e.g.: ${ex.map(e => `"${e}"`).join('; ')}` : ''; + lines.push(`${w.id}\t${t}${de}${exStr}`); + } const user = - `Categories (choose exactly one German name per word):\n${names.join(', ')}\n\n` + - `Classify each vocabulary word into the best-fitting category. If none fits, use "Sonstiges".\n` + - `Words (idtitle):\n${list}\n\n` + - `Reply with JSON only: {"assignments":[{"id":"","category":""}]}`; + `Categories (German names):\n${names.join(', ')}\n\n${CLASSIFY_RULES}\n\n` + + `Classify each vocabulary word below.\nWords (idtitle | examples):\n${lines.join('\n')}\n\n` + + `Reply with JSON only: {"assignments":[{"id":"","category":""}]}`; let assignments = []; try { - const data = await messagesCall(system, user, 2000); + const data = await messagesCall(system, user, 1500); assignments = Array.isArray(data.assignments) ? data.assignments : []; } catch { /* Fehler → ganze Charge bekommt Fallback, damit der Lauf fortschreitet */ } diff --git a/src/routes/categories.js b/src/routes/categories.js index 3d3058b..e545919 100644 --- a/src/routes/categories.js +++ b/src/routes/categories.js @@ -5,12 +5,14 @@ const { runCategorizationTick, classifyWordsSync } = require('../lib/classifyWor const STATUSES = ['requested', 'blocked', 'published']; // POST /api/categories/auto-assign — Kategorisierung anstoßen. -// ?sync=true → sofortiger One-Shot-Backfill bestehender Wörter (synchron, kein 24h-Verzug) -// sonst → ein asynchroner Batch-Tick (submit/collect über die Message Batches API) +// ?sync=true → sofortiger One-Shot-Backfill bestehender Wörter (synchron, kein 24h-Verzug) +// ?sync=true&reset=true → bestehende Zuordnungen verwerfen und alles neu klassifizieren +// sonst → ein asynchroner Batch-Tick (submit/collect über die Message Batches API) router.post('/auto-assign', async (req, res, next) => { try { const sync = req.query.sync === 'true' || req.body?.sync === true; - const result = sync ? await classifyWordsSync({}) : await runCategorizationTick(); + const reset = req.query.reset === 'true' || req.body?.reset === true; + const result = sync ? await classifyWordsSync({ reset }) : await runCategorizationTick(); res.json(result); } catch (err) { next(err); } });