Gather data

This commit is contained in:
mike
2025-12-25 00:21:58 +01:00
parent 85ebfd3013
commit 49a1aa4152
12 changed files with 494 additions and 27933 deletions

100
.db/init.sql Normal file
View File

@@ -0,0 +1,100 @@
-- 0) Optioneel: index (handig)
CREATE INDEX IF NOT EXISTS idx_frequency_word ON frequency(Word);
-- 1) View: normalisatie voor Zipf en lengte (0..1)
DROP VIEW IF EXISTS word_norm;
CREATE VIEW word_norm AS
WITH base AS (
SELECT
Word,
CAST(NULLIF(REPLACE(Zipf, ',', '.'), '') AS REAL) AS zipf_num,
LENGTH(Word) AS len
FROM frequency
WHERE Word IS NOT NULL
),
stats AS (
SELECT
MIN(zipf_num) AS min_zipf,
MAX(zipf_num) AS max_zipf,
MIN(len) AS min_len,
MAX(len) AS max_len
FROM base
WHERE zipf_num IS NOT NULL
)
SELECT
b.Word,
b.zipf_num,
b.len,
-- freq_norm: 0 = heel frequent, 1 = heel zeldzaam
(1.0 - (b.zipf_num - s.min_zipf) / NULLIF(s.max_zipf - s.min_zipf, 0.0)) AS freq_norm,
-- len_norm: 0..1
( (b.len - s.min_len) / NULLIF(CAST(s.max_len - s.min_len AS REAL), 0.0) ) AS len_norm
FROM base b
CROSS JOIN stats s
WHERE b.zipf_num IS NOT NULL;
-- 2) Letter-frequenties bouwen (letter_freq)
CREATE TABLE IF NOT EXISTS letter_freq (
ch TEXT PRIMARY KEY,
cnt INTEGER NOT NULL
);
DELETE FROM letter_freq;
WITH RECURSIVE chars(word, pos, ch) AS (
SELECT Word, 1, LOWER(SUBSTR(Word, 1, 1))
FROM frequency
WHERE Word IS NOT NULL AND LENGTH(Word) > 0
UNION ALL
SELECT word, pos + 1, LOWER(SUBSTR(word, pos + 1, 1))
FROM chars
WHERE pos < LENGTH(word)
)
INSERT INTO letter_freq(ch, cnt)
SELECT ch, COUNT(*) AS cnt
FROM chars
WHERE ch <> '' -- basic guard
GROUP BY ch;
-- 3) View: letterRarity per woord (0..1)
DROP VIEW IF EXISTS letter_rarity;
CREATE VIEW letter_rarity AS
WITH total AS (
SELECT CAST(SUM(cnt) AS REAL) AS total_cnt
FROM letter_freq
),
word_chars(word, pos, ch) AS (
SELECT Word, 1, LOWER(SUBSTR(Word, 1, 1))
FROM frequency
WHERE Word IS NOT NULL AND LENGTH(Word) > 0
UNION ALL
SELECT word, pos + 1, LOWER(SUBSTR(word, pos + 1, 1))
FROM word_chars
WHERE pos < LENGTH(word)
)
SELECT
wc.word AS Word,
AVG(1.0 - (lf.cnt / t.total_cnt)) AS letter_rarity_norm
FROM word_chars wc
JOIN letter_freq lf ON lf.ch = wc.ch
CROSS JOIN total t
GROUP BY wc.word;
-- 4) View: complexiteit v1 (ngramProb + morfologie = placeholder 0.5)
DROP VIEW IF EXISTS word_complexity_v1;
CREATE VIEW word_complexity_v1 AS
SELECT
n.Word,
(0.35 * n.freq_norm) +
(0.20 * n.len_norm) +
(0.20 * 0.5) + -- ngramProb placeholder
(0.15 * COALESCE(r.letter_rarity_norm, 0.5)) +
(0.10 * 0.5) -- morfologie placeholder
AS complexity
FROM word_norm n
LEFT JOIN letter_rarity r USING (Word);

1
.gitignore vendored
View File

@@ -6,3 +6,4 @@ target/
out/puzzle/
*.log
.output.txt
out/

142
import.py Normal file
View File

@@ -0,0 +1,142 @@
#!/usr/bin/env python3
import argparse
import json
import re
import sqlite3
from pathlib import Path
RE_ASCII_WORD = re.compile(r"^[A-Za-z]+$")
RE_SPACE = re.compile(r"\s+")
RE_PARENS = re.compile(r"\s*\([^)]*\)\s*") # verwijder (labels)
RE_BRACKETS = re.compile(r"\s*\[[^]]*]\s*") # verwijder [labels]
def clean_hint(s: str) -> str:
s = s.strip()
s = RE_BRACKETS.sub(" ", s)
s = RE_PARENS.sub(" ", s)
s = s.replace("", "'")
s = RE_SPACE.sub(" ", s).strip(" -;:,.\t")
return s
def pick_gloss(obj: dict) -> tuple[str | None, str | None]:
"""Return (hint, pos) from a Wiktextract JSON line."""
pos = obj.get("pos")
senses = obj.get("senses") or []
best = None
for s in senses:
glosses = s.get("glosses") or []
if not glosses:
continue
# Neem de eerste gloss die "normaal" oogt
for g in glosses:
if not isinstance(g, str):
continue
g2 = clean_hint(g)
if len(g2) < 3:
continue
best = g2
break
if best:
break
return best, pos
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--db", required=True, help="pad naar jouw sqlite db")
ap.add_argument("--jsonl", required=True, help="pad naar nl-extract.jsonl")
ap.add_argument("--minlen", type=int, default=2)
ap.add_argument("--maxlen", type=int, default=8)
ap.add_argument("--maxhint", type=int, default=80)
args = ap.parse_args()
db_path = Path(args.db)
jsonl_path = Path(args.jsonl)
con = sqlite3.connect(db_path)
cur = con.cursor()
# speed pragmas (alleen tijdens import)
cur.execute("PRAGMA journal_mode=WAL;")
cur.execute("PRAGMA synchronous=NORMAL;")
cur.execute("PRAGMA temp_store=MEMORY;")
cur.execute("""
CREATE TABLE IF NOT EXISTS hints (
word TEXT NOT NULL,
hint TEXT NOT NULL,
source TEXT NOT NULL DEFAULT 'wiktionary',
pos TEXT,
quality INTEGER NOT NULL DEFAULT 80,
PRIMARY KEY (word, hint, source)
);
""")
cur.execute("CREATE INDEX IF NOT EXISTS idx_hints_word ON hints(word);")
con.commit()
batch = []
inserted = 0
seen = 0
con.execute("BEGIN;")
with jsonl_path.open("r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
obj = json.loads(line)
except json.JSONDecodeError:
continue
# Kaikki/Wiktextract: vaak lang_code = "nl" en lang = "Dutch"
lang_code = obj.get("lang_code")
if lang_code and lang_code != "nl":
continue
word = obj.get("word")
if not word:
continue
word_up = word.upper().strip()
if not (args.minlen <= len(word_up) <= args.maxlen):
continue
if not RE_ASCII_WORD.match(word_up):
continue
hint, pos = pick_gloss(obj)
if not hint:
continue
# Hint kort houden
hint = hint[: args.maxhint].rstrip()
# Simpele kwaliteit: iets hoger als POS bekend is
quality = 85 if pos else 80
batch.append((word_up, hint, "wiktionary", pos, quality))
seen += 1
if len(batch) >= 2000:
cur.executemany(
"INSERT OR IGNORE INTO hints(word,hint,source,pos,quality) VALUES (?,?,?,?,?)",
batch
)
inserted += cur.rowcount if cur.rowcount != -1 else 0
batch.clear()
if batch:
cur.executemany(
"INSERT OR IGNORE INTO hints(word,hint,source,pos,quality) VALUES (?,?,?,?,?)",
batch
)
inserted += cur.rowcount if cur.rowcount != -1 else 0
con.commit()
con.close()
print(f"Done. processed_lines≈{seen}, inserted≈{inserted} (OR IGNORE kan inserts verlagen).")
if __name__ == "__main__":
main()

27831
out/pool.txt

File diff suppressed because it is too large Load Diff

View File

@@ -1,38 +0,0 @@
Date: 2025-12-22
Feeds: https://feeds.nos.nl/nosnieuwsalgemeen, https://feeds.nos.nl/nosnieuwstech
Model: mistralai/mistral-nemo-instruct-2407
Master size: 91871
Theme kept (in master): 0
Bridge size: 21125
Shorts kept: 130
Pool total: 27831
Enforced minima:
2: 1000
3: 1000
4: 1000
5: 1000
6: 2000
7: 2000
8: 2000
Counts per length (theme):
2: 0
3: 0
4: 0
5: 0
6: 0
7: 0
8: 0
Counts per length (pool):
2: 242
3: 1000
4: 1000
5: 1000
6: 2000
7: 2000
8: 20589

File diff suppressed because one or more lines are too long

View File

View File

@@ -0,0 +1,220 @@
package puzzle;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
public class ClueGenerator {
private static final String OLLAMA_URL = "http://localhost:11434/api/chat";
private static final String MODEL = "qwen2.5:14b";
private static final String HINTS_FILE = "export_with_hints.csv";
private static Map<String, String> prebuiltClues = null;
private static synchronized void ensurePrebuiltCluesLoaded() {
if (prebuiltClues != null) return;
prebuiltClues = new HashMap<>();
try {
List<String> lines = Files.readAllLines(Path.of(HINTS_FILE), StandardCharsets.UTF_8);
for (String line : lines) {
String[] parts = line.split(",", 3);
if (parts.length >= 3) {
String word = parts[0].trim().toUpperCase(Locale.ROOT);
String rawClue = parts[2].trim();
if (rawClue.startsWith("\"") && rawClue.endsWith("\"")) {
rawClue = rawClue.substring(1, rawClue.length() - 1).replace("\"\"", "\"");
}
if (!word.isEmpty() && !rawClue.isEmpty()) {
prebuiltClues.put(word, rawClue);
}
}
}
} catch (IOException e) {
System.err.println("Warning: " + HINTS_FILE + " not found or could not be read.");
}
}
public static ExportFormat.ExportedPuzzle applyClues(ExportFormat.ExportedPuzzle puzzle) {
if (puzzle == null || puzzle.words().isEmpty()) {
return puzzle;
}
ensurePrebuiltCluesLoaded();
Map<String, String> finalClueMap = new HashMap<>();
List<String> wordsMissingClues = new ArrayList<>();
for (var w : puzzle.words()) {
String wordUpper = w.word().toUpperCase(Locale.ROOT);
if (prebuiltClues.containsKey(wordUpper)) {
finalClueMap.put(w.word(), prebuiltClues.get(wordUpper));
} else {
wordsMissingClues.add(w.word());
}
}
if (!wordsMissingClues.isEmpty()) {
Map<String, String> generatedClues = generateClues(wordsMissingClues);
finalClueMap.putAll(generatedClues);
}
List<ExportFormat.WordOut> wordsWithClues = new ArrayList<>();
for (var w : puzzle.words()) {
String clue = finalClueMap.getOrDefault(w.word(), w.word());
wordsWithClues.add(new ExportFormat.WordOut(
w.word(),
clue,
w.startRow(),
w.startCol(),
w.direction(),
w.answer(),
w.arrowRow(),
w.arrowCol(),
w.isReversed()
));
}
return new ExportFormat.ExportedPuzzle(puzzle.gridv2(), wordsWithClues, puzzle.difficulty(), puzzle.rewards());
}
public static Map<String, String> generateClues(List<String> words) {
if (words == null || words.isEmpty()) {
return Collections.emptyMap();
}
String prompt = createCluePrompt(words);
try {
String jsonRequest = String.format(
"{\"model\":\"%s\",\"messages\":[{\"role\":\"user\",\"content\":\"%s\"}],\"stream\":false,\"temperature\":0.7}",
MODEL, escapeJson(prompt)
);
String responseBody = curlPostJson(OLLAMA_URL, jsonRequest, 120);
String content = extractChatContent(responseBody);
if (content == null || content.isEmpty()) {
return Collections.emptyMap();
}
return parseCluesFromReply(words, content);
} catch (Exception e) {
System.err.println("Failed to generate clues: " + e.getMessage());
return Collections.emptyMap();
}
}
private static String createCluePrompt(List<String> words) {
return "Je bent een expert in het maken van kruiswoordpuzzels. Geef voor elk van de onderstaande woorden een korte, uitdagende maar duidelijke cryptische of beschrijvende aanwijzing in het Nederlands.\n\n" +
"Output ALLEEN in dit formaat:\n" +
"woord1:aanwijzing\n" +
"woord2:aanwijzing\n\n" +
"GEEN andere tekst of uitleg. Sla GEEN woorden over.\n\n" +
"Lijst:\n" +
String.join("\n", words);
}
private static Map<String, String> parseCluesFromReply(List<String> expectedWords, String reply) {
Map<String, String> wordClueMap = new HashMap<>();
String[] lines = reply.split("\n");
for (String line : lines) {
line = line.trim();
if (line.contains(":")) {
String[] parts = line.split(":", 2);
if (parts.length == 2) {
String wordPart = parts[0].trim().replaceAll("^[\\d+.)*\\-\\s]+", "").toLowerCase();
String clue = parts[1].trim();
if (!clue.isEmpty()) {
wordClueMap.put(wordPart, clue);
}
}
}
}
Map<String, String> results = new HashMap<>();
for (String word : expectedWords) {
String clue = wordClueMap.get(word.toLowerCase());
if (clue != null) {
results.put(word, clue);
}
}
return results;
}
private static String curlPostJson(String url, String jsonBody, int timeoutSeconds) throws Exception {
var tempFile = Files.createTempFile("clue-request-", ".json");
try {
Files.writeString(tempFile, jsonBody, StandardCharsets.UTF_8);
List<String> cmd = new ArrayList<>();
cmd.add("curl");
cmd.add("-fsSL");
cmd.add("--connect-timeout");
cmd.add("10");
cmd.add("--max-time");
cmd.add(String.valueOf(timeoutSeconds));
cmd.add("-H");
cmd.add("Content-Type: application/json");
cmd.add("-d");
cmd.add("@" + tempFile);
cmd.add(url);
var p = new ProcessBuilder(cmd)
.redirectErrorStream(true)
.start();
var bytes = p.getInputStream().readAllBytes();
var code = p.waitFor();
if (code != 0) {
throw new IOException("curl POST failed (" + code + ") url=" + url + "\nOutput:\n" +
new String(bytes, StandardCharsets.UTF_8));
}
return new String(bytes, StandardCharsets.UTF_8);
} finally {
Files.deleteIfExists(tempFile);
}
}
private static String extractChatContent(String json) {
if (json == null) return null;
var choices = json.indexOf("\"choices\"");
var p = (choices >= 0) ? choices : 0;
var i = json.indexOf("\"content\"", p);
if (i < 0) {
// Fallback for Ollama non-chat format if needed, but we used /api/chat
// Ollama /api/chat returns {"model":"...","message":{"role":"assistant","content":"..."}}
i = json.indexOf("\"content\"");
if (i < 0) return null;
}
var colon = json.indexOf(':', i);
if (colon < 0) return null;
var q = json.indexOf('"', colon + 1);
if (q < 0) return null;
var sb = new StringBuilder();
var esc = false;
for (var k = q + 1; k < json.length(); k++) {
var ch = json.charAt(k);
if (esc) {
if (ch == 'n') sb.append('\n');
else if (ch == 't') sb.append('\t');
else if (ch == 'r') sb.append('\r');
else sb.append(ch);
esc = false;
} else {
if (ch == '\\') esc = true;
else if (ch == '"') break;
else sb.append(ch);
}
}
return sb.toString();
}
private static String escapeJson(String str) {
return str.replace("\\", "\\\\")
.replace("\"", "\\\"")
.replace("\n", "\\n");
}
}

View File

@@ -31,7 +31,7 @@ public class DailyGenerator {
public static void main(String[] args) {
var outDir = env("OUT_DIR", "/home/mike/dev/puzzle-generator/data/");
var wordsPath = env("WORDS_PATH", "./word-list.txt");
var wordsPath = env("WORDS_PATH", "./export_words_only.txt");
var puzzlesPerDay = envInt("PUZZLES_PER_DAY", 3);
var seed = envInt("SEED", (int) System.currentTimeMillis());
var themeFilter = envBool("THEME_FILTER", true);
@@ -119,6 +119,10 @@ public class DailyGenerator {
result, 1, new ExportFormat.Rewards(50, 2, 1)
);
// Generate clues via LLM
System.out.println("Generating clues for " + exported.words().size() + " words...");
exported = ClueGenerator.applyClues(exported);
// Write to JSON file
var filename = String.format("crossword_%s_%02d_%s.json", dateStr, i, safeSlug(theme));
var outputPath = Paths.get(outDir, filename);

View File

@@ -7,7 +7,10 @@ import java.nio.file.Paths;
import java.time.OffsetDateTime;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
public class Main {
// ---------------- CLI ----------------
@@ -15,7 +18,7 @@ public class Main {
public static class Opts {
public int seed = 1;
public int pop = 18;
public int gens = 200;
public int gens = 1000;
public int tries = 5;
public String wordsPath = "./out/pool.txt";
public double minSimplicity = 0; // 0 means no limit
@@ -74,6 +77,11 @@ public class Main {
System.out.println(SwedishGenerator.renderHuman(res.filled().grid));
System.out.printf(Locale.ROOT, "Puzzle Simplicity: %.2f%n", res.filled().simplicity);
var out = ExportFormat.exportFormatFromFilled(res, 1, new ExportFormat.Rewards(50, 2, 1));
// Generate clues via LLM
System.out.println("Generating clues for " + out.words().size() + " words...");
out = ClueGenerator.applyClues(out);
System.out.println("gridv2:");
for (String row : out.gridv2()) System.out.println(row);
System.out.println("words: " + out.words().size());

View File

@@ -153,7 +153,7 @@ public class SwedishGenerator {
static Map<String, Integer> loadScores() {
var scores = new HashMap<String, Integer>();
try {
var lines = Files.readAllLines(Path.of("word_scores.csv"), StandardCharsets.UTF_8);
var lines = Files.readAllLines(Path.of("export_words.csv"), StandardCharsets.UTF_8);
var first = true;
for (var line : lines) {
if (first) {
@@ -161,16 +161,11 @@ public class SwedishGenerator {
continue;
}
var parts = line.split(",");
if (parts.length >= 3) {
if (parts.length >= 2) {
try {
var word = parts[0].trim().toUpperCase(Locale.ROOT);
var score = Integer.parseInt(parts[1].trim());
var status = parts[2].trim();
if ("OK".equalsIgnoreCase(status)) {
scores.put(word, score);
} else {
System.err.println("Skipping:" +Arrays.toString( parts));
}
} catch (NumberFormatException ignored) {
System.err.println("Illegal number format: " + line);
}

View File

@@ -56,7 +56,7 @@ public class ThemePoolBuilderLength {
static final class Opts {
String wordsPath = "/home/mike/dev/puzzle-generator/word-list.txt";
String wordsPath = "/home/mike/dev/puzzle-generator/export_words_only.txt";
String endpoint = "https://jarvis-lan.appmodel.nl/api/stoic/";
List<String> feeds = new ArrayList<>(DEFAULT_FEEDS);
String outDir = "./out";
@@ -303,10 +303,10 @@ public class ThemePoolBuilderLength {
try {
var word = parts[0].trim().toUpperCase(Locale.ROOT);
var score = Integer.parseInt(parts[1].trim());
var status = parts[2].trim();
if ("OK".equalsIgnoreCase(status)) {
// var status = parts[2].trim();
// if ("OK".equalsIgnoreCase(status)) {
llmScores.put(word, score);
}
//}
} catch (NumberFormatException ignored) { }
}
}
@@ -731,7 +731,7 @@ public class ThemePoolBuilderLength {
// Optionally filter out VERY complex words from the bridge (e.g. lScore < 3)
// But since we sort by score (which is now dominated by lScore),
// they will be at the very bottom anyway.
if (lex.score[i] < 800) continue;
// if (lex.score[i] < 800) continue;
ids.add(i);
}
@@ -774,8 +774,8 @@ public class ThemePoolBuilderLength {
var out = new ArrayList<String>(ids.size());
for (var id : ids) {
/* if (lex.score[id] < 680)
continue;*/
if (lex.score[id] < 680)
continue;
out.add(lex.words.get(id));
}
Files.write(path, out, StandardCharsets.UTF_8, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);