Gather data

This commit is contained in:
mike
2025-12-25 04:38:16 +01:00
parent 49a1aa4152
commit 541e101ae0
26 changed files with 1539 additions and 185988 deletions

View File

@@ -10,7 +10,7 @@ public class ClueGenerator {
private static final String OLLAMA_URL = "http://localhost:11434/api/chat";
private static final String MODEL = "qwen2.5:14b";
private static final String HINTS_FILE = "export_with_hints.csv";
private static final String HINTS_FILE = "/data/puzzle/export_with_hints.csv";
private static Map<String, String> prebuiltClues = null;
private static synchronized void ensurePrebuiltCluesLoaded() {

View File

@@ -1,6 +1,5 @@
package puzzle;
import puzzle.DutchWordScorer.WordScore;
import java.nio.charset.StandardCharsets;
import java.nio.file.*;
import java.util.*;

View File

@@ -29,9 +29,9 @@ public class DailyGenerator {
return "true".equalsIgnoreCase(val) || "1".equals(val);
}
public static void main(String[] args) {
static void main(String[] args) {
var outDir = env("OUT_DIR", "/home/mike/dev/puzzle-generator/data/");
var wordsPath = env("WORDS_PATH", "./export_words_only.txt");
var wordsPath = env("WORDS_PATH", "/data/puzzle/export_with_hints.csv");
var puzzlesPerDay = envInt("PUZZLES_PER_DAY", 3);
var seed = envInt("SEED", (int) System.currentTimeMillis());
var themeFilter = envBool("THEME_FILTER", true);
@@ -49,7 +49,7 @@ public class DailyGenerator {
// Load word list
SwedishGenerator.Dict dict;
var llmScores = SwedishGenerator.loadScores();
var llmScores = SwedishGenerator.loadScores();
try {
dict = SwedishGenerator.loadWords(wordsPath, llmScores);
System.out.println("Loaded " + dict.words.size() + " words");
@@ -82,7 +82,6 @@ public class DailyGenerator {
// Filter word list by theme
List<String> filteredWords = dict.words;
if (themeFilter && !theme.equals("algemeen")) {
filteredWords = ThemeGraph.filterByTheme(dict.words, theme, themeMinScore);
System.out.println("Filtered to " + filteredWords.size() + " words for theme '" + theme + "'");
// If too few words, fall back to general
@@ -98,11 +97,11 @@ public class DailyGenerator {
// Generate puzzle
var opts = new Main.Opts();
opts.seed = seed + i;
opts.pop = 18;
opts.gens = 100;
opts.tries = 50;
opts.wordsPath = wordsPath;
opts.seed = seed + i;
opts.pop = 18;
opts.gens = 100;
opts.tries = 50;
opts.wordsPath = wordsPath;
opts.minSimplicity = 0; // default
var result = generateWithFilteredDict(opts, themedDict, llmScores);

View File

@@ -1,229 +0,0 @@
package puzzle;
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;
import java.util.concurrent.TimeUnit;
/**
* Ollama Dutch Wordlist Scorer
* Scores 90k Dutch words on popularity/complexity using local Ollama API
*/
public class DutchWordScorer {
// Configuration
private static final String OLLAMA_ENDPOINT = "http://localhost:11434/api/chat";
private static final String MODEL = "qwen2.5:14b"; // or "llama3.1:latest"
private static final int BATCH_SIZE = 50; // Words per API call
private static final int RATE_LIMIT_DELAY_MS = 500; // Be nice to local Ollama
private static final int MAX_RETRIES = 3;
// Input/output files
private static final String INPUT_WORDLIST = "word-list.txt";
private static final String OUTPUT_SCORES = "word_scores.csv";
public static void main(String[] args) throws Exception {
System.out.println("Starting Dutch wordlist scoring...");
// Read all words
List<String> words = Files.readAllLines(Paths.get(INPUT_WORDLIST));
System.out.printf("Loaded %d words from %s%n", words.size(), INPUT_WORDLIST);
// Process in batches
HttpClient client = HttpClient.newBuilder()
.connectTimeout(java.time.Duration.ofSeconds(30))
.build();
List<WordScore> allScores = new ArrayList<>();
for (int i = 0; i < words.size(); i += BATCH_SIZE) {
int end = Math.min(i + BATCH_SIZE, words.size());
List<String> batch = words.subList(i, end);
System.out.printf("Processing batch %d-%d...%n", i + 1, end);
boolean success = false;
int retries = 0;
while (!success && retries < MAX_RETRIES) {
try {
List<WordScore> batchScores = processBatch(client, batch);
allScores.addAll(batchScores);
success = true;
// Rate limiting
if (i + BATCH_SIZE < words.size()) {
Thread.sleep(RATE_LIMIT_DELAY_MS);
}
} catch (Exception e) {
retries++;
System.err.printf("Batch %d-%d failed (attempt %d/%d): %s%n",
i + 1, end, retries, MAX_RETRIES, e.getMessage());
if (retries >= MAX_RETRIES) {
System.err.println("Max retries reached, skipping batch");
// Add null scores for failed batch to maintain alignment
batch.forEach(w -> allScores.add(new WordScore(w, -1, "FAILED")));
} else {
Thread.sleep(2000 * retries); // Exponential backoff
}
}
}
}
// Write results
writeScoresToCsv(allScores);
System.out.printf("Completed! Scored %d words. Results saved to %s%n",
allScores.size(), OUTPUT_SCORES);
}
public static List<WordScore> processBatch(HttpClient client, List<String> batch) throws Exception {
String prompt = createScoringPrompt(batch);
// Build JSON request
String jsonRequest = buildChatRequestJson(prompt);
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create(OLLAMA_ENDPOINT))
.header("Content-Type", "application/json")
.POST(HttpRequest.BodyPublishers.ofString(jsonRequest))
.build();
HttpResponse<String> response = client.send(request,
HttpResponse.BodyHandlers.ofString());
if (response.statusCode() != 200) {
throw new RuntimeException("HTTP " + response.statusCode() + ": " + response.body());
}
// Parse response
String responseBody = response.body();
String assistantReply = extractMessageContent(responseBody);
return parseScoresFromReply(batch, assistantReply);
}
private static String createScoringPrompt(List<String> words) {
return """
Je bent een Nederlandse taalexpert. Geef elk woord een populariteitsscore van 1-10.
Score criteria:
- 1 = Zeer zeldzaam, archaïsch, of extreem specifiek vakjargon
- 10 = Zeer algemeen, dagelijks gebruikt door iedereen
Geef ALLEEN een lijst in dit exacte formaat, niets anders:
woord1:score
woord2:score
enz.
Woorden om te scoren:
""" + String.join("\n", words);
}
private static String buildChatRequestJson(String prompt) {
// Simple JSON building (in production use a library like Jackson)
return String.format("""
{
"model": "%s",
"messages": [
{
"role": "user",
"content": "%s"
}
],
"stream": false,
"temperature": 0.1
}
""", MODEL, escapeJson(prompt));
}
private static String escapeJson(String str) {
return str.replace("\\", "\\\\")
.replace("\"", "\\\"")
.replace("\n", "\\n")
.replace("\r", "\\r")
.replace("\t", "\\t");
}
private static String extractMessageContent(String responseBody) {
// Parse: "message":{"role":"assistant","content":"..."}
int contentStart = responseBody.indexOf("\"content\":\"") + 11;
int contentEnd = responseBody.indexOf("\"", contentStart);
return responseBody.substring(contentStart, contentEnd)
.replace("\\n", "\n");
}
private static List<WordScore> parseScoresFromReply(List<String> expectedWords, String reply) {
Map<String, Integer> wordScoreMap = new HashMap<>();
String[] lines = reply.split("\n");
for (String line : lines) {
line = line.trim();
if (line.contains(":")) {
String[] parts = line.split(":", 2);
if (parts.length == 2) {
String word = parts[0].trim().toLowerCase();
try {
int score = Integer.parseInt(parts[1].trim());
wordScoreMap.put(word, Math.max(1, Math.min(10, score)));
} catch (NumberFormatException e) {
// Skip invalid lines
}
}
}
}
// Match scores to original words (maintaining order)
List<WordScore> results = new ArrayList<>();
for (String word : expectedWords) {
Integer score = wordScoreMap.get(word.toLowerCase());
if (score != null) {
results.add(new WordScore(word, score, "OK"));
} else {
System.err.printf("Warning: No score found for '%s'%n", word);
results.add(new WordScore(word, -1, "MISSING"));
}
}
return results;
}
private static void writeScoresToCsv(List<WordScore> scores) throws Exception {
List<String> lines = new ArrayList<>();
lines.add("word,score,status");
for (WordScore ws : scores) {
lines.add(String.format("%s,%d,%s", ws.word, ws.score, ws.status));
}
Files.write(Paths.get(OUTPUT_SCORES), lines);
}
// ===== DATA CLASS =====
static class WordScore {
String word;
int score;
String status;
String endpoint;
int batchId;
WordScore(String word, int score, String status, String endpoint, int batchId) {
this.word = word;
this.score = score;
this.status = status;
this.endpoint = endpoint;
this.batchId = batchId;
}
WordScore(String word, int score, String status) {
this.word = word;
this.score = score;
this.status = status;
}
}
}

View File

@@ -1,4 +1,5 @@
package puzzle;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
@@ -7,10 +8,7 @@ import java.nio.file.Paths;
import java.time.OffsetDateTime;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
public class Main {
// ---------------- CLI ----------------
@@ -18,9 +16,9 @@ public class Main {
public static class Opts {
public int seed = 1;
public int pop = 18;
public int gens = 1000;
public int gens = 500;
public int tries = 5;
public String wordsPath = "./out/pool.txt";
public String wordsPath = "/data/puzzle/pool.txt";
public double minSimplicity = 0; // 0 means no limit
}
@@ -34,7 +32,7 @@ public class Main {
--pop 18
--gens 100
--tries 50
--words ./out/pool.txt
--words /data/pool.txt
--min-simplicity 0 (no limit)
""");
}

View File

@@ -153,18 +153,18 @@ public class SwedishGenerator {
static Map<String, Integer> loadScores() {
var scores = new HashMap<String, Integer>();
try {
var lines = Files.readAllLines(Path.of("export_words.csv"), StandardCharsets.UTF_8);
var lines = Files.readAllLines(Path.of("/data/puzzle/export_with_hints.csv"), StandardCharsets.UTF_8);
var first = true;
for (var line : lines) {
if (first) {
first = false;
continue;
}
var parts = line.split(",");
var parts = line.split(",",3);
if (parts.length >= 2) {
try {
var word = parts[0].trim().toUpperCase(Locale.ROOT);
var score = Integer.parseInt(parts[1].trim());
var score = 10-Integer.parseInt(parts[1].trim());
scores.put(word, score);
} catch (NumberFormatException ignored) {
System.err.println("Illegal number format: " + line);
@@ -200,7 +200,8 @@ public class SwedishGenerator {
var words = new ArrayList<WordDifficulty>();
for (var line : raw.split("\\R")) {
var s = line.trim().toUpperCase(Locale.ROOT);
var word = line.split(",",3)[0].trim();
var s = word.trim().toUpperCase(Locale.ROOT);
if (s.matches("^[A-Z]{2,8}$")) {
var score = llmScores.getOrDefault(s, 5); // Default to middle
words.add(new WordDifficulty(s, score));

View File

@@ -1,205 +0,0 @@
package puzzle;
import java.util.*;
/**
* ThemeGraph - Creates a graph between words and themes for filtering.
* Uses word embeddings approach: co-occurrence and semantic similarity.
*/
public class ThemeGraph {
// Predefined theme keywords for Dutch word filtering
private static final Map<String, Set<String>> THEME_KEYWORDS = new HashMap<>();
static {
// News/Politics
THEME_KEYWORDS.put("nieuws", Set.of(
"POLITIEK", "VERKIEZING", "MINISTER", "PARLEMENT", "WET", "BELEID",
"REGERING", "PARTIJ", "STEM", "KAMER", "RAAD", "STAAT"
));
// Technology
THEME_KEYWORDS.put("technologie", Set.of(
"COMPUTER", "INTERNET", "SOFTWARE", "APP", "DATA", "CODE",
"NETWERK", "SYSTEEM", "DIGITAAL", "TECH", "ROBOT", "AI"
));
// Sports
THEME_KEYWORDS.put("sport", Set.of(
"VOETBAL", "TENNIS", "WIELREN", "SPELER", "WEDSTRIJD", "TEAM",
"GOAL", "BAL", "SPEL", "WINNEN", "COACH", "ATLEET"
));
// Weather/Nature
THEME_KEYWORDS.put("weer", Set.of(
"REGEN", "ZON", "WIND", "WOLKEN", "STORM", "SNEEUW",
"WEER", "KLIMAAT", "NATUUR", "LUCHT", "WARMTE", "KOU"
));
// Economy
THEME_KEYWORDS.put("economie", Set.of(
"GELD", "EURO", "MARKT", "PRIJS", "KOPEN", "VERKOOP",
"BEDRIJF", "BANK", "HANDEL", "WINST", "SCHULD", "BUDGET"
));
// Health
THEME_KEYWORDS.put("gezondheid", Set.of(
"ZORG", "DOKTER", "MEDICIJN", "PATIENT", "ZIEKENHUIS", "GEZOND",
"VIRUS", "VACCIN", "THERAPIE", "BEHANDEL", "ARTS", "KLINIEK"
));
// General/Common
THEME_KEYWORDS.put("algemeen", Set.of(
"HUIS", "AUTO", "BOOM", "WATER", "MENS", "TIJD",
"LEVEN", "WERK", "SCHOOL", "FAMILIE", "STAD", "LAND"
));
}
/**
* Score a word against a theme (0.0 = no match, 1.0 = perfect match)
*/
public static double scoreWordTheme(String word, String theme) {
var keywords = THEME_KEYWORDS.get(theme.toLowerCase());
if (keywords == null) {
return 0.5; // unknown theme = neutral score
}
word = word.toUpperCase();
// Direct match
if (keywords.contains(word)) {
return 1.0;
}
// Substring match (partial relevance)
for (var kw : keywords) {
if (word.contains(kw) || kw.contains(word)) {
return 0.7;
}
}
// Edit distance similarity (for typos/variations)
for (var kw : keywords) {
var similarity = editDistanceSimilarity(word, kw);
if (similarity > 0.8) {
return similarity * 0.9;
}
}
return 0.0;
}
/**
* Filter word list by theme with minimum score threshold
*/
public static List<String> filterByTheme(List<String> words, String theme, double minScore) {
List<String> filtered = new ArrayList<>();
for (var word : words) {
var score = scoreWordTheme(word, theme);
if (score >= minScore) {
filtered.add(word);
}
}
return filtered;
}
/**
* Get theme suggestions for a word (sorted by score)
*/
public static List<ThemeScore> getThemesForWord(String word) {
List<ThemeScore> scores = new ArrayList<>();
for (var theme : THEME_KEYWORDS.keySet()) {
var score = scoreWordTheme(word, theme);
if (score > 0.0) {
scores.add(new ThemeScore(theme, score));
}
}
scores.sort(Comparator.comparingDouble(ts -> -ts.score));
return scores;
}
/**
* Auto-detect best theme from a word list
*/
public static String detectTheme(List<String> words) {
Map<String, Double> themeScores = new HashMap<>();
for (var theme : THEME_KEYWORDS.keySet()) {
double totalScore = 0;
for (var word : words) {
totalScore += scoreWordTheme(word, theme);
}
themeScores.put(theme, totalScore / words.size());
}
return themeScores.entrySet().stream()
.max(Comparator.comparingDouble(Map.Entry::getValue))
.map(Map.Entry::getKey)
.orElse("algemeen");
}
/**
* Simple edit distance similarity (normalized Levenshtein)
*/
private static double editDistanceSimilarity(String a, String b) {
var dist = levenshtein(a, b);
var maxLen = Math.max(a.length(), b.length());
if (maxLen == 0) return 1.0;
return 1.0 - ((double) dist / maxLen);
}
private static int levenshtein(String a, String b) {
var dp = new int[a.length() + 1][b.length() + 1];
for (var i = 0; i <= a.length(); i++) dp[i][0] = i;
for (var j = 0; j <= b.length(); j++) dp[0][j] = j;
for (var i = 1; i <= a.length(); i++) {
for (var j = 1; j <= b.length(); j++) {
var cost = (a.charAt(i - 1) == b.charAt(j - 1)) ? 0 : 1;
dp[i][j] = Math.min(
Math.min(dp[i - 1][j] + 1, dp[i][j - 1] + 1),
dp[i - 1][j - 1] + cost
);
}
}
return dp[a.length()][b.length()];
}
public record ThemeScore(String theme, double score) {
@Override
public String toString() {
return String.format("%s: %.2f", theme, score);
}
}
// ---- Main for testing ----
public static void main(String[] args) {
System.out.println("=== Theme Graph Test ===\n");
// Test word scoring
var testWords = new String[]{ "POLITIEK", "VOETBAL", "COMPUTER", "REGEN", "AUTO" };
for (var word : testWords) {
System.out.println("Word: " + word);
var themes = getThemesForWord(word);
for (var ts : themes) {
System.out.println(" " + ts);
}
System.out.println();
}
// Test theme detection
var techWords = Arrays.asList("COMPUTER", "INTERNET", "SOFTWARE", "DATA");
var detected = detectTheme(techWords);
System.out.println("Detected theme for tech words: " + detected);
// Test filtering
var allWords = Arrays.asList(
"POLITIEK", "COMPUTER", "AUTO", "VOETBAL", "INTERNET", "BOOM"
);
var filtered = filterByTheme(allWords, "technologie", 0.5);
System.out.println("\nFiltered for 'technologie' (min 0.5): " + filtered);
}
}

View File

@@ -10,18 +10,23 @@ import java.nio.charset.StandardCharsets;
import java.nio.file.*;
import java.security.SecureRandom;
import java.security.cert.X509Certificate;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.text.Normalizer;
import java.time.LocalDate;
import java.util.*;
@SuppressWarnings("ALL")
public class ThemePoolBuilderLength {
private static final List<String> DEFAULT_FEEDS = List.of(
private static final List<String> DEFAULT_FEEDS = List.of(
"https://feeds.nos.nl/nosnieuwsalgemeen",
"https://feeds.nos.nl/nosnieuwstech"
);
"https://feeds.nos.nl/nosnieuwstech");
static final String url = "jdbc:postgresql://192.168.1.159:5432/postgres";
static final String user = "puzzle";
static final String pass = "heel-goed-wachtwoord";
// NOTE: normalizeDutchToken strips non A-Z. Keep entries 2-8 after normalization.
private static final List<String> DEFAULT_SHORTS = List.of(
"EU", "VS", "UK", "NAVO", "NOS", "NS", "ANP", "VN", "NPO", "RTL",
@@ -51,46 +56,157 @@ public class ThemePoolBuilderLength {
"XI", "XII", "XIII", "XIV", "XV", "XVI", "XVII", "XVIII", "XIX", "XX"
);
private static final String BROWSER_UA =
private static final String BROWSER_UA =
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36";
static int MIN_SIMPLICITY = 520;
static final class Opts {
String wordsPath = "/home/mike/dev/puzzle-generator/export_words_only.txt";
String endpoint = "https://jarvis-lan.appmodel.nl/api/stoic/";
List<String> feeds = new ArrayList<>(DEFAULT_FEEDS);
String outDir = "./out";
int bridgeN = 40000;
int themeN = 800;
int relatedN = 2200;
int rssItemsPerFeed = 10;
String model = "mistralai/mistral-nemo-instruct-2407";
int timeoutSeconds = 180;
int retries = 2;
// ---- NEW: enforce minimum counts per length in the final pool ----
// Tune these to your puzzle generators appetite for short words.
int minLen2 = 1000;
int minLen3 = 1000;
int minLen4 = 1000;
int minLen5 = 1000; // set if you also want to force 5-letter words, etc.
int minLen6 = 2000;
int minLen7 = 2000;
int minLen8 = 2000;
String endpoint = "https://jarvis-lan.appmodel.nl/api/stoic/";
List<String> feeds = new ArrayList<>(DEFAULT_FEEDS);
String outDir = "/data/puzzle";
int bridgeN = 30000;
int themeN = 800;
int relatedN = 2200;
int rssItemsPerFeed = 10;
String model = "mistralai/mistral-nemo-instruct-2407";
int timeoutSeconds = 180;
int retries = 2;
int minLen2 = 1000;
int minLen3 = 1000;
int minLen4 = 1000;
int minLen5 = 1000; // set if you also want to force 5-letter words, etc.
int minLen6 = 1000;
int minLen7 = 1000;
int minLen8 = 1000;
}
public static void main(String[] args) throws Exception {
var o = parseArgs(args);
var outDir = Path.of(o.outDir);
Files.createDirectories(outDir);
System.out.println("Loading lexicon...");
Lexicon lex;
Class.forName("org.postgresql.Driver");
try (var c = DriverManager.getConnection(url, user, pass);) {
lex = loadLexicon(c);
}
System.out.println("Master words (2-8, A-Z): " + lex.words.size());
// RSS via curl (browser-like)
var all = new ArrayList<RssItem>();
for (var feed : o.feeds) {
var f = feed.trim();
if (f.isEmpty()) continue;
System.out.println("Fetching RSS: " + f);
all.addAll(fetchRssViaCurlBrowser(f, o.rssItemsPerFeed, o.timeoutSeconds));
}
var rssText = new StringBuilder();
var k = 0;
for (var it : all) {
k++;
rssText.append(k).append(". ").append(it.title).append("\n");
if (!it.desc.isBlank()) rssText.append(" ").append(it.desc).append("\n");
}
Files.writeString(outDir.resolve("rss.txt"), rssText.toString(), StandardCharsets.UTF_8);
// LM Studio via curl
var modelId = o.model;
if (modelId == null) {
var modelsUrl = apiUrl(o.endpoint, "/models");
System.out.println("LM Studio GET: " + modelsUrl);
var modelsJson = curlGetJson(o, modelsUrl);
modelId = pickModelId(modelsJson);
if (modelId == null) {
throw new IOException("Could not auto-pick model id from /v1/models. Use --model <id>.\n--- /models ---\n" + modelsJson);
}
}
System.out.println("Using model: " + modelId);
System.out.println("Generating theme words via LM Studio...");
var llmWords = llmThemeWords(o, modelId, rssText.toString());
var themeKept = new LinkedHashSet<String>();
for (var wRaw : llmWords) {
var w = normalizeDutchToken(wRaw);
if (w == null) continue;
if (lex.idOf.containsKey(w)) themeKept.add(w);
}
Files.write(outDir.resolve("theme.txt"), themeKept, StandardCharsets.UTF_8);
// BitSets
var themeBs = bitmapFromWords(lex, themeKept);
var bridgeBs = buildBridgeBitmap(lex, o.bridgeN);
var shortBs = bitmapFromWords(lex, DEFAULT_SHORTS);
var pool = new BitSet(lex.words.size());
pool.or(themeBs);
pool.or(bridgeBs);
pool.or(shortBs);
// ---- NEW: enforce minimum counts per length ----
enforceMinima(o, lex, pool);
// Report
var themeCounts = countsPerLen(lex, themeBs);
var poolCounts = countsPerLen(lex, pool);
var report = """
Date: %s
Feeds: %s
Model: %s
Master size: %d
Theme kept (in master): %d
Bridge size: %d
Shorts kept: %d
Pool total: %d
Enforced minima:
2: %d
3: %d
4: %d
5: %d
6: %d
7: %d
8: %d
Counts per length (theme):
%s
Counts per length (pool):
%s
""".formatted(
LocalDate.now(),
String.join(", ", o.feeds),
modelId,
lex.words.size(),
themeBs.cardinality(),
bridgeBs.cardinality(),
shortBs.cardinality(),
pool.cardinality(),
o.minLen2, o.minLen3, o.minLen4, o.minLen5, o.minLen6, o.minLen7, o.minLen8,
mapToLines(themeCounts),
mapToLines(poolCounts)
);
Files.writeString(outDir.resolve("report.txt"), report, StandardCharsets.UTF_8);
System.out.println(report);
// Output pool list
var poolFile = outDir.resolve("pool.txt");
writeWordList(poolFile, lex, pool);
System.out.println("Wrote: " + poolFile.toAbsolutePath());
}
static Opts parseArgs(String[] args) {
var o = new Opts();
for (var i = 0; i < args.length; i++) {
var a = args[i];
var v = (i + 1 < args.length) ? args[i + 1] : null;
switch (a) {
case "--words" -> {
o.wordsPath = v;
i++;
}
case "--endpoint" -> {
o.endpoint = v;
i++;
@@ -193,7 +309,6 @@ public class ThemePoolBuilderLength {
default -> throw new IllegalArgumentException("Unknown arg: " + a);
}
}
if (o.wordsPath == null) throw new IllegalArgumentException("--words is required");
return o;
}
@@ -264,54 +379,50 @@ public class ThemePoolBuilderLength {
*/
record Lexicon(List<String> words, Map<String, Integer> idOf, int[] score, BitSet[] byLen) { }
static Lexicon loadLexicon(String path) throws IOException {
var lines = Files.readAllLines(Path.of(path), StandardCharsets.UTF_8);
/**
* Loads lexicon from PostgreSQL view/table: export_words_with_hints_2_8
* Columns: WOORD, level_1_to_10, hint
*
* Notes:
* - Normalizes words via normalizeDutchToken(...)
* - Dedupes on normalized word
* - Uses level_1_to_10 as the "LLM score" (fallback 5)
* - Ignores hint for scoring (but you can store it elsewhere if needed)
*/
static Lexicon loadLexicon(Connection conn) throws SQLException {
var out = new ArrayList<String>(200_000);
var idOf = new HashMap<String, Integer>(400_000);
var out = new ArrayList<String>(lines.size());
var idOf = new HashMap<String, Integer>(lines.size() * 2);
// Store level per normalized word while loading so we can compute scores later
var levelOf = new HashMap<String, Integer>(400_000);
// 1) master lexicon
for (var line : lines) {
var w = normalizeDutchToken(line);
if (w == null) continue;
if (idOf.containsKey(w)) continue;
idOf.put(w, out.size());
out.add(w);
}
final var sql = """
SELECT woord, 10-level_1_to_10, hint
FROM export_words_with_hints_2_8
order by level_1_to_10 asc
""";
/* // 2) ensure DEFAULT_SHORTS are present even if absent in word-list.txt
for (var raw : DEFAULT_SHORTS) {
var w = normalizeDutchToken(raw);
if (w == null) continue;
if (idOf.containsKey(w)) continue;
idOf.put(w, out.size());
out.add(w);
}*/
// Load LLM scores
var llmScores = new HashMap<String, Integer>();
try {
var scoreLines = Files.readAllLines(Path.of("word_scores.csv"), StandardCharsets.UTF_8);
var first = true;
for (var line : scoreLines) {
if (first) {
first = false;
continue;
}
var parts = line.split(",", 3);
if (parts.length >= 3) {
try {
var word = parts[0].trim().toUpperCase(Locale.ROOT);
var score = Integer.parseInt(parts[1].trim());
// var status = parts[2].trim();
// if ("OK".equalsIgnoreCase(status)) {
llmScores.put(word, score);
//}
} catch (NumberFormatException ignored) { }
}
try (var ps = conn.prepareStatement(sql);
var rs = ps.executeQuery()) {
while (rs.next()) {
var rawWord = rs.getString(1);
var lvlObj = (Integer) rs.getObject(2); // nullable
// String hint = rs.getString(3); // available if you want it later
var w = normalizeDutchToken(rawWord);
if (w == null) continue;
if (idOf.containsKey(w)) continue;
idOf.put(w, out.size());
out.add(w);
var lvl = (lvlObj == null ? 5 : lvlObj.intValue());
levelOf.put(w, lvl);
}
} catch (IOException e) {
System.err.println("Warning: word_scores.csv not found, using default scores.");
} catch (SQLException e) {
throw new RuntimeException(e);
}
var n = out.size();
@@ -322,11 +433,12 @@ public class ThemePoolBuilderLength {
for (var i = 0; i < n; i++) {
var w = out.get(i);
var crossScore = crossabilityScore(w);
var lScore = llmScores.getOrDefault(w, 5);
var lScore = levelOf.getOrDefault(w, 5);
// Prioritize simple words (high lScore) and long words.
// lScore (1-10) adds up to 1000 points (weight 100).
// Length (2-8) adds up to 160 points (weight 20).
score[i] = crossScore + (lScore * 100) + (w.length() * 30);
score[i] = crossScore + (lScore * 100) + (w.length() * 40);
byLen[w.length()].set(i);
}
@@ -335,15 +447,7 @@ public class ThemePoolBuilderLength {
// ---------------- RSS via curl (browser-like) ----------------
static final class RssItem {
final String title;
final String desc;
RssItem(String title, String desc) {
this.title = title;
this.desc = desc;
}
}
record RssItem(String title, String desc) { }
static String textOfFirst(Element parent, String tag) {
var nl = parent.getElementsByTagName(tag);
@@ -353,7 +457,7 @@ public class ThemePoolBuilderLength {
}
static List<RssItem> fetchRssViaCurlBrowser(String url, int limit, int timeoutSeconds) throws Exception {
List<String> cmd = new ArrayList<>();
var cmd = new ArrayList<String>();
cmd.add("curl");
cmd.add("-fsSL");
cmd.add("-L");
@@ -433,38 +537,6 @@ public class ThemePoolBuilderLength {
return base + path;
}
static HttpClient buildHttpClient(int timeoutSeconds) {
try {
return HttpClient.newBuilder()
.connectTimeout(java.time.Duration.ofSeconds(Math.max(10, timeoutSeconds)))
.build();
} catch (RuntimeException ignored) { }
try {
var ssl = insecureSslContext();
return HttpClient.newBuilder()
.connectTimeout(java.time.Duration.ofSeconds(Math.max(10, timeoutSeconds)))
.sslContext(ssl)
.build();
} catch (Exception e) {
throw new RuntimeException("Could not initialize HttpClient. Fix Java truststore or use curl for all HTTP.", e);
}
}
static SSLContext insecureSslContext() throws Exception {
var trustAll = new TrustManager[]{
new X509TrustManager() {
public X509Certificate[] getAcceptedIssuers() { return new X509Certificate[0]; }
public void checkClientTrusted(X509Certificate[] chain, String authType) { }
public void checkServerTrusted(X509Certificate[] chain, String authType) { }
}
};
var ssl = SSLContext.getInstance("TLS");
ssl.init(null, trustAll, new SecureRandom());
return ssl;
}
static void sleepBackoff(int attempt) {
try {
var ms = (long) (300L * Math.pow(2, attempt - 1)); // 300, 600, 1200, ...
@@ -476,7 +548,7 @@ public class ThemePoolBuilderLength {
Exception last = null;
for (var attempt = 1; attempt <= o.retries; attempt++) {
try {
List<String> cmd = new ArrayList<>();
var cmd = new ArrayList<String>();
cmd.add("curl");
cmd.add("-fsSL");
cmd.add("--connect-timeout");
@@ -524,7 +596,6 @@ public class ThemePoolBuilderLength {
var tempFile = Files.createTempFile("lm-request-", ".json");
try {
Files.writeString(tempFile, jsonBody, StandardCharsets.UTF_8);
List<String> cmd = new ArrayList<>();
cmd.add("curl");
cmd.add("-fsSL");
@@ -774,7 +845,7 @@ public class ThemePoolBuilderLength {
var out = new ArrayList<String>(ids.size());
for (var id : ids) {
if (lex.score[id] < 680)
if (lex.score[id] < MIN_SIMPLICITY)
continue;
out.add(lex.words.get(id));
}
@@ -833,123 +904,4 @@ public class ThemePoolBuilderLength {
ensureMinLen(lex, pool, 8, o.minLen8);
}
// ---------------- Main ----------------
public static void main(String[] args) throws Exception {
var o = parseArgs(args);
var outDir = Path.of(o.outDir);
Files.createDirectories(outDir);
System.out.println("Loading lexicon...");
var lex = loadLexicon(o.wordsPath);
System.out.println("Master words (2-8, A-Z): " + lex.words.size());
// RSS via curl (browser-like)
var all = new ArrayList<RssItem>();
for (var feed : o.feeds) {
var f = feed.trim();
if (f.isEmpty()) continue;
System.out.println("Fetching RSS: " + f);
all.addAll(fetchRssViaCurlBrowser(f, o.rssItemsPerFeed, o.timeoutSeconds));
}
var rssText = new StringBuilder();
var k = 0;
for (var it : all) {
k++;
rssText.append(k).append(". ").append(it.title).append("\n");
if (!it.desc.isBlank()) rssText.append(" ").append(it.desc).append("\n");
}
Files.writeString(outDir.resolve("rss.txt"), rssText.toString(), StandardCharsets.UTF_8);
// LM Studio via curl
var modelId = o.model;
if (modelId == null) {
var modelsUrl = apiUrl(o.endpoint, "/models");
System.out.println("LM Studio GET: " + modelsUrl);
var modelsJson = curlGetJson(o, modelsUrl);
modelId = pickModelId(modelsJson);
if (modelId == null) {
throw new IOException("Could not auto-pick model id from /v1/models. Use --model <id>.\n--- /models ---\n" + modelsJson);
}
}
System.out.println("Using model: " + modelId);
System.out.println("Generating theme words via LM Studio...");
List<String> llmWords = llmThemeWords(o, modelId, rssText.toString());
// Normalize + keep only those present in master lexicon
var themeKept = new LinkedHashSet<String>();
for (var wRaw : llmWords) {
var w = normalizeDutchToken(wRaw);
if (w == null) continue;
if (lex.idOf.containsKey(w)) themeKept.add(w);
}
Files.write(outDir.resolve("theme.txt"), themeKept, StandardCharsets.UTF_8);
// BitSets
var themeBs = bitmapFromWords(lex, themeKept);
var bridgeBs = buildBridgeBitmap(lex, o.bridgeN);
var shortBs = bitmapFromWords(lex, DEFAULT_SHORTS);
var pool = new BitSet(lex.words.size());
pool.or(themeBs);
pool.or(bridgeBs);
pool.or(shortBs);
// ---- NEW: enforce minimum counts per length ----
enforceMinima(o, lex, pool);
// Report
var themeCounts = countsPerLen(lex, themeBs);
var poolCounts = countsPerLen(lex, pool);
var report = """
Date: %s
Feeds: %s
Model: %s
Master size: %d
Theme kept (in master): %d
Bridge size: %d
Shorts kept: %d
Pool total: %d
Enforced minima:
2: %d
3: %d
4: %d
5: %d
6: %d
7: %d
8: %d
Counts per length (theme):
%s
Counts per length (pool):
%s
""".formatted(
LocalDate.now(),
String.join(", ", o.feeds),
modelId,
lex.words.size(),
themeBs.cardinality(),
bridgeBs.cardinality(),
shortBs.cardinality(),
pool.cardinality(),
o.minLen2, o.minLen3, o.minLen4, o.minLen5, o.minLen6, o.minLen7, o.minLen8,
mapToLines(themeCounts),
mapToLines(poolCounts)
);
Files.writeString(outDir.resolve("report.txt"), report, StandardCharsets.UTF_8);
System.out.println(report);
// Output pool list
var poolFile = outDir.resolve("pool.txt");
writeWordList(poolFile, lex, pool);
System.out.println("Wrote: " + poolFile.toAbsolutePath());
}
}

24
src/puzzle/WordScore.java Normal file
View File

@@ -0,0 +1,24 @@
package puzzle;
// ===== DATA CLASS =====
class WordScore {
String word;
int score;
String status;
String endpoint;
int batchId;
WordScore(String word, int score, String status, String endpoint, int batchId) {
this.word = word;
this.score = score;
this.status = status;
this.endpoint = endpoint;
this.batchId = batchId;
}
WordScore(String word, int score, String status) {
this.word = word;
this.score = score;
this.status = status;
}
}

Binary file not shown.