update them

This commit is contained in:
mike
2025-12-21 19:29:14 +01:00
parent cdb609b746
commit e2bad52d1f
4 changed files with 33024 additions and 829 deletions

View File

@@ -0,0 +1,484 @@
package puzzle;
import puzzle.DutchWordScorer.WordScore;
import java.nio.charset.StandardCharsets;
import java.nio.file.*;
import java.util.*;
import java.util.concurrent.*;
import java.io.*;
import java.time.*;
import java.util.concurrent.atomic.*;
/**
* CONCURRENT MULTI-ENDPOINT Dutch Wordlist Scorer
* Distributes batches across Ollama, LM-Studio, and a third endpoint simultaneously
*/
public class ConcurrentWordScorer {
// ===== CONFIGURATION =====
private static final String INPUT_WORDLIST = "word-list.txt";
private static final String OUTPUT_SCORES = "word_scores.csv";
private static final int BATCH_SIZE = 30; // Smaller for better distribution
private static final int MAX_RETRIES = 3;
// Define all three endpoints
private static final LLMEndpoint[] ENDPOINTS = {
new OllamaEndpoint(),
new LMStudioEndpoint(),
new LMStudioEndpoint("LM-Studio", "http://192.168.1.74:1234/v1/chat/completions",
"mistralai/mistral-nemo-instruct-2407", 1)
// new CustomEndpoint()
};
// ===== ENDPOINT CLASSES =====
abstract static class LLMEndpoint {
String name;
String baseUrl;
String model;
Semaphore rateLimiter; // Per-endpoint rate limiting
int maxConcurrent;
LLMEndpoint(String name, String baseUrl, String model, int maxConcurrent) {
this.name = name;
this.baseUrl = baseUrl;
this.model = model;
this.maxConcurrent = maxConcurrent;
this.rateLimiter = new Semaphore(maxConcurrent);
}
abstract String buildRequestJson(String prompt);
abstract String extractResponseContent(String responseBody);
// Rate-limited request execution
List<WordScore> execute(List<String> batch) throws Exception {
rateLimiter.acquire(); // Wait for slot
try {
return executeInternal(batch);
} finally {
rateLimiter.release();
}
}
private List<WordScore> executeInternal(List<String> batch) throws Exception {
var prompt = createScoringPrompt(batch);
var jsonRequest = buildRequestJson(prompt);
var responseBody = curlPostJson(baseUrl, jsonRequest, 120);
var content = extractResponseContent(responseBody);
if (content == null || content.isEmpty()) {
throw new IOException("[" + name + "] Empty response content");
}
return parseScoresFromReply(batch, content);
}
}
static class OllamaEndpoint
extends LLMEndpoint {
OllamaEndpoint() {
super("Ollama", "http://localhost:11434/api/chat",
"qwen2.5:14b", 2); // 2 concurrent requests
}
@Override String buildRequestJson(String prompt) {
return String.format("{\"model\":\"%s\",\"messages\":[{\"role\":\"user\",\"content\":\"%s\"}],\"stream\":false,\"temperature\":0.1}",
model, escapeJson(prompt));
}
@Override String extractResponseContent(String responseBody) {
// Ollama uses "message" -> "content"
var start = responseBody.indexOf("\"content\":\"") + 11;
var end = responseBody.indexOf("\"", start);
if (start < 11 || end < 0) return "";
return responseBody.substring(start, end).replace("\\n", "\n");
}
}
static class LMStudioEndpoint
extends LLMEndpoint {
LMStudioEndpoint() {
super("LM-Studio", "http://192.168.1.159:1234/v1/chat/completions",
"mistralai/mistral-nemo-instruct-2407", 1); // LM-Studio can handle more
}
public LMStudioEndpoint(String s, String url, String s1, int i) {
super(
s, url, s1, i
);
}
@Override String buildRequestJson(String prompt) {
return String.format("{\"model\":\"%s\",\"messages\":[{\"role\":\"user\",\"content\":\"%s\"}],\"temperature\":0.1,\"max_tokens\":2048}",
model, escapeJson(prompt));
}
@Override String extractResponseContent(String responseBody) {
return extractChatContent(responseBody);
}
}
static class CustomEndpoint
extends LLMEndpoint {
CustomEndpoint() {
super("Custom", "http://192.168.1.74:1234/v1/chat/completions",
"qwen2.5-vl-7b-abliterated-caption-it_gguf", 2);
}
@Override String buildRequestJson(String prompt) {
// Adapt to your third endpoint's format
return new LMStudioEndpoint().buildRequestJson(prompt);
}
@Override String extractResponseContent(String responseBody) {
return new LMStudioEndpoint().extractResponseContent(responseBody);
}
}
// ===== MAIN COORDINATOR =====
static void main(String[] args) throws Exception {
System.out.println("=== CONCURRENT 3-Endpoint Scorer ===");
for (var ep : ENDPOINTS) {
System.out.printf("- %s: %s%n", ep.name, ep.baseUrl);
}
System.out.println();
// Load work queue
var allWords = Files.readAllLines(Paths.get(INPUT_WORDLIST));
var scoredWords = loadAlreadyScoredWords();
var workQueue = createWorkQueue(allWords, scoredWords);
System.out.printf("Total words: %d | Already scored: %d | Remaining: %d%n%n",
allWords.size(), scoredWords.size(), workQueue.size());
if (workQueue.isEmpty()) {
System.out.println("All done!");
return;
}
// Start result writer thread
BlockingQueue<List<WordScore>> resultQueue = new LinkedBlockingQueue<>();
var writerThread = startResultWriter(resultQueue);
// Start worker threads
var totalThreads = 0;
for (var ep : ENDPOINTS) totalThreads += ep.maxConcurrent;
var executor = Executors.newFixedThreadPool(totalThreads);
var totalProcessed = new AtomicInteger(scoredWords.size());
for (var endpoint : ENDPOINTS) {
for (var i = 0; i < endpoint.maxConcurrent; i++) {
executor.submit(() -> {
processBatches(endpoint, workQueue, resultQueue, totalProcessed, allWords.size());
});
}
}
// Wait for completion
executor.shutdown();
executor.awaitTermination(Long.MAX_VALUE, TimeUnit.DAYS);
// Signal writer to stop
resultQueue.put(Collections.singletonList(new WordScore(null, 0, "STOP")));
writerThread.join();
System.out.println("\n✓ All endpoints finished!");
}
// ===== WORKER THREAD LOGIC =====
private static void processBatches(LLMEndpoint endpoint,
BlockingQueue<WorkItem> workQueue,
BlockingQueue<List<WordScore>> resultQueue,
AtomicInteger totalProcessed,
int totalWords) {
System.out.printf("[%s] Worker started%n", endpoint.name);
while (!Thread.currentThread().isInterrupted()) {
try {
var work = workQueue.poll(1, TimeUnit.SECONDS);
if (work == null) {
if (workQueue.isEmpty()) break; // No more work in queue
continue;
}
var scores = processWithRetry(endpoint, work.batch);
// Add metadata
scores.forEach(s -> {
s.endpoint = endpoint.name;
s.batchId = work.batchId;
});
resultQueue.put(scores);
// Progress update
var processed = totalProcessed.addAndGet(scores.size());
if (processed % 100 < BATCH_SIZE) { // Reduce console spam
System.out.printf("Progress: %d/%d (%.1f%%)%n",
processed, totalWords, (processed * 100.0 / totalWords));
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break;
} catch (Exception e) {
System.err.printf("[%s] Fatal error: %s%n", endpoint.name, e.getMessage());
break;
}
}
System.out.printf("[%s] Worker stopped%n", endpoint.name);
}
private static List<WordScore> processWithRetry(LLMEndpoint endpoint, List<String> batch) {
var retries = 0;
while (retries < MAX_RETRIES) {
try {
return endpoint.execute(batch);
} catch (Exception e) {
retries++;
System.err.printf("[%s] Attempt %d/%d failed: %s%n",
endpoint.name, retries, MAX_RETRIES, e.getMessage());
if (retries >= MAX_RETRIES) {
return createFailedScores(batch, endpoint.name);
}
try {
Thread.sleep(2000L * retries);
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
return createFailedScores(batch, endpoint.name);
}
}
}
return createFailedScores(batch, endpoint.name);
}
// ===== RESULT WRITER THREAD =====
private static Thread startResultWriter(BlockingQueue<List<WordScore>> resultQueue) throws Exception {
var writer = new BufferedWriter(new FileWriter(OUTPUT_SCORES, true));
var isNew = Files.size(Paths.get(OUTPUT_SCORES)) == 0;
if (isNew) {
writer.write("word,score,status,endpoint,batch_id,timestamp\n");
writer.flush();
}
var thread = new Thread(() -> {
try {
while (true) {
var scores = resultQueue.take();
// Stop signal
if (scores.size() == 1 && scores.get(0).status.equals("STOP")) {
break;
}
writeBatch(writer, scores);
}
writer.close();
} catch (Exception e) {
System.err.println("Writer thread error: " + e.getMessage());
}
});
thread.start();
return thread;
}
private static synchronized void writeBatch(BufferedWriter writer, List<WordScore> scores) throws Exception {
var timestamp = Instant.now().toString();
for (var ws : scores) {
writer.write(String.format("%s,%d,%s,%s,%d,%s\n",
ws.word, ws.score, ws.status, ws.endpoint, ws.batchId, timestamp));
}
writer.flush();
}
// ===== QUEUE & DATA STRUCTURES =====
record WorkItem(int batchId, List<String> batch) {
}
private static BlockingQueue<WorkItem> createWorkQueue(List<String> allWords, Set<String> scored) {
BlockingQueue<WorkItem> queue = new LinkedBlockingQueue<>();
var batchId = 0;
for (var i = 0; i < allWords.size(); i += BATCH_SIZE) {
List<String> batch = new ArrayList<>();
for (var j = i; j < Math.min(i + BATCH_SIZE, allWords.size()); j++) {
var word = allWords.get(j);
if (!scored.contains(word.toLowerCase())) {
batch.add(word);
}
}
if (!batch.isEmpty()) {
queue.add(new WorkItem(batchId++, batch));
}
}
return queue;
}
// ===== LOADING & PARSING =====
private static Set<String> loadAlreadyScoredWords() throws Exception {
Set<String> scored = new HashSet<>();
var file = new File(OUTPUT_SCORES);
if (!file.exists()) return scored;
var lines = Files.readAllLines(file.toPath());
var first = true;
for (var line : lines) {
if (first) {
first = false;
continue;
}
var parts = line.split(",", 2);
if (parts.length >= 1) scored.add(parts[0].trim().toLowerCase());
}
return scored;
}
private static List<WordScore> createFailedScores(List<String> words, String endpoint) {
List<WordScore> failed = new ArrayList<>();
for (var word : words) {
failed.add(new WordScore(word, -1, "FAILED", endpoint, -1));
}
return failed;
}
// Parsing logic
private static List<WordScore> parseScoresFromReply(List<String> expectedWords, String reply) {
Map<String, Integer> wordScoreMap = new HashMap<>();
var lines = reply.split("\n");
for (var line : lines) {
line = line.trim();
if (line.contains(":")) {
var parts = line.split(":", 2);
if (parts.length == 2) {
var word = parts[0].trim().toLowerCase();
try {
var scoreStr = parts[1].trim();
// Handle potential non-numeric junk after the number
scoreStr = scoreStr.replaceAll("[^0-9].*", "");
if (!scoreStr.isEmpty()) {
var score = Integer.parseInt(scoreStr);
wordScoreMap.put(word, Math.max(1, Math.min(10, score)));
}
} catch (NumberFormatException e) {
// Skip invalid lines
}
}
}
}
// Match scores to original words (maintaining order)
List<WordScore> results = new ArrayList<>();
for (var word : expectedWords) {
var score = wordScoreMap.get(word.toLowerCase());
if (score != null) {
results.add(new WordScore(word, score, "OK"));
} else {
results.add(new WordScore(word, -1, "MISSING"));
}
}
return results;
}
// Prompt creation
private static String createScoringPrompt(List<String> words) {
return "Je bent een Nederlandse taalexpert. Geef elk woord een populariteitsscore van 1-10.\n\n" +
"Score criteria:\n" +
"- 1 = Zeer zeldzaam, archaïsch, of extreem specifiek vakjargon\n" +
"- 10 = Zeer algemeen, dagelijks gebruikt door iedereen\n\n" +
"Geef ALLEEN een lijst in dit exacte formaat, niets anders:\n" +
"woord1:score\n" +
"woord2:score\n\n" +
"Woorden om te scoren:\n" +
String.join("\n", words);
}
// Utility methods
private static String escapeJson(String str) {
return str.replace("\\", "\\\\")
.replace("\"", "\\\"")
.replace("\n", "\\n");
}
private static String curlPostJson(String url, String jsonBody, int timeoutSeconds) throws Exception {
// Write JSON body to temp file to avoid shell escaping issues
var tempFile = Files.createTempFile("lm-request-", ".json");
try {
Files.writeString(tempFile, jsonBody, StandardCharsets.UTF_8);
List<String> cmd = new ArrayList<>();
cmd.add("curl");
cmd.add("-fsSL");
cmd.add("--connect-timeout");
cmd.add("10");
cmd.add("--max-time");
cmd.add(String.valueOf(timeoutSeconds));
cmd.add("-H");
cmd.add("Content-Type: application/json");
cmd.add("-d");
cmd.add("@" + tempFile);
cmd.add(url);
var p = new ProcessBuilder(cmd)
.redirectErrorStream(true)
.start();
var bytes = p.getInputStream().readAllBytes();
var code = p.waitFor();
if (code != 0) {
throw new IOException("curl POST failed (" + code + ") url=" + url + "\nOutput:\n" +
new String(bytes, StandardCharsets.UTF_8));
}
return new String(bytes, StandardCharsets.UTF_8);
} finally {
Files.deleteIfExists(tempFile);
}
}
private static String extractChatContent(String json) {
if (json == null) return null;
var choices = json.indexOf("\"choices\"");
var p = (choices >= 0) ? choices : 0;
var i = json.indexOf("\"content\"", p);
if (i < 0) return null;
var colon = json.indexOf(':', i);
if (colon < 0) return null;
var q = json.indexOf('"', colon + 1);
if (q < 0) return null;
var sb = new StringBuilder();
var esc = false;
for (var k = q + 1; k < json.length(); k++) {
var ch = json.charAt(k);
if (esc) {
if (ch == 'n') sb.append('\n');
else if (ch == 't') sb.append('\t');
else if (ch == 'r') sb.append('\r');
else sb.append(ch);
esc = false;
} else {
if (ch == '\\') esc = true;
else if (ch == '"') break;
else sb.append(ch);
}
}
return sb.toString();
}
}

View File

@@ -0,0 +1,229 @@
package puzzle;
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;
import java.util.concurrent.TimeUnit;
/**
* Ollama Dutch Wordlist Scorer
* Scores 90k Dutch words on popularity/complexity using local Ollama API
*/
public class DutchWordScorer {
// Configuration
private static final String OLLAMA_ENDPOINT = "http://localhost:11434/api/chat";
private static final String MODEL = "qwen2.5:14b"; // or "llama3.1:latest"
private static final int BATCH_SIZE = 50; // Words per API call
private static final int RATE_LIMIT_DELAY_MS = 500; // Be nice to local Ollama
private static final int MAX_RETRIES = 3;
// Input/output files
private static final String INPUT_WORDLIST = "word-list.txt";
private static final String OUTPUT_SCORES = "word_scores.csv";
public static void main(String[] args) throws Exception {
System.out.println("Starting Dutch wordlist scoring...");
// Read all words
List<String> words = Files.readAllLines(Paths.get(INPUT_WORDLIST));
System.out.printf("Loaded %d words from %s%n", words.size(), INPUT_WORDLIST);
// Process in batches
HttpClient client = HttpClient.newBuilder()
.connectTimeout(java.time.Duration.ofSeconds(30))
.build();
List<WordScore> allScores = new ArrayList<>();
for (int i = 0; i < words.size(); i += BATCH_SIZE) {
int end = Math.min(i + BATCH_SIZE, words.size());
List<String> batch = words.subList(i, end);
System.out.printf("Processing batch %d-%d...%n", i + 1, end);
boolean success = false;
int retries = 0;
while (!success && retries < MAX_RETRIES) {
try {
List<WordScore> batchScores = processBatch(client, batch);
allScores.addAll(batchScores);
success = true;
// Rate limiting
if (i + BATCH_SIZE < words.size()) {
Thread.sleep(RATE_LIMIT_DELAY_MS);
}
} catch (Exception e) {
retries++;
System.err.printf("Batch %d-%d failed (attempt %d/%d): %s%n",
i + 1, end, retries, MAX_RETRIES, e.getMessage());
if (retries >= MAX_RETRIES) {
System.err.println("Max retries reached, skipping batch");
// Add null scores for failed batch to maintain alignment
batch.forEach(w -> allScores.add(new WordScore(w, -1, "FAILED")));
} else {
Thread.sleep(2000 * retries); // Exponential backoff
}
}
}
}
// Write results
writeScoresToCsv(allScores);
System.out.printf("Completed! Scored %d words. Results saved to %s%n",
allScores.size(), OUTPUT_SCORES);
}
public static List<WordScore> processBatch(HttpClient client, List<String> batch) throws Exception {
String prompt = createScoringPrompt(batch);
// Build JSON request
String jsonRequest = buildChatRequestJson(prompt);
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create(OLLAMA_ENDPOINT))
.header("Content-Type", "application/json")
.POST(HttpRequest.BodyPublishers.ofString(jsonRequest))
.build();
HttpResponse<String> response = client.send(request,
HttpResponse.BodyHandlers.ofString());
if (response.statusCode() != 200) {
throw new RuntimeException("HTTP " + response.statusCode() + ": " + response.body());
}
// Parse response
String responseBody = response.body();
String assistantReply = extractMessageContent(responseBody);
return parseScoresFromReply(batch, assistantReply);
}
private static String createScoringPrompt(List<String> words) {
return """
Je bent een Nederlandse taalexpert. Geef elk woord een populariteitsscore van 1-10.
Score criteria:
- 1 = Zeer zeldzaam, archaïsch, of extreem specifiek vakjargon
- 10 = Zeer algemeen, dagelijks gebruikt door iedereen
Geef ALLEEN een lijst in dit exacte formaat, niets anders:
woord1:score
woord2:score
enz.
Woorden om te scoren:
""" + String.join("\n", words);
}
private static String buildChatRequestJson(String prompt) {
// Simple JSON building (in production use a library like Jackson)
return String.format("""
{
"model": "%s",
"messages": [
{
"role": "user",
"content": "%s"
}
],
"stream": false,
"temperature": 0.1
}
""", MODEL, escapeJson(prompt));
}
private static String escapeJson(String str) {
return str.replace("\\", "\\\\")
.replace("\"", "\\\"")
.replace("\n", "\\n")
.replace("\r", "\\r")
.replace("\t", "\\t");
}
private static String extractMessageContent(String responseBody) {
// Parse: "message":{"role":"assistant","content":"..."}
int contentStart = responseBody.indexOf("\"content\":\"") + 11;
int contentEnd = responseBody.indexOf("\"", contentStart);
return responseBody.substring(contentStart, contentEnd)
.replace("\\n", "\n");
}
private static List<WordScore> parseScoresFromReply(List<String> expectedWords, String reply) {
Map<String, Integer> wordScoreMap = new HashMap<>();
String[] lines = reply.split("\n");
for (String line : lines) {
line = line.trim();
if (line.contains(":")) {
String[] parts = line.split(":", 2);
if (parts.length == 2) {
String word = parts[0].trim().toLowerCase();
try {
int score = Integer.parseInt(parts[1].trim());
wordScoreMap.put(word, Math.max(1, Math.min(10, score)));
} catch (NumberFormatException e) {
// Skip invalid lines
}
}
}
}
// Match scores to original words (maintaining order)
List<WordScore> results = new ArrayList<>();
for (String word : expectedWords) {
Integer score = wordScoreMap.get(word.toLowerCase());
if (score != null) {
results.add(new WordScore(word, score, "OK"));
} else {
System.err.printf("Warning: No score found for '%s'%n", word);
results.add(new WordScore(word, -1, "MISSING"));
}
}
return results;
}
private static void writeScoresToCsv(List<WordScore> scores) throws Exception {
List<String> lines = new ArrayList<>();
lines.add("word,score,status");
for (WordScore ws : scores) {
lines.add(String.format("%s,%d,%s", ws.word, ws.score, ws.status));
}
Files.write(Paths.get(OUTPUT_SCORES), lines);
}
// ===== DATA CLASS =====
static class WordScore {
String word;
int score;
String status;
String endpoint;
int batchId;
WordScore(String word, int score, String status, String endpoint, int batchId) {
this.word = word;
this.score = score;
this.status = status;
this.endpoint = endpoint;
this.batchId = batchId;
}
WordScore(String word, int score, String status) {
this.word = word;
this.score = score;
this.status = status;
}
}
}

View File

@@ -1,829 +0,0 @@
package puzzle;
import org.w3c.dom.*;
import javax.net.ssl.*;
import javax.xml.parsers.DocumentBuilderFactory;
import java.io.*;
import java.net.http.*;
import java.nio.charset.StandardCharsets;
import java.nio.file.*;
import java.security.SecureRandom;
import java.security.cert.X509Certificate;
import java.text.Normalizer;
import java.time.LocalDate;
import java.util.*;
public class ThemePoolBuilder {
private static final List<String> DEFAULT_FEEDS = List.of(
"https://feeds.nos.nl/nosnieuwsalgemeen",
"https://feeds.nos.nl/nosnieuwstech"
);
private static final List<String> DEFAULT_SHORTS = List.of(
"EU", "VS", "UK", "NAVO", "NOS", "NS", "ANP", "VN", "NPO", "RTL",
"UUR", "MIN", "TV", "GPS", "AI", "IT", "CPU", "GPU",
"ING", "KPN", "KVK", "RIVM", "GGD", "AIVD", "MIVD", "CEO", "CFO", "HR",
"NL", "BE", "BRU", "EUR", "EURO", "WET", "ART", "BTW", "DI", "MA",
"PVV", "VVD", "CDA", "FNV","EN","IN","OP","OM","TE","ER","DE","HET","EEN","VAN","MET","NOG","OOK","MAAR","WEL","NIET",
"HOE","ALS","EEN",
"NL", "BE", "BRU", "EUR", "EURO", "WET", "ART", "BTW", "DI", "MA",
"ZO", "DO", "WO", "VR", "ZO", "MO", "WA", "WE", "TAAL",
"LAND", "GEMEENTE", "STAAT", "BUREAU", "HUIS", "SCHOOL", "STR", "BAAN",
"WERK", "KLUS",
"FONDS", "RAAD", "CONGRESS", "GROEP", "STRAAT", "BRUG", "PARK",
"BUURT",
"BOUW", "HOTEL", "CAFE", "BAR",
"BIJBAAN", "STUDENT", "DOCENT",
"WINKEL", "MARKT", "KIOSK", "AUTO", "MOBILE", "FIETS", "SCOOTER",
// afkortingen (worden toch A-Z geforceerd)
"DHR","MEVR","DR","ST","CA","IVM","MBT","TAV","TOV","DWZ","MAW","OA","TM",
"EU","VS","NAVO","NOS","NS","ANWB","KVK","BTW","BRP","CBS","NPO","RTL","RIVM",
// romeinse cijfers (28 tekens)
"II","III","IV","VI","VII","VIII","IX",
"XI","XII","XIII","XIV","XV","XVI","XVII","XVIII","XIX","XX"
);
private static final String BROWSER_UA =
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36";
static final class Opts {
String wordsPath = "/home/mike/dev/puzzle-generator/word-list.txt";
String endpoint = "https://jarvis-lan.appmodel.nl/api/stoic/";
List<String> feeds = new ArrayList<>(DEFAULT_FEEDS);
String outDir = "./out";
int bridgeN = 52000;
int themeN = 800;
int relatedN = 2200;
int rssItemsPerFeed = 10;
String model = "mistralai/mistral-nemo-instruct-2407";
int timeoutSeconds = 180; // LM Studio needs more time for generation
int retries = 2;
}
static Opts parseArgs(String[] args) {
var o = new Opts();
for (var i = 0; i < args.length; i++) {
var a = args[i];
var v = (i + 1 < args.length) ? args[i + 1] : null;
switch (a) {
case "--words" -> {
o.wordsPath = v;
i++;
}
case "--endpoint" -> {
o.endpoint = v;
i++;
}
case "--feeds" -> {
o.feeds = Arrays.asList(v.split(","));
i++;
}
case "--out" -> {
o.outDir = v;
i++;
}
case "--bridge" -> {
o.bridgeN = Integer.parseInt(v);
i++;
}
case "--theme" -> {
o.themeN = Integer.parseInt(v);
i++;
}
case "--related" -> {
o.relatedN = Integer.parseInt(v);
i++;
}
case "--items" -> {
o.rssItemsPerFeed = Integer.parseInt(v);
i++;
}
case "--model" -> {
o.model = v;
i++;
}
case "--timeout" -> {
o.timeoutSeconds = Integer.parseInt(v);
i++;
}
case "--retries" -> {
o.retries = Integer.parseInt(v);
i++;
}
case "-h", "--help" -> {
System.out.println("""
Usage:
java puzzle.ThemePoolBuilder --words WORDS.txt [options]
Options:
--endpoint http://HOST:1234/v1 (LM Studio)
--feeds url1,url2
--out ./out
--bridge 5000
--theme 300
--related 1200
--items 20 (per feed)
--model <id> (recommended; skips /v1/models)
--timeout 60 (seconds)
--retries 4
""");
System.exit(0);
}
default -> throw new IllegalArgumentException("Unknown arg: " + a);
}
}
if (o.wordsPath == null) throw new IllegalArgumentException("--words is required");
return o;
}
static boolean isAZ(String s) {
for (var i = 0; i < s.length(); i++) {
var ch = s.charAt(i);
if (ch < 'A' || ch > 'Z') return false;
}
return true;
}
static String normalizeDutchToken(String raw) {
if (raw == null) return null;
var s = raw.trim();
if (s.isEmpty()) return null;
s = Normalizer.normalize(s, Normalizer.Form.NFD).replaceAll("\\p{M}+", "");
s = s.toUpperCase(Locale.ROOT);
s = s.replaceAll("[^A-Z]", "");
if (s.length() < 2 || s.length() > 8) return null;
if (!isAZ(s)) return null;
return s;
}
static String stripHtml(String s) {
if (s == null) return "";
var x = s.replaceAll("<[^>]+>", " ");
x = x.replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">");
x = x.replaceAll("\\s+", " ").trim();
return x;
}
static final Map<Character, Integer> LETTER_WEIGHT = Map.ofEntries(
Map.entry('E', 10), Map.entry('N', 9), Map.entry('A', 9), Map.entry('R', 8),
Map.entry('I', 8), Map.entry('O', 7), Map.entry('S', 7), Map.entry('T', 7),
Map.entry('D', 6), Map.entry('L', 6), Map.entry('K', 5), Map.entry('M', 5),
Map.entry('U', 5), Map.entry('P', 4), Map.entry('G', 4), Map.entry('H', 4),
Map.entry('V', 4), Map.entry('B', 3), Map.entry('W', 3),
Map.entry('C', 2), Map.entry('F', 2), Map.entry('Z', 2),
Map.entry('J', 1), Map.entry('Y', 1), Map.entry('Q', 0), Map.entry('X', 0)
);
static boolean isVowel(char ch) {
return ch == 'A' || ch == 'E' || ch == 'I' || ch == 'O' || ch == 'U';
}
static int crossabilityScore(String w) {
var score = 0;
var vowels = 0;
for (var i = 0; i < w.length(); i++) {
var ch = w.charAt(i);
score += LETTER_WEIGHT.getOrDefault(ch, 2);
if (isVowel(ch)) vowels++;
}
var ratio = vowels / (double) w.length();
if (ratio >= 0.35 && ratio <= 0.65) score += 8;
if (w.indexOf('Q') >= 0 || w.indexOf('X') >= 0) score -= 6;
if (w.indexOf('Y') >= 0 || w.indexOf('J') >= 0) score -= 2;
return score;
}
/**
* @param words id -> word
* @param idOf word -> id
* @param score id -> crossability
* @param byLen byLen[L] for L 0..8 */
record Lexicon(List<String> words, Map<String, Integer> idOf, int[] score, BitSet[] byLen) {
}
static Lexicon loadLexicon(String path) throws IOException {
var lines = Files.readAllLines(Path.of(path), StandardCharsets.UTF_8);
var out = new ArrayList<String>(lines.size());
var idOf = new HashMap<String, Integer>(lines.size() * 2);
// 1) master lexicon
for (var line : lines) {
var w = normalizeDutchToken(line);
if (w == null) continue;
if (idOf.containsKey(w)) continue;
idOf.put(w, out.size());
out.add(w);
}
// 2) inject extra short words (24 letters mostly)
var extraShorts = List.of(
"EN","IN","OP","OM","TE","ER","DE","HET","EEN","VAN","MET",
"AL","NU","ZO","TO","NA","BIJ","TOT","ALS","DAN","WAT","DAT",
"IK","JE","WE","WIJ","JIJ","ZIJ","HIJ","HEN","ONS","JOU",
"EIS","WET","RAAD","PLAN","TEAM","MAAT"
);
for (var raw : DEFAULT_SHORTS) {
var w = normalizeDutchToken(raw);
if (w == null) continue;
if (idOf.containsKey(w)) continue;
idOf.put(w, out.size());
out.add(w);
}
for (var wRaw : extraShorts) {
var w = normalizeDutchToken(wRaw);
if (w == null) continue;
if (idOf.containsKey(w)) continue;
idOf.put(w, out.size());
out.add(w);
}
var n = out.size();
var score = new int[n];
var byLen = new BitSet[9];
for (var L = 0; L <= 8; L++) byLen[L] = new BitSet(n);
for (var i = 0; i < n; i++) {
var w = out.get(i);
score[i] = crossabilityScore(w);
byLen[w.length()].set(i);
}
return new Lexicon(out, idOf, score, byLen);
}
// ---------------- RSS via curl (browser-like) ----------------
static final class RssItem {
final String title;
final String desc;
RssItem(String title, String desc) {
this.title = title;
this.desc = desc;
}
}
static String textOfFirst(Element parent, String tag) {
var nl = parent.getElementsByTagName(tag);
if (nl.getLength() == 0) return null;
var n = nl.item(0);
return n.getTextContent();
}
static List<RssItem> fetchRssViaCurlBrowser(String url, int limit, int timeoutSeconds) throws Exception {
List<String> cmd = new ArrayList<>();
cmd.add("curl");
cmd.add("-fsSL");
cmd.add("-L");
cmd.add("--compressed");
cmd.add("--connect-timeout");
cmd.add("10");
cmd.add("--max-time");
cmd.add(String.valueOf(timeoutSeconds));
cmd.add("--retry");
cmd.add("5");
cmd.add("--retry-all-errors");
cmd.add("--retry-delay");
cmd.add("1");
cmd.add("-H");
cmd.add("User-Agent: " + BROWSER_UA);
cmd.add("-H");
cmd.add("Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
cmd.add("-H");
cmd.add("Accept-Language: nl-NL,nl;q=0.9,en;q=0.7");
cmd.add("-H");
cmd.add("Cache-Control: no-cache");
cmd.add("-H");
cmd.add("Pragma: no-cache");
cmd.add("-H");
cmd.add("Sec-Fetch-Dest: document");
cmd.add("-H");
cmd.add("Sec-Fetch-Mode: navigate");
cmd.add("-H");
cmd.add("Sec-Fetch-Site: none");
cmd.add("-H");
cmd.add("Sec-Fetch-User: ?1");
cmd.add(url);
var p = new ProcessBuilder(cmd)
.redirectErrorStream(true)
.start();
var bytes = p.getInputStream().readAllBytes();
var code = p.waitFor();
if (code != 0) {
throw new IOException("curl RSS failed (" + code + ") url=" + url + " output=" +
new String(bytes, StandardCharsets.UTF_8));
}
try (InputStream is = new ByteArrayInputStream(bytes)) {
var dbf = DocumentBuilderFactory.newInstance();
var doc = dbf.newDocumentBuilder().parse(is);
var items = doc.getElementsByTagName("item");
var out = new ArrayList<RssItem>();
for (var i = 0; i < items.getLength() && out.size() < limit; i++) {
var item = (Element) items.item(i);
var title = textOfFirst(item, "title");
var desc = textOfFirst(item, "description");
if (title == null) title = "";
if (desc == null) desc = "";
out.add(new RssItem(stripHtml(title), stripHtml(desc)));
}
return out;
}
}
// ---------------- LM Studio (OpenAI-compatible) ----------------
/**
* Build stable API URLs:
* - Accepts endpoint with or without /v1
* - Ensures exactly one /v1 prefix
*/
static String apiUrl(String endpointArg, String path) {
var base = endpointArg.trim();
if (base.endsWith("/")) base = base.substring(0, base.length() - 1);
if (base.endsWith("/v1")) base = base.substring(0, base.length() - 3);
if (!path.startsWith("/")) path = "/" + path;
if (!path.startsWith("/v1/")) path = "/" + path;
return base + path;
}
/**
* Some systems have a broken Java truststore -> default SSLContext init can crash HttpClient.build().
* We try default, else fall back to an "insecure" SSLContext just to allow HttpClient to exist.
* (We only use HttpClient against LM Studio over HTTP, so SSL is not used anyway.)
*/
static HttpClient buildHttpClient(int timeoutSeconds) {
try {
return HttpClient.newBuilder()
.connectTimeout(java.time.Duration.ofSeconds(Math.max(10, timeoutSeconds)))
.build();
} catch (RuntimeException ignored) {
// fall back
}
try {
var ssl = insecureSslContext();
return HttpClient.newBuilder()
.connectTimeout(java.time.Duration.ofSeconds(Math.max(10, timeoutSeconds)))
.sslContext(ssl)
.build();
} catch (Exception e) {
throw new RuntimeException("Could not initialize HttpClient. Fix Java truststore or use curl for all HTTP.", e);
}
}
static SSLContext insecureSslContext() throws Exception {
var trustAll = new TrustManager[]{
new X509TrustManager() {
public X509Certificate[] getAcceptedIssuers() { return new X509Certificate[0]; }
public void checkClientTrusted(X509Certificate[] chain, String authType) { }
public void checkServerTrusted(X509Certificate[] chain, String authType) { }
}
};
var ssl = SSLContext.getInstance("TLS");
ssl.init(null, trustAll, new SecureRandom());
return ssl;
}
static void sleepBackoff(int attempt) {
try {
var ms = (long) (300L * Math.pow(2, attempt - 1)); // 300, 600, 1200, ...
Thread.sleep(Math.min(ms, 3000));
} catch (InterruptedException ignored) { }
}
/**
* GET JSON via curl (Java HttpClient has network stack issues on some systems)
*/
static String curlGetJson(Opts o, String url) throws Exception {
Exception last = null;
for (var attempt = 1; attempt <= o.retries; attempt++) {
try {
List<String> cmd = new ArrayList<>();
cmd.add("curl");
cmd.add("-fsSL");
cmd.add("--connect-timeout");
cmd.add("10");
cmd.add("--max-time");
cmd.add(String.valueOf(o.timeoutSeconds));
cmd.add("--retry");
cmd.add("3");
cmd.add("--retry-all-errors");
cmd.add("--retry-delay");
cmd.add("1");
cmd.add("-H");
cmd.add("Accept: application/json");
cmd.add("-H");
cmd.add("User-Agent: " + BROWSER_UA);
cmd.add(url);
var p = new ProcessBuilder(cmd)
.redirectErrorStream(true)
.start();
var bytes = p.getInputStream().readAllBytes();
var code = p.waitFor();
if (code != 0) {
throw new IOException("curl GET failed (" + code + ") url=" + url + "\nOutput:\n" +
new String(bytes, StandardCharsets.UTF_8));
}
return new String(bytes, StandardCharsets.UTF_8);
} catch (Exception e) {
last = e;
if (attempt < o.retries) sleepBackoff(attempt);
}
}
throw last;
}
/**
* POST JSON via curl (Java HttpClient has network stack issues on some systems)
*/
static String curlPostJson(Opts o, String url, String jsonBody) throws Exception {
Exception last = null;
for (var attempt = 1; attempt <= o.retries; attempt++) {
try {
System.out.println(" Attempt " + attempt + "/" + o.retries + " via curl...");
// Write JSON body to temp file to avoid shell escaping issues
var tempFile = Files.createTempFile("lm-request-", ".json");
try {
Files.writeString(tempFile, jsonBody, StandardCharsets.UTF_8);
List<String> cmd = new ArrayList<>();
cmd.add("curl");
cmd.add("-fsSL");
cmd.add("--connect-timeout");
cmd.add("10");
cmd.add("--max-time");
cmd.add(String.valueOf(o.timeoutSeconds));
cmd.add("--retry");
cmd.add("3");
cmd.add("--retry-all-errors");
cmd.add("--retry-delay");
cmd.add("1");
cmd.add("-H");
cmd.add("Content-Type: application/json");
cmd.add("-H");
cmd.add("Accept: application/json");
cmd.add("-H");
cmd.add("User-Agent: " + BROWSER_UA);
cmd.add("-d");
cmd.add("@" + tempFile.toString());
cmd.add(url);
var p = new ProcessBuilder(cmd)
.redirectErrorStream(true)
.start();
var bytes = p.getInputStream().readAllBytes();
var code = p.waitFor();
if (code != 0) {
throw new IOException("curl POST failed (" + code + ") url=" + url + "\nOutput:\n" +
new String(bytes, StandardCharsets.UTF_8));
}
return new String(bytes, StandardCharsets.UTF_8);
} finally {
Files.deleteIfExists(tempFile);
}
} catch (Exception e) {
System.err.println(" Error: " + e.getClass().getName() + ": " + e.getMessage());
last = e;
if (attempt < o.retries) sleepBackoff(attempt);
}
}
throw last;
}
// pick first model id from {"data":[{"id":"..."} ...]}
static String pickModelId(String modelsJson) {
if (modelsJson == null) return null;
var data = modelsJson.indexOf("\"data\"");
if (data < 0) return null;
var id = modelsJson.indexOf("\"id\"", data);
if (id < 0) return null;
var q1 = modelsJson.indexOf('"', modelsJson.indexOf(':', id) + 1);
if (q1 < 0) return null;
var q2 = modelsJson.indexOf('"', q1 + 1);
if (q2 < 0) return null;
return modelsJson.substring(q1 + 1, q2);
}
// Extract assistant "content" from chat/completions response (best-effort)
static String extractChatContent(String json) {
if (json == null) return null;
// Try to anchor near "choices"
var choices = json.indexOf("\"choices\"");
var p = (choices >= 0) ? choices : 0;
// Find first "content" after that
var i = json.indexOf("\"content\"", p);
if (i < 0) return null;
var colon = json.indexOf(':', i);
if (colon < 0) return null;
// Find opening quote of the JSON string value
var q = json.indexOf('"', colon + 1);
if (q < 0) return null;
var sb = new StringBuilder();
var esc = false;
for (var k = q + 1; k < json.length(); k++) {
var ch = json.charAt(k);
if (esc) {
if (ch == 'n') sb.append('\n');
else if (ch == 't') sb.append('\t');
else if (ch == 'r') sb.append('\r');
else sb.append(ch);
esc = false;
} else {
if (ch == '\\') esc = true;
else if (ch == '"') break;
else sb.append(ch);
}
}
return sb.toString();
}
// Parse JSON array of strings from model output.
// We accept extra text; we just take first '[' ... last ']'
static List<String> parseStringArray(String s) {
if (s == null) return List.of();
var a = s.indexOf('[');
var b = s.lastIndexOf(']');
if (a < 0 || b < 0 || b <= a) return List.of();
var body = s.substring(a + 1, b);
var out = new ArrayList<String>();
var cur = new StringBuilder();
boolean in = false, esc = false;
for (var i = 0; i < body.length(); i++) {
var ch = body.charAt(i);
if (!in) {
if (ch == '"') {
in = true;
cur.setLength(0);
esc = false;
}
} else {
if (esc) {
cur.append(ch);
esc = false;
} else if (ch == '\\') {
esc = true;
} else if (ch == '"') {
out.add(cur.toString());
in = false;
} else {
cur.append(ch);
}
}
}
return out;
}
static String jsonQuote(String s) {
if (s == null) return "null";
var sb = new StringBuilder();
sb.append('"');
for (var i = 0; i < s.length(); i++) {
var ch = s.charAt(i);
if (ch == '\\' || ch == '"') sb.append('\\').append(ch);
else if (ch == '\n') sb.append("\\n");
else if (ch == '\r') sb.append("\\r");
else if (ch == '\t') sb.append("\\t");
else sb.append(ch);
}
sb.append('"');
return sb.toString();
}
static List<String> llmThemeWords(Opts o, String modelId, String rssText) throws Exception {
var prompt = """
Je genereert woorden voor een Nederlandse kruiswoordpuzzel.
Regels:
- Output MOET exact één JSON array zijn: ["WOORD", ...]
- Alleen A-Z, 2-8 letters woorden
- Geen spaties, streepjes, cijfers, accenten, apostrofs, punten
- Geen duplicaten
- Focus op zelfstandige naamwoorden/termen uit het nieuws en relevante Zweedse kruiswoordpuzzel koppelwoorden in het thema.
- Lever %d THEMA-woorden en daarna %d GERELATEERDE woorden (totaal %d).
- Voeg ook wat korte woorden/afkortingen toe (2-4 letters), maar houd het totaal gelijk.
Nieuws (koppen/samenvattingen):
%s
""".formatted(o.themeN, o.relatedN, (o.themeN + o.relatedN), rssText.substring(0,8000));
var body = """
{
"model": %s,
"messages": [
{"role":"system","content":"Je bent een strikte JSON generator. Antwoord ALLEEN met een JSON array van strings."},
{"role":"user","content": %s}
],
"temperature": 0.35,
"max_tokens": 20000
}
""".formatted(jsonQuote(modelId), jsonQuote(prompt));
var url = apiUrl(o.endpoint, "/chat/completions");
System.out.println("LM Studio POST: " + url);
System.out.println("Request body length: " + body.length() + " bytes");
var resp = curlPostJson(o, url, body);
var content = extractChatContent(resp);
if (content == null) {
throw new IOException("Could not extract chat content from LM Studio response.\n--- response ---\n" + resp);
}
return parseStringArray(content);
}
// ---------------- Pool building ----------------
static BitSet buildBridgeBitmap(Lexicon lex, int bridgeN) {
var n = lex.words.size();
var ids = new Integer[n];
for (var i = 0; i < n; i++) ids[i] = i;
Arrays.sort(ids, (a, b) -> Integer.compare(lex.score[b], lex.score[a]));
var bs = new BitSet(n);
var take = Math.min(bridgeN, n);
for (var i = 0; i < take; i++) bs.set(ids[i]);
return bs;
}
static BitSet bitmapFromWords(Lexicon lex, Collection<String> words) {
var bs = new BitSet(lex.words.size());
for (var raw : words) {
var w = normalizeDutchToken(raw);
if (w == null) continue;
var id = lex.idOf.get(w);
if (id != null) bs.set(id);
}
return bs;
}
static Map<Integer, Integer> countsPerLen(Lexicon lex, BitSet bs) {
var out = new HashMap<Integer, Integer>();
for (var L = 2; L <= 8; L++) {
var tmp = (BitSet) bs.clone();
tmp.and(lex.byLen[L]);
out.put(L, tmp.cardinality());
}
return out;
}
static void writeWordList(Path path, Lexicon lex, BitSet bs) throws IOException {
var out = new ArrayList<String>(bs.cardinality());
for (var i = bs.nextSetBit(0); i >= 0; i = bs.nextSetBit(i + 1)) {
out.add(lex.words.get(i));
}
out.sort(String::compareTo);
Files.write(path, out, StandardCharsets.UTF_8, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
}
static String mapToLines(Map<Integer, Integer> m) {
var sb = new StringBuilder();
for (var L = 2; L <= 8; L++) {
sb.append(" ").append(L).append(": ").append(m.getOrDefault(L, 0)).append("\n");
}
return sb.toString();
}
// ---------------- Main ----------------
public static void main(String[] args) throws Exception {
var o = parseArgs(args);
var outDir = Path.of(o.outDir);
Files.createDirectories(outDir);
System.out.println("Loading lexicon...");
var lex = loadLexicon(o.wordsPath);
System.out.println("Master words (2-8, A-Z): " + lex.words.size());
// RSS via curl (browser-like)
var all = new ArrayList<RssItem>();
for (var feed : o.feeds) {
var f = feed.trim();
if (f.isEmpty()) continue;
System.out.println("Fetching RSS: " + f);
all.addAll(fetchRssViaCurlBrowser(f, o.rssItemsPerFeed, o.timeoutSeconds));
}
var rssText = new StringBuilder();
var k = 0;
for (var it : all) {
k++;
rssText.append(k).append(". ").append(it.title).append("\n");
if (!it.desc.isBlank()) rssText.append(" ").append(it.desc).append("\n");
}
Files.writeString(outDir.resolve("rss.txt"), rssText.toString(), StandardCharsets.UTF_8);
// LM Studio via curl
var modelId = o.model;
if (modelId == null) {
var modelsUrl = apiUrl(o.endpoint, "/models");
System.out.println("LM Studio GET: " + modelsUrl);
var modelsJson = curlGetJson(o, modelsUrl);
modelId = pickModelId(modelsJson);
if (modelId == null) {
throw new IOException("Could not auto-pick model id from /v1/models. Use --model <id>.\n--- /models ---\n" + modelsJson);
}
}
System.out.println("Using model: " + modelId);
System.out.println("Generating theme words via LM Studio...");
var llmWords = llmThemeWords(o, modelId, rssText.toString());
// Normalize + keep only those present in master lexicon
var themeKept = new LinkedHashSet<String>();
for (var wRaw : llmWords) {
var w = normalizeDutchToken(wRaw);
if (w == null) continue;
if (lex.idOf.containsKey(w)) themeKept.add(w);
}
Files.write(outDir.resolve("theme.txt"), themeKept, StandardCharsets.UTF_8);
// BitSets
var themeBs = bitmapFromWords(lex, themeKept);
var bridgeBs = buildBridgeBitmap(lex, o.bridgeN);
var shortBs = bitmapFromWords(lex, DEFAULT_SHORTS);
var pool = new BitSet(lex.words.size());
pool.or(themeBs);
pool.or(bridgeBs);
pool.or(shortBs);
// Report
var themeCounts = countsPerLen(lex, themeBs);
var poolCounts = countsPerLen(lex, pool);
var report = """
Date: %s
Feeds: %s
Model: %s
Master size: %d
Theme kept (in master): %d
Bridge size: %d
Shorts kept: %d
Pool total: %d
Counts per length (theme):
%s
Counts per length (pool):
%s
""".formatted(
LocalDate.now(),
String.join(", ", o.feeds),
modelId,
lex.words.size(),
themeBs.cardinality(),
bridgeBs.cardinality(),
shortBs.cardinality(),
pool.cardinality(),
mapToLines(themeCounts),
mapToLines(poolCounts)
);
Files.writeString(outDir.resolve("report.txt"), report, StandardCharsets.UTF_8);
System.out.println(report);
// Output pool list
var poolFile = outDir.resolve("pool.txt");
writeWordList(poolFile, lex, pool);
System.out.println("Wrote: " + poolFile.toAbsolutePath());
}
}

32311
word_scores.csv Normal file

File diff suppressed because it is too large Load Diff