update them
This commit is contained in:
@@ -1,4 +1,9 @@
|
||||
package puzzle;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.time.LocalDate;
|
||||
|
||||
public class Main {
|
||||
// ---------------- CLI ----------------
|
||||
@@ -68,5 +73,72 @@ public class Main {
|
||||
System.out.printf("%s %s start=(%d,%d) arrow=(%d,%d)%n",
|
||||
w.word(), w.direction(), w.startRow(), w.startCol(), w.arrowRow(), w.arrowCol());
|
||||
}
|
||||
|
||||
// Export to JSON file
|
||||
var dateStr = LocalDate.now().toString();
|
||||
var theme = "algemeen";
|
||||
var filename = String.format("crossword_%s_%02d_%s.json", dateStr, 1, safeSlug(theme));
|
||||
var outDir = "data";
|
||||
var outputPath = Paths.get(outDir, filename);
|
||||
|
||||
try {
|
||||
Files.createDirectories(Paths.get(outDir));
|
||||
var json = toJson(out, dateStr, theme);
|
||||
Files.writeString(outputPath, json, StandardCharsets.UTF_8);
|
||||
System.out.println("\nSaved to: " + outputPath);
|
||||
} catch (IOException e) {
|
||||
System.err.println("Failed to write " + filename + ": " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
private static String toJson(ExportFormat.ExportedPuzzle puzzle, String date, String theme) {
|
||||
var sb = new StringBuilder();
|
||||
sb.append("{\n");
|
||||
sb.append(" \"date\": \"").append(escapeJson(date)).append("\",\n");
|
||||
sb.append(" \"theme\": \"").append(escapeJson(theme)).append("\",\n");
|
||||
sb.append(" \"difficulty\": ").append(puzzle.difficulty()).append(",\n");
|
||||
sb.append(" \"rewards\": {\n");
|
||||
sb.append(" \"coins\": ").append(puzzle.rewards().coins()).append(",\n");
|
||||
sb.append(" \"stars\": ").append(puzzle.rewards().stars()).append(",\n");
|
||||
sb.append(" \"hints\": ").append(puzzle.rewards().hints()).append("\n");
|
||||
sb.append(" },\n");
|
||||
sb.append(" \"gridv2\": [\n");
|
||||
for (var i = 0; i < puzzle.gridv2().size(); i++) {
|
||||
sb.append(" \"").append(escapeJson(puzzle.gridv2().get(i))).append("\"");
|
||||
if (i < puzzle.gridv2().size() - 1) sb.append(",");
|
||||
sb.append("\n");
|
||||
}
|
||||
sb.append(" ],\n");
|
||||
sb.append(" \"words\": [\n");
|
||||
for (var i = 0; i < puzzle.words().size(); i++) {
|
||||
var w = puzzle.words().get(i);
|
||||
sb.append(" {\n");
|
||||
sb.append(" \"word\": \"").append(escapeJson(w.word())).append("\",\n");
|
||||
sb.append(" \"clue\": \"").append(escapeJson(w.clue())).append("\",\n");
|
||||
sb.append(" \"startRow\": ").append(w.startRow()).append(",\n");
|
||||
sb.append(" \"startCol\": ").append(w.startCol()).append(",\n");
|
||||
sb.append(" \"direction\": \"").append(escapeJson(w.direction())).append("\",\n");
|
||||
sb.append(" \"answer\": \"").append(escapeJson(w.answer())).append("\",\n");
|
||||
sb.append(" \"arrowRow\": ").append(w.arrowRow()).append(",\n");
|
||||
sb.append(" \"arrowCol\": ").append(w.arrowCol()).append("\n");
|
||||
sb.append(" }");
|
||||
if (i < puzzle.words().size() - 1) sb.append(",");
|
||||
sb.append("\n");
|
||||
}
|
||||
sb.append(" ]\n");
|
||||
sb.append("}\n");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private static String escapeJson(String s) {
|
||||
return s.replace("\\", "\\\\")
|
||||
.replace("\"", "\\\"")
|
||||
.replace("\n", "\\n")
|
||||
.replace("\r", "\\r")
|
||||
.replace("\t", "\\t");
|
||||
}
|
||||
|
||||
private static String safeSlug(String s) {
|
||||
return s.toLowerCase().replaceAll("[^a-z0-9]+", "-").replaceAll("^-|-$", "");
|
||||
}
|
||||
}
|
||||
@@ -5,6 +5,7 @@ import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
/**
|
||||
* SwedishGenerator.java
|
||||
@@ -108,6 +109,10 @@ public class SwedishGenerator {
|
||||
if (n >= a.length) a = Arrays.copyOf(a, a.length * 2);
|
||||
a[n++] = v;
|
||||
}
|
||||
void replaceAll(int[] newData) {
|
||||
this.a = newData;
|
||||
this.n = newData.length;
|
||||
}
|
||||
int size() { return n; }
|
||||
int[] data() { return a; } // note: may have extra capacity
|
||||
}
|
||||
@@ -124,6 +129,18 @@ public class SwedishGenerator {
|
||||
}
|
||||
}
|
||||
|
||||
static class WordDifficulty {
|
||||
|
||||
final String word;
|
||||
final int difficulty;
|
||||
|
||||
public WordDifficulty(String word) {
|
||||
this.word = word;
|
||||
// Simple heuristic for difficulty: shorter words have lower difficulty
|
||||
this.difficulty = -Math.min(40,word.length() * 5);
|
||||
}
|
||||
}
|
||||
|
||||
static final class Dict {
|
||||
|
||||
final ArrayList<String> words;
|
||||
@@ -135,7 +152,6 @@ public class SwedishGenerator {
|
||||
this.lenCounts = lenCounts;
|
||||
}
|
||||
}
|
||||
|
||||
static Dict loadWords(String wordsPath) {
|
||||
String raw;
|
||||
try {
|
||||
@@ -144,16 +160,26 @@ public class SwedishGenerator {
|
||||
raw = "EU\nUUR\nAUTO\nBOOM\nHUIS\nKAT\nZEE\nRODE\nDRAAD\nKENNIS\nNETWERK\nPAKTE\n";
|
||||
}
|
||||
|
||||
var words = new ArrayList<String>();
|
||||
var words = new ArrayList<WordDifficulty>();
|
||||
for (var line : raw.split("\\R")) {
|
||||
var s = line.trim().toUpperCase(Locale.ROOT);
|
||||
if (s.matches("^[A-Z]{2,8}$")) words.add(s);
|
||||
if (s.matches("^[A-Z]{2,8}$")) {
|
||||
words.add(new WordDifficulty(s));
|
||||
}
|
||||
}
|
||||
|
||||
// Sort words by difficulty in ascending order
|
||||
words.sort(Comparator.comparingInt(wd -> wd.difficulty));
|
||||
|
||||
var dictWords = new ArrayList<String>();
|
||||
for (var wd : words) {
|
||||
dictWords.add(wd.word);
|
||||
}
|
||||
|
||||
var index = new HashMap<Integer, DictEntry>();
|
||||
var lenCounts = new HashMap<Integer, Integer>();
|
||||
|
||||
for (var w : words) {
|
||||
for (var w : dictWords) {
|
||||
var L = w.length();
|
||||
lenCounts.put(L, lenCounts.getOrDefault(L, 0) + 1);
|
||||
|
||||
@@ -172,7 +198,7 @@ public class SwedishGenerator {
|
||||
}
|
||||
}
|
||||
|
||||
return new Dict(words, index, lenCounts);
|
||||
return new Dict(dictWords, index, lenCounts);
|
||||
}
|
||||
|
||||
static int[] intersectSorted(int[] a, int aLen, int[] b, int bLen) {
|
||||
@@ -195,7 +221,6 @@ public class SwedishGenerator {
|
||||
int[] indices; // null => unconstrained
|
||||
int count;
|
||||
}
|
||||
|
||||
static CandidateInfo candidateInfoForPattern(DictEntry entry, char[] pattern /* 0 means null */) {
|
||||
var lists = new ArrayList<IntList>();
|
||||
for (var i = 0; i < pattern.length; i++) {
|
||||
@@ -204,6 +229,7 @@ public class SwedishGenerator {
|
||||
lists.add(entry.pos[i][ch - 'A']);
|
||||
}
|
||||
}
|
||||
|
||||
var ci = new CandidateInfo();
|
||||
if (lists.isEmpty()) {
|
||||
ci.indices = null;
|
||||
@@ -211,8 +237,6 @@ public class SwedishGenerator {
|
||||
return ci;
|
||||
}
|
||||
|
||||
lists.sort(Comparator.comparingInt(IntList::size));
|
||||
|
||||
var first = lists.get(0);
|
||||
var cur = Arrays.copyOf(first.data(), first.size());
|
||||
var curLen = cur.length;
|
||||
@@ -230,6 +254,11 @@ public class SwedishGenerator {
|
||||
ci.count = curLen;
|
||||
return ci;
|
||||
}
|
||||
static int indexToDifficulty(DictEntry entry, int index) {
|
||||
var word = entry.words.get(index);
|
||||
return new WordDifficulty(word).difficulty;
|
||||
}
|
||||
|
||||
|
||||
// ---------------- Slots ----------------
|
||||
|
||||
@@ -751,11 +780,14 @@ public class SwedishGenerator {
|
||||
var L = idxs.length;
|
||||
var tries = Math.min(MAX_TRIES_PER_SLOT, L);
|
||||
|
||||
var start = (L == 1) ? 0 : rng.randint(0, L - 1);
|
||||
var step = (L <= 1) ? 1 : rng.randint(1, L - 1);
|
||||
|
||||
// When picking words from sorted indices, we want to favor the beginning
|
||||
// (lower difficulty) but still have some randomness.
|
||||
for (var t = 0; t < tries; t++) {
|
||||
var idx = idxs[(start + t * step) % L];
|
||||
// Power law or similar to favor lower indices:
|
||||
// pick a random double in [0, 1), square it to bias towards 0.
|
||||
double r = rng.nextFloat();
|
||||
int idxInArray = (int) (r * r * L);
|
||||
var idx = idxs[idxInArray];
|
||||
var w = entry.words.get(idx);
|
||||
if (tryWord.apply(w)) return true;
|
||||
}
|
||||
@@ -770,12 +802,10 @@ public class SwedishGenerator {
|
||||
}
|
||||
|
||||
var tries = Math.min(MAX_TRIES_PER_SLOT, N);
|
||||
var start = (N == 1) ? 0 : rng.randint(0, N - 1);
|
||||
var step = (N <= 1) ? 1 : rng.randint(1, N - 1);
|
||||
|
||||
for (var t = 0; t < tries; t++) {
|
||||
var idx = (start + t * step) % N;
|
||||
var w = entry.words.get(idx);
|
||||
double r = rng.nextFloat();
|
||||
int idxInArray = (int) (r * r * N);
|
||||
var w = entry.words.get(idxInArray);
|
||||
if (tryWord.apply(w)) return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -5,7 +5,6 @@ import javax.net.ssl.*;
|
||||
import javax.xml.parsers.DocumentBuilderFactory;
|
||||
|
||||
import java.io.*;
|
||||
import java.net.URI;
|
||||
import java.net.http.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.*;
|
||||
@@ -24,28 +23,44 @@ public class ThemePoolBuilder {
|
||||
"EU", "VS", "UK", "NAVO", "NOS", "NS", "ANP", "VN", "NPO", "RTL",
|
||||
"UUR", "MIN", "TV", "GPS", "AI", "IT", "CPU", "GPU",
|
||||
"ING", "KPN", "KVK", "RIVM", "GGD", "AIVD", "MIVD", "CEO", "CFO", "HR",
|
||||
"PVV", "VVD", "CDA", "FNV"
|
||||
"NL", "BE", "BRU", "EUR", "EURO", "WET", "ART", "BTW", "DI", "MA",
|
||||
"PVV", "VVD", "CDA", "FNV","EN","IN","OP","OM","TE","ER","DE","HET","EEN","VAN","MET","NOG","OOK","MAAR","WEL","NIET",
|
||||
"HOE","ALS","EEN",
|
||||
"NL", "BE", "BRU", "EUR", "EURO", "WET", "ART", "BTW", "DI", "MA",
|
||||
"ZO", "DO", "WO", "VR", "ZO", "MO", "WA", "WE", "TAAL",
|
||||
"LAND", "GEMEENTE", "STAAT", "BUREAU", "HUIS", "SCHOOL", "STR", "BAAN",
|
||||
"WERK", "KLUS",
|
||||
"FONDS", "RAAD", "CONGRESS", "GROEP", "STRAAT", "BRUG", "PARK",
|
||||
"BUURT",
|
||||
"BOUW", "HOTEL", "CAFE", "BAR",
|
||||
"BIJBAAN", "STUDENT", "DOCENT",
|
||||
"WINKEL", "MARKT", "KIOSK", "AUTO", "MOBILE", "FIETS", "SCOOTER",
|
||||
|
||||
// afkortingen (worden toch A-Z geforceerd)
|
||||
"DHR","MEVR","DR","ST","CA","IVM","MBT","TAV","TOV","DWZ","MAW","OA","TM",
|
||||
"EU","VS","NAVO","NOS","NS","ANWB","KVK","BTW","BRP","CBS","NPO","RTL","RIVM",
|
||||
|
||||
// romeinse cijfers (2–8 tekens)
|
||||
"II","III","IV","VI","VII","VIII","IX",
|
||||
"XI","XII","XIII","XIV","XV","XVI","XVII","XVIII","XIX","XX"
|
||||
);
|
||||
|
||||
// Browser-like UA (no shell quoting issues because we use ProcessBuilder args)
|
||||
private static final String BROWSER_UA =
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36";
|
||||
|
||||
// ---------------- CLI ----------------
|
||||
|
||||
static final class Opts {
|
||||
|
||||
String wordsPath = "/home/mike/dev/puzzle-generator/word-list.txt";
|
||||
String endpoint = "https://jarvis-lan.appmodel.nl/api/stoic/chat/completions/";
|
||||
String endpoint = "https://jarvis-lan.appmodel.nl/api/stoic/";
|
||||
List<String> feeds = new ArrayList<>(DEFAULT_FEEDS);
|
||||
String outDir = "./out";
|
||||
|
||||
int bridgeN = 5000;
|
||||
int themeN = 300;
|
||||
int relatedN = 1200;
|
||||
int rssItemsPerFeed = 20;
|
||||
int bridgeN = 52000;
|
||||
int themeN = 800;
|
||||
int relatedN = 2200;
|
||||
int rssItemsPerFeed = 10;
|
||||
|
||||
String model = "openai/gpt-oss-20b";
|
||||
String model = "mistralai/mistral-nemo-instruct-2407";
|
||||
int timeoutSeconds = 180; // LM Studio needs more time for generation
|
||||
int retries = 2;
|
||||
}
|
||||
@@ -126,8 +141,6 @@ public class ThemePoolBuilder {
|
||||
return o;
|
||||
}
|
||||
|
||||
// ---------------- Normalization ----------------
|
||||
|
||||
static boolean isAZ(String s) {
|
||||
for (var i = 0; i < s.length(); i++) {
|
||||
var ch = s.charAt(i);
|
||||
@@ -141,11 +154,9 @@ public class ThemePoolBuilder {
|
||||
var s = raw.trim();
|
||||
if (s.isEmpty()) return null;
|
||||
|
||||
// strip diacritics
|
||||
s = Normalizer.normalize(s, Normalizer.Form.NFD).replaceAll("\\p{M}+", "");
|
||||
s = s.toUpperCase(Locale.ROOT);
|
||||
|
||||
// keep only A-Z
|
||||
s = s.replaceAll("[^A-Z]", "");
|
||||
if (s.length() < 2 || s.length() > 8) return null;
|
||||
if (!isAZ(s)) return null;
|
||||
@@ -160,8 +171,6 @@ public class ThemePoolBuilder {
|
||||
return x;
|
||||
}
|
||||
|
||||
// ---------------- Crossability score ----------------
|
||||
|
||||
static final Map<Character, Integer> LETTER_WEIGHT = Map.ofEntries(
|
||||
Map.entry('E', 10), Map.entry('N', 9), Map.entry('A', 9), Map.entry('R', 8),
|
||||
Map.entry('I', 8), Map.entry('O', 7), Map.entry('S', 7), Map.entry('T', 7),
|
||||
@@ -191,21 +200,14 @@ public class ThemePoolBuilder {
|
||||
return score;
|
||||
}
|
||||
|
||||
// ---------------- Lexicon ----------------
|
||||
|
||||
static final class Lexicon {
|
||||
|
||||
final List<String> words; // id -> word
|
||||
final Map<String, Integer> idOf; // word -> id
|
||||
final int[] score; // id -> crossability
|
||||
final BitSet[] byLen; // byLen[L] for L 0..8
|
||||
|
||||
Lexicon(List<String> words, Map<String, Integer> idOf, int[] score, BitSet[] byLen) {
|
||||
this.words = words;
|
||||
this.idOf = idOf;
|
||||
this.score = score;
|
||||
this.byLen = byLen;
|
||||
}
|
||||
/**
|
||||
* @param words id -> word
|
||||
* @param idOf word -> id
|
||||
* @param score id -> crossability
|
||||
* @param byLen byLen[L] for L 0..8 */
|
||||
record Lexicon(List<String> words, Map<String, Integer> idOf, int[] score, BitSet[] byLen) {
|
||||
|
||||
}
|
||||
|
||||
static Lexicon loadLexicon(String path) throws IOException {
|
||||
@@ -214,6 +216,7 @@ public class ThemePoolBuilder {
|
||||
var out = new ArrayList<String>(lines.size());
|
||||
var idOf = new HashMap<String, Integer>(lines.size() * 2);
|
||||
|
||||
// 1) master lexicon
|
||||
for (var line : lines) {
|
||||
var w = normalizeDutchToken(line);
|
||||
if (w == null) continue;
|
||||
@@ -222,6 +225,29 @@ public class ThemePoolBuilder {
|
||||
out.add(w);
|
||||
}
|
||||
|
||||
// 2) inject extra short words (2–4 letters mostly)
|
||||
var extraShorts = List.of(
|
||||
"EN","IN","OP","OM","TE","ER","DE","HET","EEN","VAN","MET",
|
||||
"AL","NU","ZO","TO","NA","BIJ","TOT","ALS","DAN","WAT","DAT",
|
||||
"IK","JE","WE","WIJ","JIJ","ZIJ","HIJ","HEN","ONS","JOU",
|
||||
"EIS","WET","RAAD","PLAN","TEAM","MAAT"
|
||||
);
|
||||
for (var raw : DEFAULT_SHORTS) {
|
||||
var w = normalizeDutchToken(raw);
|
||||
if (w == null) continue;
|
||||
if (idOf.containsKey(w)) continue;
|
||||
idOf.put(w, out.size());
|
||||
out.add(w);
|
||||
}
|
||||
for (var wRaw : extraShorts) {
|
||||
var w = normalizeDutchToken(wRaw);
|
||||
if (w == null) continue;
|
||||
if (idOf.containsKey(w)) continue;
|
||||
idOf.put(w, out.size());
|
||||
out.add(w);
|
||||
}
|
||||
|
||||
|
||||
var n = out.size();
|
||||
var score = new int[n];
|
||||
var byLen = new BitSet[9];
|
||||
@@ -336,7 +362,7 @@ public class ThemePoolBuilder {
|
||||
if (base.endsWith("/v1")) base = base.substring(0, base.length() - 3);
|
||||
|
||||
if (!path.startsWith("/")) path = "/" + path;
|
||||
if (!path.startsWith("/v1/")) path = "/v1" + path;
|
||||
if (!path.startsWith("/v1/")) path = "/" + path;
|
||||
|
||||
return base + path;
|
||||
}
|
||||
@@ -606,16 +632,16 @@ public class ThemePoolBuilder {
|
||||
|
||||
Regels:
|
||||
- Output MOET exact één JSON array zijn: ["WOORD", ...]
|
||||
- Alleen A-Z, 2-8 letters
|
||||
- Alleen A-Z, 2-8 letters woorden
|
||||
- Geen spaties, streepjes, cijfers, accenten, apostrofs, punten
|
||||
- Geen duplicaten
|
||||
- Focus op zelfstandige naamwoorden/termen uit het nieuws
|
||||
- Focus op zelfstandige naamwoorden/termen uit het nieuws en relevante Zweedse kruiswoordpuzzel koppelwoorden in het thema.
|
||||
- Lever %d THEMA-woorden en daarna %d GERELATEERDE woorden (totaal %d).
|
||||
- Voeg ook wat korte woorden/afkortingen toe (2-4 letters), maar houd het totaal gelijk.
|
||||
|
||||
Nieuws (koppen/samenvattingen):
|
||||
%s
|
||||
""".formatted(o.themeN, o.relatedN, (o.themeN + o.relatedN), rssText);
|
||||
""".formatted(o.themeN, o.relatedN, (o.themeN + o.relatedN), rssText.substring(0,8000));
|
||||
|
||||
var body = """
|
||||
{
|
||||
@@ -625,7 +651,7 @@ public class ThemePoolBuilder {
|
||||
{"role":"user","content": %s}
|
||||
],
|
||||
"temperature": 0.35,
|
||||
"max_tokens": 2000
|
||||
"max_tokens": 20000
|
||||
}
|
||||
""".formatted(jsonQuote(modelId), jsonQuote(prompt));
|
||||
|
||||
|
||||
855
src/puzzle/ThemePoolBuilderLength.java
Normal file
855
src/puzzle/ThemePoolBuilderLength.java
Normal file
@@ -0,0 +1,855 @@
|
||||
package puzzle;
|
||||
|
||||
import org.w3c.dom.*;
|
||||
import javax.net.ssl.*;
|
||||
import javax.xml.parsers.DocumentBuilderFactory;
|
||||
|
||||
import java.io.*;
|
||||
import java.net.http.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.*;
|
||||
import java.security.SecureRandom;
|
||||
import java.security.cert.X509Certificate;
|
||||
import java.text.Normalizer;
|
||||
import java.time.LocalDate;
|
||||
import java.util.*;
|
||||
|
||||
public class ThemePoolBuilderLength {
|
||||
|
||||
private static final List<String> DEFAULT_FEEDS = List.of(
|
||||
"https://feeds.nos.nl/nosnieuwsalgemeen",
|
||||
"https://feeds.nos.nl/nosnieuwstech"
|
||||
);
|
||||
|
||||
// NOTE: normalizeDutchToken strips non A-Z. Keep entries 2-8 after normalization.
|
||||
private static final List<String> DEFAULT_SHORTS = List.of(
|
||||
"EU", "VS", "UK", "NAVO", "NOS", "NS", "ANP", "VN", "NPO", "RTL",
|
||||
"UUR", "MIN", "TV", "GPS", "AI", "IT", "CPU", "GPU",
|
||||
"ING", "KPN", "KVK", "RIVM", "GGD", "AIVD", "MIVD", "CEO", "CFO", "HR",
|
||||
"NL", "BE", "BRU", "EUR", "EURO", "WET", "ART", "BTW", "DI", "MA",
|
||||
"PVV", "VVD", "CDA", "FNV",
|
||||
"EN","IN","OP","OM","TE","ER","DE","HET","EEN","VAN","MET","NOG","OOK","MAAR","WEL","NIET",
|
||||
"HOE","ALS",
|
||||
|
||||
"ZO", "DO", "WO", "VR", "MO", "WA", "WE", "TAAL",
|
||||
"LAND", "GEMEENTE", "STAAT", "BUREAU", "HUIS", "SCHOOL", "STR", "BAAN",
|
||||
"WERK", "KLUS",
|
||||
"FONDS", "RAAD", "CONGRESS", "GROEP", "STRAAT", "BRUG", "PARK",
|
||||
"BUURT",
|
||||
"BOUW", "HOTEL", "CAFE", "BAR",
|
||||
"BIJBAAN", "STUDENT", "DOCENT",
|
||||
"WINKEL", "MARKT", "KIOSK", "AUTO", "MOBILE", "FIETS", "SCOOTER",
|
||||
|
||||
// afkortingen
|
||||
"DHR","MEVR","DR","ST","CA","IVM","MBT","TAV","TOV","DWZ","MAW","OA","TM",
|
||||
"ANWB","BRP","CBS",
|
||||
|
||||
// romeinse cijfers (2-8)
|
||||
"II","III","IV","VI","VII","VIII","IX",
|
||||
"XI","XII","XIII","XIV","XV","XVI","XVII","XVIII","XIX","XX"
|
||||
);
|
||||
|
||||
private static final String BROWSER_UA =
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36";
|
||||
|
||||
static final class Opts {
|
||||
|
||||
String wordsPath = "/home/mike/dev/puzzle-generator/word-list.txt";
|
||||
String endpoint = "https://jarvis-lan.appmodel.nl/api/stoic/";
|
||||
List<String> feeds = new ArrayList<>(DEFAULT_FEEDS);
|
||||
String outDir = "./out";
|
||||
|
||||
int bridgeN = 42000;
|
||||
int themeN = 800;
|
||||
int relatedN = 2200;
|
||||
int rssItemsPerFeed = 10;
|
||||
|
||||
String model = "mistralai/mistral-nemo-instruct-2407";
|
||||
int timeoutSeconds = 180;
|
||||
int retries = 2;
|
||||
|
||||
// ---- NEW: enforce minimum counts per length in the final pool ----
|
||||
// Tune these to your puzzle generator’s appetite for short words.
|
||||
int minLen2 = 4000;
|
||||
int minLen3 = 7000;
|
||||
int minLen4 = 9000;
|
||||
int minLen5 = 0; // set if you also want to force 5-letter words, etc.
|
||||
int minLen6 = 0;
|
||||
int minLen7 = 0;
|
||||
int minLen8 = 0;
|
||||
}
|
||||
|
||||
static Opts parseArgs(String[] args) {
|
||||
var o = new Opts();
|
||||
for (var i = 0; i < args.length; i++) {
|
||||
var a = args[i];
|
||||
var v = (i + 1 < args.length) ? args[i + 1] : null;
|
||||
switch (a) {
|
||||
case "--words" -> { o.wordsPath = v; i++; }
|
||||
case "--endpoint" -> { o.endpoint = v; i++; }
|
||||
case "--feeds" -> { o.feeds = Arrays.asList(v.split(",")); i++; }
|
||||
case "--out" -> { o.outDir = v; i++; }
|
||||
case "--bridge" -> { o.bridgeN = Integer.parseInt(v); i++; }
|
||||
case "--theme" -> { o.themeN = Integer.parseInt(v); i++; }
|
||||
case "--related" -> { o.relatedN = Integer.parseInt(v); i++; }
|
||||
case "--items" -> { o.rssItemsPerFeed = Integer.parseInt(v); i++; }
|
||||
case "--model" -> { o.model = v; i++; }
|
||||
case "--timeout" -> { o.timeoutSeconds = Integer.parseInt(v); i++; }
|
||||
case "--retries" -> { o.retries = Integer.parseInt(v); i++; }
|
||||
|
||||
// ---- NEW: minima per length ----
|
||||
case "--min2" -> { o.minLen2 = Integer.parseInt(v); i++; }
|
||||
case "--min3" -> { o.minLen3 = Integer.parseInt(v); i++; }
|
||||
case "--min4" -> { o.minLen4 = Integer.parseInt(v); i++; }
|
||||
case "--min5" -> { o.minLen5 = Integer.parseInt(v); i++; }
|
||||
case "--min6" -> { o.minLen6 = Integer.parseInt(v); i++; }
|
||||
case "--min7" -> { o.minLen7 = Integer.parseInt(v); i++; }
|
||||
case "--min8" -> { o.minLen8 = Integer.parseInt(v); i++; }
|
||||
|
||||
case "-h", "--help" -> {
|
||||
System.out.println("""
|
||||
Usage:
|
||||
java puzzle.ThemePoolBuilder --words WORDS.txt [options]
|
||||
|
||||
Options:
|
||||
--endpoint http://HOST:1234/v1 (LM Studio)
|
||||
--feeds url1,url2
|
||||
--out ./out
|
||||
--bridge 5000
|
||||
--theme 300
|
||||
--related 1200
|
||||
--items 20 (per feed)
|
||||
--model <id> (recommended; skips /v1/models)
|
||||
--timeout 60 (seconds)
|
||||
--retries 4
|
||||
|
||||
# enforce minima per length in final pool
|
||||
--min2 4000
|
||||
--min3 7000
|
||||
--min4 9000
|
||||
--min5 0
|
||||
--min6 0
|
||||
--min7 0
|
||||
--min8 0
|
||||
""");
|
||||
System.exit(0);
|
||||
}
|
||||
default -> throw new IllegalArgumentException("Unknown arg: " + a);
|
||||
}
|
||||
}
|
||||
if (o.wordsPath == null) throw new IllegalArgumentException("--words is required");
|
||||
return o;
|
||||
}
|
||||
|
||||
static boolean isAZ(String s) {
|
||||
for (var i = 0; i < s.length(); i++) {
|
||||
var ch = s.charAt(i);
|
||||
if (ch < 'A' || ch > 'Z') return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static String normalizeDutchToken(String raw) {
|
||||
if (raw == null) return null;
|
||||
var s = raw.trim();
|
||||
if (s.isEmpty()) return null;
|
||||
|
||||
s = Normalizer.normalize(s, Normalizer.Form.NFD).replaceAll("\\p{M}+", "");
|
||||
s = s.toUpperCase(Locale.ROOT);
|
||||
|
||||
s = s.replaceAll("[^A-Z]", "");
|
||||
if (s.length() < 2 || s.length() > 8) return null;
|
||||
if (!isAZ(s)) return null;
|
||||
return s;
|
||||
}
|
||||
|
||||
static String stripHtml(String s) {
|
||||
if (s == null) return "";
|
||||
var x = s.replaceAll("<[^>]+>", " ");
|
||||
x = x.replace("&", "&").replace("<", "<").replace(">", ">");
|
||||
x = x.replaceAll("\\s+", " ").trim();
|
||||
return x;
|
||||
}
|
||||
|
||||
static final Map<Character, Integer> LETTER_WEIGHT = Map.ofEntries(
|
||||
Map.entry('E', 10), Map.entry('N', 9), Map.entry('A', 9), Map.entry('R', 8),
|
||||
Map.entry('I', 8), Map.entry('O', 7), Map.entry('S', 7), Map.entry('T', 7),
|
||||
Map.entry('D', 6), Map.entry('L', 6), Map.entry('K', 5), Map.entry('M', 5),
|
||||
Map.entry('U', 5), Map.entry('P', 4), Map.entry('G', 4), Map.entry('H', 4),
|
||||
Map.entry('V', 4), Map.entry('B', 3), Map.entry('W', 3),
|
||||
Map.entry('C', 2), Map.entry('F', 2), Map.entry('Z', 2),
|
||||
Map.entry('J', 1), Map.entry('Y', 1), Map.entry('Q', 0), Map.entry('X', 0)
|
||||
);
|
||||
|
||||
static boolean isVowel(char ch) {
|
||||
return ch == 'A' || ch == 'E' || ch == 'I' || ch == 'O' || ch == 'U';
|
||||
}
|
||||
|
||||
static int crossabilityScore(String w) {
|
||||
var score = 0;
|
||||
var vowels = 0;
|
||||
for (var i = 0; i < w.length(); i++) {
|
||||
var ch = w.charAt(i);
|
||||
score += LETTER_WEIGHT.getOrDefault(ch, 2);
|
||||
if (isVowel(ch)) vowels++;
|
||||
}
|
||||
var ratio = vowels / (double) w.length();
|
||||
if (ratio >= 0.35 && ratio <= 0.65) score += 8;
|
||||
if (w.indexOf('Q') >= 0 || w.indexOf('X') >= 0) score -= 6;
|
||||
if (w.indexOf('Y') >= 0 || w.indexOf('J') >= 0) score -= 2;
|
||||
return score;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param words id -> word
|
||||
* @param idOf word -> id
|
||||
* @param score id -> crossability
|
||||
* @param byLen byLen[L] for L 0..8
|
||||
*/
|
||||
record Lexicon(List<String> words, Map<String, Integer> idOf, int[] score, BitSet[] byLen) { }
|
||||
|
||||
static Lexicon loadLexicon(String path) throws IOException {
|
||||
var lines = Files.readAllLines(Path.of(path), StandardCharsets.UTF_8);
|
||||
|
||||
var out = new ArrayList<String>(lines.size());
|
||||
var idOf = new HashMap<String, Integer>(lines.size() * 2);
|
||||
|
||||
// 1) master lexicon
|
||||
for (var line : lines) {
|
||||
var w = normalizeDutchToken(line);
|
||||
if (w == null) continue;
|
||||
if (idOf.containsKey(w)) continue;
|
||||
idOf.put(w, out.size());
|
||||
out.add(w);
|
||||
}
|
||||
|
||||
// 2) ensure DEFAULT_SHORTS are present even if absent in word-list.txt
|
||||
for (var raw : DEFAULT_SHORTS) {
|
||||
var w = normalizeDutchToken(raw);
|
||||
if (w == null) continue;
|
||||
if (idOf.containsKey(w)) continue;
|
||||
idOf.put(w, out.size());
|
||||
out.add(w);
|
||||
}
|
||||
|
||||
// 3) small extra injects (optional)
|
||||
var extraShorts = List.of(
|
||||
"AL","NU","TO","NA","BIJ","TOT","DAN","WAT","DAT",
|
||||
"IK","JE","WE","WIJ","JIJ","ZIJ","HIJ","HEN","ONS","JOU"
|
||||
);
|
||||
for (var wRaw : extraShorts) {
|
||||
var w = normalizeDutchToken(wRaw);
|
||||
if (w == null) continue;
|
||||
if (idOf.containsKey(w)) continue;
|
||||
idOf.put(w, out.size());
|
||||
out.add(w);
|
||||
}
|
||||
|
||||
var n = out.size();
|
||||
var score = new int[n];
|
||||
var byLen = new BitSet[9];
|
||||
for (var L = 0; L <= 8; L++) byLen[L] = new BitSet(n);
|
||||
|
||||
for (var i = 0; i < n; i++) {
|
||||
var w = out.get(i);
|
||||
score[i] = crossabilityScore(w);
|
||||
byLen[w.length()].set(i);
|
||||
}
|
||||
|
||||
return new Lexicon(out, idOf, score, byLen);
|
||||
}
|
||||
|
||||
// ---------------- RSS via curl (browser-like) ----------------
|
||||
|
||||
static final class RssItem {
|
||||
final String title;
|
||||
final String desc;
|
||||
RssItem(String title, String desc) {
|
||||
this.title = title;
|
||||
this.desc = desc;
|
||||
}
|
||||
}
|
||||
|
||||
static String textOfFirst(Element parent, String tag) {
|
||||
var nl = parent.getElementsByTagName(tag);
|
||||
if (nl.getLength() == 0) return null;
|
||||
var n = nl.item(0);
|
||||
return n.getTextContent();
|
||||
}
|
||||
|
||||
static List<RssItem> fetchRssViaCurlBrowser(String url, int limit, int timeoutSeconds) throws Exception {
|
||||
List<String> cmd = new ArrayList<>();
|
||||
cmd.add("curl");
|
||||
cmd.add("-fsSL");
|
||||
cmd.add("-L");
|
||||
cmd.add("--compressed");
|
||||
|
||||
cmd.add("--connect-timeout");
|
||||
cmd.add("10");
|
||||
cmd.add("--max-time");
|
||||
cmd.add(String.valueOf(timeoutSeconds));
|
||||
|
||||
cmd.add("--retry");
|
||||
cmd.add("5");
|
||||
cmd.add("--retry-all-errors");
|
||||
cmd.add("--retry-delay");
|
||||
cmd.add("1");
|
||||
|
||||
cmd.add("-H");
|
||||
cmd.add("User-Agent: " + BROWSER_UA);
|
||||
cmd.add("-H");
|
||||
cmd.add("Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
|
||||
cmd.add("-H");
|
||||
cmd.add("Accept-Language: nl-NL,nl;q=0.9,en;q=0.7");
|
||||
cmd.add("-H");
|
||||
cmd.add("Cache-Control: no-cache");
|
||||
cmd.add("-H");
|
||||
cmd.add("Pragma: no-cache");
|
||||
cmd.add("-H");
|
||||
cmd.add("Sec-Fetch-Dest: document");
|
||||
cmd.add("-H");
|
||||
cmd.add("Sec-Fetch-Mode: navigate");
|
||||
cmd.add("-H");
|
||||
cmd.add("Sec-Fetch-Site: none");
|
||||
cmd.add("-H");
|
||||
cmd.add("Sec-Fetch-User: ?1");
|
||||
|
||||
cmd.add(url);
|
||||
|
||||
var p = new ProcessBuilder(cmd)
|
||||
.redirectErrorStream(true)
|
||||
.start();
|
||||
|
||||
var bytes = p.getInputStream().readAllBytes();
|
||||
var code = p.waitFor();
|
||||
if (code != 0) {
|
||||
throw new IOException("curl RSS failed (" + code + ") url=" + url + " output=" +
|
||||
new String(bytes, StandardCharsets.UTF_8));
|
||||
}
|
||||
|
||||
try (InputStream is = new ByteArrayInputStream(bytes)) {
|
||||
var dbf = DocumentBuilderFactory.newInstance();
|
||||
var doc = dbf.newDocumentBuilder().parse(is);
|
||||
var items = doc.getElementsByTagName("item");
|
||||
|
||||
var out = new ArrayList<RssItem>();
|
||||
for (var i = 0; i < items.getLength() && out.size() < limit; i++) {
|
||||
var item = (Element) items.item(i);
|
||||
var title = textOfFirst(item, "title");
|
||||
var desc = textOfFirst(item, "description");
|
||||
if (title == null) title = "";
|
||||
if (desc == null) desc = "";
|
||||
out.add(new RssItem(stripHtml(title), stripHtml(desc)));
|
||||
}
|
||||
return out;
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------- LM Studio (OpenAI-compatible) ----------------
|
||||
|
||||
static String apiUrl(String endpointArg, String path) {
|
||||
var base = endpointArg.trim();
|
||||
if (base.endsWith("/")) base = base.substring(0, base.length() - 1);
|
||||
if (base.endsWith("/v1")) base = base.substring(0, base.length() - 3);
|
||||
|
||||
if (!path.startsWith("/")) path = "/" + path;
|
||||
if (!path.startsWith("/v1/")) path = "/" + path;
|
||||
|
||||
return base + path;
|
||||
}
|
||||
|
||||
static HttpClient buildHttpClient(int timeoutSeconds) {
|
||||
try {
|
||||
return HttpClient.newBuilder()
|
||||
.connectTimeout(java.time.Duration.ofSeconds(Math.max(10, timeoutSeconds)))
|
||||
.build();
|
||||
} catch (RuntimeException ignored) { }
|
||||
|
||||
try {
|
||||
var ssl = insecureSslContext();
|
||||
return HttpClient.newBuilder()
|
||||
.connectTimeout(java.time.Duration.ofSeconds(Math.max(10, timeoutSeconds)))
|
||||
.sslContext(ssl)
|
||||
.build();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Could not initialize HttpClient. Fix Java truststore or use curl for all HTTP.", e);
|
||||
}
|
||||
}
|
||||
|
||||
static SSLContext insecureSslContext() throws Exception {
|
||||
var trustAll = new TrustManager[]{
|
||||
new X509TrustManager() {
|
||||
public X509Certificate[] getAcceptedIssuers() { return new X509Certificate[0]; }
|
||||
public void checkClientTrusted(X509Certificate[] chain, String authType) { }
|
||||
public void checkServerTrusted(X509Certificate[] chain, String authType) { }
|
||||
}
|
||||
};
|
||||
var ssl = SSLContext.getInstance("TLS");
|
||||
ssl.init(null, trustAll, new SecureRandom());
|
||||
return ssl;
|
||||
}
|
||||
|
||||
static void sleepBackoff(int attempt) {
|
||||
try {
|
||||
var ms = (long) (300L * Math.pow(2, attempt - 1)); // 300, 600, 1200, ...
|
||||
Thread.sleep(Math.min(ms, 3000));
|
||||
} catch (InterruptedException ignored) { }
|
||||
}
|
||||
|
||||
static String curlGetJson(Opts o, String url) throws Exception {
|
||||
Exception last = null;
|
||||
for (var attempt = 1; attempt <= o.retries; attempt++) {
|
||||
try {
|
||||
List<String> cmd = new ArrayList<>();
|
||||
cmd.add("curl");
|
||||
cmd.add("-fsSL");
|
||||
cmd.add("--connect-timeout");
|
||||
cmd.add("10");
|
||||
cmd.add("--max-time");
|
||||
cmd.add(String.valueOf(o.timeoutSeconds));
|
||||
cmd.add("--retry");
|
||||
cmd.add("3");
|
||||
cmd.add("--retry-all-errors");
|
||||
cmd.add("--retry-delay");
|
||||
cmd.add("1");
|
||||
cmd.add("-H");
|
||||
cmd.add("Accept: application/json");
|
||||
cmd.add("-H");
|
||||
cmd.add("User-Agent: " + BROWSER_UA);
|
||||
cmd.add(url);
|
||||
|
||||
var p = new ProcessBuilder(cmd)
|
||||
.redirectErrorStream(true)
|
||||
.start();
|
||||
|
||||
var bytes = p.getInputStream().readAllBytes();
|
||||
var code = p.waitFor();
|
||||
|
||||
if (code != 0) {
|
||||
throw new IOException("curl GET failed (" + code + ") url=" + url + "\nOutput:\n" +
|
||||
new String(bytes, StandardCharsets.UTF_8));
|
||||
}
|
||||
|
||||
return new String(bytes, StandardCharsets.UTF_8);
|
||||
} catch (Exception e) {
|
||||
last = e;
|
||||
if (attempt < o.retries) sleepBackoff(attempt);
|
||||
}
|
||||
}
|
||||
throw last;
|
||||
}
|
||||
|
||||
static String curlPostJson(Opts o, String url, String jsonBody) throws Exception {
|
||||
Exception last = null;
|
||||
for (var attempt = 1; attempt <= o.retries; attempt++) {
|
||||
try {
|
||||
System.out.println(" Attempt " + attempt + "/" + o.retries + " via curl...");
|
||||
|
||||
var tempFile = Files.createTempFile("lm-request-", ".json");
|
||||
try {
|
||||
Files.writeString(tempFile, jsonBody, StandardCharsets.UTF_8);
|
||||
|
||||
List<String> cmd = new ArrayList<>();
|
||||
cmd.add("curl");
|
||||
cmd.add("-fsSL");
|
||||
cmd.add("--connect-timeout");
|
||||
cmd.add("10");
|
||||
cmd.add("--max-time");
|
||||
cmd.add(String.valueOf(o.timeoutSeconds));
|
||||
cmd.add("--retry");
|
||||
cmd.add("3");
|
||||
cmd.add("--retry-all-errors");
|
||||
cmd.add("--retry-delay");
|
||||
cmd.add("1");
|
||||
cmd.add("-H");
|
||||
cmd.add("Content-Type: application/json");
|
||||
cmd.add("-H");
|
||||
cmd.add("Accept: application/json");
|
||||
cmd.add("-H");
|
||||
cmd.add("User-Agent: " + BROWSER_UA);
|
||||
cmd.add("-d");
|
||||
cmd.add("@" + tempFile.toString());
|
||||
cmd.add(url);
|
||||
|
||||
var p = new ProcessBuilder(cmd)
|
||||
.redirectErrorStream(true)
|
||||
.start();
|
||||
|
||||
var bytes = p.getInputStream().readAllBytes();
|
||||
var code = p.waitFor();
|
||||
|
||||
if (code != 0) {
|
||||
throw new IOException("curl POST failed (" + code + ") url=" + url + "\nOutput:\n" +
|
||||
new String(bytes, StandardCharsets.UTF_8));
|
||||
}
|
||||
|
||||
return new String(bytes, StandardCharsets.UTF_8);
|
||||
} finally {
|
||||
Files.deleteIfExists(tempFile);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
System.err.println(" Error: " + e.getClass().getName() + ": " + e.getMessage());
|
||||
last = e;
|
||||
if (attempt < o.retries) sleepBackoff(attempt);
|
||||
}
|
||||
}
|
||||
throw last;
|
||||
}
|
||||
|
||||
static String pickModelId(String modelsJson) {
|
||||
if (modelsJson == null) return null;
|
||||
var data = modelsJson.indexOf("\"data\"");
|
||||
if (data < 0) return null;
|
||||
var id = modelsJson.indexOf("\"id\"", data);
|
||||
if (id < 0) return null;
|
||||
var q1 = modelsJson.indexOf('"', modelsJson.indexOf(':', id) + 1);
|
||||
if (q1 < 0) return null;
|
||||
var q2 = modelsJson.indexOf('"', q1 + 1);
|
||||
if (q2 < 0) return null;
|
||||
return modelsJson.substring(q1 + 1, q2);
|
||||
}
|
||||
|
||||
static String extractChatContent(String json) {
|
||||
if (json == null) return null;
|
||||
|
||||
var choices = json.indexOf("\"choices\"");
|
||||
var p = (choices >= 0) ? choices : 0;
|
||||
|
||||
var i = json.indexOf("\"content\"", p);
|
||||
if (i < 0) return null;
|
||||
var colon = json.indexOf(':', i);
|
||||
if (colon < 0) return null;
|
||||
|
||||
var q = json.indexOf('"', colon + 1);
|
||||
if (q < 0) return null;
|
||||
|
||||
var sb = new StringBuilder();
|
||||
var esc = false;
|
||||
for (var k = q + 1; k < json.length(); k++) {
|
||||
var ch = json.charAt(k);
|
||||
if (esc) {
|
||||
if (ch == 'n') sb.append('\n');
|
||||
else if (ch == 't') sb.append('\t');
|
||||
else if (ch == 'r') sb.append('\r');
|
||||
else sb.append(ch);
|
||||
esc = false;
|
||||
} else {
|
||||
if (ch == '\\') esc = true;
|
||||
else if (ch == '"') break;
|
||||
else sb.append(ch);
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
static List<String> parseStringArray(String s) {
|
||||
if (s == null) return List.of();
|
||||
var a = s.indexOf('[');
|
||||
var b = s.lastIndexOf(']');
|
||||
if (a < 0 || b < 0 || b <= a) return List.of();
|
||||
|
||||
var body = s.substring(a + 1, b);
|
||||
var out = new ArrayList<String>();
|
||||
|
||||
var cur = new StringBuilder();
|
||||
boolean in = false, esc = false;
|
||||
|
||||
for (var i = 0; i < body.length(); i++) {
|
||||
var ch = body.charAt(i);
|
||||
if (!in) {
|
||||
if (ch == '"') {
|
||||
in = true;
|
||||
cur.setLength(0);
|
||||
esc = false;
|
||||
}
|
||||
} else {
|
||||
if (esc) {
|
||||
cur.append(ch);
|
||||
esc = false;
|
||||
} else if (ch == '\\') {
|
||||
esc = true;
|
||||
} else if (ch == '"') {
|
||||
out.add(cur.toString());
|
||||
in = false;
|
||||
} else {
|
||||
cur.append(ch);
|
||||
}
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
static String jsonQuote(String s) {
|
||||
if (s == null) return "null";
|
||||
var sb = new StringBuilder();
|
||||
sb.append('"');
|
||||
for (var i = 0; i < s.length(); i++) {
|
||||
var ch = s.charAt(i);
|
||||
if (ch == '\\' || ch == '"') sb.append('\\').append(ch);
|
||||
else if (ch == '\n') sb.append("\\n");
|
||||
else if (ch == '\r') sb.append("\\r");
|
||||
else if (ch == '\t') sb.append("\\t");
|
||||
else sb.append(ch);
|
||||
}
|
||||
sb.append('"');
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
static List<String> llmThemeWords(Opts o, String modelId, String rssText) throws Exception {
|
||||
var prompt = """
|
||||
Je genereert woorden voor een Nederlandse kruiswoordpuzzel.
|
||||
|
||||
Regels:
|
||||
- Output MOET exact één JSON array zijn: ["WOORD", ...]
|
||||
- Alleen A-Z, 2-8 letters woorden
|
||||
- Geen spaties, streepjes, cijfers, accenten, apostrofs, punten
|
||||
- Geen duplicaten
|
||||
- Focus op zelfstandige naamwoorden/termen uit het nieuws en relevante Zweedse kruiswoordpuzzel koppelwoorden in het thema.
|
||||
- Lever %d THEMA-woorden en daarna %d GERELATEERDE woorden (totaal %d).
|
||||
- Voeg ook wat korte woorden/afkortingen toe (2-4 letters), maar houd het totaal gelijk.
|
||||
|
||||
Nieuws (koppen/samenvattingen):
|
||||
%s
|
||||
""".formatted(o.themeN, o.relatedN, (o.themeN + o.relatedN), rssText.substring(0, 8000));
|
||||
|
||||
var body = """
|
||||
{
|
||||
"model": %s,
|
||||
"messages": [
|
||||
{"role":"system","content":"Je bent een strikte JSON generator. Antwoord ALLEEN met een JSON array van strings."},
|
||||
{"role":"user","content": %s}
|
||||
],
|
||||
"temperature": 0.35,
|
||||
"max_tokens": 20000
|
||||
}
|
||||
""".formatted(jsonQuote(modelId), jsonQuote(prompt));
|
||||
|
||||
var url = apiUrl(o.endpoint, "/chat/completions");
|
||||
System.out.println("LM Studio POST: " + url);
|
||||
System.out.println("Request body length: " + body.length() + " bytes");
|
||||
|
||||
var resp = curlPostJson(o, url, body);
|
||||
var content = extractChatContent(resp);
|
||||
if (content == null) {
|
||||
throw new IOException("Could not extract chat content from LM Studio response.\n--- response ---\n" + resp);
|
||||
}
|
||||
return parseStringArray(content);
|
||||
}
|
||||
|
||||
// ---------------- Pool building ----------------
|
||||
|
||||
static BitSet buildBridgeBitmap(Lexicon lex, int bridgeN) {
|
||||
var n = lex.words.size();
|
||||
var ids = new Integer[n];
|
||||
for (var i = 0; i < n; i++) ids[i] = i;
|
||||
|
||||
Arrays.sort(ids, (a, b) -> Integer.compare(lex.score[b], lex.score[a]));
|
||||
|
||||
var bs = new BitSet(n);
|
||||
var take = Math.min(bridgeN, n);
|
||||
for (var i = 0; i < take; i++) bs.set(ids[i]);
|
||||
return bs;
|
||||
}
|
||||
|
||||
static BitSet bitmapFromWords(Lexicon lex, Collection<String> words) {
|
||||
var bs = new BitSet(lex.words.size());
|
||||
for (var raw : words) {
|
||||
var w = normalizeDutchToken(raw);
|
||||
if (w == null) continue;
|
||||
var id = lex.idOf.get(w);
|
||||
if (id != null) bs.set(id);
|
||||
}
|
||||
return bs;
|
||||
}
|
||||
|
||||
static Map<Integer, Integer> countsPerLen(Lexicon lex, BitSet bs) {
|
||||
var out = new HashMap<Integer, Integer>();
|
||||
for (var L = 2; L <= 8; L++) {
|
||||
var tmp = (BitSet) bs.clone();
|
||||
tmp.and(lex.byLen[L]);
|
||||
out.put(L, tmp.cardinality());
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
static void writeWordList(Path path, Lexicon lex, BitSet bs) throws IOException {
|
||||
var out = new ArrayList<String>(bs.cardinality());
|
||||
for (var i = bs.nextSetBit(0); i >= 0; i = bs.nextSetBit(i + 1)) {
|
||||
out.add(lex.words.get(i));
|
||||
}
|
||||
out.sort(String::compareTo);
|
||||
Files.write(path, out, StandardCharsets.UTF_8, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
|
||||
}
|
||||
|
||||
static String mapToLines(Map<Integer, Integer> m) {
|
||||
var sb = new StringBuilder();
|
||||
for (var L = 2; L <= 8; L++) {
|
||||
sb.append(" ").append(L).append(": ").append(m.getOrDefault(L, 0)).append("\n");
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
// ---------------- NEW: enforce minima per length ----------------
|
||||
|
||||
static int countLen(Lexicon lex, BitSet bs, int L) {
|
||||
var tmp = (BitSet) bs.clone();
|
||||
tmp.and(lex.byLen[L]);
|
||||
return tmp.cardinality();
|
||||
}
|
||||
|
||||
static void ensureMinLen(Lexicon lex, BitSet pool, int L, int minWanted) {
|
||||
if (minWanted <= 0) return;
|
||||
|
||||
var current = countLen(lex, pool, L);
|
||||
if (current >= minWanted) return;
|
||||
|
||||
var need = minWanted - current;
|
||||
|
||||
// Collect candidate ids of exactly length L that are not already in pool.
|
||||
var candidates = new ArrayList<Integer>(Math.max(need * 2, 1024));
|
||||
for (var id = lex.byLen[L].nextSetBit(0); id >= 0; id = lex.byLen[L].nextSetBit(id + 1)) {
|
||||
if (!pool.get(id)) candidates.add(id);
|
||||
}
|
||||
if (candidates.isEmpty()) return;
|
||||
|
||||
// Sort by crossability score (desc)
|
||||
candidates.sort((a, b) -> Integer.compare(lex.score[b], lex.score[a]));
|
||||
|
||||
var added = 0;
|
||||
for (var id : candidates) {
|
||||
pool.set(id);
|
||||
added++;
|
||||
if (added >= need) break;
|
||||
}
|
||||
}
|
||||
|
||||
static void enforceMinima(Opts o, Lexicon lex, BitSet pool) {
|
||||
ensureMinLen(lex, pool, 2, o.minLen2);
|
||||
ensureMinLen(lex, pool, 3, o.minLen3);
|
||||
ensureMinLen(lex, pool, 4, o.minLen4);
|
||||
ensureMinLen(lex, pool, 5, o.minLen5);
|
||||
ensureMinLen(lex, pool, 6, o.minLen6);
|
||||
ensureMinLen(lex, pool, 7, o.minLen7);
|
||||
ensureMinLen(lex, pool, 8, o.minLen8);
|
||||
}
|
||||
|
||||
// ---------------- Main ----------------
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
var o = parseArgs(args);
|
||||
|
||||
var outDir = Path.of(o.outDir);
|
||||
Files.createDirectories(outDir);
|
||||
|
||||
System.out.println("Loading lexicon...");
|
||||
var lex = loadLexicon(o.wordsPath);
|
||||
System.out.println("Master words (2-8, A-Z): " + lex.words.size());
|
||||
|
||||
// RSS via curl (browser-like)
|
||||
var all = new ArrayList<RssItem>();
|
||||
for (var feed : o.feeds) {
|
||||
var f = feed.trim();
|
||||
if (f.isEmpty()) continue;
|
||||
System.out.println("Fetching RSS: " + f);
|
||||
all.addAll(fetchRssViaCurlBrowser(f, o.rssItemsPerFeed, o.timeoutSeconds));
|
||||
}
|
||||
|
||||
var rssText = new StringBuilder();
|
||||
var k = 0;
|
||||
for (var it : all) {
|
||||
k++;
|
||||
rssText.append(k).append(". ").append(it.title).append("\n");
|
||||
if (!it.desc.isBlank()) rssText.append(" ").append(it.desc).append("\n");
|
||||
}
|
||||
Files.writeString(outDir.resolve("rss.txt"), rssText.toString(), StandardCharsets.UTF_8);
|
||||
|
||||
// LM Studio via curl
|
||||
var modelId = o.model;
|
||||
if (modelId == null) {
|
||||
var modelsUrl = apiUrl(o.endpoint, "/models");
|
||||
System.out.println("LM Studio GET: " + modelsUrl);
|
||||
var modelsJson = curlGetJson(o, modelsUrl);
|
||||
modelId = pickModelId(modelsJson);
|
||||
if (modelId == null) {
|
||||
throw new IOException("Could not auto-pick model id from /v1/models. Use --model <id>.\n--- /models ---\n" + modelsJson);
|
||||
}
|
||||
}
|
||||
System.out.println("Using model: " + modelId);
|
||||
|
||||
System.out.println("Generating theme words via LM Studio...");
|
||||
var llmWords = llmThemeWords(o, modelId, rssText.toString());
|
||||
|
||||
// Normalize + keep only those present in master lexicon
|
||||
var themeKept = new LinkedHashSet<String>();
|
||||
for (var wRaw : llmWords) {
|
||||
var w = normalizeDutchToken(wRaw);
|
||||
if (w == null) continue;
|
||||
if (lex.idOf.containsKey(w)) themeKept.add(w);
|
||||
}
|
||||
Files.write(outDir.resolve("theme.txt"), themeKept, StandardCharsets.UTF_8);
|
||||
|
||||
// BitSets
|
||||
var themeBs = bitmapFromWords(lex, themeKept);
|
||||
var bridgeBs = buildBridgeBitmap(lex, o.bridgeN);
|
||||
var shortBs = bitmapFromWords(lex, DEFAULT_SHORTS);
|
||||
|
||||
var pool = new BitSet(lex.words.size());
|
||||
pool.or(themeBs);
|
||||
pool.or(bridgeBs);
|
||||
pool.or(shortBs);
|
||||
|
||||
// ---- NEW: enforce minimum counts per length ----
|
||||
enforceMinima(o, lex, pool);
|
||||
|
||||
// Report
|
||||
var themeCounts = countsPerLen(lex, themeBs);
|
||||
var poolCounts = countsPerLen(lex, pool);
|
||||
|
||||
var report = """
|
||||
Date: %s
|
||||
Feeds: %s
|
||||
Model: %s
|
||||
|
||||
Master size: %d
|
||||
Theme kept (in master): %d
|
||||
Bridge size: %d
|
||||
Shorts kept: %d
|
||||
Pool total: %d
|
||||
|
||||
Enforced minima:
|
||||
2: %d
|
||||
3: %d
|
||||
4: %d
|
||||
5: %d
|
||||
6: %d
|
||||
7: %d
|
||||
8: %d
|
||||
|
||||
Counts per length (theme):
|
||||
%s
|
||||
|
||||
Counts per length (pool):
|
||||
%s
|
||||
""".formatted(
|
||||
LocalDate.now(),
|
||||
String.join(", ", o.feeds),
|
||||
modelId,
|
||||
lex.words.size(),
|
||||
themeBs.cardinality(),
|
||||
bridgeBs.cardinality(),
|
||||
shortBs.cardinality(),
|
||||
pool.cardinality(),
|
||||
o.minLen2, o.minLen3, o.minLen4, o.minLen5, o.minLen6, o.minLen7, o.minLen8,
|
||||
mapToLines(themeCounts),
|
||||
mapToLines(poolCounts)
|
||||
);
|
||||
|
||||
Files.writeString(outDir.resolve("report.txt"), report, StandardCharsets.UTF_8);
|
||||
System.out.println(report);
|
||||
|
||||
// Output pool list
|
||||
var poolFile = outDir.resolve("pool.txt");
|
||||
writeWordList(poolFile, lex, pool);
|
||||
System.out.println("Wrote: " + poolFile.toAbsolutePath());
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user