diff --git a/run.sh b/run.sh index 4783f43..20c3ad9 100755 --- a/run.sh +++ b/run.sh @@ -1,2 +1,3 @@ #!/bin/bash java -cp ~/dev/.target puzzle.Main "$@" +java -cp ~/dev/.target puzzle.DailyGenerator "$@" diff --git a/src/puzzle/DailyGenerator.java b/src/puzzle/DailyGenerator.java index 9fce6f5..5648b02 100644 --- a/src/puzzle/DailyGenerator.java +++ b/src/puzzle/DailyGenerator.java @@ -30,12 +30,12 @@ public class DailyGenerator { } public static void main(String[] args) { - var outDir = env("OUT_DIR", "/data/puzzles"); + var outDir = env("OUT_DIR", "/home/mike/dev/puzzle-generator/data/"); var wordsPath = env("WORDS_PATH", "./word-list.txt"); var puzzlesPerDay = envInt("PUZZLES_PER_DAY", 3); var seed = envInt("SEED", (int) System.currentTimeMillis()); var themeFilter = envBool("THEME_FILTER", true); - var themeMinScore = Double.parseDouble(env("THEME_MIN_SCORE", "0.6")); + var themeMinScore = Double.parseDouble(env("THEME_MIN_SCORE", "0.0")); var today = LocalDate.now(); var dateStr = today.toString(); diff --git a/src/puzzle/Main.java b/src/puzzle/Main.java index 72ac618..3adfc1b 100644 --- a/src/puzzle/Main.java +++ b/src/puzzle/Main.java @@ -8,7 +8,7 @@ public class Main { public int pop = 18; public int gens = 100; public int tries = 50; - public String wordsPath = "./word-list.txt"; + public String wordsPath = "./out/pool.txt"; } static void usage() { diff --git a/src/puzzle/ThemePoolBuilder.java b/src/puzzle/ThemePoolBuilder.java new file mode 100644 index 0000000..42df0e6 --- /dev/null +++ b/src/puzzle/ThemePoolBuilder.java @@ -0,0 +1,803 @@ +package puzzle; + +import org.w3c.dom.*; +import javax.net.ssl.*; +import javax.xml.parsers.DocumentBuilderFactory; + +import java.io.*; +import java.net.URI; +import java.net.http.*; +import java.nio.charset.StandardCharsets; +import java.nio.file.*; +import java.security.SecureRandom; +import java.security.cert.X509Certificate; +import java.text.Normalizer; +import java.time.LocalDate; +import java.util.*; +public class ThemePoolBuilder { + + private static final List DEFAULT_FEEDS = List.of( + "https://feeds.nos.nl/nosnieuwsalgemeen", + "https://feeds.nos.nl/nosnieuwstech" + ); + private static final List DEFAULT_SHORTS = List.of( + "EU", "VS", "UK", "NAVO", "NOS", "NS", "ANP", "VN", "NPO", "RTL", + "UUR", "MIN", "TV", "GPS", "AI", "IT", "CPU", "GPU", + "ING", "KPN", "KVK", "RIVM", "GGD", "AIVD", "MIVD", "CEO", "CFO", "HR", + "PVV", "VVD", "CDA", "FNV" + ); + + // Browser-like UA (no shell quoting issues because we use ProcessBuilder args) + private static final String BROWSER_UA = + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36"; + + // ---------------- CLI ---------------- + + static final class Opts { + + String wordsPath = "/home/mike/dev/puzzle-generator/word-list.txt"; + String endpoint = "http://192.168.1.159:1234/v1"; + List feeds = new ArrayList<>(DEFAULT_FEEDS); + String outDir = "./out"; + + int bridgeN = 5000; + int themeN = 300; + int relatedN = 1200; + int rssItemsPerFeed = 20; + + String model = "openai/gpt-oss-20b"; + int timeoutSeconds = 180; // LM Studio needs more time for generation + int retries = 2; + } + + static Opts parseArgs(String[] args) { + var o = new Opts(); + for (var i = 0; i < args.length; i++) { + var a = args[i]; + var v = (i + 1 < args.length) ? args[i + 1] : null; + switch (a) { + case "--words" -> { + o.wordsPath = v; + i++; + } + case "--endpoint" -> { + o.endpoint = v; + i++; + } + case "--feeds" -> { + o.feeds = Arrays.asList(v.split(",")); + i++; + } + case "--out" -> { + o.outDir = v; + i++; + } + case "--bridge" -> { + o.bridgeN = Integer.parseInt(v); + i++; + } + case "--theme" -> { + o.themeN = Integer.parseInt(v); + i++; + } + case "--related" -> { + o.relatedN = Integer.parseInt(v); + i++; + } + case "--items" -> { + o.rssItemsPerFeed = Integer.parseInt(v); + i++; + } + case "--model" -> { + o.model = v; + i++; + } + case "--timeout" -> { + o.timeoutSeconds = Integer.parseInt(v); + i++; + } + case "--retries" -> { + o.retries = Integer.parseInt(v); + i++; + } + case "-h", "--help" -> { + System.out.println(""" + Usage: + java puzzle.ThemePoolBuilder --words WORDS.txt [options] + + Options: + --endpoint http://HOST:1234/v1 (LM Studio) + --feeds url1,url2 + --out ./out + --bridge 5000 + --theme 300 + --related 1200 + --items 20 (per feed) + --model (recommended; skips /v1/models) + --timeout 60 (seconds) + --retries 4 + """); + System.exit(0); + } + default -> throw new IllegalArgumentException("Unknown arg: " + a); + } + } + if (o.wordsPath == null) throw new IllegalArgumentException("--words is required"); + return o; + } + + // ---------------- Normalization ---------------- + + static boolean isAZ(String s) { + for (var i = 0; i < s.length(); i++) { + var ch = s.charAt(i); + if (ch < 'A' || ch > 'Z') return false; + } + return true; + } + + static String normalizeDutchToken(String raw) { + if (raw == null) return null; + var s = raw.trim(); + if (s.isEmpty()) return null; + + // strip diacritics + s = Normalizer.normalize(s, Normalizer.Form.NFD).replaceAll("\\p{M}+", ""); + s = s.toUpperCase(Locale.ROOT); + + // keep only A-Z + s = s.replaceAll("[^A-Z]", ""); + if (s.length() < 2 || s.length() > 8) return null; + if (!isAZ(s)) return null; + return s; + } + + static String stripHtml(String s) { + if (s == null) return ""; + var x = s.replaceAll("<[^>]+>", " "); + x = x.replace("&", "&").replace("<", "<").replace(">", ">"); + x = x.replaceAll("\\s+", " ").trim(); + return x; + } + + // ---------------- Crossability score ---------------- + + static final Map LETTER_WEIGHT = Map.ofEntries( + Map.entry('E', 10), Map.entry('N', 9), Map.entry('A', 9), Map.entry('R', 8), + Map.entry('I', 8), Map.entry('O', 7), Map.entry('S', 7), Map.entry('T', 7), + Map.entry('D', 6), Map.entry('L', 6), Map.entry('K', 5), Map.entry('M', 5), + Map.entry('U', 5), Map.entry('P', 4), Map.entry('G', 4), Map.entry('H', 4), + Map.entry('V', 4), Map.entry('B', 3), Map.entry('W', 3), + Map.entry('C', 2), Map.entry('F', 2), Map.entry('Z', 2), + Map.entry('J', 1), Map.entry('Y', 1), Map.entry('Q', 0), Map.entry('X', 0) + ); + + static boolean isVowel(char ch) { + return ch == 'A' || ch == 'E' || ch == 'I' || ch == 'O' || ch == 'U'; + } + + static int crossabilityScore(String w) { + var score = 0; + var vowels = 0; + for (var i = 0; i < w.length(); i++) { + var ch = w.charAt(i); + score += LETTER_WEIGHT.getOrDefault(ch, 2); + if (isVowel(ch)) vowels++; + } + var ratio = vowels / (double) w.length(); + if (ratio >= 0.35 && ratio <= 0.65) score += 8; + if (w.indexOf('Q') >= 0 || w.indexOf('X') >= 0) score -= 6; + if (w.indexOf('Y') >= 0 || w.indexOf('J') >= 0) score -= 2; + return score; + } + + // ---------------- Lexicon ---------------- + + static final class Lexicon { + + final List words; // id -> word + final Map idOf; // word -> id + final int[] score; // id -> crossability + final BitSet[] byLen; // byLen[L] for L 0..8 + + Lexicon(List words, Map idOf, int[] score, BitSet[] byLen) { + this.words = words; + this.idOf = idOf; + this.score = score; + this.byLen = byLen; + } + } + + static Lexicon loadLexicon(String path) throws IOException { + var lines = Files.readAllLines(Path.of(path), StandardCharsets.UTF_8); + + var out = new ArrayList(lines.size()); + var idOf = new HashMap(lines.size() * 2); + + for (var line : lines) { + var w = normalizeDutchToken(line); + if (w == null) continue; + if (idOf.containsKey(w)) continue; + idOf.put(w, out.size()); + out.add(w); + } + + var n = out.size(); + var score = new int[n]; + var byLen = new BitSet[9]; + for (var L = 0; L <= 8; L++) byLen[L] = new BitSet(n); + + for (var i = 0; i < n; i++) { + var w = out.get(i); + score[i] = crossabilityScore(w); + byLen[w.length()].set(i); + } + + return new Lexicon(out, idOf, score, byLen); + } + + // ---------------- RSS via curl (browser-like) ---------------- + + static final class RssItem { + + final String title; + final String desc; + RssItem(String title, String desc) { + this.title = title; + this.desc = desc; + } + } + + static String textOfFirst(Element parent, String tag) { + var nl = parent.getElementsByTagName(tag); + if (nl.getLength() == 0) return null; + var n = nl.item(0); + return n.getTextContent(); + } + + static List fetchRssViaCurlBrowser(String url, int limit, int timeoutSeconds) throws Exception { + List cmd = new ArrayList<>(); + cmd.add("curl"); + cmd.add("-fsSL"); + cmd.add("-L"); + cmd.add("--compressed"); + + cmd.add("--connect-timeout"); + cmd.add("10"); + cmd.add("--max-time"); + cmd.add(String.valueOf(timeoutSeconds)); + + cmd.add("--retry"); + cmd.add("5"); + cmd.add("--retry-all-errors"); + cmd.add("--retry-delay"); + cmd.add("1"); + + cmd.add("-H"); + cmd.add("User-Agent: " + BROWSER_UA); + cmd.add("-H"); + cmd.add("Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); + cmd.add("-H"); + cmd.add("Accept-Language: nl-NL,nl;q=0.9,en;q=0.7"); + cmd.add("-H"); + cmd.add("Cache-Control: no-cache"); + cmd.add("-H"); + cmd.add("Pragma: no-cache"); + cmd.add("-H"); + cmd.add("Sec-Fetch-Dest: document"); + cmd.add("-H"); + cmd.add("Sec-Fetch-Mode: navigate"); + cmd.add("-H"); + cmd.add("Sec-Fetch-Site: none"); + cmd.add("-H"); + cmd.add("Sec-Fetch-User: ?1"); + + cmd.add(url); + + var p = new ProcessBuilder(cmd) + .redirectErrorStream(true) + .start(); + + var bytes = p.getInputStream().readAllBytes(); + var code = p.waitFor(); + if (code != 0) { + throw new IOException("curl RSS failed (" + code + ") url=" + url + " output=" + + new String(bytes, StandardCharsets.UTF_8)); + } + + try (InputStream is = new ByteArrayInputStream(bytes)) { + var dbf = DocumentBuilderFactory.newInstance(); + var doc = dbf.newDocumentBuilder().parse(is); + var items = doc.getElementsByTagName("item"); + + var out = new ArrayList(); + for (var i = 0; i < items.getLength() && out.size() < limit; i++) { + var item = (Element) items.item(i); + var title = textOfFirst(item, "title"); + var desc = textOfFirst(item, "description"); + if (title == null) title = ""; + if (desc == null) desc = ""; + out.add(new RssItem(stripHtml(title), stripHtml(desc))); + } + return out; + } + } + + // ---------------- LM Studio (OpenAI-compatible) ---------------- + + /** + * Build stable API URLs: + * - Accepts endpoint with or without /v1 + * - Ensures exactly one /v1 prefix + */ + static String apiUrl(String endpointArg, String path) { + var base = endpointArg.trim(); + if (base.endsWith("/")) base = base.substring(0, base.length() - 1); + if (base.endsWith("/v1")) base = base.substring(0, base.length() - 3); + + if (!path.startsWith("/")) path = "/" + path; + if (!path.startsWith("/v1/")) path = "/v1" + path; + + return base + path; + } + + /** + * Some systems have a broken Java truststore -> default SSLContext init can crash HttpClient.build(). + * We try default, else fall back to an "insecure" SSLContext just to allow HttpClient to exist. + * (We only use HttpClient against LM Studio over HTTP, so SSL is not used anyway.) + */ + static HttpClient buildHttpClient(int timeoutSeconds) { + try { + return HttpClient.newBuilder() + .connectTimeout(java.time.Duration.ofSeconds(Math.max(10, timeoutSeconds))) + .build(); + } catch (RuntimeException ignored) { + // fall back + } + + try { + var ssl = insecureSslContext(); + return HttpClient.newBuilder() + .connectTimeout(java.time.Duration.ofSeconds(Math.max(10, timeoutSeconds))) + .sslContext(ssl) + .build(); + } catch (Exception e) { + throw new RuntimeException("Could not initialize HttpClient. Fix Java truststore or use curl for all HTTP.", e); + } + } + + static SSLContext insecureSslContext() throws Exception { + var trustAll = new TrustManager[]{ + new X509TrustManager() { + + public X509Certificate[] getAcceptedIssuers() { return new X509Certificate[0]; } + public void checkClientTrusted(X509Certificate[] chain, String authType) { } + public void checkServerTrusted(X509Certificate[] chain, String authType) { } + } + }; + var ssl = SSLContext.getInstance("TLS"); + ssl.init(null, trustAll, new SecureRandom()); + return ssl; + } + + static void sleepBackoff(int attempt) { + try { + var ms = (long) (300L * Math.pow(2, attempt - 1)); // 300, 600, 1200, ... + Thread.sleep(Math.min(ms, 3000)); + } catch (InterruptedException ignored) { } + } + + /** + * GET JSON via curl (Java HttpClient has network stack issues on some systems) + */ + static String curlGetJson(Opts o, String url) throws Exception { + Exception last = null; + for (var attempt = 1; attempt <= o.retries; attempt++) { + try { + List cmd = new ArrayList<>(); + cmd.add("curl"); + cmd.add("-fsSL"); + cmd.add("--connect-timeout"); + cmd.add("10"); + cmd.add("--max-time"); + cmd.add(String.valueOf(o.timeoutSeconds)); + cmd.add("--retry"); + cmd.add("3"); + cmd.add("--retry-all-errors"); + cmd.add("--retry-delay"); + cmd.add("1"); + cmd.add("-H"); + cmd.add("Accept: application/json"); + cmd.add("-H"); + cmd.add("User-Agent: " + BROWSER_UA); + cmd.add(url); + + var p = new ProcessBuilder(cmd) + .redirectErrorStream(true) + .start(); + + var bytes = p.getInputStream().readAllBytes(); + var code = p.waitFor(); + + if (code != 0) { + throw new IOException("curl GET failed (" + code + ") url=" + url + "\nOutput:\n" + + new String(bytes, StandardCharsets.UTF_8)); + } + + return new String(bytes, StandardCharsets.UTF_8); + } catch (Exception e) { + last = e; + if (attempt < o.retries) sleepBackoff(attempt); + } + } + throw last; + } + + /** + * POST JSON via curl (Java HttpClient has network stack issues on some systems) + */ + static String curlPostJson(Opts o, String url, String jsonBody) throws Exception { + Exception last = null; + for (var attempt = 1; attempt <= o.retries; attempt++) { + try { + System.out.println(" Attempt " + attempt + "/" + o.retries + " via curl..."); + + // Write JSON body to temp file to avoid shell escaping issues + var tempFile = Files.createTempFile("lm-request-", ".json"); + try { + Files.writeString(tempFile, jsonBody, StandardCharsets.UTF_8); + + List cmd = new ArrayList<>(); + cmd.add("curl"); + cmd.add("-fsSL"); + cmd.add("--connect-timeout"); + cmd.add("10"); + cmd.add("--max-time"); + cmd.add(String.valueOf(o.timeoutSeconds)); + cmd.add("--retry"); + cmd.add("3"); + cmd.add("--retry-all-errors"); + cmd.add("--retry-delay"); + cmd.add("1"); + cmd.add("-H"); + cmd.add("Content-Type: application/json"); + cmd.add("-H"); + cmd.add("Accept: application/json"); + cmd.add("-H"); + cmd.add("User-Agent: " + BROWSER_UA); + cmd.add("-d"); + cmd.add("@" + tempFile.toString()); + cmd.add(url); + + var p = new ProcessBuilder(cmd) + .redirectErrorStream(true) + .start(); + + var bytes = p.getInputStream().readAllBytes(); + var code = p.waitFor(); + + if (code != 0) { + throw new IOException("curl POST failed (" + code + ") url=" + url + "\nOutput:\n" + + new String(bytes, StandardCharsets.UTF_8)); + } + + return new String(bytes, StandardCharsets.UTF_8); + } finally { + Files.deleteIfExists(tempFile); + } + } catch (Exception e) { + System.err.println(" Error: " + e.getClass().getName() + ": " + e.getMessage()); + last = e; + if (attempt < o.retries) sleepBackoff(attempt); + } + } + throw last; + } + + // pick first model id from {"data":[{"id":"..."} ...]} + static String pickModelId(String modelsJson) { + if (modelsJson == null) return null; + var data = modelsJson.indexOf("\"data\""); + if (data < 0) return null; + var id = modelsJson.indexOf("\"id\"", data); + if (id < 0) return null; + var q1 = modelsJson.indexOf('"', modelsJson.indexOf(':', id) + 1); + if (q1 < 0) return null; + var q2 = modelsJson.indexOf('"', q1 + 1); + if (q2 < 0) return null; + return modelsJson.substring(q1 + 1, q2); + } + + // Extract assistant "content" from chat/completions response (best-effort) + static String extractChatContent(String json) { + if (json == null) return null; + + // Try to anchor near "choices" + var choices = json.indexOf("\"choices\""); + var p = (choices >= 0) ? choices : 0; + + // Find first "content" after that + var i = json.indexOf("\"content\"", p); + if (i < 0) return null; + var colon = json.indexOf(':', i); + if (colon < 0) return null; + + // Find opening quote of the JSON string value + var q = json.indexOf('"', colon + 1); + if (q < 0) return null; + + var sb = new StringBuilder(); + var esc = false; + for (var k = q + 1; k < json.length(); k++) { + var ch = json.charAt(k); + if (esc) { + if (ch == 'n') sb.append('\n'); + else if (ch == 't') sb.append('\t'); + else if (ch == 'r') sb.append('\r'); + else sb.append(ch); + esc = false; + } else { + if (ch == '\\') esc = true; + else if (ch == '"') break; + else sb.append(ch); + } + } + return sb.toString(); + } + + // Parse JSON array of strings from model output. + // We accept extra text; we just take first '[' ... last ']' + static List parseStringArray(String s) { + if (s == null) return List.of(); + var a = s.indexOf('['); + var b = s.lastIndexOf(']'); + if (a < 0 || b < 0 || b <= a) return List.of(); + + var body = s.substring(a + 1, b); + var out = new ArrayList(); + + var cur = new StringBuilder(); + boolean in = false, esc = false; + + for (var i = 0; i < body.length(); i++) { + var ch = body.charAt(i); + if (!in) { + if (ch == '"') { + in = true; + cur.setLength(0); + esc = false; + } + } else { + if (esc) { + cur.append(ch); + esc = false; + } else if (ch == '\\') { + esc = true; + } else if (ch == '"') { + out.add(cur.toString()); + in = false; + } else { + cur.append(ch); + } + } + } + return out; + } + + static String jsonQuote(String s) { + if (s == null) return "null"; + var sb = new StringBuilder(); + sb.append('"'); + for (var i = 0; i < s.length(); i++) { + var ch = s.charAt(i); + if (ch == '\\' || ch == '"') sb.append('\\').append(ch); + else if (ch == '\n') sb.append("\\n"); + else if (ch == '\r') sb.append("\\r"); + else if (ch == '\t') sb.append("\\t"); + else sb.append(ch); + } + sb.append('"'); + return sb.toString(); + } + + static List llmThemeWords(Opts o, String modelId, String rssText) throws Exception { + var prompt = """ + Je genereert woorden voor een Nederlandse kruiswoordpuzzel. + + Regels: + - Output MOET exact één JSON array zijn: ["WOORD", ...] + - Alleen A-Z, 2-8 letters + - Geen spaties, streepjes, cijfers, accenten, apostrofs, punten + - Geen duplicaten + - Focus op zelfstandige naamwoorden/termen uit het nieuws + - Lever %d THEMA-woorden en daarna %d GERELATEERDE woorden (totaal %d). + - Voeg ook wat korte woorden/afkortingen toe (2-4 letters), maar houd het totaal gelijk. + + Nieuws (koppen/samenvattingen): + %s + """.formatted(o.themeN, o.relatedN, (o.themeN + o.relatedN), rssText); + + var body = """ + { + "model": %s, + "messages": [ + {"role":"system","content":"Je bent een strikte JSON generator. Antwoord ALLEEN met een JSON array van strings."}, + {"role":"user","content": %s} + ], + "temperature": 0.35, + "max_tokens": 2000 + } + """.formatted(jsonQuote(modelId), jsonQuote(prompt)); + + var url = apiUrl(o.endpoint, "/chat/completions"); + System.out.println("LM Studio POST: " + url); + System.out.println("Request body length: " + body.length() + " bytes"); + + var resp = curlPostJson(o, url, body); + var content = extractChatContent(resp); + if (content == null) { + throw new IOException("Could not extract chat content from LM Studio response.\n--- response ---\n" + resp); + } + return parseStringArray(content); + } + + // ---------------- Pool building ---------------- + + static BitSet buildBridgeBitmap(Lexicon lex, int bridgeN) { + var n = lex.words.size(); + var ids = new Integer[n]; + for (var i = 0; i < n; i++) ids[i] = i; + + Arrays.sort(ids, (a, b) -> Integer.compare(lex.score[b], lex.score[a])); + + var bs = new BitSet(n); + var take = Math.min(bridgeN, n); + for (var i = 0; i < take; i++) bs.set(ids[i]); + return bs; + } + + static BitSet bitmapFromWords(Lexicon lex, Collection words) { + var bs = new BitSet(lex.words.size()); + for (var raw : words) { + var w = normalizeDutchToken(raw); + if (w == null) continue; + var id = lex.idOf.get(w); + if (id != null) bs.set(id); + } + return bs; + } + + static Map countsPerLen(Lexicon lex, BitSet bs) { + var out = new HashMap(); + for (var L = 2; L <= 8; L++) { + var tmp = (BitSet) bs.clone(); + tmp.and(lex.byLen[L]); + out.put(L, tmp.cardinality()); + } + return out; + } + + static void writeWordList(Path path, Lexicon lex, BitSet bs) throws IOException { + var out = new ArrayList(bs.cardinality()); + for (var i = bs.nextSetBit(0); i >= 0; i = bs.nextSetBit(i + 1)) { + out.add(lex.words.get(i)); + } + out.sort(String::compareTo); + Files.write(path, out, StandardCharsets.UTF_8, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING); + } + + static String mapToLines(Map m) { + var sb = new StringBuilder(); + for (var L = 2; L <= 8; L++) { + sb.append(" ").append(L).append(": ").append(m.getOrDefault(L, 0)).append("\n"); + } + return sb.toString(); + } + + // ---------------- Main ---------------- + + public static void main(String[] args) throws Exception { + var o = parseArgs(args); + + var outDir = Path.of(o.outDir); + Files.createDirectories(outDir); + + System.out.println("Loading lexicon..."); + var lex = loadLexicon(o.wordsPath); + System.out.println("Master words (2-8, A-Z): " + lex.words.size()); + + // RSS via curl (browser-like) + var all = new ArrayList(); + for (var feed : o.feeds) { + var f = feed.trim(); + if (f.isEmpty()) continue; + System.out.println("Fetching RSS: " + f); + all.addAll(fetchRssViaCurlBrowser(f, o.rssItemsPerFeed, o.timeoutSeconds)); + } + + var rssText = new StringBuilder(); + var k = 0; + for (var it : all) { + k++; + rssText.append(k).append(". ").append(it.title).append("\n"); + if (!it.desc.isBlank()) rssText.append(" ").append(it.desc).append("\n"); + } + Files.writeString(outDir.resolve("rss.txt"), rssText.toString(), StandardCharsets.UTF_8); + + // LM Studio via curl + var modelId = o.model; + if (modelId == null) { + var modelsUrl = apiUrl(o.endpoint, "/models"); + System.out.println("LM Studio GET: " + modelsUrl); + var modelsJson = curlGetJson(o, modelsUrl); + modelId = pickModelId(modelsJson); + if (modelId == null) { + throw new IOException("Could not auto-pick model id from /v1/models. Use --model .\n--- /models ---\n" + modelsJson); + } + } + System.out.println("Using model: " + modelId); + + System.out.println("Generating theme words via LM Studio..."); + var llmWords = llmThemeWords(o, modelId, rssText.toString()); + + // Normalize + keep only those present in master lexicon + var themeKept = new LinkedHashSet(); + for (var wRaw : llmWords) { + var w = normalizeDutchToken(wRaw); + if (w == null) continue; + if (lex.idOf.containsKey(w)) themeKept.add(w); + } + Files.write(outDir.resolve("theme.txt"), themeKept, StandardCharsets.UTF_8); + + // BitSets + var themeBs = bitmapFromWords(lex, themeKept); + var bridgeBs = buildBridgeBitmap(lex, o.bridgeN); + var shortBs = bitmapFromWords(lex, DEFAULT_SHORTS); + + var pool = new BitSet(lex.words.size()); + pool.or(themeBs); + pool.or(bridgeBs); + pool.or(shortBs); + + // Report + var themeCounts = countsPerLen(lex, themeBs); + var poolCounts = countsPerLen(lex, pool); + + var report = """ + Date: %s + Feeds: %s + Model: %s + + Master size: %d + Theme kept (in master): %d + Bridge size: %d + Shorts kept: %d + Pool total: %d + + Counts per length (theme): + %s + + Counts per length (pool): + %s + """.formatted( + LocalDate.now(), + String.join(", ", o.feeds), + modelId, + lex.words.size(), + themeBs.cardinality(), + bridgeBs.cardinality(), + shortBs.cardinality(), + pool.cardinality(), + mapToLines(themeCounts), + mapToLines(poolCounts) + ); + + Files.writeString(outDir.resolve("report.txt"), report, StandardCharsets.UTF_8); + System.out.println(report); + + // Output pool list + var poolFile = outDir.resolve("pool.txt"); + writeWordList(poolFile, lex, pool); + System.out.println("Wrote: " + poolFile.toAbsolutePath()); + } +}