package puzzle; import org.w3c.dom.*; import javax.xml.parsers.DocumentBuilderFactory; import java.io.*; import java.nio.charset.StandardCharsets; import java.nio.file.*; import java.sql.Connection; import java.sql.DriverManager; import java.sql.SQLException; import java.text.Normalizer; import java.time.LocalDate; import java.util.*; public class ThemePoolBuilderLength { private static final List DEFAULT_FEEDS = List.of( "https://feeds.nos.nl/nosnieuwsalgemeen", "https://feeds.nos.nl/nosnieuwstech"); static final String url = "jdbc:postgresql://192.168.1.159:5432/postgres"; static final String user = "puzzle"; static final String pass = "heel-goed-wachtwoord"; // NOTE: normalizeDutchToken strips non A-Z. Keep entries 2-8 after normalization. private static final List DEFAULT_SHORTS = List.of( "EU", "VS", "UK", "NAVO", "NOS", "NS", "ANP", "VN", "NPO", "RTL", "UUR", "MIN", "TV", "GPS", "AI", "IT", "CPU", "GPU", "ING", "KPN", "KVK", "RIVM", "GGD", "AIVD", "MIVD", "CEO", "CFO", "HR", "NL", "BE", "BRU", "EUR", "EURO", "WET", "ART", "BTW", "DI", "MA", "PVV", "VVD", "CDA", "FNV", "EN", "IN", "OP", "OM", "TE", "ER", "DE", "HET", "EEN", "VAN", "MET", "NOG", "OOK", "MAAR", "WEL", "NIET", "HOE", "ALS", "ZO", "DO", "WO", "VR", "MO", "WA", "WE", "TAAL", "LAND", "GEMEENTE", "STAAT", "BUREAU", "HUIS", "SCHOOL", "STR", "BAAN", "WERK", "KLUS", "FONDS", "RAAD", "CONGRESS", "GROEP", "STRAAT", "BRUG", "PARK", "BUURT", "BOUW", "HOTEL", "CAFE", "BAR", "BIJBAAN", "STUDENT", "DOCENT", "WINKEL", "MARKT", "KIOSK", "AUTO", "MOBILE", "FIETS", "SCOOTER", // afkortingen "DHR", "MEVR", "DR", "ST", "CA", "IVM", "MBT", "TAV", "TOV", "DWZ", "MAW", "OA", "TM", "ANWB", "BRP", "CBS", "AL", "NU", "TO", "NA", "BIJ", "TOT", "DAN", "WAT", "DAT", "IK", "JE", "WE", "WIJ", "JIJ", "ZIJ", "HIJ", "HEN", "ONS", "JOU", // romeinse cijfers (2-8) "II", "III", "IV", "VI", "VII", "VIII", "IX", "XI", "XII", "XIII", "XIV", "XV", "XVI", "XVII", "XVIII", "XIX", "XX" ); private static final String BROWSER_UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36"; static int MIN_SIMPLICITY = 520, MAX_WORD_LENGTH = 7; static final class Opts { String endpoint = "https://jarvis-lan.appmodel.nl/api/ollama/"; List feeds = new ArrayList<>(DEFAULT_FEEDS); String outDir = System.getenv("OUT_DIR") != null ? System.getenv("OUT_DIR") : "/data/puzzle"; int bridgeN = 30000; int themeN = 800; int relatedN = 2200; int rssItemsPerFeed = 10; String model = "/models/Hadiseh-Mhd/Mixtral-8x7B-Instruct-v0.1-Q4_K_M-GGUF/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf"; int timeoutSeconds = 180; int retries = 2; int minLen2 = 1000; int minLen3 = 1000; int minLen4 = 1000; int minLen5 = 1000; // set if you also want to force 5-letter words, etc. int minLen6 = 1000; int minLen7 = 1000; int minLen8 = MAX_WORD_LENGTH >= 8 ? 1000 : 0; } public static void main(String[] args) throws Exception { var o = parseArgs(args); var outDir = Path.of(o.outDir); Files.createDirectories(outDir); System.out.println("Loading lexicon..."); Lexicon lex; Class.forName("org.postgresql.Driver"); try (var c = DriverManager.getConnection(url, user, pass);) { lex = loadLexicon(c); } System.out.println("Master words (2-" + MAX_WORD_LENGTH + ", A-Z): " + lex.words.size()); // RSS via curl (browser-like) var all = new ArrayList(); for (var feed : o.feeds) { var f = feed.trim(); if (f.isEmpty()) continue; System.out.println("Fetching RSS: " + f); all.addAll(fetchRssViaCurlBrowser(f, o.rssItemsPerFeed, o.timeoutSeconds)); } var rssText = new StringBuilder(); var k = 0; for (var it : all) { k++; rssText.append(k).append(". ").append(it.title).append("\n"); if (!it.desc.isBlank()) rssText.append(" ").append(it.desc).append("\n"); } Files.writeString(outDir.resolve("rss.txt"), rssText.toString(), StandardCharsets.UTF_8); // LM Studio via curl var modelId = o.model; if (modelId == null) { var modelsUrl = apiUrl(o.endpoint, "/models"); System.out.println("Ollama GET: " + modelsUrl); var modelsJson = curlGetJson(o, modelsUrl); modelId = pickModelId(modelsJson); if (modelId == null) { throw new IOException("Could not auto-pick model id from /v1/models. Use --model .\n--- /models ---\n" + modelsJson); } } System.out.println("Using model: " + modelId); System.out.println("Generating theme words via LM Studio..."); var llmWords = List.of();//llmThemeWords(o, modelId, rssText.toString()); var themeKept = new LinkedHashSet(); for (var wRaw : llmWords) { var w = normalizeDutchToken(wRaw); if (w == null) continue; if (lex.idOf.containsKey(w)) themeKept.add(w); } Files.write(outDir.resolve("theme.txt"), themeKept, StandardCharsets.UTF_8); // BitSets var themeBs = bitmapFromWords(lex, themeKept); var bridgeBs = buildBridgeBitmap(lex, o.bridgeN); var shortBs = bitmapFromWords(lex, DEFAULT_SHORTS); var pool = new BitSet(lex.words.size()); pool.or(themeBs); pool.or(bridgeBs); pool.or(shortBs); // ---- NEW: enforce minimum counts per length ---- enforceMinima(o, lex, pool); // Report var themeCounts = countsPerLen(lex, themeBs); var poolCounts = countsPerLen(lex, pool); var report = """ Date: %s Feeds: %s Model: %s Master size: %d Theme kept (in master): %d Bridge size: %d Shorts kept: %d Pool total: %d Enforced minima: 2: %d 3: %d 4: %d 5: %d 6: %d 7: %d 8: %d Counts per length (theme): %s Counts per length (pool): %s """.formatted( LocalDate.now(), String.join(", ", o.feeds), modelId, lex.words.size(), themeBs.cardinality(), bridgeBs.cardinality(), shortBs.cardinality(), pool.cardinality(), o.minLen2, o.minLen3, o.minLen4, o.minLen5, o.minLen6, o.minLen7, o.minLen8, mapToLines(themeCounts), mapToLines(poolCounts) ); Files.writeString(outDir.resolve("report.txt"), report, StandardCharsets.UTF_8); System.out.println(report); // Output pool list var poolFile = outDir.resolve("pool.txt"); writeWordList(poolFile, lex, pool); System.out.println("Wrote: " + poolFile.toAbsolutePath()); } static Opts parseArgs(String[] args) { var o = new Opts(); for (var i = 0; i < args.length; i++) { var a = args[i]; var v = (i + 1 < args.length) ? args[i + 1] : null; switch (a) { case "--endpoint" -> { o.endpoint = v; i++; } case "--feeds" -> { o.feeds = Arrays.asList(v.split(",")); i++; } case "--out" -> { o.outDir = v; i++; } case "--bridge" -> { o.bridgeN = Integer.parseInt(v); i++; } case "--theme" -> { o.themeN = Integer.parseInt(v); i++; } case "--related" -> { o.relatedN = Integer.parseInt(v); i++; } case "--items" -> { o.rssItemsPerFeed = Integer.parseInt(v); i++; } case "--model" -> { o.model = v; i++; } case "--timeout" -> { o.timeoutSeconds = Integer.parseInt(v); i++; } case "--retries" -> { o.retries = Integer.parseInt(v); i++; } // ---- NEW: minima per length ---- case "--min2" -> { o.minLen2 = Integer.parseInt(v); i++; } case "--min3" -> { o.minLen3 = Integer.parseInt(v); i++; } case "--min4" -> { o.minLen4 = Integer.parseInt(v); i++; } case "--min5" -> { o.minLen5 = Integer.parseInt(v); i++; } case "--min6" -> { o.minLen6 = Integer.parseInt(v); i++; } case "--min7" -> { o.minLen7 = Integer.parseInt(v); i++; } case "--min8" -> { o.minLen8 = Integer.parseInt(v); i++; } case "-h", "--help" -> { System.out.println(""" Usage: java puzzle.ThemePoolBuilder --words WORDS.txt [options] Options: --endpoint http://HOST:1234/v1 (LM Studio) --feeds url1,url2 --out ./out --bridge 5000 --theme 300 --related 1200 --items 20 (per feed) --model (recommended; skips /v1/models) --timeout 60 (seconds) --retries 4 # enforce minima per length in final pool --min2 4000 --min3 7000 --min4 9000 --min5 0 --min6 0 --min7 0 --min8 0 """); System.exit(0); } default -> throw new IllegalArgumentException("Unknown arg: " + a); } } return o; } static boolean isAZ(String s) { for (var i = 0; i < s.length(); i++) { var ch = s.charAt(i); if (ch < 'A' || ch > 'Z') return false; } return true; } static String normalizeDutchToken(String raw) { if (raw == null) return null; var s = raw.trim(); if (s.isEmpty()) return null; s = Normalizer.normalize(s, Normalizer.Form.NFD).replaceAll("\\p{M}+", ""); s = s.toUpperCase(Locale.ROOT); s = s.replaceAll("[^A-Z]", ""); if (s.length() < 2 || s.length() > 8) return null; if (!isAZ(s)) return null; return s; } static String stripHtml(String s) { if (s == null) return ""; var x = s.replaceAll("<[^>]+>", " "); x = x.replace("&", "&").replace("<", "<").replace(">", ">"); x = x.replaceAll("\\s+", " ").trim(); return x; } /** * @param words id -> word * @param idOf word -> id * @param score id -> crossability * @param byLen byLen[L] for L 0..8 */ record Lexicon(List words, Map idOf, int[] score, BitSet[] byLen) { } /** * Loads lexicon from PostgreSQL view/table: export_words_with_hints_2_8 * Columns: WOORD, level_1_to_10, hint * * Notes: * - Normalizes words via normalizeDutchToken(...) * - Dedupes on normalized word * - Uses level_1_to_10 as the "LLM score" (fallback 5) * - Ignores hint for scoring (but you can store it elsewhere if needed) */ static Lexicon loadLexicon(Connection conn) throws SQLException { var out = new ArrayList(200_000); var idOf = new HashMap(400_000); // Store level per normalized word while loading so we can compute scores later var levelOf = new HashMap(400_000); final var sql = """ SELECT woord, 10-level_1_to_10, hint FROM export_real_words_with_hints where length(woord)<=7 order by level_1_to_10 asc """ ; try (var ps = conn.prepareStatement(sql); var rs = ps.executeQuery()) { while (rs.next()) { var rawWord = rs.getString(1); var lvlObj = (Integer) rs.getObject(2); // nullable // String hint = rs.getString(3); // available if you want it later var w = normalizeDutchToken(rawWord); if (w == null) continue; if (idOf.containsKey(w)) continue; idOf.put(w, out.size()); out.add(w); var lvl = (lvlObj == null ? 5 : lvlObj.intValue()); levelOf.put(w, lvl); } } catch (SQLException e) { throw new RuntimeException(e); } var n = out.size(); var score = new int[n]; var byLen = new BitSet[9]; for (var L = 0; L <= 8; L++) byLen[L] = new BitSet(n); for (var i = 0; i < n; i++) { var w = out.get(i); var crossScore = HintScores.crossabilityScore(w); var lScore = levelOf.getOrDefault(w, 5); // Prioritize simple words (high lScore) and long words. // lScore (1-10) adds up to 1000 points (weight 100). // Length (2-8) adds up to 160 points (weight 20). score[i] = crossScore + (lScore * 100) + (w.length() * 40); byLen[w.length()].set(i); } return new Lexicon(out, idOf, score, byLen); } // ---------------- RSS via curl (browser-like) ---------------- record RssItem(String title, String desc) { } static String textOfFirst(Element parent, String tag) { var nl = parent.getElementsByTagName(tag); if (nl.getLength() == 0) return null; var n = nl.item(0); return n.getTextContent(); } static List fetchRssViaCurlBrowser(String url, int limit, int timeoutSeconds) throws Exception { var cmd = new ArrayList(); cmd.add("curl"); cmd.add("-fsSL"); cmd.add("-L"); cmd.add("--compressed"); cmd.add("--connect-timeout"); cmd.add("10"); cmd.add("--max-time"); cmd.add(String.valueOf(timeoutSeconds)); cmd.add("--retry"); cmd.add("5"); cmd.add("--retry-all-errors"); cmd.add("--retry-delay"); cmd.add("1"); cmd.add("-H"); cmd.add("User-Agent: " + BROWSER_UA); cmd.add("-H"); cmd.add("Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); cmd.add("-H"); cmd.add("Accept-Language: nl-NL,nl;q=0.9,en;q=0.7"); cmd.add("-H"); cmd.add("Cache-Control: no-cache"); cmd.add("-H"); cmd.add("Pragma: no-cache"); cmd.add("-H"); cmd.add("Sec-Fetch-Dest: document"); cmd.add("-H"); cmd.add("Sec-Fetch-Mode: navigate"); cmd.add("-H"); cmd.add("Sec-Fetch-Site: none"); cmd.add("-H"); cmd.add("Sec-Fetch-User: ?1"); cmd.add(url); var p = new ProcessBuilder(cmd) .redirectErrorStream(true) .start(); var bytes = p.getInputStream().readAllBytes(); var code = p.waitFor(); if (code != 0) { throw new IOException("curl RSS failed (" + code + ") url=" + url + " output=" + new String(bytes, StandardCharsets.UTF_8)); } try (InputStream is = new ByteArrayInputStream(bytes)) { var dbf = DocumentBuilderFactory.newInstance(); var doc = dbf.newDocumentBuilder().parse(is); var items = doc.getElementsByTagName("item"); var out = new ArrayList(); for (var i = 0; i < items.getLength() && out.size() < limit; i++) { var item = (Element) items.item(i); var title = textOfFirst(item, "title"); var desc = textOfFirst(item, "description"); if (title == null) title = ""; if (desc == null) desc = ""; out.add(new RssItem(stripHtml(title), stripHtml(desc))); } return out; } } // ---------------- LM Studio (OpenAI-compatible) ---------------- static String apiUrl(String endpointArg, String path) { var base = endpointArg.trim(); if (base.endsWith("/")) base = base.substring(0, base.length() - 1); if (base.endsWith("/v1")) base = base.substring(0, base.length() - 3); if (!path.startsWith("/")) path = "/" + path; if (!path.startsWith("/v1/")) path = "/" + path; return base + path; } static void sleepBackoff(int attempt) { try { var ms = (long) (300L * Math.pow(2, attempt - 1)); // 300, 600, 1200, ... Thread.sleep(Math.min(ms, 3000)); } catch (InterruptedException ignored) { } } static String curlGetJson(Opts o, String url) throws Exception { Exception last = null; for (var attempt = 1; attempt <= o.retries; attempt++) { try { var cmd = new ArrayList(); cmd.add("curl"); cmd.add("-fsSL"); cmd.add("--connect-timeout"); cmd.add("10"); cmd.add("--max-time"); cmd.add(String.valueOf(o.timeoutSeconds)); cmd.add("--retry"); cmd.add("3"); cmd.add("--retry-all-errors"); cmd.add("--retry-delay"); cmd.add("1"); cmd.add("-H"); cmd.add("Accept: application/json"); cmd.add("-H"); cmd.add("User-Agent: " + BROWSER_UA); cmd.add(url); var p = new ProcessBuilder(cmd) .redirectErrorStream(true) .start(); var bytes = p.getInputStream().readAllBytes(); var code = p.waitFor(); if (code != 0) { throw new IOException("curl GET failed (" + code + ") url=" + url + "\nOutput:\n" + new String(bytes, StandardCharsets.UTF_8)); } return new String(bytes, StandardCharsets.UTF_8); } catch (Exception e) { last = e; if (attempt < o.retries) sleepBackoff(attempt); } } throw last; } static String curlPostJson(Opts o, String url, String jsonBody) throws Exception { Exception last = null; for (var attempt = 1; attempt <= o.retries; attempt++) { try { System.out.println(" Attempt " + attempt + "/" + o.retries + " via curl..."); var tempFile = Files.createTempFile("lm-request-", ".json"); try { Files.writeString(tempFile, jsonBody, StandardCharsets.UTF_8); List cmd = new ArrayList<>(); cmd.add("curl"); cmd.add("-fsSL"); cmd.add("--connect-timeout"); cmd.add("10"); cmd.add("--max-time"); cmd.add(String.valueOf(o.timeoutSeconds)); cmd.add("--retry"); cmd.add("3"); cmd.add("--retry-all-errors"); cmd.add("--retry-delay"); cmd.add("1"); cmd.add("-H"); cmd.add("Content-Type: application/json"); cmd.add("-H"); cmd.add("Accept: application/json"); cmd.add("-H"); cmd.add("User-Agent: " + BROWSER_UA); cmd.add("-d"); cmd.add("@" + tempFile.toString()); cmd.add(url); var p = new ProcessBuilder(cmd) .redirectErrorStream(true) .start(); var bytes = p.getInputStream().readAllBytes(); var code = p.waitFor(); if (code != 0) { throw new IOException("curl POST failed (" + code + ") url=" + url + "\nOutput:\n" + new String(bytes, StandardCharsets.UTF_8)); } return new String(bytes, StandardCharsets.UTF_8); } finally { Files.deleteIfExists(tempFile); } } catch (Exception e) { System.err.println(" Error: " + e.getClass().getName() + ": " + e.getMessage()); last = e; if (attempt < o.retries) sleepBackoff(attempt); } } throw last; } static String pickModelId(String modelsJson) { if (modelsJson == null) return null; var data = modelsJson.indexOf("\"data\""); if (data < 0) return null; var id = modelsJson.indexOf("\"id\"", data); if (id < 0) return null; var q1 = modelsJson.indexOf('"', modelsJson.indexOf(':', id) + 1); if (q1 < 0) return null; var q2 = modelsJson.indexOf('"', q1 + 1); if (q2 < 0) return null; return modelsJson.substring(q1 + 1, q2); } static String extractChatContent(String json) { if (json == null) return null; var choices = json.indexOf("\"choices\""); var p = (choices >= 0) ? choices : 0; var i = json.indexOf("\"content\"", p); if (i < 0) return null; var colon = json.indexOf(':', i); if (colon < 0) return null; var q = json.indexOf('"', colon + 1); if (q < 0) return null; var sb = new StringBuilder(); var esc = false; for (var k = q + 1; k < json.length(); k++) { var ch = json.charAt(k); if (esc) { if (ch == 'n') sb.append('\n'); else if (ch == 't') sb.append('\t'); else if (ch == 'r') sb.append('\r'); else sb.append(ch); esc = false; } else { if (ch == '\\') esc = true; else if (ch == '"') break; else sb.append(ch); } } return sb.toString(); } static List parseStringArray(String s) { if (s == null) return List.of(); var a = s.indexOf('['); var b = s.lastIndexOf(']'); if (a < 0 || b < 0 || b <= a) return List.of(); var body = s.substring(a + 1, b); var out = new ArrayList(); // If it's a simple comma-separated list without quotes (or with mixed quotes), // let's try a more robust approach. if (!body.contains("\"")) { for (var part : body.split(",")) { var trimmed = part.trim(); if (!trimmed.isEmpty()) out.add(trimmed); } if (!out.isEmpty()) return out; } var cur = new StringBuilder(); boolean in = false, esc = false; for (var i = 0; i < body.length(); i++) { var ch = body.charAt(i); if (!in) { if (ch == '"') { in = true; cur.setLength(0); esc = false; } } else { if (esc) { cur.append(ch); esc = false; } else if (ch == '\\') { esc = true; } else if (ch == '"') { out.add(cur.toString()); in = false; } else { cur.append(ch); } } } return out; } static String jsonQuote(String s) { if (s == null) return "null"; var sb = new StringBuilder(); sb.append('"'); for (var i = 0; i < s.length(); i++) { var ch = s.charAt(i); if (ch == '\\' || ch == '"') sb.append('\\').append(ch); else if (ch == '\n') sb.append("\\n"); else if (ch == '\r') sb.append("\\r"); else if (ch == '\t') sb.append("\\t"); else sb.append(ch); } sb.append('"'); return sb.toString(); } static List llmThemeWords(Opts o, String modelId, String rssText) throws Exception { var prompt = """ Je genereert woorden voor een Nederlandse kruiswoordpuzzel. Regels: - Output MOET exact één JSON array zijn: ["WOORD", ...] - Alleen A-Z, 2-8 letters woorden - Geen spaties, streepjes, cijfers, accenten, apostrofs, punten - Geen duplicaten - Focus op zelfstandige naamwoorden/termen uit het nieuws en relevante Zweedse kruiswoordpuzzel koppelwoorden in het thema. - Lever %d THEMA-woorden en daarna %d GERELATEERDE woorden (totaal %d). - Voeg ook wat korte woorden/afkortingen toe (2-4 letters), maar houd het totaal gelijk. Nieuws (koppen/samenvattingen): %s """.formatted(o.themeN, o.relatedN, (o.themeN + o.relatedN), rssText.substring(0, Math.min(rssText.length(), 8000))); var body = """ { "model": %s, "messages": [ {"role":"system","content":"Je bent een strikte JSON generator. Antwoord ALLEEN met een JSON array van strings."}, {"role":"user","content": %s} ], "temperature": 0.35, "max_tokens": 20000 } """.formatted(jsonQuote(modelId), jsonQuote(prompt)); var url = apiUrl(o.endpoint, "/chat/completions"); System.out.println("LM Studio POST: " + url); System.out.println("Request body length: " + body.length() + " bytes"); var resp = curlPostJson(o, url, body); var content = extractChatContent(resp); if (content == null) { throw new IOException("Could not extract chat content from LM Studio response.\n--- response ---\n" + resp); } return parseStringArray(content); } // ---------------- Pool building ---------------- static BitSet buildBridgeBitmap(Lexicon lex, int bridgeN) { var n = lex.words.size(); var ids = new ArrayList(n); for (var i = 0; i < n; i++) { // Optionally filter out VERY complex words from the bridge (e.g. lScore < 3) // But since we sort by score (which is now dominated by lScore), // they will be at the very bottom anyway. // if (lex.score[i] < 800) continue; ids.add(i); } ids.sort((a, b) -> Integer.compare(lex.score[b], lex.score[a])); var bs = new BitSet(n); var take = Math.min(bridgeN, ids.size()); for (var i = 0; i < take; i++) bs.set(ids.get(i)); return bs; } static BitSet bitmapFromWords(Lexicon lex, Collection words) { var bs = new BitSet(lex.words.size()); for (var raw : words) { var w = normalizeDutchToken(raw); if (w == null) continue; var id = lex.idOf.get(w); if (id != null) bs.set(id); } return bs; } static Map countsPerLen(Lexicon lex, BitSet bs) { var out = new HashMap(); for (var L = 2; L <= 8; L++) { var tmp = (BitSet) bs.clone(); tmp.and(lex.byLen[L]); out.put(L, tmp.cardinality()); } return out; } static void writeWordList(Path path, Lexicon lex, BitSet bs) throws IOException { var ids = new ArrayList(bs.cardinality()); for (var i = bs.nextSetBit(0); i >= 0; i = bs.nextSetBit(i + 1)) { ids.add(i); } // Sort by score descending (higher score is easier/better) ids.sort((a, b) -> Integer.compare(lex.score[b], lex.score[a])); var out = new ArrayList(ids.size()); for (var id : ids) { if (lex.score[id] < MIN_SIMPLICITY) continue; out.add(lex.words.get(id)); } Files.write(path, out, StandardCharsets.UTF_8, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING); } static String mapToLines(Map m) { var sb = new StringBuilder(); for (var L = 2; L <= 8; L++) { sb.append(" ").append(L).append(": ").append(m.getOrDefault(L, 0)).append("\n"); } return sb.toString(); } // ---------------- NEW: enforce minima per length ---------------- static int countLen(Lexicon lex, BitSet bs, int L) { var tmp = (BitSet) bs.clone(); tmp.and(lex.byLen[L]); return tmp.cardinality(); } static void ensureMinLen(Lexicon lex, BitSet pool, int L, int minWanted) { if (minWanted <= 0) return; var current = countLen(lex, pool, L); if (current >= minWanted) return; var need = minWanted - current; // Collect candidate ids of exactly length L that are not already in pool. var candidates = new ArrayList(Math.max(need * 2, 1024)); for (var id = lex.byLen[L].nextSetBit(0); id >= 0; id = lex.byLen[L].nextSetBit(id + 1)) { if (!pool.get(id)) candidates.add(id); } if (candidates.isEmpty()) return; // Sort by crossability score (desc) candidates.sort((a, b) -> Integer.compare(lex.score[b], lex.score[a])); var added = 0; for (var id : candidates) { pool.set(id); added++; if (added >= need) break; } } static void enforceMinima(Opts o, Lexicon lex, BitSet pool) { ensureMinLen(lex, pool, 2, o.minLen2); ensureMinLen(lex, pool, 3, o.minLen3); ensureMinLen(lex, pool, 4, o.minLen4); ensureMinLen(lex, pool, 5, o.minLen5); ensureMinLen(lex, pool, 6, o.minLen6); ensureMinLen(lex, pool, 7, o.minLen7); ensureMinLen(lex, pool, 8, o.minLen8); } }