introduce bitloops

This commit is contained in:
mike
2026-01-19 20:45:28 +01:00
parent 5d186ae0ba
commit 5678af332e
20 changed files with 111 additions and 1005 deletions

View File

@@ -0,0 +1,16 @@
package puzzle;
/**
* Generated constants from pom.xml during build via templating-maven-plugin.
*/
public final class Config {
public static final int CLUE_SIZE = 4;
public static final int MIN_LEN = 2;
public static final int MAX_TRIES_PER_SLOT = 1000;
public static final int MAX_LEN = 8;
public static final int PUZZLE_ROWS = 8;
public static final int PUZZLE_COLS = 9;
public static final int PUZZLE_SIZE = PUZZLE_ROWS*PUZZLE_COLS;
public static final int MAX_WORD_LENGTH = PUZZLE_ROWS;
public static final int MAX_WORD_LENGTH_MIN_1 = PUZZLE_ROWS-1;
}

View File

@@ -1,5 +1,6 @@
package puzzle;
import module java.base;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NoArgsConstructor;
@@ -14,11 +15,6 @@ import puzzle.SwedishGenerator.DictEntry;
import puzzle.SwedishGenerator.FillResult;
import puzzle.SwedishGenerator.Grid;
import puzzle.SwedishGenerator.Slotinfo;
import java.util.Arrays;
import java.util.function.IntSupplier;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import static puzzle.Export.Clue.DOWN;
import static puzzle.Export.Clue.RIGHT;
import static puzzle.Masker.Clues.createEmpty;
@@ -39,12 +35,12 @@ import static puzzle.SwedishGenerator.X;
*/
public record Export() {
public static final ThreadLocal<byte[]> BYTES = ThreadLocal.withInitial(() -> new byte[SwedishGenerator.MAX_WORD_LENGTH]);
static final byte CLUE_DOWN = 0;
static final byte CLUE_RIGHT = 1;
static final byte CLUE_UP = 2;
static final byte CLUE_LEFT = 3;
static final byte CLUE_LEFT_TOP = 4;
public static final ThreadLocal<byte[]> BYTES = ThreadLocal.withInitial(() -> new byte[SwedishGenerator.MAX_WORD_LENGTH]);
static final byte CLUE_DOWN = 0;
static final byte CLUE_RIGHT = 1;
static final byte CLUE_UP = 2;
static final byte CLUE_LEFT = 3;
static final byte CLUE_LEFT_TOP = 4;
static int HI(int in) { return in | 64; }
static char LETTER(int in) { return (char) (in | 64); }
static char CLUE_CHAR(int s) { return (char) (s | 48); }
@@ -242,7 +238,7 @@ public record Export() {
public record WordOut(String word, int[] cell, int startRow, int startCol, char direction, int arrowRow, int arrowCol, boolean isReversed, int complex, String[] clue) {
public WordOut(long l, int startRow, int startCol, char d, int arrowRow, int arrowCol, boolean isReversed, byte[] bytes) {
val meta = Meta.readRecord(Meta.shardKey(l), Lemma.unpackShardIndex(l));
val meta = Meta.readRecord(Meta.shardKey(l), Lemma.unpackShardIndex(l));
this(Lemma.asWord(l, bytes), new int[]{ arrowRow, arrowCol, startRow, startCol }, startRow, startCol, d, arrowRow, arrowCol, isReversed,
meta.simpel(), meta.clues());
}
@@ -379,4 +375,30 @@ public record Export() {
}
int[] toArray() { return Arrays.copyOf(data, size); }
}
static final class LongArrayList {
long[] a;
int size;
LongArrayList(int initialCapacity) {
if (initialCapacity < 0) throw new IllegalArgumentException();
a = new long[initialCapacity];
}
int size() { return size; }
void add(long v) {
if (size == a.length) grow();
a[size++] = v;
}
void grow() {
int newCap = a.length == 0 ? 1 : a.length * 2;
long[] n = new long[newCap];
System.arraycopy(a, 0, n, 0, size);
a = n;
}
long[] toArray() { return Arrays.copyOf(a, this.size); }
}
}

View File

@@ -1,8 +1,7 @@
package puzzle;
import java.sql.*;
import java.util.Map;
import java.util.function.ToIntFunction;
import module java.base;
import module java.sql;
public final class HintScores {

View File

@@ -1,28 +0,0 @@
package puzzle;
import java.util.Arrays;
public final class LongArrayList {
private long[] a;
private int size;
public LongArrayList(int initialCapacity) {
if (initialCapacity < 0) throw new IllegalArgumentException();
a = new long[initialCapacity];
}
public int size() { return size; }
public void add(long v) {
if (size == a.length) grow();
a[size++] = v;
}
private void grow() {
int newCap = a.length == 0 ? 1 : a.length * 2;
long[] n = new long[newCap];
System.arraycopy(a, 0, n, 0, size);
a = n;
}
public long[] toArray() { return Arrays.copyOf(a, this.size); }
}

View File

@@ -1,5 +1,6 @@
package puzzle;
import module java.base;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
@@ -7,17 +8,6 @@ import lombok.val;
import puzzle.Masker.Clues;
import puzzle.SwedishGenerator.Rng;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.*;
import java.time.OffsetDateTime;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Collectors;
import static puzzle.Export.*;
import static puzzle.SwedishGenerator.*;

View File

@@ -1,28 +1,25 @@
package puzzle;
import module java.base;
import lombok.AllArgsConstructor;
import lombok.val;
import precomp.Neighbors9x8;
import precomp.Neighbors9x8.rci;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Objects;
import static java.lang.Long.*;
import static puzzle.SwedishGenerator.*;
public final class Masker {
public static final rci[] IT = Neighbors9x8.IT;
private final Rng rng;
private final int[] stack;
private final Clues cache;
private final int[] activeCIdx = new int[SwedishGenerator.SIZE];
private final long[] activeSLo = new long[SwedishGenerator.SIZE];
private final long[] activeSHi = new long[SwedishGenerator.SIZE];
private final long[] adjLo = new long[SwedishGenerator.SIZE];
private final long[] adjHi = new long[SwedishGenerator.SIZE];
public static final rci[] IT = Neighbors9x8.IT;
private final Rng rng;
private final int[] stack;
private final Clues cache;
private final int[] activeCIdx = new int[SwedishGenerator.SIZE];
private final long[] activeSLo = new long[SwedishGenerator.SIZE];
private final long[] activeSHi = new long[SwedishGenerator.SIZE];
private final long[] adjLo = new long[SwedishGenerator.SIZE];
private final long[] adjHi = new long[SwedishGenerator.SIZE];
public Masker(Rng rng, int[] stack, Clues cache) {
this.rng = rng;
@@ -41,7 +38,7 @@ public final class Masker {
for (int dc2 = -2; dc2 <= 2; dc2++) {
val ti = IT[i];
MUTATE_RI[i][k++] = offset(clamp(ti.r() + dr1 + dr2, 0, R - 1),
clamp(ti.c() + dc1 + dc2, 0, C - 1));
clamp(ti.c() + dc1 + dc2, 0, C - 1));
}
}
}
@@ -477,50 +474,55 @@ public final class Masker {
}
}
if (Main.VERBOSE) System.out.println("generateMask init pop: " + popSize + " clueSize: " + clueSize);
var pop = new ArrayList<GridAndFit>();
for (var i = 0; i < popSize; i++) {
GridAndFit[] pop = new GridAndFit[popSize];
for (int i = 0; i < popSize; i++) {
if (Thread.currentThread().isInterrupted()) return null;
pop.add(new GridAndFit(hillclimb(randomMask(clueSize), clueSize, 180)));
pop[i] = new GridAndFit(hillclimb(randomMask(clueSize), clueSize, 180));
}
for (var gen = 0; gen < gens; gen++) {
for (int gen = 0; gen < gens; gen++) {
if (Thread.currentThread().isInterrupted()) break;
var children = new ArrayList<GridAndFit>();
for (var k = 0; k < offspring; k++) {
GridAndFit[] children = new GridAndFit[offspring];
int childCount = 0;
for (int k = 0; k < offspring; k++) {
if (Thread.currentThread().isInterrupted()) break;
var p1 = rng.rand(pop);
var p2 = rng.rand(pop);
var child = crossover(p1.grid, p2.grid);
children.add(new GridAndFit(hillclimb(child, clueSize, 70)));
GridAndFit p1 = rng.rand(pop);
GridAndFit p2 = rng.rand(pop);
Clues child = crossover(p1.grid, p2.grid);
children[k] = new GridAndFit(hillclimb(child, clueSize, 70));
childCount++;
}
pop.addAll(children);
pop.sort(Comparator.comparingLong(GridAndFit::fit));
GridAndFit[] combined = new GridAndFit[pop.length + childCount];
System.arraycopy(pop, 0, combined, 0, pop.length);
System.arraycopy(children, 0, combined, pop.length, childCount);
Arrays.sort(combined, Comparator.comparingLong(GridAndFit::fit));
var next = new ArrayList<GridAndFit>();
for (var cand : pop) {
if (next.size() >= offspring) break;
var ok = true;
for (var kept : next)
if (cand.grid.similarity(kept.grid) > 0.92) {
GridAndFit[] next = new GridAndFit[offspring];
int nextCount = 0;
for (GridAndFit cand : combined) {
if (nextCount >= offspring) break;
boolean ok = true;
for (int i = 0; i < nextCount; i++)
if (cand.grid.similarity(next[i].grid) > 0.92) {
ok = false;
break;
}
if (ok) next.add(cand);
if (ok) next[nextCount++] = cand;
}
pop = next;
pop = nextCount == offspring ? next : Arrays.copyOf(next, nextCount);
if (Main.VERBOSE && (gen & 15) == 15) System.out.println(" gen " + gen + "/" + gens + " bestFitness=" + pop.get(0).fit());
if (Main.VERBOSE && (gen & 15) == 15) System.out.println(" gen " + gen + "/" + gens + " bestFitness=" + pop[0].fit());
}
if (pop.isEmpty()) return null;
GridAndFit best = pop.get(0);
for (int i = 1; i < pop.size(); i++) {
var x = pop.get(i);
if (pop.length == 0) return null;
GridAndFit best = pop[0];
for (int i = 1; i < pop.length; i++) {
GridAndFit x = pop[i];
if (x.fit() < best.fit()) best = x;
}
return best.grid;
}//@formatter:off
}
//@formatter:off
@FunctionalInterface public interface SlotVisitor { void visit(int key, long lo, long hi); }
//@formatter:on
@AllArgsConstructor

View File

@@ -1,15 +1,9 @@
package puzzle;
import module java.base;
import com.google.gson.Gson;
import lombok.val;
import puzzle.SwedishGenerator.Lemma;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.stream.IntStream;
public class Meta {
static final Gson GSON = new Gson();

View File

@@ -4,7 +4,6 @@ import lombok.AllArgsConstructor;
import lombok.NoArgsConstructor;
import lombok.val;
import precomp.Neighbors9x8;
import java.util.List;
import static java.lang.Long.*;
import static java.lang.Long.numberOfTrailingZeros;
import static java.nio.charset.StandardCharsets.US_ASCII;
@@ -36,9 +35,9 @@ public record SwedishGenerator() {
public static final int SIZE = C * R;// ~18
public static final int SIZE_MIN_1 = SIZE - 1;// ~18
public static final double SIZED = (double) SIZE;// ~18
public static final long MASK_LO = (SIZE >= 64) ? -1L : (1L << SIZE) - 1;
public static final long MASK_HI = (SIZE <= 64) ? 0L : (SIZE >= 128 ? -1L : (1L << (SIZE - 64)) - 1);
public static final int MAX_WORD_LENGTH = C <= R ? C : R;
public static final long MASK_LO = -1L;
public static final long MASK_HI = (1L << (SIZE - 64)) - 1;
public static final int MAX_WORD_LENGTH = Config.PUZZLE_ROWS;
public static final int MAX_WORD_LENGTH_PLUS_ONE = MAX_WORD_LENGTH + 1;
public static final int MIN_LEN = 3;//Config.MIN_LEN;
public static final int MAX_TRIES_PER_SLOT = 700;//Config.MAX_TRIES_PER_SLOT;
@@ -92,7 +91,6 @@ public record SwedishGenerator() {
return (byte) (r & 3);
}
public <T> T rand(T[] p) { return p[(int) (((nextU32() & 0xFFFFFFFFL) % ((long) p.length)))]; }
public <T> T rand(List<T> p) { return p.get((int) (((nextU32() & 0xFFFFFFFFL) % ((long) p.size())))); }
public int randint(int max) { return (int) (((nextU32() & 0xFFFFFFFFL) % ((long) max))); }
public int randint0_SIZE() { return (int) (((nextU32() & 0xFFFFFFFFL) % RANGE_0_SIZE)); }
public int randint0_624() { return (int) (((nextU32() & 0xFFFFFFFFL) % RANGE_0_624)); }

View File

@@ -1,6 +0,0 @@
package puzzle;
import precomp.Neighbors9x8;
public class TestGen {
}

View File

@@ -1,876 +0,0 @@
package puzzle;
import org.w3c.dom.*;
import javax.xml.parsers.DocumentBuilderFactory;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.nio.file.*;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.text.Normalizer;
import java.time.LocalDate;
import java.util.*;
public class ThemePoolBuilderLength {
private static final List<String> DEFAULT_FEEDS = List.of(
"https://feeds.nos.nl/nosnieuwsalgemeen",
"https://feeds.nos.nl/nosnieuwstech");
static final String url = "jdbc:postgresql://192.168.1.159:5432/postgres";
static final String user = "puzzle";
static final String pass = "heel-goed-wachtwoord";
// NOTE: normalizeDutchToken strips non A-Z. Keep entries 2-8 after normalization.
private static final List<String> DEFAULT_SHORTS = List.of(
"EU", "VS", "UK", "NAVO", "NOS", "NS", "ANP", "VN", "NPO", "RTL",
"UUR", "MIN", "TV", "GPS", "AI", "IT", "CPU", "GPU",
"ING", "KPN", "KVK", "RIVM", "GGD", "AIVD", "MIVD", "CEO", "CFO", "HR",
"NL", "BE", "BRU", "EUR", "EURO", "WET", "ART", "BTW", "DI", "MA",
"PVV", "VVD", "CDA", "FNV",
"EN", "IN", "OP", "OM", "TE", "ER", "DE", "HET", "EEN", "VAN", "MET", "NOG", "OOK", "MAAR", "WEL", "NIET",
"HOE", "ALS",
"ZO", "DO", "WO", "VR", "MO", "WA", "WE", "TAAL",
"LAND", "GEMEENTE", "STAAT", "BUREAU", "HUIS", "SCHOOL", "STR", "BAAN",
"WERK", "KLUS",
"FONDS", "RAAD", "CONGRESS", "GROEP", "STRAAT", "BRUG", "PARK",
"BUURT",
"BOUW", "HOTEL", "CAFE", "BAR",
"BIJBAAN", "STUDENT", "DOCENT",
"WINKEL", "MARKT", "KIOSK", "AUTO", "MOBILE", "FIETS", "SCOOTER",
// afkortingen
"DHR", "MEVR", "DR", "ST", "CA", "IVM", "MBT", "TAV", "TOV", "DWZ", "MAW", "OA", "TM",
"ANWB", "BRP", "CBS",
"AL", "NU", "TO", "NA", "BIJ", "TOT", "DAN", "WAT", "DAT",
"IK", "JE", "WE", "WIJ", "JIJ", "ZIJ", "HIJ", "HEN", "ONS", "JOU",
// romeinse cijfers (2-8)
"II", "III", "IV", "VI", "VII", "VIII", "IX",
"XI", "XII", "XIII", "XIV", "XV", "XVI", "XVII", "XVIII", "XIX", "XX"
);
private static final String BROWSER_UA =
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36";
static int MIN_SIMPLICITY = 520,
MAX_WORD_LENGTH = 7;
static final class Opts {
String endpoint = "https://jarvis-lan.appmodel.nl/api/ollama/";
List<String> feeds = new ArrayList<>(DEFAULT_FEEDS);
String outDir = System.getenv("OUT_DIR") != null ? System.getenv("OUT_DIR") : "/data/puzzle";
int bridgeN = 30000;
int themeN = 800;
int relatedN = 2200;
int rssItemsPerFeed = 10;
String model = "/models/Hadiseh-Mhd/Mixtral-8x7B-Instruct-v0.1-Q4_K_M-GGUF/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf";
int timeoutSeconds = 180;
int retries = 2;
int minLen2 = 1000;
int minLen3 = 1000;
int minLen4 = 1000;
int minLen5 = 1000; // set if you also want to force 5-letter words, etc.
int minLen6 = 1000;
int minLen7 = 1000;
int minLen8 = MAX_WORD_LENGTH >= 8 ? 1000 : 0;
}
public static void main(String[] args) throws Exception {
var o = parseArgs(args);
var outDir = Path.of(o.outDir);
Files.createDirectories(outDir);
System.out.println("Loading lexicon...");
Lexicon lex;
Class.forName("org.postgresql.Driver");
try (var c = DriverManager.getConnection(url, user, pass);) {
lex = loadLexicon(c);
}
System.out.println("Master words (2-" + MAX_WORD_LENGTH + ", A-Z): " + lex.words.size());
// RSS via curl (browser-like)
var all = new ArrayList<RssItem>();
for (var feed : o.feeds) {
var f = feed.trim();
if (f.isEmpty()) continue;
System.out.println("Fetching RSS: " + f);
all.addAll(fetchRssViaCurlBrowser(f, o.rssItemsPerFeed, o.timeoutSeconds));
}
var rssText = new StringBuilder();
var k = 0;
for (var it : all) {
k++;
rssText.append(k).append(". ").append(it.title).append("\n");
if (!it.desc.isBlank()) rssText.append(" ").append(it.desc).append("\n");
}
Files.writeString(outDir.resolve("rss.txt"), rssText.toString(), StandardCharsets.UTF_8);
// LM Studio via curl
var modelId = o.model;
if (modelId == null) {
var modelsUrl = apiUrl(o.endpoint, "/models");
System.out.println("Ollama GET: " + modelsUrl);
var modelsJson = curlGetJson(o, modelsUrl);
modelId = pickModelId(modelsJson);
if (modelId == null) {
throw new IOException("Could not auto-pick model id from /v1/models. Use --model <id>.\n--- /models ---\n" + modelsJson);
}
}
System.out.println("Using model: " + modelId);
System.out.println("Generating theme words via LM Studio...");
var llmWords = List.<String>of();//llmThemeWords(o, modelId, rssText.toString());
var themeKept = new LinkedHashSet<String>();
for (var wRaw : llmWords) {
var w = normalizeDutchToken(wRaw);
if (w == null) continue;
if (lex.idOf.containsKey(w)) themeKept.add(w);
}
Files.write(outDir.resolve("theme.txt"), themeKept, StandardCharsets.UTF_8);
// BitSets
var themeBs = bitmapFromWords(lex, themeKept);
var bridgeBs = buildBridgeBitmap(lex, o.bridgeN);
var shortBs = bitmapFromWords(lex, DEFAULT_SHORTS);
var pool = new BitSet(lex.words.size());
pool.or(themeBs);
pool.or(bridgeBs);
pool.or(shortBs);
// ---- NEW: enforce minimum counts per length ----
enforceMinima(o, lex, pool);
// Report
var themeCounts = countsPerLen(lex, themeBs);
var poolCounts = countsPerLen(lex, pool);
var report = """
Date: %s
Feeds: %s
Model: %s
Master size: %d
Theme kept (in master): %d
Bridge size: %d
Shorts kept: %d
Pool total: %d
Enforced minima:
2: %d
3: %d
4: %d
5: %d
6: %d
7: %d
8: %d
Counts per length (theme):
%s
Counts per length (pool):
%s
""".formatted(
LocalDate.now(),
String.join(", ", o.feeds),
modelId,
lex.words.size(),
themeBs.cardinality(),
bridgeBs.cardinality(),
shortBs.cardinality(),
pool.cardinality(),
o.minLen2, o.minLen3, o.minLen4, o.minLen5, o.minLen6, o.minLen7, o.minLen8,
mapToLines(themeCounts),
mapToLines(poolCounts)
);
Files.writeString(outDir.resolve("report.txt"), report, StandardCharsets.UTF_8);
System.out.println(report);
// Output pool list
var poolFile = outDir.resolve("pool.txt");
writeWordList(poolFile, lex, pool);
System.out.println("Wrote: " + poolFile.toAbsolutePath());
}
static Opts parseArgs(String[] args) {
var o = new Opts();
for (var i = 0; i < args.length; i++) {
var a = args[i];
var v = (i + 1 < args.length) ? args[i + 1] : null;
switch (a) {
case "--endpoint" -> {
o.endpoint = v;
i++;
}
case "--feeds" -> {
o.feeds = Arrays.asList(v.split(","));
i++;
}
case "--out" -> {
o.outDir = v;
i++;
}
case "--bridge" -> {
o.bridgeN = Integer.parseInt(v);
i++;
}
case "--theme" -> {
o.themeN = Integer.parseInt(v);
i++;
}
case "--related" -> {
o.relatedN = Integer.parseInt(v);
i++;
}
case "--items" -> {
o.rssItemsPerFeed = Integer.parseInt(v);
i++;
}
case "--model" -> {
o.model = v;
i++;
}
case "--timeout" -> {
o.timeoutSeconds = Integer.parseInt(v);
i++;
}
case "--retries" -> {
o.retries = Integer.parseInt(v);
i++;
}
// ---- NEW: minima per length ----
case "--min2" -> {
o.minLen2 = Integer.parseInt(v);
i++;
}
case "--min3" -> {
o.minLen3 = Integer.parseInt(v);
i++;
}
case "--min4" -> {
o.minLen4 = Integer.parseInt(v);
i++;
}
case "--min5" -> {
o.minLen5 = Integer.parseInt(v);
i++;
}
case "--min6" -> {
o.minLen6 = Integer.parseInt(v);
i++;
}
case "--min7" -> {
o.minLen7 = Integer.parseInt(v);
i++;
}
case "--min8" -> {
o.minLen8 = Integer.parseInt(v);
i++;
}
case "-h", "--help" -> {
System.out.println("""
Usage:
java puzzle.ThemePoolBuilder --words WORDS.txt [options]
Options:
--endpoint http://HOST:1234/v1 (LM Studio)
--feeds url1,url2
--out ./out
--bridge 5000
--theme 300
--related 1200
--items 20 (per feed)
--model <id> (recommended; skips /v1/models)
--timeout 60 (seconds)
--retries 4
# enforce minima per length in final pool
--min2 4000
--min3 7000
--min4 9000
--min5 0
--min6 0
--min7 0
--min8 0
""");
System.exit(0);
}
default -> throw new IllegalArgumentException("Unknown arg: " + a);
}
}
return o;
}
static boolean isAZ(String s) {
for (var i = 0; i < s.length(); i++) {
var ch = s.charAt(i);
if (ch < 'A' || ch > 'Z') return false;
}
return true;
}
static String normalizeDutchToken(String raw) {
if (raw == null) return null;
var s = raw.trim();
if (s.isEmpty()) return null;
s = Normalizer.normalize(s, Normalizer.Form.NFD).replaceAll("\\p{M}+", "");
s = s.toUpperCase(Locale.ROOT);
s = s.replaceAll("[^A-Z]", "");
if (s.length() < 2 || s.length() > 8) return null;
if (!isAZ(s)) return null;
return s;
}
static String stripHtml(String s) {
if (s == null) return "";
var x = s.replaceAll("<[^>]+>", " ");
x = x.replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">");
x = x.replaceAll("\\s+", " ").trim();
return x;
}
/**
* @param words id -> word
* @param idOf word -> id
* @param score id -> crossability
* @param byLen byLen[L] for L 0..8
*/
record Lexicon(List<String> words, Map<String, Integer> idOf, int[] score, BitSet[] byLen) { }
/**
* Loads lexicon from PostgreSQL view/table: export_words_with_hints_2_8
* Columns: WOORD, level_1_to_10, hint
*
* Notes:
* - Normalizes words via normalizeDutchToken(...)
* - Dedupes on normalized word
* - Uses level_1_to_10 as the "LLM score" (fallback 5)
* - Ignores hint for scoring (but you can store it elsewhere if needed)
*/
static Lexicon loadLexicon(Connection conn) throws SQLException {
var out = new ArrayList<String>(200_000);
var idOf = new HashMap<String, Integer>(400_000);
// Store level per normalized word while loading so we can compute scores later
var levelOf = new HashMap<String, Integer>(400_000);
final var sql = """
SELECT woord, 10-level_1_to_10, hint
FROM export_real_words_with_hints
where length(woord)<=7
order by level_1_to_10 asc
""" ;
try (var ps = conn.prepareStatement(sql);
var rs = ps.executeQuery()) {
while (rs.next()) {
var rawWord = rs.getString(1);
var lvlObj = (Integer) rs.getObject(2); // nullable
// String hint = rs.getString(3); // available if you want it later
var w = normalizeDutchToken(rawWord);
if (w == null) continue;
if (idOf.containsKey(w)) continue;
idOf.put(w, out.size());
out.add(w);
var lvl = (lvlObj == null ? 5 : lvlObj.intValue());
levelOf.put(w, lvl);
}
} catch (SQLException e) {
throw new RuntimeException(e);
}
var n = out.size();
var score = new int[n];
var byLen = new BitSet[9];
for (var L = 0; L <= 8; L++) byLen[L] = new BitSet(n);
for (var i = 0; i < n; i++) {
var w = out.get(i);
var crossScore = HintScores.crossabilityScore(w);
var lScore = levelOf.getOrDefault(w, 5);
// Prioritize simple words (high lScore) and long words.
// lScore (1-10) adds up to 1000 points (weight 100).
// Length (2-8) adds up to 160 points (weight 20).
score[i] = crossScore + (lScore * 100) + (w.length() * 40);
byLen[w.length()].set(i);
}
return new Lexicon(out, idOf, score, byLen);
}
// ---------------- RSS via curl (browser-like) ----------------
record RssItem(String title, String desc) { }
static String textOfFirst(Element parent, String tag) {
var nl = parent.getElementsByTagName(tag);
if (nl.getLength() == 0) return null;
var n = nl.item(0);
return n.getTextContent();
}
static List<RssItem> fetchRssViaCurlBrowser(String url, int limit, int timeoutSeconds) throws Exception {
var cmd = new ArrayList<String>();
cmd.add("curl");
cmd.add("-fsSL");
cmd.add("-L");
cmd.add("--compressed");
cmd.add("--connect-timeout");
cmd.add("10");
cmd.add("--max-time");
cmd.add(String.valueOf(timeoutSeconds));
cmd.add("--retry");
cmd.add("5");
cmd.add("--retry-all-errors");
cmd.add("--retry-delay");
cmd.add("1");
cmd.add("-H");
cmd.add("User-Agent: " + BROWSER_UA);
cmd.add("-H");
cmd.add("Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
cmd.add("-H");
cmd.add("Accept-Language: nl-NL,nl;q=0.9,en;q=0.7");
cmd.add("-H");
cmd.add("Cache-Control: no-cache");
cmd.add("-H");
cmd.add("Pragma: no-cache");
cmd.add("-H");
cmd.add("Sec-Fetch-Dest: document");
cmd.add("-H");
cmd.add("Sec-Fetch-Mode: navigate");
cmd.add("-H");
cmd.add("Sec-Fetch-Site: none");
cmd.add("-H");
cmd.add("Sec-Fetch-User: ?1");
cmd.add(url);
var p = new ProcessBuilder(cmd)
.redirectErrorStream(true)
.start();
var bytes = p.getInputStream().readAllBytes();
var code = p.waitFor();
if (code != 0) {
throw new IOException("curl RSS failed (" + code + ") url=" + url + " output=" +
new String(bytes, StandardCharsets.UTF_8));
}
try (InputStream is = new ByteArrayInputStream(bytes)) {
var dbf = DocumentBuilderFactory.newInstance();
var doc = dbf.newDocumentBuilder().parse(is);
var items = doc.getElementsByTagName("item");
var out = new ArrayList<RssItem>();
for (var i = 0; i < items.getLength() && out.size() < limit; i++) {
var item = (Element) items.item(i);
var title = textOfFirst(item, "title");
var desc = textOfFirst(item, "description");
if (title == null) title = "";
if (desc == null) desc = "";
out.add(new RssItem(stripHtml(title), stripHtml(desc)));
}
return out;
}
}
// ---------------- LM Studio (OpenAI-compatible) ----------------
static String apiUrl(String endpointArg, String path) {
var base = endpointArg.trim();
if (base.endsWith("/")) base = base.substring(0, base.length() - 1);
if (base.endsWith("/v1")) base = base.substring(0, base.length() - 3);
if (!path.startsWith("/")) path = "/" + path;
if (!path.startsWith("/v1/")) path = "/" + path;
return base + path;
}
static void sleepBackoff(int attempt) {
try {
var ms = (long) (300L * Math.pow(2, attempt - 1)); // 300, 600, 1200, ...
Thread.sleep(Math.min(ms, 3000));
} catch (InterruptedException ignored) { }
}
static String curlGetJson(Opts o, String url) throws Exception {
Exception last = null;
for (var attempt = 1; attempt <= o.retries; attempt++) {
try {
var cmd = new ArrayList<String>();
cmd.add("curl");
cmd.add("-fsSL");
cmd.add("--connect-timeout");
cmd.add("10");
cmd.add("--max-time");
cmd.add(String.valueOf(o.timeoutSeconds));
cmd.add("--retry");
cmd.add("3");
cmd.add("--retry-all-errors");
cmd.add("--retry-delay");
cmd.add("1");
cmd.add("-H");
cmd.add("Accept: application/json");
cmd.add("-H");
cmd.add("User-Agent: " + BROWSER_UA);
cmd.add(url);
var p = new ProcessBuilder(cmd)
.redirectErrorStream(true)
.start();
var bytes = p.getInputStream().readAllBytes();
var code = p.waitFor();
if (code != 0) {
throw new IOException("curl GET failed (" + code + ") url=" + url + "\nOutput:\n" +
new String(bytes, StandardCharsets.UTF_8));
}
return new String(bytes, StandardCharsets.UTF_8);
} catch (Exception e) {
last = e;
if (attempt < o.retries) sleepBackoff(attempt);
}
}
throw last;
}
static String curlPostJson(Opts o, String url, String jsonBody) throws Exception {
Exception last = null;
for (var attempt = 1; attempt <= o.retries; attempt++) {
try {
System.out.println(" Attempt " + attempt + "/" + o.retries + " via curl...");
var tempFile = Files.createTempFile("lm-request-", ".json");
try {
Files.writeString(tempFile, jsonBody, StandardCharsets.UTF_8);
List<String> cmd = new ArrayList<>();
cmd.add("curl");
cmd.add("-fsSL");
cmd.add("--connect-timeout");
cmd.add("10");
cmd.add("--max-time");
cmd.add(String.valueOf(o.timeoutSeconds));
cmd.add("--retry");
cmd.add("3");
cmd.add("--retry-all-errors");
cmd.add("--retry-delay");
cmd.add("1");
cmd.add("-H");
cmd.add("Content-Type: application/json");
cmd.add("-H");
cmd.add("Accept: application/json");
cmd.add("-H");
cmd.add("User-Agent: " + BROWSER_UA);
cmd.add("-d");
cmd.add("@" + tempFile.toString());
cmd.add(url);
var p = new ProcessBuilder(cmd)
.redirectErrorStream(true)
.start();
var bytes = p.getInputStream().readAllBytes();
var code = p.waitFor();
if (code != 0) {
throw new IOException("curl POST failed (" + code + ") url=" + url + "\nOutput:\n" +
new String(bytes, StandardCharsets.UTF_8));
}
return new String(bytes, StandardCharsets.UTF_8);
} finally {
Files.deleteIfExists(tempFile);
}
} catch (Exception e) {
System.err.println(" Error: " + e.getClass().getName() + ": " + e.getMessage());
last = e;
if (attempt < o.retries) sleepBackoff(attempt);
}
}
throw last;
}
static String pickModelId(String modelsJson) {
if (modelsJson == null) return null;
var data = modelsJson.indexOf("\"data\"");
if (data < 0) return null;
var id = modelsJson.indexOf("\"id\"", data);
if (id < 0) return null;
var q1 = modelsJson.indexOf('"', modelsJson.indexOf(':', id) + 1);
if (q1 < 0) return null;
var q2 = modelsJson.indexOf('"', q1 + 1);
if (q2 < 0) return null;
return modelsJson.substring(q1 + 1, q2);
}
static String extractChatContent(String json) {
if (json == null) return null;
var choices = json.indexOf("\"choices\"");
var p = (choices >= 0) ? choices : 0;
var i = json.indexOf("\"content\"", p);
if (i < 0) return null;
var colon = json.indexOf(':', i);
if (colon < 0) return null;
var q = json.indexOf('"', colon + 1);
if (q < 0) return null;
var sb = new StringBuilder();
var esc = false;
for (var k = q + 1; k < json.length(); k++) {
var ch = json.charAt(k);
if (esc) {
if (ch == 'n') sb.append('\n');
else if (ch == 't') sb.append('\t');
else if (ch == 'r') sb.append('\r');
else sb.append(ch);
esc = false;
} else {
if (ch == '\\') esc = true;
else if (ch == '"') break;
else sb.append(ch);
}
}
return sb.toString();
}
static List<String> parseStringArray(String s) {
if (s == null) return List.of();
var a = s.indexOf('[');
var b = s.lastIndexOf(']');
if (a < 0 || b < 0 || b <= a) return List.of();
var body = s.substring(a + 1, b);
var out = new ArrayList<String>();
// If it's a simple comma-separated list without quotes (or with mixed quotes),
// let's try a more robust approach.
if (!body.contains("\"")) {
for (var part : body.split(",")) {
var trimmed = part.trim();
if (!trimmed.isEmpty()) out.add(trimmed);
}
if (!out.isEmpty()) return out;
}
var cur = new StringBuilder();
boolean in = false, esc = false;
for (var i = 0; i < body.length(); i++) {
var ch = body.charAt(i);
if (!in) {
if (ch == '"') {
in = true;
cur.setLength(0);
esc = false;
}
} else {
if (esc) {
cur.append(ch);
esc = false;
} else if (ch == '\\') {
esc = true;
} else if (ch == '"') {
out.add(cur.toString());
in = false;
} else {
cur.append(ch);
}
}
}
return out;
}
static String jsonQuote(String s) {
if (s == null) return "null";
var sb = new StringBuilder();
sb.append('"');
for (var i = 0; i < s.length(); i++) {
var ch = s.charAt(i);
if (ch == '\\' || ch == '"') sb.append('\\').append(ch);
else if (ch == '\n') sb.append("\\n");
else if (ch == '\r') sb.append("\\r");
else if (ch == '\t') sb.append("\\t");
else sb.append(ch);
}
sb.append('"');
return sb.toString();
}
static List<String> llmThemeWords(Opts o, String modelId, String rssText) throws Exception {
var prompt = """
Je genereert woorden voor een Nederlandse kruiswoordpuzzel.
Regels:
- Output MOET exact één JSON array zijn: ["WOORD", ...]
- Alleen A-Z, 2-8 letters woorden
- Geen spaties, streepjes, cijfers, accenten, apostrofs, punten
- Geen duplicaten
- Focus op zelfstandige naamwoorden/termen uit het nieuws en relevante Zweedse kruiswoordpuzzel koppelwoorden in het thema.
- Lever %d THEMA-woorden en daarna %d GERELATEERDE woorden (totaal %d).
- Voeg ook wat korte woorden/afkortingen toe (2-4 letters), maar houd het totaal gelijk.
Nieuws (koppen/samenvattingen):
%s
""".formatted(o.themeN, o.relatedN, (o.themeN + o.relatedN), rssText.substring(0, Math.min(rssText.length(), 8000)));
var body = """
{
"model": %s,
"messages": [
{"role":"system","content":"Je bent een strikte JSON generator. Antwoord ALLEEN met een JSON array van strings."},
{"role":"user","content": %s}
],
"temperature": 0.35,
"max_tokens": 20000
}
""".formatted(jsonQuote(modelId), jsonQuote(prompt));
var url = apiUrl(o.endpoint, "/chat/completions");
System.out.println("LM Studio POST: " + url);
System.out.println("Request body length: " + body.length() + " bytes");
var resp = curlPostJson(o, url, body);
var content = extractChatContent(resp);
if (content == null) {
throw new IOException("Could not extract chat content from LM Studio response.\n--- response ---\n" + resp);
}
return parseStringArray(content);
}
// ---------------- Pool building ----------------
static BitSet buildBridgeBitmap(Lexicon lex, int bridgeN) {
var n = lex.words.size();
var ids = new ArrayList<Integer>(n);
for (var i = 0; i < n; i++) {
// Optionally filter out VERY complex words from the bridge (e.g. lScore < 3)
// But since we sort by score (which is now dominated by lScore),
// they will be at the very bottom anyway.
// if (lex.score[i] < 800) continue;
ids.add(i);
}
ids.sort((a, b) -> Integer.compare(lex.score[b], lex.score[a]));
var bs = new BitSet(n);
var take = Math.min(bridgeN, ids.size());
for (var i = 0; i < take; i++) bs.set(ids.get(i));
return bs;
}
static BitSet bitmapFromWords(Lexicon lex, Collection<String> words) {
var bs = new BitSet(lex.words.size());
for (var raw : words) {
var w = normalizeDutchToken(raw);
if (w == null) continue;
var id = lex.idOf.get(w);
if (id != null) bs.set(id);
}
return bs;
}
static Map<Integer, Integer> countsPerLen(Lexicon lex, BitSet bs) {
var out = new HashMap<Integer, Integer>();
for (var L = 2; L <= 8; L++) {
var tmp = (BitSet) bs.clone();
tmp.and(lex.byLen[L]);
out.put(L, tmp.cardinality());
}
return out;
}
static void writeWordList(Path path, Lexicon lex, BitSet bs) throws IOException {
var ids = new ArrayList<Integer>(bs.cardinality());
for (var i = bs.nextSetBit(0); i >= 0; i = bs.nextSetBit(i + 1)) {
ids.add(i);
}
// Sort by score descending (higher score is easier/better)
ids.sort((a, b) -> Integer.compare(lex.score[b], lex.score[a]));
var out = new ArrayList<String>(ids.size());
for (var id : ids) {
if (lex.score[id] < MIN_SIMPLICITY)
continue;
out.add(lex.words.get(id));
}
Files.write(path, out, StandardCharsets.UTF_8, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
}
static String mapToLines(Map<Integer, Integer> m) {
var sb = new StringBuilder();
for (var L = 2; L <= 8; L++) {
sb.append(" ").append(L).append(": ").append(m.getOrDefault(L, 0)).append("\n");
}
return sb.toString();
}
// ---------------- NEW: enforce minima per length ----------------
static int countLen(Lexicon lex, BitSet bs, int L) {
var tmp = (BitSet) bs.clone();
tmp.and(lex.byLen[L]);
return tmp.cardinality();
}
static void ensureMinLen(Lexicon lex, BitSet pool, int L, int minWanted) {
if (minWanted <= 0) return;
var current = countLen(lex, pool, L);
if (current >= minWanted) return;
var need = minWanted - current;
// Collect candidate ids of exactly length L that are not already in pool.
var candidates = new ArrayList<Integer>(Math.max(need * 2, 1024));
for (var id = lex.byLen[L].nextSetBit(0); id >= 0; id = lex.byLen[L].nextSetBit(id + 1)) {
if (!pool.get(id)) candidates.add(id);
}
if (candidates.isEmpty()) return;
// Sort by crossability score (desc)
candidates.sort((a, b) -> Integer.compare(lex.score[b], lex.score[a]));
var added = 0;
for (var id : candidates) {
pool.set(id);
added++;
if (added >= need) break;
}
}
static void enforceMinima(Opts o, Lexicon lex, BitSet pool) {
ensureMinLen(lex, pool, 2, o.minLen2);
ensureMinLen(lex, pool, 3, o.minLen3);
ensureMinLen(lex, pool, 4, o.minLen4);
ensureMinLen(lex, pool, 5, o.minLen5);
ensureMinLen(lex, pool, 6, o.minLen6);
ensureMinLen(lex, pool, 7, o.minLen7);
ensureMinLen(lex, pool, 8, o.minLen8);
}
}