update them

This commit is contained in:
mike
2025-12-22 01:03:18 +01:00
parent 83d1a907c1
commit f9f0f31d12
56 changed files with 37953 additions and 37322 deletions

View File

@@ -49,8 +49,9 @@ public class DailyGenerator {
// Load word list
SwedishGenerator.Dict dict;
var llmScores = SwedishGenerator.loadScores();
try {
dict = SwedishGenerator.loadWords(wordsPath);
dict = SwedishGenerator.loadWords(wordsPath, llmScores);
System.out.println("Loaded " + dict.words.size() + " words");
} catch (Exception e) {
System.err.println("Failed to load words: " + e.getMessage());
@@ -102,8 +103,9 @@ public class DailyGenerator {
opts.gens = 100;
opts.tries = 50;
opts.wordsPath = wordsPath;
opts.minSimplicity = 0; // default
var result = generateWithFilteredDict(opts, themedDict);
var result = generateWithFilteredDict(opts, themedDict, llmScores);
if (result == null) {
System.out.println("Failed to generate puzzle " + i);
@@ -172,12 +174,12 @@ public class DailyGenerator {
return new SwedishGenerator.Dict(new ArrayList<>(allowed), newIndex, newLenCounts);
}
private static SwedishGenerator.PuzzleResult generateWithFilteredDict(Main.Opts opts, SwedishGenerator.Dict dict) {
private static SwedishGenerator.PuzzleResult generateWithFilteredDict(Main.Opts opts, SwedishGenerator.Dict dict, Map<String, Integer> llmScores) {
var rng = new SwedishGenerator.Rng(opts.seed);
for (var attempt = 1; attempt <= opts.tries; attempt++) {
var mask = SwedishGenerator.generateMask(rng, dict.lenCounts, opts.pop, opts.gens);
var filled = SwedishGenerator.fillMask(rng, mask, dict.index, 200, 30000);
var filled = SwedishGenerator.fillMask(rng, mask, dict.index, llmScores, 200, 30000);
if (filled.ok) {
return new SwedishGenerator.PuzzleResult(mask, filled);

View File

@@ -5,6 +5,7 @@ import java.nio.file.Files;
import java.nio.file.Paths;
import java.time.LocalDate;
import java.util.Date;
import java.util.Locale;
public class Main {
// ---------------- CLI ----------------
@@ -13,21 +14,23 @@ public class Main {
public int seed = 1;
public int pop = 18;
public int gens = 100;
public int tries = 50;
public int tries = 5;
public String wordsPath = "./out/pool.txt";
public double minSimplicity = 0; // 0 means no limit
}
static void usage() {
System.out.println("""
Usage:
java SwedishGenerator [--seed N] [--pop N] [--gens N] [--tries N] [--words word-list.txt]
java SwedishGenerator [--seed N] [--pop N] [--gens N] [--tries N] [--words word-list.txt] [--min-simplicity N.N]
Defaults:
--seed 1
--pop 18
--gens 100
--tries 50
--words ./word-list.txt
--words ./out/pool.txt
--min-simplicity 0 (no limit)
""");
}
@@ -45,6 +48,7 @@ public class Main {
else if (a.equals("--gens")) { out.gens = Integer.parseInt(v); i++; }
else if (a.equals("--tries")) { out.tries = Integer.parseInt(v); i++; }
else if (a.equals("--words")) { out.wordsPath = v; i++; }
else if (a.equals("--min-simplicity")) { out.minSimplicity = Double.parseDouble(v); i++; }
else throw new IllegalArgumentException("Unknown arg: " + a);
}
return out;
@@ -66,11 +70,13 @@ public class Main {
System.out.println("\n=== FILLED PUZZLE (HUMAN) ===");
System.out.println(SwedishGenerator.renderHuman(res.filled().grid));
System.out.printf(Locale.ROOT, "Puzzle Simplicity: %.2f%n", res.filled().simplicity);
var out = ExportFormat.exportFormatFromFilled(res, 1, new ExportFormat.Rewards(50, 2, 1));
System.out.println("gridv2:");
for (String row : out.gridv2()) System.out.println(row);
System.out.println("words: " + out.words().size());
for (var w : out.words()) {
var simplicityOfWord =
System.out.printf("%s %s start=(%d,%d) arrow=(%d,%d)%n",
w.word(), w.direction(), w.startRow(), w.startCol(), w.arrowRow(), w.arrowCol());
}

View File

@@ -14,6 +14,7 @@ import java.util.stream.IntStream;
* javac SwedishGenerator.java
* java SwedishGenerator [--seed N] [--pop N] [--gens N] [--tries N] [--words word-list.txt]
*/
@SuppressWarnings("ALL")
public class SwedishGenerator {
static final int W = 9, H = 8;
@@ -136,7 +137,7 @@ public class SwedishGenerator {
final int score;
public WordDifficulty(String word, int score) {
this.word = word;
this.word = word;
this.score = score;
// We want LONGER and SIMPLER words to be tried earlier (lower difficulty value).
// word.length() is 2 to 8.
@@ -144,7 +145,8 @@ public class SwedishGenerator {
// Base difficulty starts high and decreases with length and score.
// Length impact: up to 8 * 10 = 80
// Score impact: up to 10 * 15 = 150
this.difficulty = 250 - (word.length() * 10) - (score * 15);
var difficulty1 = 0 + ((8 - word.length()) * 30) + ((10-score) * 15);
this.difficulty = difficulty1;
}
}
@@ -154,17 +156,26 @@ public class SwedishGenerator {
var lines = Files.readAllLines(Path.of("word_scores.csv"), StandardCharsets.UTF_8);
var first = true;
for (var line : lines) {
if (first) { first = false; continue; }
var parts = line.split(",", 3);
if (first) {
first = false;
continue;
}
var parts = line.split("," );
if (parts.length >= 3) {
try {
var word = parts[0].trim().toUpperCase(Locale.ROOT);
var score = Integer.parseInt(parts[1].trim());
var word = parts[0].trim().toUpperCase(Locale.ROOT);
var score = Integer.parseInt(parts[1].trim());
var status = parts[2].trim();
if ("OK".equalsIgnoreCase(status)) {
scores.put(word, score);
} else {
System.err.println("Skipping:" +Arrays.toString( parts));
}
} catch (NumberFormatException ignored) {}
} catch (NumberFormatException ignored) {
System.err.println("Illegal number format: " + line);
}
} else {
System.err.println("Illegal word: " + line);
}
}
} catch (IOException e) {
@@ -184,7 +195,7 @@ public class SwedishGenerator {
this.lenCounts = lenCounts;
}
}
static Dict loadWords(String wordsPath) {
static Dict loadWords(String wordsPath, Map<String, Integer> llmScores) {
String raw;
try {
raw = Files.readString(Path.of(wordsPath), StandardCharsets.UTF_8);
@@ -192,8 +203,7 @@ public class SwedishGenerator {
raw = "EU\nUUR\nAUTO\nBOOM\nHUIS\nKAT\nZEE\nRODE\nDRAAD\nKENNIS\nNETWERK\nPAKTE\n";
}
var llmScores = loadScores();
var words = new ArrayList<WordDifficulty>();
var words = new ArrayList<WordDifficulty>();
for (var line : raw.split("\\R")) {
var s = line.trim().toUpperCase(Locale.ROOT);
if (s.matches("^[A-Z]{2,8}$")) {
@@ -289,11 +299,10 @@ public class SwedishGenerator {
return ci;
}
static int indexToDifficulty(DictEntry entry, int index, Map<String, Integer> llmScores) {
var word = entry.words.get(index);
var word = entry.words.get(index);
var score = llmScores.getOrDefault(word, 5);
return new WordDifficulty(word, score).difficulty;
}
// ---------------- Slots ----------------
@@ -617,6 +626,7 @@ public class SwedishGenerator {
public char[][] grid;
public HashMap<String, String> clueMap;
public FillStats stats;
public double simplicity;
}
record Undo(int[] rs, int[] cs, char[] prev, int n) {
@@ -667,6 +677,7 @@ public class SwedishGenerator {
}
static FillResult fillMask(Rng rng, char[][] mask, HashMap<Integer, DictEntry> dictIndex,
Map<String, Integer> llmScores,
int logEveryMs, int timeLimitMs) {
var grid = deepCopyGrid(mask);
@@ -819,10 +830,10 @@ public class SwedishGenerator {
// (lower difficulty) but still have some randomness.
for (var t = 0; t < tries; t++) {
// Bias strongly towards lower indices (simpler words) using r^3
double r = rng.nextFloat();
int idxInArray = (int) (r * r * r * L);
var idx = idxs[idxInArray];
var w = entry.words.get(idx);
double r = rng.nextFloat();
int idxInArray = (int) (r * r * r * L);
var idx = idxs[idxInArray];
var w = entry.words.get(idx);
if (tryWord.apply(w)) return true;
}
stats.backtracks++;
@@ -837,9 +848,9 @@ public class SwedishGenerator {
var tries = Math.min(MAX_TRIES_PER_SLOT, N);
for (var t = 0; t < tries; t++) {
double r = rng.nextFloat();
int idxInArray = (int) (r * r * r * N);
var w = entry.words.get(idxInArray);
double r = rng.nextFloat();
int idxInArray = (int) (r * r * r * N);
var w = entry.words.get(idxInArray);
if (tryWord.apply(w)) return true;
}
@@ -862,6 +873,14 @@ public class SwedishGenerator {
stats.seconds = (System.currentTimeMillis() - t0) / 1000.0;
res.stats = stats;
if (ok) {
double totalSimplicity = 0;
for (var w : assigned.values()) {
totalSimplicity += llmScores.getOrDefault(w, 5);
}
res.simplicity = assigned.isEmpty() ? 0 : totalSimplicity / assigned.size();
}
// print a final progress line
System.out.println(
String.format(Locale.ROOT,
@@ -882,11 +901,12 @@ public class SwedishGenerator {
public record PuzzleResult(char[][] mask, FillResult filled) { }
public static PuzzleResult generatePuzzle(Main.Opts opts) {
var rng = new Rng(opts.seed);
var tLoad0 = System.nanoTime();
var dict = loadWords(opts.wordsPath);
var tLoad1 = System.nanoTime();
System.out.printf(Locale.ROOT, "LOAD_WORDS: %.3fs%n", (tLoad1 - tLoad0) / 1e9);
var rng = new Rng(opts.seed);
var llmScores = loadScores();
var tLoad0 = System.nanoTime();
var dict = loadWords(opts.wordsPath, llmScores);
var tLoad1 = System.nanoTime();
System.out.printf(Locale.ROOT, "LOAD_WORDS: %.3fs%n %s words", (tLoad1 - tLoad0) / 1e9,dict.words.size());
for (var attempt = 1; attempt <= opts.tries; attempt++) {
System.out.println("\nAttempt " + attempt + "/" + opts.tries);
@@ -897,14 +917,19 @@ public class SwedishGenerator {
System.out.printf(Locale.ROOT, "MASK: %.3fs%n", (tMask1 - tMask0) / 1e9);
var tFill0 = System.nanoTime();
var filled = fillMask(rng, mask, dict.index, 200, 60000);
var filled = fillMask(rng, mask, dict.index, llmScores, 200, 60000);
var tFill1 = System.nanoTime();
System.out.printf(Locale.ROOT, "FILL: %.3fms%n", (tFill1 - tFill0) / 1e6);
System.out.printf(Locale.ROOT, "FILL: %.3fms | Simplicity: %.2f%n", (tFill1 - tFill0) / 1e6, filled.simplicity);
if (filled.ok) {
if (filled.ok && (opts.minSimplicity <= 0 || filled.simplicity >= opts.minSimplicity)) {
return new PuzzleResult(mask, filled);
}
if (filled.ok) {
System.out.printf(Locale.ROOT, "Puzzle simplicity %.2f is below min %.2f, retrying...%n",
filled.simplicity, opts.minSimplicity);
}
}
return null;
}
}

26
src/puzzle/TestSort.java Normal file
View File

@@ -0,0 +1,26 @@
package puzzle;
import puzzle.ThemePoolBuilderLength.Lexicon;
import java.nio.file.*;
import java.util.*;
public class TestSort {
public static void main(String[] args) throws Exception {
Lexicon lex = new Lexicon(
Arrays.asList("A", "B", "C"),
new HashMap<>(),
new int[]{10, 30, 20},
new BitSet[9]
);
BitSet bs = new BitSet();
bs.set(0); bs.set(1); bs.set(2);
Path p = Paths.get("test_pool.txt");
ThemePoolBuilderLength.writeWordList(p, lex, bs);
List<String> lines = Files.readAllLines(p);
System.out.println("Sorted words: " + lines);
if (lines.get(0).equals("B") && lines.get(1).equals("C") && lines.get(2).equals("A")) {
System.out.println("SUCCESS");
} else {
System.out.println("FAILURE");
System.exit(1);
}
}
}

View File

@@ -14,6 +14,7 @@ import java.text.Normalizer;
import java.time.LocalDate;
import java.util.*;
@SuppressWarnings("ALL")
public class ThemePoolBuilderLength {
private static final List<String> DEFAULT_FEEDS = List.of(
@@ -28,8 +29,8 @@ public class ThemePoolBuilderLength {
"ING", "KPN", "KVK", "RIVM", "GGD", "AIVD", "MIVD", "CEO", "CFO", "HR",
"NL", "BE", "BRU", "EUR", "EURO", "WET", "ART", "BTW", "DI", "MA",
"PVV", "VVD", "CDA", "FNV",
"EN","IN","OP","OM","TE","ER","DE","HET","EEN","VAN","MET","NOG","OOK","MAAR","WEL","NIET",
"HOE","ALS",
"EN", "IN", "OP", "OM", "TE", "ER", "DE", "HET", "EEN", "VAN", "MET", "NOG", "OOK", "MAAR", "WEL", "NIET",
"HOE", "ALS",
"ZO", "DO", "WO", "VR", "MO", "WA", "WE", "TAAL",
"LAND", "GEMEENTE", "STAAT", "BUREAU", "HUIS", "SCHOOL", "STR", "BAAN",
@@ -41,12 +42,13 @@ public class ThemePoolBuilderLength {
"WINKEL", "MARKT", "KIOSK", "AUTO", "MOBILE", "FIETS", "SCOOTER",
// afkortingen
"DHR","MEVR","DR","ST","CA","IVM","MBT","TAV","TOV","DWZ","MAW","OA","TM",
"ANWB","BRP","CBS",
"DHR", "MEVR", "DR", "ST", "CA", "IVM", "MBT", "TAV", "TOV", "DWZ", "MAW", "OA", "TM",
"ANWB", "BRP", "CBS",
"AL", "NU", "TO", "NA", "BIJ", "TOT", "DAN", "WAT", "DAT",
"IK", "JE", "WE", "WIJ", "JIJ", "ZIJ", "HIJ", "HEN", "ONS", "JOU",
// romeinse cijfers (2-8)
"II","III","IV","VI","VII","VIII","IX",
"XI","XII","XIII","XIV","XV","XVI","XVII","XVIII","XIX","XX"
"II", "III", "IV", "VI", "VII", "VIII", "IX",
"XI", "XII", "XIII", "XIV", "XV", "XVI", "XVII", "XVIII", "XIX", "XX"
);
private static final String BROWSER_UA =
@@ -59,7 +61,7 @@ public class ThemePoolBuilderLength {
List<String> feeds = new ArrayList<>(DEFAULT_FEEDS);
String outDir = "./out";
int bridgeN = 32000;
int bridgeN = 40000;
int themeN = 800;
int relatedN = 2200;
int rssItemsPerFeed = 10;
@@ -74,9 +76,9 @@ public class ThemePoolBuilderLength {
int minLen3 = 1000;
int minLen4 = 1000;
int minLen5 = 1000; // set if you also want to force 5-letter words, etc.
int minLen6 = 1000;
int minLen7 = 1000;
int minLen8 = 1000;
int minLen6 = 2000;
int minLen7 = 2000;
int minLen8 = 2000;
}
static Opts parseArgs(String[] args) {
@@ -85,53 +87,107 @@ public class ThemePoolBuilderLength {
var a = args[i];
var v = (i + 1 < args.length) ? args[i + 1] : null;
switch (a) {
case "--words" -> { o.wordsPath = v; i++; }
case "--endpoint" -> { o.endpoint = v; i++; }
case "--feeds" -> { o.feeds = Arrays.asList(v.split(",")); i++; }
case "--out" -> { o.outDir = v; i++; }
case "--bridge" -> { o.bridgeN = Integer.parseInt(v); i++; }
case "--theme" -> { o.themeN = Integer.parseInt(v); i++; }
case "--related" -> { o.relatedN = Integer.parseInt(v); i++; }
case "--items" -> { o.rssItemsPerFeed = Integer.parseInt(v); i++; }
case "--model" -> { o.model = v; i++; }
case "--timeout" -> { o.timeoutSeconds = Integer.parseInt(v); i++; }
case "--retries" -> { o.retries = Integer.parseInt(v); i++; }
case "--words" -> {
o.wordsPath = v;
i++;
}
case "--endpoint" -> {
o.endpoint = v;
i++;
}
case "--feeds" -> {
o.feeds = Arrays.asList(v.split(","));
i++;
}
case "--out" -> {
o.outDir = v;
i++;
}
case "--bridge" -> {
o.bridgeN = Integer.parseInt(v);
i++;
}
case "--theme" -> {
o.themeN = Integer.parseInt(v);
i++;
}
case "--related" -> {
o.relatedN = Integer.parseInt(v);
i++;
}
case "--items" -> {
o.rssItemsPerFeed = Integer.parseInt(v);
i++;
}
case "--model" -> {
o.model = v;
i++;
}
case "--timeout" -> {
o.timeoutSeconds = Integer.parseInt(v);
i++;
}
case "--retries" -> {
o.retries = Integer.parseInt(v);
i++;
}
// ---- NEW: minima per length ----
case "--min2" -> { o.minLen2 = Integer.parseInt(v); i++; }
case "--min3" -> { o.minLen3 = Integer.parseInt(v); i++; }
case "--min4" -> { o.minLen4 = Integer.parseInt(v); i++; }
case "--min5" -> { o.minLen5 = Integer.parseInt(v); i++; }
case "--min6" -> { o.minLen6 = Integer.parseInt(v); i++; }
case "--min7" -> { o.minLen7 = Integer.parseInt(v); i++; }
case "--min8" -> { o.minLen8 = Integer.parseInt(v); i++; }
case "--min2" -> {
o.minLen2 = Integer.parseInt(v);
i++;
}
case "--min3" -> {
o.minLen3 = Integer.parseInt(v);
i++;
}
case "--min4" -> {
o.minLen4 = Integer.parseInt(v);
i++;
}
case "--min5" -> {
o.minLen5 = Integer.parseInt(v);
i++;
}
case "--min6" -> {
o.minLen6 = Integer.parseInt(v);
i++;
}
case "--min7" -> {
o.minLen7 = Integer.parseInt(v);
i++;
}
case "--min8" -> {
o.minLen8 = Integer.parseInt(v);
i++;
}
case "-h", "--help" -> {
System.out.println("""
Usage:
java puzzle.ThemePoolBuilder --words WORDS.txt [options]
Options:
--endpoint http://HOST:1234/v1 (LM Studio)
--feeds url1,url2
--out ./out
--bridge 5000
--theme 300
--related 1200
--items 20 (per feed)
--model <id> (recommended; skips /v1/models)
--timeout 60 (seconds)
--retries 4
# enforce minima per length in final pool
--min2 4000
--min3 7000
--min4 9000
--min5 0
--min6 0
--min7 0
--min8 0
""");
Usage:
java puzzle.ThemePoolBuilder --words WORDS.txt [options]
Options:
--endpoint http://HOST:1234/v1 (LM Studio)
--feeds url1,url2
--out ./out
--bridge 5000
--theme 300
--related 1200
--items 20 (per feed)
--model <id> (recommended; skips /v1/models)
--timeout 60 (seconds)
--retries 4
# enforce minima per length in final pool
--min2 4000
--min3 7000
--min4 9000
--min5 0
--min6 0
--min7 0
--min8 0
""");
System.exit(0);
}
default -> throw new IllegalArgumentException("Unknown arg: " + a);
@@ -223,64 +279,54 @@ public class ThemePoolBuilderLength {
out.add(w);
}
// 2) ensure DEFAULT_SHORTS are present even if absent in word-list.txt
/* // 2) ensure DEFAULT_SHORTS are present even if absent in word-list.txt
for (var raw : DEFAULT_SHORTS) {
var w = normalizeDutchToken(raw);
if (w == null) continue;
if (idOf.containsKey(w)) continue;
idOf.put(w, out.size());
out.add(w);
}
// 3) small extra injects (optional)
var extraShorts = List.of(
"AL","NU","TO","NA","BIJ","TOT","DAN","WAT","DAT",
"IK","JE","WE","WIJ","JIJ","ZIJ","HIJ","HEN","ONS","JOU"
);
for (var wRaw : extraShorts) {
var w = normalizeDutchToken(wRaw);
if (w == null) continue;
if (idOf.containsKey(w)) continue;
idOf.put(w, out.size());
out.add(w);
}
}*/
// Load LLM scores
var llmScores = new HashMap<String, Integer>();
try {
var scoreLines = Files.readAllLines(Path.of("word_scores.csv"), StandardCharsets.UTF_8);
var first = true;
var first = true;
for (var line : scoreLines) {
if (first) { first = false; continue; }
if (first) {
first = false;
continue;
}
var parts = line.split(",", 3);
if (parts.length >= 3) {
try {
var word = parts[0].trim().toUpperCase(Locale.ROOT);
var score = Integer.parseInt(parts[1].trim());
var word = parts[0].trim().toUpperCase(Locale.ROOT);
var score = Integer.parseInt(parts[1].trim());
var status = parts[2].trim();
if ("OK".equalsIgnoreCase(status)) {
llmScores.put(word, score);
}
} catch (NumberFormatException ignored) {}
} catch (NumberFormatException ignored) { }
}
}
} catch (IOException e) {
System.err.println("Warning: word_scores.csv not found, using default scores.");
}
var n = out.size();
var score = new int[n];
var byLen = new BitSet[9];
for (var L = 0; L <= 8; L++) byLen[L] = new BitSet(n);
for (var i = 0; i < n; i++) {
var w = out.get(i);
var w = out.get(i);
var crossScore = crossabilityScore(w);
var lScore = llmScores.getOrDefault(w, 5);
var lScore = llmScores.getOrDefault(w, 5);
// Prioritize simple words (high lScore) and long words.
// lScore (1-10) adds up to 300 points (weight 30).
// lScore (1-10) adds up to 1000 points (weight 100).
// Length (2-8) adds up to 160 points (weight 20).
score[i] = crossScore + (lScore * 30) + (w.length() * 20);
score[i] = crossScore + (lScore * 100) + (w.length() * 30);
byLen[w.length()].set(i);
}
@@ -290,6 +336,7 @@ public class ThemePoolBuilderLength {
// ---------------- RSS via curl (browser-like) ----------------
static final class RssItem {
final String title;
final String desc;
RssItem(String title, String desc) {
@@ -407,6 +454,7 @@ public class ThemePoolBuilderLength {
static SSLContext insecureSslContext() throws Exception {
var trustAll = new TrustManager[]{
new X509TrustManager() {
public X509Certificate[] getAcceptedIssuers() { return new X509Certificate[0]; }
public void checkClientTrusted(X509Certificate[] chain, String authType) { }
public void checkServerTrusted(X509Certificate[] chain, String authType) { }
@@ -579,6 +627,16 @@ public class ThemePoolBuilderLength {
var body = s.substring(a + 1, b);
var out = new ArrayList<String>();
// If it's a simple comma-separated list without quotes (or with mixed quotes),
// let's try a more robust approach.
if (!body.contains("\"")) {
for (var part : body.split(",")) {
var trimmed = part.trim();
if (!trimmed.isEmpty()) out.add(trimmed);
}
if (!out.isEmpty()) return out;
}
var cur = new StringBuilder();
boolean in = false, esc = false;
@@ -626,7 +684,7 @@ public class ThemePoolBuilderLength {
static List<String> llmThemeWords(Opts o, String modelId, String rssText) throws Exception {
var prompt = """
Je genereert woorden voor een Nederlandse kruiswoordpuzzel.
Regels:
- Output MOET exact één JSON array zijn: ["WOORD", ...]
- Alleen A-Z, 2-8 letters woorden
@@ -635,10 +693,10 @@ public class ThemePoolBuilderLength {
- Focus op zelfstandige naamwoorden/termen uit het nieuws en relevante Zweedse kruiswoordpuzzel koppelwoorden in het thema.
- Lever %d THEMA-woorden en daarna %d GERELATEERDE woorden (totaal %d).
- Voeg ook wat korte woorden/afkortingen toe (2-4 letters), maar houd het totaal gelijk.
Nieuws (koppen/samenvattingen):
%s
""".formatted(o.themeN, o.relatedN, (o.themeN + o.relatedN), rssText.substring(0, 8000));
""".formatted(o.themeN, o.relatedN, (o.themeN + o.relatedN), rssText.substring(0, Math.min(rssText.length(), 8000)));
var body = """
{
@@ -668,14 +726,20 @@ public class ThemePoolBuilderLength {
static BitSet buildBridgeBitmap(Lexicon lex, int bridgeN) {
var n = lex.words.size();
var ids = new Integer[n];
for (var i = 0; i < n; i++) ids[i] = i;
var ids = new ArrayList<Integer>(n);
for (var i = 0; i < n; i++) {
// Optionally filter out VERY complex words from the bridge (e.g. lScore < 3)
// But since we sort by score (which is now dominated by lScore),
// they will be at the very bottom anyway.
if (lex.score[i] < 800) continue;
ids.add(i);
}
Arrays.sort(ids, (a, b) -> Integer.compare(lex.score[b], lex.score[a]));
ids.sort((a, b) -> Integer.compare(lex.score[b], lex.score[a]));
var bs = new BitSet(n);
var take = Math.min(bridgeN, n);
for (var i = 0; i < take; i++) bs.set(ids[i]);
var take = Math.min(bridgeN, ids.size());
for (var i = 0; i < take; i++) bs.set(ids.get(i));
return bs;
}
@@ -710,6 +774,8 @@ public class ThemePoolBuilderLength {
var out = new ArrayList<String>(ids.size());
for (var id : ids) {
/* if (lex.score[id] < 680)
continue;*/
out.add(lex.words.get(id));
}
Files.write(path, out, StandardCharsets.UTF_8, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
@@ -811,7 +877,7 @@ public class ThemePoolBuilderLength {
System.out.println("Using model: " + modelId);
System.out.println("Generating theme words via LM Studio...");
var llmWords = llmThemeWords(o, modelId, rssText.toString());
List<String> llmWords = Arrays.asList();//llmThemeWords(o, modelId, rssText.toString());
// Normalize + keep only those present in master lexicon
var themeKept = new LinkedHashSet<String>();
@@ -843,13 +909,13 @@ public class ThemePoolBuilderLength {
Date: %s
Feeds: %s
Model: %s
Master size: %d
Theme kept (in master): %d
Bridge size: %d
Shorts kept: %d
Pool total: %d
Enforced minima:
2: %d
3: %d
@@ -858,10 +924,10 @@ public class ThemePoolBuilderLength {
6: %d
7: %d
8: %d
Counts per length (theme):
%s
Counts per length (pool):
%s
""".formatted(