update them

This commit is contained in:
mike
2025-12-21 19:42:20 +01:00
parent e2bad52d1f
commit 22133e86f0
11 changed files with 17859 additions and 7169 deletions

View File

@@ -133,14 +133,40 @@ public class SwedishGenerator {
final String word;
final int difficulty;
final int score;
public WordDifficulty(String word) {
public WordDifficulty(String word, int score) {
this.word = word;
this.score = score;
// Simple heuristic for difficulty: shorter words have lower difficulty
this.difficulty = -Math.min(40,word.length() * 5);
// We combine this with the score (10 = common/simple, 1 = rare/hard)
// Lower difficulty value means it is tried EARLIER.
// We want LONGER and SIMPLER words to be tried earlier.
// Increasing simplicity weight: score (1-10) now has max impact of 50.
this.difficulty = -Math.min(40, word.length() * 5) - (score * 5);
}
}
static Map<String, Integer> loadScores() {
var scores = new HashMap<String, Integer>();
try {
var lines = Files.readAllLines(Path.of("word_scores.csv"), StandardCharsets.UTF_8);
var first = true;
for (var line : lines) {
if (first) { first = false; continue; }
var parts = line.split(",", 3);
if (parts.length >= 2) {
try {
scores.put(parts[0].trim().toUpperCase(Locale.ROOT), Integer.parseInt(parts[1].trim()));
} catch (NumberFormatException ignored) {}
}
}
} catch (IOException e) {
System.err.println("Warning: word_scores.csv not found, using default scores.");
}
return scores;
}
static final class Dict {
final ArrayList<String> words;
@@ -160,11 +186,13 @@ public class SwedishGenerator {
raw = "EU\nUUR\nAUTO\nBOOM\nHUIS\nKAT\nZEE\nRODE\nDRAAD\nKENNIS\nNETWERK\nPAKTE\n";
}
var words = new ArrayList<WordDifficulty>();
var llmScores = loadScores();
var words = new ArrayList<WordDifficulty>();
for (var line : raw.split("\\R")) {
var s = line.trim().toUpperCase(Locale.ROOT);
if (s.matches("^[A-Z]{2,8}$")) {
words.add(new WordDifficulty(s));
var score = llmScores.getOrDefault(s, 5); // Default to middle
words.add(new WordDifficulty(s, score));
}
}
@@ -254,9 +282,10 @@ public class SwedishGenerator {
ci.count = curLen;
return ci;
}
static int indexToDifficulty(DictEntry entry, int index) {
static int indexToDifficulty(DictEntry entry, int index, Map<String, Integer> llmScores) {
var word = entry.words.get(index);
return new WordDifficulty(word).difficulty;
var score = llmScores.getOrDefault(word, 5);
return new WordDifficulty(word, score).difficulty;
}
@@ -729,7 +758,7 @@ public class SwedishGenerator {
return p;
};
final var MAX_TRIES_PER_SLOT = 500;
final var MAX_TRIES_PER_SLOT = 2000;
class Solver {
@@ -783,10 +812,9 @@ public class SwedishGenerator {
// When picking words from sorted indices, we want to favor the beginning
// (lower difficulty) but still have some randomness.
for (var t = 0; t < tries; t++) {
// Power law or similar to favor lower indices:
// pick a random double in [0, 1), square it to bias towards 0.
// Bias strongly towards lower indices (simpler words) using r^3
double r = rng.nextFloat();
int idxInArray = (int) (r * r * L);
int idxInArray = (int) (r * r * r * L);
var idx = idxs[idxInArray];
var w = entry.words.get(idx);
if (tryWord.apply(w)) return true;
@@ -804,7 +832,7 @@ public class SwedishGenerator {
var tries = Math.min(MAX_TRIES_PER_SLOT, N);
for (var t = 0; t < tries; t++) {
double r = rng.nextFloat();
int idxInArray = (int) (r * r * N);
int idxInArray = (int) (r * r * r * N);
var w = entry.words.get(idxInArray);
if (tryWord.apply(w)) return true;
}
@@ -863,7 +891,7 @@ public class SwedishGenerator {
System.out.printf(Locale.ROOT, "MASK: %.3fs%n", (tMask1 - tMask0) / 1e9);
var tFill0 = System.nanoTime();
var filled = fillMask(rng, mask, dict.index, 200, 30000);
var filled = fillMask(rng, mask, dict.index, 200, 60000);
var tFill1 = System.nanoTime();
System.out.printf(Locale.ROOT, "FILL: %.3fms%n", (tFill1 - tFill0) / 1e6);

View File

@@ -245,6 +245,24 @@ public class ThemePoolBuilderLength {
out.add(w);
}
// Load LLM scores
var llmScores = new HashMap<String, Integer>();
try {
var scoreLines = Files.readAllLines(Path.of("word_scores.csv"), StandardCharsets.UTF_8);
var first = true;
for (var line : scoreLines) {
if (first) { first = false; continue; }
var parts = line.split(",", 3);
if (parts.length >= 2) {
try {
llmScores.put(parts[0].trim().toUpperCase(Locale.ROOT), Integer.parseInt(parts[1].trim()));
} catch (NumberFormatException ignored) {}
}
}
} catch (IOException e) {
System.err.println("Warning: word_scores.csv not found, using default scores.");
}
var n = out.size();
var score = new int[n];
var byLen = new BitSet[9];
@@ -252,7 +270,11 @@ public class ThemePoolBuilderLength {
for (var i = 0; i < n; i++) {
var w = out.get(i);
score[i] = crossabilityScore(w);
var crossScore = crossabilityScore(w);
var lScore = llmScores.getOrDefault(w, 5);
// Prioritize simple words (high lScore) and high crossability
// Increased simplicity weight: lScore (1-10) now adds up to 200 points.
score[i] = crossScore + (lScore * 20);
byLen[w.length()].set(i);
}