update them

This commit is contained in:
mike
2025-12-21 21:42:47 +01:00
parent ee7b1925f2
commit 2e34efbde3
8 changed files with 53532 additions and 15773 deletions

20956
out/pool.txt

File diff suppressed because it is too large Load Diff

View File

@@ -3,10 +3,10 @@ Feeds: https://feeds.nos.nl/nosnieuwsalgemeen, https://feeds.nos.nl/nosnieuwstec
Model: mistralai/mistral-nemo-instruct-2407 Model: mistralai/mistral-nemo-instruct-2407
Master size: 91892 Master size: 91892
Theme kept (in master): 0 Theme kept (in master): 35
Bridge size: 32000 Bridge size: 32000
Shorts kept: 133 Shorts kept: 133
Pool total: 38391 Pool total: 38785
Enforced minima: Enforced minima:
2: 4000 2: 4000
@@ -19,20 +19,20 @@ Enforced minima:
Counts per length (theme): Counts per length (theme):
2: 0 2: 0
3: 0 3: 2
4: 0 4: 3
5: 0 5: 8
6: 0 6: 7
7: 0 7: 5
8: 0 8: 10
Counts per length (pool): Counts per length (pool):
2: 248 2: 248
3: 1666 3: 1666
4: 4850 4: 4850
5: 795 5: 57
6: 1987 6: 136
7: 8550 7: 1183
8: 20295 8: 30645

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,35 @@
SCHOTEN
GETUIGEN
GEWONDEN
AUTORUIT
GAT
MELDING
RWANDA
CONGO
CONFLICT
STRIJD
MACHT
LAND
GRENS
BURUNDI
VLUCHT
INKOMEN
MINISTER
ZAKEN
DRUK
OPVOEREN
TROPEN
DAGEN
LANGER
ZON
HALFROND
DAGLICHT
BEWOLKT
ZONNIG
VERSCHIL
BELEVING
WATT
METER
GRIJS
KOUDER
TREND

View File

@@ -18,7 +18,7 @@ public class ConcurrentWordScorer {
// ===== CONFIGURATION ===== // ===== CONFIGURATION =====
private static final String INPUT_WORDLIST = "word-list.txt"; private static final String INPUT_WORDLIST = "word-list.txt";
private static final String OUTPUT_SCORES = "word_scores.csv"; private static final String OUTPUT_SCORES = "word_scores.csv";
private static final int BATCH_SIZE = 30; // Smaller for better distribution private static final int BATCH_SIZE = 10; // Even smaller for the difficult remaining words
private static final int MAX_RETRIES = 3; private static final int MAX_RETRIES = 3;
// Define all three endpoints // Define all three endpoints
@@ -72,7 +72,7 @@ public class ConcurrentWordScorer {
throw new IOException("[" + name + "] Empty response content"); throw new IOException("[" + name + "] Empty response content");
} }
return parseScoresFromReply(batch, content); return parseScoresFromReply(batch, content, name);
} }
} }
@@ -81,7 +81,7 @@ public class ConcurrentWordScorer {
OllamaEndpoint() { OllamaEndpoint() {
super("Ollama", "http://localhost:11434/api/chat", super("Ollama", "http://localhost:11434/api/chat",
"qwen2.5:14b", 2); // 2 concurrent requests "qwen2.5:14b", 1); // 2 concurrent requests
} }
@Override String buildRequestJson(String prompt) { @Override String buildRequestJson(String prompt) {
@@ -147,6 +147,8 @@ public class ConcurrentWordScorer {
} }
System.out.println(); System.out.println();
cleanupOutputFile();
// Load work queue // Load work queue
var allWords = Files.readAllLines(Paths.get(INPUT_WORDLIST)); var allWords = Files.readAllLines(Paths.get(INPUT_WORDLIST));
var scoredWords = loadAlreadyScoredWords(); var scoredWords = loadAlreadyScoredWords();
@@ -342,12 +344,50 @@ public class ConcurrentWordScorer {
first = false; first = false;
continue; continue;
} }
var parts = line.split(",", 2); var parts = line.split(",");
if (parts.length >= 1) scored.add(parts[0].trim().toLowerCase()); if (parts.length >= 3) {
var word = parts[0].trim().toLowerCase();
var status = parts[2].trim();
if ("OK".equalsIgnoreCase(status)) {
scored.add(word);
}
}
} }
return scored; return scored;
} }
private static void cleanupOutputFile() throws IOException {
var path = Paths.get(OUTPUT_SCORES);
if (!Files.exists(path)) return;
System.out.println("Cleaning up " + OUTPUT_SCORES + "...");
var lines = Files.readAllLines(path);
if (lines.isEmpty()) return;
var header = lines.get(0);
Map<String, String> latestOkEntries = new LinkedHashMap<>();
for (int i = 1; i < lines.size(); i++) {
var line = lines.get(i);
var parts = line.split(",");
if (parts.length >= 3) {
var word = parts[0].trim().toLowerCase();
var status = parts[2].trim();
if ("OK".equalsIgnoreCase(status)) {
latestOkEntries.put(word, line);
}
}
}
var cleanedLines = new ArrayList<String>();
cleanedLines.add(header);
cleanedLines.addAll(latestOkEntries.values());
Files.write(path, cleanedLines, StandardCharsets.UTF_8);
System.out.printf("Cleanup complete. Kept %d unique OK entries. Removed %d non-OK or duplicate entries.%n",
latestOkEntries.size(), lines.size() - cleanedLines.size());
}
private static List<WordScore> createFailedScores(List<String> words, String endpoint) { private static List<WordScore> createFailedScores(List<String> words, String endpoint) {
List<WordScore> failed = new ArrayList<>(); List<WordScore> failed = new ArrayList<>();
for (var word : words) { for (var word : words) {
@@ -357,16 +397,25 @@ public class ConcurrentWordScorer {
} }
// Parsing logic // Parsing logic
private static List<WordScore> parseScoresFromReply(List<String> expectedWords, String reply) { private static List<WordScore> parseScoresFromReply(List<String> expectedWords, String reply, String endpointName) {
Map<String, Integer> wordScoreMap = new HashMap<>(); Map<String, Integer> wordScoreMap = new HashMap<>();
var lines = reply.split("\n"); var lines = reply.split("\n");
for (var line : lines) { for (var line : lines) {
line = line.trim(); line = line.trim();
if (line.contains(":")) { // Handle formats like "1. word:score", "word: score", "word - score"
var parts = line.split(":", 2); String sep = null;
if (line.contains(":")) sep = ":";
else if (line.contains("-")) sep = "-";
if (sep != null) {
var parts = line.split(sep, 2);
if (parts.length == 2) { if (parts.length == 2) {
var word = parts[0].trim().toLowerCase(); var wordPart = parts[0].trim();
// Remove leading numbering like "1. " or bullets like "* ", "- "
wordPart = wordPart.replaceAll("^[\\d+.)*\\-\\s]+", "");
var word = wordPart.toLowerCase();
try { try {
var scoreStr = parts[1].trim(); var scoreStr = parts[1].trim();
// Handle potential non-numeric junk after the number // Handle potential non-numeric junk after the number
@@ -398,14 +447,12 @@ public class ConcurrentWordScorer {
// Prompt creation // Prompt creation
private static String createScoringPrompt(List<String> words) { private static String createScoringPrompt(List<String> words) {
return "Je bent een Nederlandse taalexpert. Geef elk woord een populariteitsscore van 1-10.\n\n" + return "Je bent een Nederlandse taalexpert. Geef elk van de " + words.size() + " onderstaande woorden een populariteitsscore van 1 (zeer zeldzaam) tot 10 (zeer algemeen).\n\n" +
"Score criteria:\n" + "Output ALLEEN in dit formaat:\n" +
"- 1 = Zeer zeldzaam, archaïsch, of extreem specifiek vakjargon\n" +
"- 10 = Zeer algemeen, dagelijks gebruikt door iedereen\n\n" +
"Geef ALLEEN een lijst in dit exacte formaat, niets anders:\n" +
"woord1:score\n" + "woord1:score\n" +
"woord2:score\n\n" + "woord2:score\n\n" +
"Woorden om te scoren:\n" + "GEEN andere tekst of uitleg. Sla GEEN woorden over.\n\n" +
"Lijst:\n" +
String.join("\n", words); String.join("\n", words);
} }

View File

@@ -138,12 +138,13 @@ public class SwedishGenerator {
public WordDifficulty(String word, int score) { public WordDifficulty(String word, int score) {
this.word = word; this.word = word;
this.score = score; this.score = score;
// Simple heuristic for difficulty: shorter words have lower difficulty // We want LONGER and SIMPLER words to be tried earlier (lower difficulty value).
// We combine this with the score (10 = common/simple, 1 = rare/hard) // word.length() is 2 to 8.
// Lower difficulty value means it is tried EARLIER. // score is 1 to 10.
// We want LONGER and SIMPLER words to be tried earlier. // Base difficulty starts high and decreases with length and score.
// Increasing simplicity weight: score (1-10) now has max impact of 50. // Length impact: up to 8 * 10 = 80
this.difficulty = /*Math.min(40, (10-word.length()) * 5)*/ - (score * 5); // Score impact: up to 10 * 15 = 150
this.difficulty = 250 - (word.length() * 10) - (score * 15);
} }
} }
@@ -155,9 +156,14 @@ public class SwedishGenerator {
for (var line : lines) { for (var line : lines) {
if (first) { first = false; continue; } if (first) { first = false; continue; }
var parts = line.split(",", 3); var parts = line.split(",", 3);
if (parts.length >= 2) { if (parts.length >= 3) {
try { try {
scores.put(parts[0].trim().toUpperCase(Locale.ROOT), Integer.parseInt(parts[1].trim())); var word = parts[0].trim().toUpperCase(Locale.ROOT);
var score = Integer.parseInt(parts[1].trim());
var status = parts[2].trim();
if ("OK".equalsIgnoreCase(status)) {
scores.put(word, score);
}
} catch (NumberFormatException ignored) {} } catch (NumberFormatException ignored) {}
} }
} }

View File

@@ -253,9 +253,14 @@ public class ThemePoolBuilderLength {
for (var line : scoreLines) { for (var line : scoreLines) {
if (first) { first = false; continue; } if (first) { first = false; continue; }
var parts = line.split(",", 3); var parts = line.split(",", 3);
if (parts.length >= 2) { if (parts.length >= 3) {
try { try {
llmScores.put(parts[0].trim().toUpperCase(Locale.ROOT), Integer.parseInt(parts[1].trim())); var word = parts[0].trim().toUpperCase(Locale.ROOT);
var score = Integer.parseInt(parts[1].trim());
var status = parts[2].trim();
if ("OK".equalsIgnoreCase(status)) {
llmScores.put(word, score);
}
} catch (NumberFormatException ignored) {} } catch (NumberFormatException ignored) {}
} }
} }
@@ -272,9 +277,10 @@ public class ThemePoolBuilderLength {
var w = out.get(i); var w = out.get(i);
var crossScore = crossabilityScore(w); var crossScore = crossabilityScore(w);
var lScore = llmScores.getOrDefault(w, 5); var lScore = llmScores.getOrDefault(w, 5);
// Prioritize simple words (high lScore) and high crossability // Prioritize simple words (high lScore) and long words.
// Increased simplicity weight: lScore (1-10) now adds up to 200 points. // lScore (1-10) adds up to 300 points (weight 30).
score[i] = crossScore + (lScore * 20); // Length (2-8) adds up to 160 points (weight 20).
score[i] = crossScore + (lScore * 30) + (w.length() * 20);
byLen[w.length()].set(i); byLen[w.length()].set(i);
} }

File diff suppressed because it is too large Load Diff