update them

This commit is contained in:
mike
2025-12-21 21:42:47 +01:00
parent ee7b1925f2
commit 2e34efbde3
8 changed files with 53532 additions and 15773 deletions

20956
out/pool.txt

File diff suppressed because it is too large Load Diff

View File

@@ -3,10 +3,10 @@ Feeds: https://feeds.nos.nl/nosnieuwsalgemeen, https://feeds.nos.nl/nosnieuwstec
Model: mistralai/mistral-nemo-instruct-2407
Master size: 91892
Theme kept (in master): 0
Theme kept (in master): 35
Bridge size: 32000
Shorts kept: 133
Pool total: 38391
Pool total: 38785
Enforced minima:
2: 4000
@@ -19,20 +19,20 @@ Enforced minima:
Counts per length (theme):
2: 0
3: 0
4: 0
5: 0
6: 0
7: 0
8: 0
3: 2
4: 3
5: 8
6: 7
7: 5
8: 10
Counts per length (pool):
2: 248
3: 1666
4: 4850
5: 795
6: 1987
7: 8550
8: 20295
5: 57
6: 136
7: 1183
8: 30645

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,35 @@
SCHOTEN
GETUIGEN
GEWONDEN
AUTORUIT
GAT
MELDING
RWANDA
CONGO
CONFLICT
STRIJD
MACHT
LAND
GRENS
BURUNDI
VLUCHT
INKOMEN
MINISTER
ZAKEN
DRUK
OPVOEREN
TROPEN
DAGEN
LANGER
ZON
HALFROND
DAGLICHT
BEWOLKT
ZONNIG
VERSCHIL
BELEVING
WATT
METER
GRIJS
KOUDER
TREND

View File

@@ -18,7 +18,7 @@ public class ConcurrentWordScorer {
// ===== CONFIGURATION =====
private static final String INPUT_WORDLIST = "word-list.txt";
private static final String OUTPUT_SCORES = "word_scores.csv";
private static final int BATCH_SIZE = 30; // Smaller for better distribution
private static final int BATCH_SIZE = 10; // Even smaller for the difficult remaining words
private static final int MAX_RETRIES = 3;
// Define all three endpoints
@@ -72,7 +72,7 @@ public class ConcurrentWordScorer {
throw new IOException("[" + name + "] Empty response content");
}
return parseScoresFromReply(batch, content);
return parseScoresFromReply(batch, content, name);
}
}
@@ -81,7 +81,7 @@ public class ConcurrentWordScorer {
OllamaEndpoint() {
super("Ollama", "http://localhost:11434/api/chat",
"qwen2.5:14b", 2); // 2 concurrent requests
"qwen2.5:14b", 1); // 2 concurrent requests
}
@Override String buildRequestJson(String prompt) {
@@ -147,6 +147,8 @@ public class ConcurrentWordScorer {
}
System.out.println();
cleanupOutputFile();
// Load work queue
var allWords = Files.readAllLines(Paths.get(INPUT_WORDLIST));
var scoredWords = loadAlreadyScoredWords();
@@ -342,12 +344,50 @@ public class ConcurrentWordScorer {
first = false;
continue;
}
var parts = line.split(",", 2);
if (parts.length >= 1) scored.add(parts[0].trim().toLowerCase());
var parts = line.split(",");
if (parts.length >= 3) {
var word = parts[0].trim().toLowerCase();
var status = parts[2].trim();
if ("OK".equalsIgnoreCase(status)) {
scored.add(word);
}
}
}
return scored;
}
private static void cleanupOutputFile() throws IOException {
var path = Paths.get(OUTPUT_SCORES);
if (!Files.exists(path)) return;
System.out.println("Cleaning up " + OUTPUT_SCORES + "...");
var lines = Files.readAllLines(path);
if (lines.isEmpty()) return;
var header = lines.get(0);
Map<String, String> latestOkEntries = new LinkedHashMap<>();
for (int i = 1; i < lines.size(); i++) {
var line = lines.get(i);
var parts = line.split(",");
if (parts.length >= 3) {
var word = parts[0].trim().toLowerCase();
var status = parts[2].trim();
if ("OK".equalsIgnoreCase(status)) {
latestOkEntries.put(word, line);
}
}
}
var cleanedLines = new ArrayList<String>();
cleanedLines.add(header);
cleanedLines.addAll(latestOkEntries.values());
Files.write(path, cleanedLines, StandardCharsets.UTF_8);
System.out.printf("Cleanup complete. Kept %d unique OK entries. Removed %d non-OK or duplicate entries.%n",
latestOkEntries.size(), lines.size() - cleanedLines.size());
}
private static List<WordScore> createFailedScores(List<String> words, String endpoint) {
List<WordScore> failed = new ArrayList<>();
for (var word : words) {
@@ -357,16 +397,25 @@ public class ConcurrentWordScorer {
}
// Parsing logic
private static List<WordScore> parseScoresFromReply(List<String> expectedWords, String reply) {
private static List<WordScore> parseScoresFromReply(List<String> expectedWords, String reply, String endpointName) {
Map<String, Integer> wordScoreMap = new HashMap<>();
var lines = reply.split("\n");
for (var line : lines) {
line = line.trim();
if (line.contains(":")) {
var parts = line.split(":", 2);
// Handle formats like "1. word:score", "word: score", "word - score"
String sep = null;
if (line.contains(":")) sep = ":";
else if (line.contains("-")) sep = "-";
if (sep != null) {
var parts = line.split(sep, 2);
if (parts.length == 2) {
var word = parts[0].trim().toLowerCase();
var wordPart = parts[0].trim();
// Remove leading numbering like "1. " or bullets like "* ", "- "
wordPart = wordPart.replaceAll("^[\\d+.)*\\-\\s]+", "");
var word = wordPart.toLowerCase();
try {
var scoreStr = parts[1].trim();
// Handle potential non-numeric junk after the number
@@ -398,14 +447,12 @@ public class ConcurrentWordScorer {
// Prompt creation
private static String createScoringPrompt(List<String> words) {
return "Je bent een Nederlandse taalexpert. Geef elk woord een populariteitsscore van 1-10.\n\n" +
"Score criteria:\n" +
"- 1 = Zeer zeldzaam, archaïsch, of extreem specifiek vakjargon\n" +
"- 10 = Zeer algemeen, dagelijks gebruikt door iedereen\n\n" +
"Geef ALLEEN een lijst in dit exacte formaat, niets anders:\n" +
return "Je bent een Nederlandse taalexpert. Geef elk van de " + words.size() + " onderstaande woorden een populariteitsscore van 1 (zeer zeldzaam) tot 10 (zeer algemeen).\n\n" +
"Output ALLEEN in dit formaat:\n" +
"woord1:score\n" +
"woord2:score\n\n" +
"Woorden om te scoren:\n" +
"GEEN andere tekst of uitleg. Sla GEEN woorden over.\n\n" +
"Lijst:\n" +
String.join("\n", words);
}

View File

@@ -138,12 +138,13 @@ public class SwedishGenerator {
public WordDifficulty(String word, int score) {
this.word = word;
this.score = score;
// Simple heuristic for difficulty: shorter words have lower difficulty
// We combine this with the score (10 = common/simple, 1 = rare/hard)
// Lower difficulty value means it is tried EARLIER.
// We want LONGER and SIMPLER words to be tried earlier.
// Increasing simplicity weight: score (1-10) now has max impact of 50.
this.difficulty = /*Math.min(40, (10-word.length()) * 5)*/ - (score * 5);
// We want LONGER and SIMPLER words to be tried earlier (lower difficulty value).
// word.length() is 2 to 8.
// score is 1 to 10.
// Base difficulty starts high and decreases with length and score.
// Length impact: up to 8 * 10 = 80
// Score impact: up to 10 * 15 = 150
this.difficulty = 250 - (word.length() * 10) - (score * 15);
}
}
@@ -155,9 +156,14 @@ public class SwedishGenerator {
for (var line : lines) {
if (first) { first = false; continue; }
var parts = line.split(",", 3);
if (parts.length >= 2) {
if (parts.length >= 3) {
try {
scores.put(parts[0].trim().toUpperCase(Locale.ROOT), Integer.parseInt(parts[1].trim()));
var word = parts[0].trim().toUpperCase(Locale.ROOT);
var score = Integer.parseInt(parts[1].trim());
var status = parts[2].trim();
if ("OK".equalsIgnoreCase(status)) {
scores.put(word, score);
}
} catch (NumberFormatException ignored) {}
}
}

View File

@@ -253,9 +253,14 @@ public class ThemePoolBuilderLength {
for (var line : scoreLines) {
if (first) { first = false; continue; }
var parts = line.split(",", 3);
if (parts.length >= 2) {
if (parts.length >= 3) {
try {
llmScores.put(parts[0].trim().toUpperCase(Locale.ROOT), Integer.parseInt(parts[1].trim()));
var word = parts[0].trim().toUpperCase(Locale.ROOT);
var score = Integer.parseInt(parts[1].trim());
var status = parts[2].trim();
if ("OK".equalsIgnoreCase(status)) {
llmScores.put(word, score);
}
} catch (NumberFormatException ignored) {}
}
}
@@ -272,9 +277,10 @@ public class ThemePoolBuilderLength {
var w = out.get(i);
var crossScore = crossabilityScore(w);
var lScore = llmScores.getOrDefault(w, 5);
// Prioritize simple words (high lScore) and high crossability
// Increased simplicity weight: lScore (1-10) now adds up to 200 points.
score[i] = crossScore + (lScore * 20);
// Prioritize simple words (high lScore) and long words.
// lScore (1-10) adds up to 300 points (weight 30).
// Length (2-8) adds up to 160 points (weight 20).
score[i] = crossScore + (lScore * 30) + (w.length() * 20);
byLen[w.length()].set(i);
}

File diff suppressed because it is too large Load Diff