update them
This commit is contained in:
20956
out/pool.txt
20956
out/pool.txt
File diff suppressed because it is too large
Load Diff
@@ -3,10 +3,10 @@ Feeds: https://feeds.nos.nl/nosnieuwsalgemeen, https://feeds.nos.nl/nosnieuwstec
|
|||||||
Model: mistralai/mistral-nemo-instruct-2407
|
Model: mistralai/mistral-nemo-instruct-2407
|
||||||
|
|
||||||
Master size: 91892
|
Master size: 91892
|
||||||
Theme kept (in master): 0
|
Theme kept (in master): 35
|
||||||
Bridge size: 32000
|
Bridge size: 32000
|
||||||
Shorts kept: 133
|
Shorts kept: 133
|
||||||
Pool total: 38391
|
Pool total: 38785
|
||||||
|
|
||||||
Enforced minima:
|
Enforced minima:
|
||||||
2: 4000
|
2: 4000
|
||||||
@@ -19,20 +19,20 @@ Enforced minima:
|
|||||||
|
|
||||||
Counts per length (theme):
|
Counts per length (theme):
|
||||||
2: 0
|
2: 0
|
||||||
3: 0
|
3: 2
|
||||||
4: 0
|
4: 3
|
||||||
5: 0
|
5: 8
|
||||||
6: 0
|
6: 7
|
||||||
7: 0
|
7: 5
|
||||||
8: 0
|
8: 10
|
||||||
|
|
||||||
|
|
||||||
Counts per length (pool):
|
Counts per length (pool):
|
||||||
2: 248
|
2: 248
|
||||||
3: 1666
|
3: 1666
|
||||||
4: 4850
|
4: 4850
|
||||||
5: 795
|
5: 57
|
||||||
6: 1987
|
6: 136
|
||||||
7: 8550
|
7: 1183
|
||||||
8: 20295
|
8: 30645
|
||||||
|
|
||||||
|
|||||||
32
out/rss.txt
32
out/rss.txt
File diff suppressed because one or more lines are too long
@@ -0,0 +1,35 @@
|
|||||||
|
SCHOTEN
|
||||||
|
GETUIGEN
|
||||||
|
GEWONDEN
|
||||||
|
AUTORUIT
|
||||||
|
GAT
|
||||||
|
MELDING
|
||||||
|
RWANDA
|
||||||
|
CONGO
|
||||||
|
CONFLICT
|
||||||
|
STRIJD
|
||||||
|
MACHT
|
||||||
|
LAND
|
||||||
|
GRENS
|
||||||
|
BURUNDI
|
||||||
|
VLUCHT
|
||||||
|
INKOMEN
|
||||||
|
MINISTER
|
||||||
|
ZAKEN
|
||||||
|
DRUK
|
||||||
|
OPVOEREN
|
||||||
|
TROPEN
|
||||||
|
DAGEN
|
||||||
|
LANGER
|
||||||
|
ZON
|
||||||
|
HALFROND
|
||||||
|
DAGLICHT
|
||||||
|
BEWOLKT
|
||||||
|
ZONNIG
|
||||||
|
VERSCHIL
|
||||||
|
BELEVING
|
||||||
|
WATT
|
||||||
|
METER
|
||||||
|
GRIJS
|
||||||
|
KOUDER
|
||||||
|
TREND
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ public class ConcurrentWordScorer {
|
|||||||
// ===== CONFIGURATION =====
|
// ===== CONFIGURATION =====
|
||||||
private static final String INPUT_WORDLIST = "word-list.txt";
|
private static final String INPUT_WORDLIST = "word-list.txt";
|
||||||
private static final String OUTPUT_SCORES = "word_scores.csv";
|
private static final String OUTPUT_SCORES = "word_scores.csv";
|
||||||
private static final int BATCH_SIZE = 30; // Smaller for better distribution
|
private static final int BATCH_SIZE = 10; // Even smaller for the difficult remaining words
|
||||||
private static final int MAX_RETRIES = 3;
|
private static final int MAX_RETRIES = 3;
|
||||||
|
|
||||||
// Define all three endpoints
|
// Define all three endpoints
|
||||||
@@ -72,7 +72,7 @@ public class ConcurrentWordScorer {
|
|||||||
throw new IOException("[" + name + "] Empty response content");
|
throw new IOException("[" + name + "] Empty response content");
|
||||||
}
|
}
|
||||||
|
|
||||||
return parseScoresFromReply(batch, content);
|
return parseScoresFromReply(batch, content, name);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -81,7 +81,7 @@ public class ConcurrentWordScorer {
|
|||||||
|
|
||||||
OllamaEndpoint() {
|
OllamaEndpoint() {
|
||||||
super("Ollama", "http://localhost:11434/api/chat",
|
super("Ollama", "http://localhost:11434/api/chat",
|
||||||
"qwen2.5:14b", 2); // 2 concurrent requests
|
"qwen2.5:14b", 1); // 2 concurrent requests
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override String buildRequestJson(String prompt) {
|
@Override String buildRequestJson(String prompt) {
|
||||||
@@ -147,6 +147,8 @@ public class ConcurrentWordScorer {
|
|||||||
}
|
}
|
||||||
System.out.println();
|
System.out.println();
|
||||||
|
|
||||||
|
cleanupOutputFile();
|
||||||
|
|
||||||
// Load work queue
|
// Load work queue
|
||||||
var allWords = Files.readAllLines(Paths.get(INPUT_WORDLIST));
|
var allWords = Files.readAllLines(Paths.get(INPUT_WORDLIST));
|
||||||
var scoredWords = loadAlreadyScoredWords();
|
var scoredWords = loadAlreadyScoredWords();
|
||||||
@@ -342,12 +344,50 @@ public class ConcurrentWordScorer {
|
|||||||
first = false;
|
first = false;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
var parts = line.split(",", 2);
|
var parts = line.split(",");
|
||||||
if (parts.length >= 1) scored.add(parts[0].trim().toLowerCase());
|
if (parts.length >= 3) {
|
||||||
|
var word = parts[0].trim().toLowerCase();
|
||||||
|
var status = parts[2].trim();
|
||||||
|
if ("OK".equalsIgnoreCase(status)) {
|
||||||
|
scored.add(word);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return scored;
|
return scored;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static void cleanupOutputFile() throws IOException {
|
||||||
|
var path = Paths.get(OUTPUT_SCORES);
|
||||||
|
if (!Files.exists(path)) return;
|
||||||
|
|
||||||
|
System.out.println("Cleaning up " + OUTPUT_SCORES + "...");
|
||||||
|
var lines = Files.readAllLines(path);
|
||||||
|
if (lines.isEmpty()) return;
|
||||||
|
|
||||||
|
var header = lines.get(0);
|
||||||
|
Map<String, String> latestOkEntries = new LinkedHashMap<>();
|
||||||
|
|
||||||
|
for (int i = 1; i < lines.size(); i++) {
|
||||||
|
var line = lines.get(i);
|
||||||
|
var parts = line.split(",");
|
||||||
|
if (parts.length >= 3) {
|
||||||
|
var word = parts[0].trim().toLowerCase();
|
||||||
|
var status = parts[2].trim();
|
||||||
|
if ("OK".equalsIgnoreCase(status)) {
|
||||||
|
latestOkEntries.put(word, line);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var cleanedLines = new ArrayList<String>();
|
||||||
|
cleanedLines.add(header);
|
||||||
|
cleanedLines.addAll(latestOkEntries.values());
|
||||||
|
|
||||||
|
Files.write(path, cleanedLines, StandardCharsets.UTF_8);
|
||||||
|
System.out.printf("Cleanup complete. Kept %d unique OK entries. Removed %d non-OK or duplicate entries.%n",
|
||||||
|
latestOkEntries.size(), lines.size() - cleanedLines.size());
|
||||||
|
}
|
||||||
|
|
||||||
private static List<WordScore> createFailedScores(List<String> words, String endpoint) {
|
private static List<WordScore> createFailedScores(List<String> words, String endpoint) {
|
||||||
List<WordScore> failed = new ArrayList<>();
|
List<WordScore> failed = new ArrayList<>();
|
||||||
for (var word : words) {
|
for (var word : words) {
|
||||||
@@ -357,16 +397,25 @@ public class ConcurrentWordScorer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Parsing logic
|
// Parsing logic
|
||||||
private static List<WordScore> parseScoresFromReply(List<String> expectedWords, String reply) {
|
private static List<WordScore> parseScoresFromReply(List<String> expectedWords, String reply, String endpointName) {
|
||||||
Map<String, Integer> wordScoreMap = new HashMap<>();
|
Map<String, Integer> wordScoreMap = new HashMap<>();
|
||||||
var lines = reply.split("\n");
|
var lines = reply.split("\n");
|
||||||
|
|
||||||
for (var line : lines) {
|
for (var line : lines) {
|
||||||
line = line.trim();
|
line = line.trim();
|
||||||
if (line.contains(":")) {
|
// Handle formats like "1. word:score", "word: score", "word - score"
|
||||||
var parts = line.split(":", 2);
|
String sep = null;
|
||||||
|
if (line.contains(":")) sep = ":";
|
||||||
|
else if (line.contains("-")) sep = "-";
|
||||||
|
|
||||||
|
if (sep != null) {
|
||||||
|
var parts = line.split(sep, 2);
|
||||||
if (parts.length == 2) {
|
if (parts.length == 2) {
|
||||||
var word = parts[0].trim().toLowerCase();
|
var wordPart = parts[0].trim();
|
||||||
|
// Remove leading numbering like "1. " or bullets like "* ", "- "
|
||||||
|
wordPart = wordPart.replaceAll("^[\\d+.)*\\-\\s]+", "");
|
||||||
|
var word = wordPart.toLowerCase();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
var scoreStr = parts[1].trim();
|
var scoreStr = parts[1].trim();
|
||||||
// Handle potential non-numeric junk after the number
|
// Handle potential non-numeric junk after the number
|
||||||
@@ -398,14 +447,12 @@ public class ConcurrentWordScorer {
|
|||||||
|
|
||||||
// Prompt creation
|
// Prompt creation
|
||||||
private static String createScoringPrompt(List<String> words) {
|
private static String createScoringPrompt(List<String> words) {
|
||||||
return "Je bent een Nederlandse taalexpert. Geef elk woord een populariteitsscore van 1-10.\n\n" +
|
return "Je bent een Nederlandse taalexpert. Geef elk van de " + words.size() + " onderstaande woorden een populariteitsscore van 1 (zeer zeldzaam) tot 10 (zeer algemeen).\n\n" +
|
||||||
"Score criteria:\n" +
|
"Output ALLEEN in dit formaat:\n" +
|
||||||
"- 1 = Zeer zeldzaam, archaïsch, of extreem specifiek vakjargon\n" +
|
|
||||||
"- 10 = Zeer algemeen, dagelijks gebruikt door iedereen\n\n" +
|
|
||||||
"Geef ALLEEN een lijst in dit exacte formaat, niets anders:\n" +
|
|
||||||
"woord1:score\n" +
|
"woord1:score\n" +
|
||||||
"woord2:score\n\n" +
|
"woord2:score\n\n" +
|
||||||
"Woorden om te scoren:\n" +
|
"GEEN andere tekst of uitleg. Sla GEEN woorden over.\n\n" +
|
||||||
|
"Lijst:\n" +
|
||||||
String.join("\n", words);
|
String.join("\n", words);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -138,12 +138,13 @@ public class SwedishGenerator {
|
|||||||
public WordDifficulty(String word, int score) {
|
public WordDifficulty(String word, int score) {
|
||||||
this.word = word;
|
this.word = word;
|
||||||
this.score = score;
|
this.score = score;
|
||||||
// Simple heuristic for difficulty: shorter words have lower difficulty
|
// We want LONGER and SIMPLER words to be tried earlier (lower difficulty value).
|
||||||
// We combine this with the score (10 = common/simple, 1 = rare/hard)
|
// word.length() is 2 to 8.
|
||||||
// Lower difficulty value means it is tried EARLIER.
|
// score is 1 to 10.
|
||||||
// We want LONGER and SIMPLER words to be tried earlier.
|
// Base difficulty starts high and decreases with length and score.
|
||||||
// Increasing simplicity weight: score (1-10) now has max impact of 50.
|
// Length impact: up to 8 * 10 = 80
|
||||||
this.difficulty = /*Math.min(40, (10-word.length()) * 5)*/ - (score * 5);
|
// Score impact: up to 10 * 15 = 150
|
||||||
|
this.difficulty = 250 - (word.length() * 10) - (score * 15);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -155,9 +156,14 @@ public class SwedishGenerator {
|
|||||||
for (var line : lines) {
|
for (var line : lines) {
|
||||||
if (first) { first = false; continue; }
|
if (first) { first = false; continue; }
|
||||||
var parts = line.split(",", 3);
|
var parts = line.split(",", 3);
|
||||||
if (parts.length >= 2) {
|
if (parts.length >= 3) {
|
||||||
try {
|
try {
|
||||||
scores.put(parts[0].trim().toUpperCase(Locale.ROOT), Integer.parseInt(parts[1].trim()));
|
var word = parts[0].trim().toUpperCase(Locale.ROOT);
|
||||||
|
var score = Integer.parseInt(parts[1].trim());
|
||||||
|
var status = parts[2].trim();
|
||||||
|
if ("OK".equalsIgnoreCase(status)) {
|
||||||
|
scores.put(word, score);
|
||||||
|
}
|
||||||
} catch (NumberFormatException ignored) {}
|
} catch (NumberFormatException ignored) {}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -253,9 +253,14 @@ public class ThemePoolBuilderLength {
|
|||||||
for (var line : scoreLines) {
|
for (var line : scoreLines) {
|
||||||
if (first) { first = false; continue; }
|
if (first) { first = false; continue; }
|
||||||
var parts = line.split(",", 3);
|
var parts = line.split(",", 3);
|
||||||
if (parts.length >= 2) {
|
if (parts.length >= 3) {
|
||||||
try {
|
try {
|
||||||
llmScores.put(parts[0].trim().toUpperCase(Locale.ROOT), Integer.parseInt(parts[1].trim()));
|
var word = parts[0].trim().toUpperCase(Locale.ROOT);
|
||||||
|
var score = Integer.parseInt(parts[1].trim());
|
||||||
|
var status = parts[2].trim();
|
||||||
|
if ("OK".equalsIgnoreCase(status)) {
|
||||||
|
llmScores.put(word, score);
|
||||||
|
}
|
||||||
} catch (NumberFormatException ignored) {}
|
} catch (NumberFormatException ignored) {}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -272,9 +277,10 @@ public class ThemePoolBuilderLength {
|
|||||||
var w = out.get(i);
|
var w = out.get(i);
|
||||||
var crossScore = crossabilityScore(w);
|
var crossScore = crossabilityScore(w);
|
||||||
var lScore = llmScores.getOrDefault(w, 5);
|
var lScore = llmScores.getOrDefault(w, 5);
|
||||||
// Prioritize simple words (high lScore) and high crossability
|
// Prioritize simple words (high lScore) and long words.
|
||||||
// Increased simplicity weight: lScore (1-10) now adds up to 200 points.
|
// lScore (1-10) adds up to 300 points (weight 30).
|
||||||
score[i] = crossScore + (lScore * 20);
|
// Length (2-8) adds up to 160 points (weight 20).
|
||||||
|
score[i] = crossScore + (lScore * 30) + (w.length() * 20);
|
||||||
byLen[w.length()].set(i);
|
byLen[w.length()].set(i);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
48143
word_scores.csv
48143
word_scores.csv
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user