update them
This commit is contained in:
20956
out/pool.txt
20956
out/pool.txt
File diff suppressed because it is too large
Load Diff
@@ -3,10 +3,10 @@ Feeds: https://feeds.nos.nl/nosnieuwsalgemeen, https://feeds.nos.nl/nosnieuwstec
|
||||
Model: mistralai/mistral-nemo-instruct-2407
|
||||
|
||||
Master size: 91892
|
||||
Theme kept (in master): 0
|
||||
Theme kept (in master): 35
|
||||
Bridge size: 32000
|
||||
Shorts kept: 133
|
||||
Pool total: 38391
|
||||
Pool total: 38785
|
||||
|
||||
Enforced minima:
|
||||
2: 4000
|
||||
@@ -19,20 +19,20 @@ Enforced minima:
|
||||
|
||||
Counts per length (theme):
|
||||
2: 0
|
||||
3: 0
|
||||
4: 0
|
||||
5: 0
|
||||
6: 0
|
||||
7: 0
|
||||
8: 0
|
||||
3: 2
|
||||
4: 3
|
||||
5: 8
|
||||
6: 7
|
||||
7: 5
|
||||
8: 10
|
||||
|
||||
|
||||
Counts per length (pool):
|
||||
2: 248
|
||||
3: 1666
|
||||
4: 4850
|
||||
5: 795
|
||||
6: 1987
|
||||
7: 8550
|
||||
8: 20295
|
||||
5: 57
|
||||
6: 136
|
||||
7: 1183
|
||||
8: 30645
|
||||
|
||||
|
||||
32
out/rss.txt
32
out/rss.txt
File diff suppressed because one or more lines are too long
@@ -0,0 +1,35 @@
|
||||
SCHOTEN
|
||||
GETUIGEN
|
||||
GEWONDEN
|
||||
AUTORUIT
|
||||
GAT
|
||||
MELDING
|
||||
RWANDA
|
||||
CONGO
|
||||
CONFLICT
|
||||
STRIJD
|
||||
MACHT
|
||||
LAND
|
||||
GRENS
|
||||
BURUNDI
|
||||
VLUCHT
|
||||
INKOMEN
|
||||
MINISTER
|
||||
ZAKEN
|
||||
DRUK
|
||||
OPVOEREN
|
||||
TROPEN
|
||||
DAGEN
|
||||
LANGER
|
||||
ZON
|
||||
HALFROND
|
||||
DAGLICHT
|
||||
BEWOLKT
|
||||
ZONNIG
|
||||
VERSCHIL
|
||||
BELEVING
|
||||
WATT
|
||||
METER
|
||||
GRIJS
|
||||
KOUDER
|
||||
TREND
|
||||
|
||||
@@ -18,7 +18,7 @@ public class ConcurrentWordScorer {
|
||||
// ===== CONFIGURATION =====
|
||||
private static final String INPUT_WORDLIST = "word-list.txt";
|
||||
private static final String OUTPUT_SCORES = "word_scores.csv";
|
||||
private static final int BATCH_SIZE = 30; // Smaller for better distribution
|
||||
private static final int BATCH_SIZE = 10; // Even smaller for the difficult remaining words
|
||||
private static final int MAX_RETRIES = 3;
|
||||
|
||||
// Define all three endpoints
|
||||
@@ -72,7 +72,7 @@ public class ConcurrentWordScorer {
|
||||
throw new IOException("[" + name + "] Empty response content");
|
||||
}
|
||||
|
||||
return parseScoresFromReply(batch, content);
|
||||
return parseScoresFromReply(batch, content, name);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -81,7 +81,7 @@ public class ConcurrentWordScorer {
|
||||
|
||||
OllamaEndpoint() {
|
||||
super("Ollama", "http://localhost:11434/api/chat",
|
||||
"qwen2.5:14b", 2); // 2 concurrent requests
|
||||
"qwen2.5:14b", 1); // 2 concurrent requests
|
||||
}
|
||||
|
||||
@Override String buildRequestJson(String prompt) {
|
||||
@@ -147,6 +147,8 @@ public class ConcurrentWordScorer {
|
||||
}
|
||||
System.out.println();
|
||||
|
||||
cleanupOutputFile();
|
||||
|
||||
// Load work queue
|
||||
var allWords = Files.readAllLines(Paths.get(INPUT_WORDLIST));
|
||||
var scoredWords = loadAlreadyScoredWords();
|
||||
@@ -342,12 +344,50 @@ public class ConcurrentWordScorer {
|
||||
first = false;
|
||||
continue;
|
||||
}
|
||||
var parts = line.split(",", 2);
|
||||
if (parts.length >= 1) scored.add(parts[0].trim().toLowerCase());
|
||||
var parts = line.split(",");
|
||||
if (parts.length >= 3) {
|
||||
var word = parts[0].trim().toLowerCase();
|
||||
var status = parts[2].trim();
|
||||
if ("OK".equalsIgnoreCase(status)) {
|
||||
scored.add(word);
|
||||
}
|
||||
}
|
||||
}
|
||||
return scored;
|
||||
}
|
||||
|
||||
private static void cleanupOutputFile() throws IOException {
|
||||
var path = Paths.get(OUTPUT_SCORES);
|
||||
if (!Files.exists(path)) return;
|
||||
|
||||
System.out.println("Cleaning up " + OUTPUT_SCORES + "...");
|
||||
var lines = Files.readAllLines(path);
|
||||
if (lines.isEmpty()) return;
|
||||
|
||||
var header = lines.get(0);
|
||||
Map<String, String> latestOkEntries = new LinkedHashMap<>();
|
||||
|
||||
for (int i = 1; i < lines.size(); i++) {
|
||||
var line = lines.get(i);
|
||||
var parts = line.split(",");
|
||||
if (parts.length >= 3) {
|
||||
var word = parts[0].trim().toLowerCase();
|
||||
var status = parts[2].trim();
|
||||
if ("OK".equalsIgnoreCase(status)) {
|
||||
latestOkEntries.put(word, line);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var cleanedLines = new ArrayList<String>();
|
||||
cleanedLines.add(header);
|
||||
cleanedLines.addAll(latestOkEntries.values());
|
||||
|
||||
Files.write(path, cleanedLines, StandardCharsets.UTF_8);
|
||||
System.out.printf("Cleanup complete. Kept %d unique OK entries. Removed %d non-OK or duplicate entries.%n",
|
||||
latestOkEntries.size(), lines.size() - cleanedLines.size());
|
||||
}
|
||||
|
||||
private static List<WordScore> createFailedScores(List<String> words, String endpoint) {
|
||||
List<WordScore> failed = new ArrayList<>();
|
||||
for (var word : words) {
|
||||
@@ -357,16 +397,25 @@ public class ConcurrentWordScorer {
|
||||
}
|
||||
|
||||
// Parsing logic
|
||||
private static List<WordScore> parseScoresFromReply(List<String> expectedWords, String reply) {
|
||||
private static List<WordScore> parseScoresFromReply(List<String> expectedWords, String reply, String endpointName) {
|
||||
Map<String, Integer> wordScoreMap = new HashMap<>();
|
||||
var lines = reply.split("\n");
|
||||
|
||||
for (var line : lines) {
|
||||
line = line.trim();
|
||||
if (line.contains(":")) {
|
||||
var parts = line.split(":", 2);
|
||||
// Handle formats like "1. word:score", "word: score", "word - score"
|
||||
String sep = null;
|
||||
if (line.contains(":")) sep = ":";
|
||||
else if (line.contains("-")) sep = "-";
|
||||
|
||||
if (sep != null) {
|
||||
var parts = line.split(sep, 2);
|
||||
if (parts.length == 2) {
|
||||
var word = parts[0].trim().toLowerCase();
|
||||
var wordPart = parts[0].trim();
|
||||
// Remove leading numbering like "1. " or bullets like "* ", "- "
|
||||
wordPart = wordPart.replaceAll("^[\\d+.)*\\-\\s]+", "");
|
||||
var word = wordPart.toLowerCase();
|
||||
|
||||
try {
|
||||
var scoreStr = parts[1].trim();
|
||||
// Handle potential non-numeric junk after the number
|
||||
@@ -398,14 +447,12 @@ public class ConcurrentWordScorer {
|
||||
|
||||
// Prompt creation
|
||||
private static String createScoringPrompt(List<String> words) {
|
||||
return "Je bent een Nederlandse taalexpert. Geef elk woord een populariteitsscore van 1-10.\n\n" +
|
||||
"Score criteria:\n" +
|
||||
"- 1 = Zeer zeldzaam, archaïsch, of extreem specifiek vakjargon\n" +
|
||||
"- 10 = Zeer algemeen, dagelijks gebruikt door iedereen\n\n" +
|
||||
"Geef ALLEEN een lijst in dit exacte formaat, niets anders:\n" +
|
||||
return "Je bent een Nederlandse taalexpert. Geef elk van de " + words.size() + " onderstaande woorden een populariteitsscore van 1 (zeer zeldzaam) tot 10 (zeer algemeen).\n\n" +
|
||||
"Output ALLEEN in dit formaat:\n" +
|
||||
"woord1:score\n" +
|
||||
"woord2:score\n\n" +
|
||||
"Woorden om te scoren:\n" +
|
||||
"GEEN andere tekst of uitleg. Sla GEEN woorden over.\n\n" +
|
||||
"Lijst:\n" +
|
||||
String.join("\n", words);
|
||||
}
|
||||
|
||||
|
||||
@@ -138,12 +138,13 @@ public class SwedishGenerator {
|
||||
public WordDifficulty(String word, int score) {
|
||||
this.word = word;
|
||||
this.score = score;
|
||||
// Simple heuristic for difficulty: shorter words have lower difficulty
|
||||
// We combine this with the score (10 = common/simple, 1 = rare/hard)
|
||||
// Lower difficulty value means it is tried EARLIER.
|
||||
// We want LONGER and SIMPLER words to be tried earlier.
|
||||
// Increasing simplicity weight: score (1-10) now has max impact of 50.
|
||||
this.difficulty = /*Math.min(40, (10-word.length()) * 5)*/ - (score * 5);
|
||||
// We want LONGER and SIMPLER words to be tried earlier (lower difficulty value).
|
||||
// word.length() is 2 to 8.
|
||||
// score is 1 to 10.
|
||||
// Base difficulty starts high and decreases with length and score.
|
||||
// Length impact: up to 8 * 10 = 80
|
||||
// Score impact: up to 10 * 15 = 150
|
||||
this.difficulty = 250 - (word.length() * 10) - (score * 15);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -155,9 +156,14 @@ public class SwedishGenerator {
|
||||
for (var line : lines) {
|
||||
if (first) { first = false; continue; }
|
||||
var parts = line.split(",", 3);
|
||||
if (parts.length >= 2) {
|
||||
if (parts.length >= 3) {
|
||||
try {
|
||||
scores.put(parts[0].trim().toUpperCase(Locale.ROOT), Integer.parseInt(parts[1].trim()));
|
||||
var word = parts[0].trim().toUpperCase(Locale.ROOT);
|
||||
var score = Integer.parseInt(parts[1].trim());
|
||||
var status = parts[2].trim();
|
||||
if ("OK".equalsIgnoreCase(status)) {
|
||||
scores.put(word, score);
|
||||
}
|
||||
} catch (NumberFormatException ignored) {}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -253,9 +253,14 @@ public class ThemePoolBuilderLength {
|
||||
for (var line : scoreLines) {
|
||||
if (first) { first = false; continue; }
|
||||
var parts = line.split(",", 3);
|
||||
if (parts.length >= 2) {
|
||||
if (parts.length >= 3) {
|
||||
try {
|
||||
llmScores.put(parts[0].trim().toUpperCase(Locale.ROOT), Integer.parseInt(parts[1].trim()));
|
||||
var word = parts[0].trim().toUpperCase(Locale.ROOT);
|
||||
var score = Integer.parseInt(parts[1].trim());
|
||||
var status = parts[2].trim();
|
||||
if ("OK".equalsIgnoreCase(status)) {
|
||||
llmScores.put(word, score);
|
||||
}
|
||||
} catch (NumberFormatException ignored) {}
|
||||
}
|
||||
}
|
||||
@@ -272,9 +277,10 @@ public class ThemePoolBuilderLength {
|
||||
var w = out.get(i);
|
||||
var crossScore = crossabilityScore(w);
|
||||
var lScore = llmScores.getOrDefault(w, 5);
|
||||
// Prioritize simple words (high lScore) and high crossability
|
||||
// Increased simplicity weight: lScore (1-10) now adds up to 200 points.
|
||||
score[i] = crossScore + (lScore * 20);
|
||||
// Prioritize simple words (high lScore) and long words.
|
||||
// lScore (1-10) adds up to 300 points (weight 30).
|
||||
// Length (2-8) adds up to 160 points (weight 20).
|
||||
score[i] = crossScore + (lScore * 30) + (w.length() * 20);
|
||||
byLen[w.length()].set(i);
|
||||
}
|
||||
|
||||
|
||||
48143
word_scores.csv
48143
word_scores.csv
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user