update them

This commit is contained in:
mike
2025-12-21 21:42:47 +01:00
parent ee7b1925f2
commit 2e34efbde3
8 changed files with 53532 additions and 15773 deletions

View File

@@ -18,7 +18,7 @@ public class ConcurrentWordScorer {
// ===== CONFIGURATION =====
private static final String INPUT_WORDLIST = "word-list.txt";
private static final String OUTPUT_SCORES = "word_scores.csv";
private static final int BATCH_SIZE = 30; // Smaller for better distribution
private static final int BATCH_SIZE = 10; // Even smaller for the difficult remaining words
private static final int MAX_RETRIES = 3;
// Define all three endpoints
@@ -72,7 +72,7 @@ public class ConcurrentWordScorer {
throw new IOException("[" + name + "] Empty response content");
}
return parseScoresFromReply(batch, content);
return parseScoresFromReply(batch, content, name);
}
}
@@ -81,7 +81,7 @@ public class ConcurrentWordScorer {
OllamaEndpoint() {
super("Ollama", "http://localhost:11434/api/chat",
"qwen2.5:14b", 2); // 2 concurrent requests
"qwen2.5:14b", 1); // 2 concurrent requests
}
@Override String buildRequestJson(String prompt) {
@@ -147,6 +147,8 @@ public class ConcurrentWordScorer {
}
System.out.println();
cleanupOutputFile();
// Load work queue
var allWords = Files.readAllLines(Paths.get(INPUT_WORDLIST));
var scoredWords = loadAlreadyScoredWords();
@@ -342,12 +344,50 @@ public class ConcurrentWordScorer {
first = false;
continue;
}
var parts = line.split(",", 2);
if (parts.length >= 1) scored.add(parts[0].trim().toLowerCase());
var parts = line.split(",");
if (parts.length >= 3) {
var word = parts[0].trim().toLowerCase();
var status = parts[2].trim();
if ("OK".equalsIgnoreCase(status)) {
scored.add(word);
}
}
}
return scored;
}
private static void cleanupOutputFile() throws IOException {
var path = Paths.get(OUTPUT_SCORES);
if (!Files.exists(path)) return;
System.out.println("Cleaning up " + OUTPUT_SCORES + "...");
var lines = Files.readAllLines(path);
if (lines.isEmpty()) return;
var header = lines.get(0);
Map<String, String> latestOkEntries = new LinkedHashMap<>();
for (int i = 1; i < lines.size(); i++) {
var line = lines.get(i);
var parts = line.split(",");
if (parts.length >= 3) {
var word = parts[0].trim().toLowerCase();
var status = parts[2].trim();
if ("OK".equalsIgnoreCase(status)) {
latestOkEntries.put(word, line);
}
}
}
var cleanedLines = new ArrayList<String>();
cleanedLines.add(header);
cleanedLines.addAll(latestOkEntries.values());
Files.write(path, cleanedLines, StandardCharsets.UTF_8);
System.out.printf("Cleanup complete. Kept %d unique OK entries. Removed %d non-OK or duplicate entries.%n",
latestOkEntries.size(), lines.size() - cleanedLines.size());
}
private static List<WordScore> createFailedScores(List<String> words, String endpoint) {
List<WordScore> failed = new ArrayList<>();
for (var word : words) {
@@ -357,16 +397,25 @@ public class ConcurrentWordScorer {
}
// Parsing logic
private static List<WordScore> parseScoresFromReply(List<String> expectedWords, String reply) {
private static List<WordScore> parseScoresFromReply(List<String> expectedWords, String reply, String endpointName) {
Map<String, Integer> wordScoreMap = new HashMap<>();
var lines = reply.split("\n");
for (var line : lines) {
line = line.trim();
if (line.contains(":")) {
var parts = line.split(":", 2);
// Handle formats like "1. word:score", "word: score", "word - score"
String sep = null;
if (line.contains(":")) sep = ":";
else if (line.contains("-")) sep = "-";
if (sep != null) {
var parts = line.split(sep, 2);
if (parts.length == 2) {
var word = parts[0].trim().toLowerCase();
var wordPart = parts[0].trim();
// Remove leading numbering like "1. " or bullets like "* ", "- "
wordPart = wordPart.replaceAll("^[\\d+.)*\\-\\s]+", "");
var word = wordPart.toLowerCase();
try {
var scoreStr = parts[1].trim();
// Handle potential non-numeric junk after the number
@@ -398,14 +447,12 @@ public class ConcurrentWordScorer {
// Prompt creation
private static String createScoringPrompt(List<String> words) {
return "Je bent een Nederlandse taalexpert. Geef elk woord een populariteitsscore van 1-10.\n\n" +
"Score criteria:\n" +
"- 1 = Zeer zeldzaam, archaïsch, of extreem specifiek vakjargon\n" +
"- 10 = Zeer algemeen, dagelijks gebruikt door iedereen\n\n" +
"Geef ALLEEN een lijst in dit exacte formaat, niets anders:\n" +
return "Je bent een Nederlandse taalexpert. Geef elk van de " + words.size() + " onderstaande woorden een populariteitsscore van 1 (zeer zeldzaam) tot 10 (zeer algemeen).\n\n" +
"Output ALLEEN in dit formaat:\n" +
"woord1:score\n" +
"woord2:score\n\n" +
"Woorden om te scoren:\n" +
"GEEN andere tekst of uitleg. Sla GEEN woorden over.\n\n" +
"Lijst:\n" +
String.join("\n", words);
}

View File

@@ -138,12 +138,13 @@ public class SwedishGenerator {
public WordDifficulty(String word, int score) {
this.word = word;
this.score = score;
// Simple heuristic for difficulty: shorter words have lower difficulty
// We combine this with the score (10 = common/simple, 1 = rare/hard)
// Lower difficulty value means it is tried EARLIER.
// We want LONGER and SIMPLER words to be tried earlier.
// Increasing simplicity weight: score (1-10) now has max impact of 50.
this.difficulty = /*Math.min(40, (10-word.length()) * 5)*/ - (score * 5);
// We want LONGER and SIMPLER words to be tried earlier (lower difficulty value).
// word.length() is 2 to 8.
// score is 1 to 10.
// Base difficulty starts high and decreases with length and score.
// Length impact: up to 8 * 10 = 80
// Score impact: up to 10 * 15 = 150
this.difficulty = 250 - (word.length() * 10) - (score * 15);
}
}
@@ -155,9 +156,14 @@ public class SwedishGenerator {
for (var line : lines) {
if (first) { first = false; continue; }
var parts = line.split(",", 3);
if (parts.length >= 2) {
if (parts.length >= 3) {
try {
scores.put(parts[0].trim().toUpperCase(Locale.ROOT), Integer.parseInt(parts[1].trim()));
var word = parts[0].trim().toUpperCase(Locale.ROOT);
var score = Integer.parseInt(parts[1].trim());
var status = parts[2].trim();
if ("OK".equalsIgnoreCase(status)) {
scores.put(word, score);
}
} catch (NumberFormatException ignored) {}
}
}

View File

@@ -253,9 +253,14 @@ public class ThemePoolBuilderLength {
for (var line : scoreLines) {
if (first) { first = false; continue; }
var parts = line.split(",", 3);
if (parts.length >= 2) {
if (parts.length >= 3) {
try {
llmScores.put(parts[0].trim().toUpperCase(Locale.ROOT), Integer.parseInt(parts[1].trim()));
var word = parts[0].trim().toUpperCase(Locale.ROOT);
var score = Integer.parseInt(parts[1].trim());
var status = parts[2].trim();
if ("OK".equalsIgnoreCase(status)) {
llmScores.put(word, score);
}
} catch (NumberFormatException ignored) {}
}
}
@@ -272,9 +277,10 @@ public class ThemePoolBuilderLength {
var w = out.get(i);
var crossScore = crossabilityScore(w);
var lScore = llmScores.getOrDefault(w, 5);
// Prioritize simple words (high lScore) and high crossability
// Increased simplicity weight: lScore (1-10) now adds up to 200 points.
score[i] = crossScore + (lScore * 20);
// Prioritize simple words (high lScore) and long words.
// lScore (1-10) adds up to 300 points (weight 30).
// Length (2-8) adds up to 160 points (weight 20).
score[i] = crossScore + (lScore * 30) + (w.length() * 20);
byLen[w.length()].set(i);
}