Files
puzzle-generator/src/puzzle/DutchWordScorer.java
2025-12-21 19:29:14 +01:00

229 lines
8.6 KiB
Java

package puzzle;
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;
import java.util.concurrent.TimeUnit;
/**
* Ollama Dutch Wordlist Scorer
* Scores 90k Dutch words on popularity/complexity using local Ollama API
*/
public class DutchWordScorer {
// Configuration
private static final String OLLAMA_ENDPOINT = "http://localhost:11434/api/chat";
private static final String MODEL = "qwen2.5:14b"; // or "llama3.1:latest"
private static final int BATCH_SIZE = 50; // Words per API call
private static final int RATE_LIMIT_DELAY_MS = 500; // Be nice to local Ollama
private static final int MAX_RETRIES = 3;
// Input/output files
private static final String INPUT_WORDLIST = "word-list.txt";
private static final String OUTPUT_SCORES = "word_scores.csv";
public static void main(String[] args) throws Exception {
System.out.println("Starting Dutch wordlist scoring...");
// Read all words
List<String> words = Files.readAllLines(Paths.get(INPUT_WORDLIST));
System.out.printf("Loaded %d words from %s%n", words.size(), INPUT_WORDLIST);
// Process in batches
HttpClient client = HttpClient.newBuilder()
.connectTimeout(java.time.Duration.ofSeconds(30))
.build();
List<WordScore> allScores = new ArrayList<>();
for (int i = 0; i < words.size(); i += BATCH_SIZE) {
int end = Math.min(i + BATCH_SIZE, words.size());
List<String> batch = words.subList(i, end);
System.out.printf("Processing batch %d-%d...%n", i + 1, end);
boolean success = false;
int retries = 0;
while (!success && retries < MAX_RETRIES) {
try {
List<WordScore> batchScores = processBatch(client, batch);
allScores.addAll(batchScores);
success = true;
// Rate limiting
if (i + BATCH_SIZE < words.size()) {
Thread.sleep(RATE_LIMIT_DELAY_MS);
}
} catch (Exception e) {
retries++;
System.err.printf("Batch %d-%d failed (attempt %d/%d): %s%n",
i + 1, end, retries, MAX_RETRIES, e.getMessage());
if (retries >= MAX_RETRIES) {
System.err.println("Max retries reached, skipping batch");
// Add null scores for failed batch to maintain alignment
batch.forEach(w -> allScores.add(new WordScore(w, -1, "FAILED")));
} else {
Thread.sleep(2000 * retries); // Exponential backoff
}
}
}
}
// Write results
writeScoresToCsv(allScores);
System.out.printf("Completed! Scored %d words. Results saved to %s%n",
allScores.size(), OUTPUT_SCORES);
}
public static List<WordScore> processBatch(HttpClient client, List<String> batch) throws Exception {
String prompt = createScoringPrompt(batch);
// Build JSON request
String jsonRequest = buildChatRequestJson(prompt);
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create(OLLAMA_ENDPOINT))
.header("Content-Type", "application/json")
.POST(HttpRequest.BodyPublishers.ofString(jsonRequest))
.build();
HttpResponse<String> response = client.send(request,
HttpResponse.BodyHandlers.ofString());
if (response.statusCode() != 200) {
throw new RuntimeException("HTTP " + response.statusCode() + ": " + response.body());
}
// Parse response
String responseBody = response.body();
String assistantReply = extractMessageContent(responseBody);
return parseScoresFromReply(batch, assistantReply);
}
private static String createScoringPrompt(List<String> words) {
return """
Je bent een Nederlandse taalexpert. Geef elk woord een populariteitsscore van 1-10.
Score criteria:
- 1 = Zeer zeldzaam, archaïsch, of extreem specifiek vakjargon
- 10 = Zeer algemeen, dagelijks gebruikt door iedereen
Geef ALLEEN een lijst in dit exacte formaat, niets anders:
woord1:score
woord2:score
enz.
Woorden om te scoren:
""" + String.join("\n", words);
}
private static String buildChatRequestJson(String prompt) {
// Simple JSON building (in production use a library like Jackson)
return String.format("""
{
"model": "%s",
"messages": [
{
"role": "user",
"content": "%s"
}
],
"stream": false,
"temperature": 0.1
}
""", MODEL, escapeJson(prompt));
}
private static String escapeJson(String str) {
return str.replace("\\", "\\\\")
.replace("\"", "\\\"")
.replace("\n", "\\n")
.replace("\r", "\\r")
.replace("\t", "\\t");
}
private static String extractMessageContent(String responseBody) {
// Parse: "message":{"role":"assistant","content":"..."}
int contentStart = responseBody.indexOf("\"content\":\"") + 11;
int contentEnd = responseBody.indexOf("\"", contentStart);
return responseBody.substring(contentStart, contentEnd)
.replace("\\n", "\n");
}
private static List<WordScore> parseScoresFromReply(List<String> expectedWords, String reply) {
Map<String, Integer> wordScoreMap = new HashMap<>();
String[] lines = reply.split("\n");
for (String line : lines) {
line = line.trim();
if (line.contains(":")) {
String[] parts = line.split(":", 2);
if (parts.length == 2) {
String word = parts[0].trim().toLowerCase();
try {
int score = Integer.parseInt(parts[1].trim());
wordScoreMap.put(word, Math.max(1, Math.min(10, score)));
} catch (NumberFormatException e) {
// Skip invalid lines
}
}
}
}
// Match scores to original words (maintaining order)
List<WordScore> results = new ArrayList<>();
for (String word : expectedWords) {
Integer score = wordScoreMap.get(word.toLowerCase());
if (score != null) {
results.add(new WordScore(word, score, "OK"));
} else {
System.err.printf("Warning: No score found for '%s'%n", word);
results.add(new WordScore(word, -1, "MISSING"));
}
}
return results;
}
private static void writeScoresToCsv(List<WordScore> scores) throws Exception {
List<String> lines = new ArrayList<>();
lines.add("word,score,status");
for (WordScore ws : scores) {
lines.add(String.format("%s,%d,%s", ws.word, ws.score, ws.status));
}
Files.write(Paths.get(OUTPUT_SCORES), lines);
}
// ===== DATA CLASS =====
static class WordScore {
String word;
int score;
String status;
String endpoint;
int batchId;
WordScore(String word, int score, String status, String endpoint, int batchId) {
this.word = word;
this.score = score;
this.status = status;
this.endpoint = endpoint;
this.batchId = batchId;
}
WordScore(String word, int score, String status) {
this.word = word;
this.score = score;
this.status = status;
}
}
}