package puzzle; import java.net.URI; import java.net.http.HttpClient; import java.net.http.HttpRequest; import java.net.http.HttpResponse; import java.nio.file.Files; import java.nio.file.Paths; import java.util.*; import java.util.concurrent.TimeUnit; /** * Ollama Dutch Wordlist Scorer * Scores 90k Dutch words on popularity/complexity using local Ollama API */ public class DutchWordScorer { // Configuration private static final String OLLAMA_ENDPOINT = "http://localhost:11434/api/chat"; private static final String MODEL = "qwen2.5:14b"; // or "llama3.1:latest" private static final int BATCH_SIZE = 50; // Words per API call private static final int RATE_LIMIT_DELAY_MS = 500; // Be nice to local Ollama private static final int MAX_RETRIES = 3; // Input/output files private static final String INPUT_WORDLIST = "word-list.txt"; private static final String OUTPUT_SCORES = "word_scores.csv"; public static void main(String[] args) throws Exception { System.out.println("Starting Dutch wordlist scoring..."); // Read all words List words = Files.readAllLines(Paths.get(INPUT_WORDLIST)); System.out.printf("Loaded %d words from %s%n", words.size(), INPUT_WORDLIST); // Process in batches HttpClient client = HttpClient.newBuilder() .connectTimeout(java.time.Duration.ofSeconds(30)) .build(); List allScores = new ArrayList<>(); for (int i = 0; i < words.size(); i += BATCH_SIZE) { int end = Math.min(i + BATCH_SIZE, words.size()); List batch = words.subList(i, end); System.out.printf("Processing batch %d-%d...%n", i + 1, end); boolean success = false; int retries = 0; while (!success && retries < MAX_RETRIES) { try { List batchScores = processBatch(client, batch); allScores.addAll(batchScores); success = true; // Rate limiting if (i + BATCH_SIZE < words.size()) { Thread.sleep(RATE_LIMIT_DELAY_MS); } } catch (Exception e) { retries++; System.err.printf("Batch %d-%d failed (attempt %d/%d): %s%n", i + 1, end, retries, MAX_RETRIES, e.getMessage()); if (retries >= MAX_RETRIES) { System.err.println("Max retries reached, skipping batch"); // Add null scores for failed batch to maintain alignment batch.forEach(w -> allScores.add(new WordScore(w, -1, "FAILED"))); } else { Thread.sleep(2000 * retries); // Exponential backoff } } } } // Write results writeScoresToCsv(allScores); System.out.printf("Completed! Scored %d words. Results saved to %s%n", allScores.size(), OUTPUT_SCORES); } public static List processBatch(HttpClient client, List batch) throws Exception { String prompt = createScoringPrompt(batch); // Build JSON request String jsonRequest = buildChatRequestJson(prompt); HttpRequest request = HttpRequest.newBuilder() .uri(URI.create(OLLAMA_ENDPOINT)) .header("Content-Type", "application/json") .POST(HttpRequest.BodyPublishers.ofString(jsonRequest)) .build(); HttpResponse response = client.send(request, HttpResponse.BodyHandlers.ofString()); if (response.statusCode() != 200) { throw new RuntimeException("HTTP " + response.statusCode() + ": " + response.body()); } // Parse response String responseBody = response.body(); String assistantReply = extractMessageContent(responseBody); return parseScoresFromReply(batch, assistantReply); } private static String createScoringPrompt(List words) { return """ Je bent een Nederlandse taalexpert. Geef elk woord een populariteitsscore van 1-10. Score criteria: - 1 = Zeer zeldzaam, archaïsch, of extreem specifiek vakjargon - 10 = Zeer algemeen, dagelijks gebruikt door iedereen Geef ALLEEN een lijst in dit exacte formaat, niets anders: woord1:score woord2:score enz. Woorden om te scoren: """ + String.join("\n", words); } private static String buildChatRequestJson(String prompt) { // Simple JSON building (in production use a library like Jackson) return String.format(""" { "model": "%s", "messages": [ { "role": "user", "content": "%s" } ], "stream": false, "temperature": 0.1 } """, MODEL, escapeJson(prompt)); } private static String escapeJson(String str) { return str.replace("\\", "\\\\") .replace("\"", "\\\"") .replace("\n", "\\n") .replace("\r", "\\r") .replace("\t", "\\t"); } private static String extractMessageContent(String responseBody) { // Parse: "message":{"role":"assistant","content":"..."} int contentStart = responseBody.indexOf("\"content\":\"") + 11; int contentEnd = responseBody.indexOf("\"", contentStart); return responseBody.substring(contentStart, contentEnd) .replace("\\n", "\n"); } private static List parseScoresFromReply(List expectedWords, String reply) { Map wordScoreMap = new HashMap<>(); String[] lines = reply.split("\n"); for (String line : lines) { line = line.trim(); if (line.contains(":")) { String[] parts = line.split(":", 2); if (parts.length == 2) { String word = parts[0].trim().toLowerCase(); try { int score = Integer.parseInt(parts[1].trim()); wordScoreMap.put(word, Math.max(1, Math.min(10, score))); } catch (NumberFormatException e) { // Skip invalid lines } } } } // Match scores to original words (maintaining order) List results = new ArrayList<>(); for (String word : expectedWords) { Integer score = wordScoreMap.get(word.toLowerCase()); if (score != null) { results.add(new WordScore(word, score, "OK")); } else { System.err.printf("Warning: No score found for '%s'%n", word); results.add(new WordScore(word, -1, "MISSING")); } } return results; } private static void writeScoresToCsv(List scores) throws Exception { List lines = new ArrayList<>(); lines.add("word,score,status"); for (WordScore ws : scores) { lines.add(String.format("%s,%d,%s", ws.word, ws.score, ws.status)); } Files.write(Paths.get(OUTPUT_SCORES), lines); } // ===== DATA CLASS ===== static class WordScore { String word; int score; String status; String endpoint; int batchId; WordScore(String word, int score, String status, String endpoint, int batchId) { this.word = word; this.score = score; this.status = status; this.endpoint = endpoint; this.batchId = batchId; } WordScore(String word, int score, String status) { this.word = word; this.score = score; this.status = status; } } }