update them
This commit is contained in:
229
src/puzzle/DutchWordScorer.java
Normal file
229
src/puzzle/DutchWordScorer.java
Normal file
@@ -0,0 +1,229 @@
|
||||
package puzzle;
|
||||
|
||||
import java.net.URI;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/**
|
||||
* Ollama Dutch Wordlist Scorer
|
||||
* Scores 90k Dutch words on popularity/complexity using local Ollama API
|
||||
*/
|
||||
public class DutchWordScorer {
|
||||
|
||||
// Configuration
|
||||
private static final String OLLAMA_ENDPOINT = "http://localhost:11434/api/chat";
|
||||
private static final String MODEL = "qwen2.5:14b"; // or "llama3.1:latest"
|
||||
private static final int BATCH_SIZE = 50; // Words per API call
|
||||
private static final int RATE_LIMIT_DELAY_MS = 500; // Be nice to local Ollama
|
||||
private static final int MAX_RETRIES = 3;
|
||||
|
||||
// Input/output files
|
||||
private static final String INPUT_WORDLIST = "word-list.txt";
|
||||
private static final String OUTPUT_SCORES = "word_scores.csv";
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
System.out.println("Starting Dutch wordlist scoring...");
|
||||
|
||||
// Read all words
|
||||
List<String> words = Files.readAllLines(Paths.get(INPUT_WORDLIST));
|
||||
System.out.printf("Loaded %d words from %s%n", words.size(), INPUT_WORDLIST);
|
||||
|
||||
// Process in batches
|
||||
HttpClient client = HttpClient.newBuilder()
|
||||
.connectTimeout(java.time.Duration.ofSeconds(30))
|
||||
.build();
|
||||
|
||||
List<WordScore> allScores = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < words.size(); i += BATCH_SIZE) {
|
||||
int end = Math.min(i + BATCH_SIZE, words.size());
|
||||
List<String> batch = words.subList(i, end);
|
||||
|
||||
System.out.printf("Processing batch %d-%d...%n", i + 1, end);
|
||||
|
||||
boolean success = false;
|
||||
int retries = 0;
|
||||
|
||||
while (!success && retries < MAX_RETRIES) {
|
||||
try {
|
||||
List<WordScore> batchScores = processBatch(client, batch);
|
||||
allScores.addAll(batchScores);
|
||||
success = true;
|
||||
|
||||
// Rate limiting
|
||||
if (i + BATCH_SIZE < words.size()) {
|
||||
Thread.sleep(RATE_LIMIT_DELAY_MS);
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
retries++;
|
||||
System.err.printf("Batch %d-%d failed (attempt %d/%d): %s%n",
|
||||
i + 1, end, retries, MAX_RETRIES, e.getMessage());
|
||||
|
||||
if (retries >= MAX_RETRIES) {
|
||||
System.err.println("Max retries reached, skipping batch");
|
||||
// Add null scores for failed batch to maintain alignment
|
||||
batch.forEach(w -> allScores.add(new WordScore(w, -1, "FAILED")));
|
||||
} else {
|
||||
Thread.sleep(2000 * retries); // Exponential backoff
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write results
|
||||
writeScoresToCsv(allScores);
|
||||
System.out.printf("Completed! Scored %d words. Results saved to %s%n",
|
||||
allScores.size(), OUTPUT_SCORES);
|
||||
}
|
||||
|
||||
public static List<WordScore> processBatch(HttpClient client, List<String> batch) throws Exception {
|
||||
String prompt = createScoringPrompt(batch);
|
||||
|
||||
// Build JSON request
|
||||
String jsonRequest = buildChatRequestJson(prompt);
|
||||
|
||||
HttpRequest request = HttpRequest.newBuilder()
|
||||
.uri(URI.create(OLLAMA_ENDPOINT))
|
||||
.header("Content-Type", "application/json")
|
||||
.POST(HttpRequest.BodyPublishers.ofString(jsonRequest))
|
||||
.build();
|
||||
|
||||
HttpResponse<String> response = client.send(request,
|
||||
HttpResponse.BodyHandlers.ofString());
|
||||
|
||||
if (response.statusCode() != 200) {
|
||||
throw new RuntimeException("HTTP " + response.statusCode() + ": " + response.body());
|
||||
}
|
||||
|
||||
// Parse response
|
||||
String responseBody = response.body();
|
||||
String assistantReply = extractMessageContent(responseBody);
|
||||
|
||||
return parseScoresFromReply(batch, assistantReply);
|
||||
}
|
||||
|
||||
private static String createScoringPrompt(List<String> words) {
|
||||
return """
|
||||
Je bent een Nederlandse taalexpert. Geef elk woord een populariteitsscore van 1-10.
|
||||
|
||||
Score criteria:
|
||||
- 1 = Zeer zeldzaam, archaïsch, of extreem specifiek vakjargon
|
||||
- 10 = Zeer algemeen, dagelijks gebruikt door iedereen
|
||||
|
||||
Geef ALLEEN een lijst in dit exacte formaat, niets anders:
|
||||
woord1:score
|
||||
woord2:score
|
||||
enz.
|
||||
|
||||
Woorden om te scoren:
|
||||
""" + String.join("\n", words);
|
||||
}
|
||||
|
||||
private static String buildChatRequestJson(String prompt) {
|
||||
// Simple JSON building (in production use a library like Jackson)
|
||||
return String.format("""
|
||||
{
|
||||
"model": "%s",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "%s"
|
||||
}
|
||||
],
|
||||
"stream": false,
|
||||
"temperature": 0.1
|
||||
}
|
||||
""", MODEL, escapeJson(prompt));
|
||||
}
|
||||
|
||||
private static String escapeJson(String str) {
|
||||
return str.replace("\\", "\\\\")
|
||||
.replace("\"", "\\\"")
|
||||
.replace("\n", "\\n")
|
||||
.replace("\r", "\\r")
|
||||
.replace("\t", "\\t");
|
||||
}
|
||||
|
||||
private static String extractMessageContent(String responseBody) {
|
||||
// Parse: "message":{"role":"assistant","content":"..."}
|
||||
int contentStart = responseBody.indexOf("\"content\":\"") + 11;
|
||||
int contentEnd = responseBody.indexOf("\"", contentStart);
|
||||
return responseBody.substring(contentStart, contentEnd)
|
||||
.replace("\\n", "\n");
|
||||
}
|
||||
|
||||
private static List<WordScore> parseScoresFromReply(List<String> expectedWords, String reply) {
|
||||
Map<String, Integer> wordScoreMap = new HashMap<>();
|
||||
String[] lines = reply.split("\n");
|
||||
|
||||
for (String line : lines) {
|
||||
line = line.trim();
|
||||
if (line.contains(":")) {
|
||||
String[] parts = line.split(":", 2);
|
||||
if (parts.length == 2) {
|
||||
String word = parts[0].trim().toLowerCase();
|
||||
try {
|
||||
int score = Integer.parseInt(parts[1].trim());
|
||||
wordScoreMap.put(word, Math.max(1, Math.min(10, score)));
|
||||
} catch (NumberFormatException e) {
|
||||
// Skip invalid lines
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Match scores to original words (maintaining order)
|
||||
List<WordScore> results = new ArrayList<>();
|
||||
for (String word : expectedWords) {
|
||||
Integer score = wordScoreMap.get(word.toLowerCase());
|
||||
if (score != null) {
|
||||
results.add(new WordScore(word, score, "OK"));
|
||||
} else {
|
||||
System.err.printf("Warning: No score found for '%s'%n", word);
|
||||
results.add(new WordScore(word, -1, "MISSING"));
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
private static void writeScoresToCsv(List<WordScore> scores) throws Exception {
|
||||
List<String> lines = new ArrayList<>();
|
||||
lines.add("word,score,status");
|
||||
|
||||
for (WordScore ws : scores) {
|
||||
lines.add(String.format("%s,%d,%s", ws.word, ws.score, ws.status));
|
||||
}
|
||||
|
||||
Files.write(Paths.get(OUTPUT_SCORES), lines);
|
||||
}
|
||||
// ===== DATA CLASS =====
|
||||
static class WordScore {
|
||||
|
||||
String word;
|
||||
int score;
|
||||
String status;
|
||||
String endpoint;
|
||||
int batchId;
|
||||
|
||||
WordScore(String word, int score, String status, String endpoint, int batchId) {
|
||||
this.word = word;
|
||||
this.score = score;
|
||||
this.status = status;
|
||||
this.endpoint = endpoint;
|
||||
this.batchId = batchId;
|
||||
}
|
||||
WordScore(String word, int score, String status) {
|
||||
this.word = word;
|
||||
this.score = score;
|
||||
this.status = status;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user