229 lines
8.6 KiB
Java
229 lines
8.6 KiB
Java
package puzzle;
|
|
|
|
import java.net.URI;
|
|
import java.net.http.HttpClient;
|
|
import java.net.http.HttpRequest;
|
|
import java.net.http.HttpResponse;
|
|
import java.nio.file.Files;
|
|
import java.nio.file.Paths;
|
|
import java.util.*;
|
|
import java.util.concurrent.TimeUnit;
|
|
|
|
/**
|
|
* Ollama Dutch Wordlist Scorer
|
|
* Scores 90k Dutch words on popularity/complexity using local Ollama API
|
|
*/
|
|
public class DutchWordScorer {
|
|
|
|
// Configuration
|
|
private static final String OLLAMA_ENDPOINT = "http://localhost:11434/api/chat";
|
|
private static final String MODEL = "qwen2.5:14b"; // or "llama3.1:latest"
|
|
private static final int BATCH_SIZE = 50; // Words per API call
|
|
private static final int RATE_LIMIT_DELAY_MS = 500; // Be nice to local Ollama
|
|
private static final int MAX_RETRIES = 3;
|
|
|
|
// Input/output files
|
|
private static final String INPUT_WORDLIST = "word-list.txt";
|
|
private static final String OUTPUT_SCORES = "word_scores.csv";
|
|
|
|
public static void main(String[] args) throws Exception {
|
|
System.out.println("Starting Dutch wordlist scoring...");
|
|
|
|
// Read all words
|
|
List<String> words = Files.readAllLines(Paths.get(INPUT_WORDLIST));
|
|
System.out.printf("Loaded %d words from %s%n", words.size(), INPUT_WORDLIST);
|
|
|
|
// Process in batches
|
|
HttpClient client = HttpClient.newBuilder()
|
|
.connectTimeout(java.time.Duration.ofSeconds(30))
|
|
.build();
|
|
|
|
List<WordScore> allScores = new ArrayList<>();
|
|
|
|
for (int i = 0; i < words.size(); i += BATCH_SIZE) {
|
|
int end = Math.min(i + BATCH_SIZE, words.size());
|
|
List<String> batch = words.subList(i, end);
|
|
|
|
System.out.printf("Processing batch %d-%d...%n", i + 1, end);
|
|
|
|
boolean success = false;
|
|
int retries = 0;
|
|
|
|
while (!success && retries < MAX_RETRIES) {
|
|
try {
|
|
List<WordScore> batchScores = processBatch(client, batch);
|
|
allScores.addAll(batchScores);
|
|
success = true;
|
|
|
|
// Rate limiting
|
|
if (i + BATCH_SIZE < words.size()) {
|
|
Thread.sleep(RATE_LIMIT_DELAY_MS);
|
|
}
|
|
|
|
} catch (Exception e) {
|
|
retries++;
|
|
System.err.printf("Batch %d-%d failed (attempt %d/%d): %s%n",
|
|
i + 1, end, retries, MAX_RETRIES, e.getMessage());
|
|
|
|
if (retries >= MAX_RETRIES) {
|
|
System.err.println("Max retries reached, skipping batch");
|
|
// Add null scores for failed batch to maintain alignment
|
|
batch.forEach(w -> allScores.add(new WordScore(w, -1, "FAILED")));
|
|
} else {
|
|
Thread.sleep(2000 * retries); // Exponential backoff
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Write results
|
|
writeScoresToCsv(allScores);
|
|
System.out.printf("Completed! Scored %d words. Results saved to %s%n",
|
|
allScores.size(), OUTPUT_SCORES);
|
|
}
|
|
|
|
public static List<WordScore> processBatch(HttpClient client, List<String> batch) throws Exception {
|
|
String prompt = createScoringPrompt(batch);
|
|
|
|
// Build JSON request
|
|
String jsonRequest = buildChatRequestJson(prompt);
|
|
|
|
HttpRequest request = HttpRequest.newBuilder()
|
|
.uri(URI.create(OLLAMA_ENDPOINT))
|
|
.header("Content-Type", "application/json")
|
|
.POST(HttpRequest.BodyPublishers.ofString(jsonRequest))
|
|
.build();
|
|
|
|
HttpResponse<String> response = client.send(request,
|
|
HttpResponse.BodyHandlers.ofString());
|
|
|
|
if (response.statusCode() != 200) {
|
|
throw new RuntimeException("HTTP " + response.statusCode() + ": " + response.body());
|
|
}
|
|
|
|
// Parse response
|
|
String responseBody = response.body();
|
|
String assistantReply = extractMessageContent(responseBody);
|
|
|
|
return parseScoresFromReply(batch, assistantReply);
|
|
}
|
|
|
|
private static String createScoringPrompt(List<String> words) {
|
|
return """
|
|
Je bent een Nederlandse taalexpert. Geef elk woord een populariteitsscore van 1-10.
|
|
|
|
Score criteria:
|
|
- 1 = Zeer zeldzaam, archaïsch, of extreem specifiek vakjargon
|
|
- 10 = Zeer algemeen, dagelijks gebruikt door iedereen
|
|
|
|
Geef ALLEEN een lijst in dit exacte formaat, niets anders:
|
|
woord1:score
|
|
woord2:score
|
|
enz.
|
|
|
|
Woorden om te scoren:
|
|
""" + String.join("\n", words);
|
|
}
|
|
|
|
private static String buildChatRequestJson(String prompt) {
|
|
// Simple JSON building (in production use a library like Jackson)
|
|
return String.format("""
|
|
{
|
|
"model": "%s",
|
|
"messages": [
|
|
{
|
|
"role": "user",
|
|
"content": "%s"
|
|
}
|
|
],
|
|
"stream": false,
|
|
"temperature": 0.1
|
|
}
|
|
""", MODEL, escapeJson(prompt));
|
|
}
|
|
|
|
private static String escapeJson(String str) {
|
|
return str.replace("\\", "\\\\")
|
|
.replace("\"", "\\\"")
|
|
.replace("\n", "\\n")
|
|
.replace("\r", "\\r")
|
|
.replace("\t", "\\t");
|
|
}
|
|
|
|
private static String extractMessageContent(String responseBody) {
|
|
// Parse: "message":{"role":"assistant","content":"..."}
|
|
int contentStart = responseBody.indexOf("\"content\":\"") + 11;
|
|
int contentEnd = responseBody.indexOf("\"", contentStart);
|
|
return responseBody.substring(contentStart, contentEnd)
|
|
.replace("\\n", "\n");
|
|
}
|
|
|
|
private static List<WordScore> parseScoresFromReply(List<String> expectedWords, String reply) {
|
|
Map<String, Integer> wordScoreMap = new HashMap<>();
|
|
String[] lines = reply.split("\n");
|
|
|
|
for (String line : lines) {
|
|
line = line.trim();
|
|
if (line.contains(":")) {
|
|
String[] parts = line.split(":", 2);
|
|
if (parts.length == 2) {
|
|
String word = parts[0].trim().toLowerCase();
|
|
try {
|
|
int score = Integer.parseInt(parts[1].trim());
|
|
wordScoreMap.put(word, Math.max(1, Math.min(10, score)));
|
|
} catch (NumberFormatException e) {
|
|
// Skip invalid lines
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Match scores to original words (maintaining order)
|
|
List<WordScore> results = new ArrayList<>();
|
|
for (String word : expectedWords) {
|
|
Integer score = wordScoreMap.get(word.toLowerCase());
|
|
if (score != null) {
|
|
results.add(new WordScore(word, score, "OK"));
|
|
} else {
|
|
System.err.printf("Warning: No score found for '%s'%n", word);
|
|
results.add(new WordScore(word, -1, "MISSING"));
|
|
}
|
|
}
|
|
|
|
return results;
|
|
}
|
|
|
|
private static void writeScoresToCsv(List<WordScore> scores) throws Exception {
|
|
List<String> lines = new ArrayList<>();
|
|
lines.add("word,score,status");
|
|
|
|
for (WordScore ws : scores) {
|
|
lines.add(String.format("%s,%d,%s", ws.word, ws.score, ws.status));
|
|
}
|
|
|
|
Files.write(Paths.get(OUTPUT_SCORES), lines);
|
|
}
|
|
// ===== DATA CLASS =====
|
|
static class WordScore {
|
|
|
|
String word;
|
|
int score;
|
|
String status;
|
|
String endpoint;
|
|
int batchId;
|
|
|
|
WordScore(String word, int score, String status, String endpoint, int batchId) {
|
|
this.word = word;
|
|
this.score = score;
|
|
this.status = status;
|
|
this.endpoint = endpoint;
|
|
this.batchId = batchId;
|
|
}
|
|
WordScore(String word, int score, String status) {
|
|
this.word = word;
|
|
this.score = score;
|
|
this.status = status;
|
|
}
|
|
}
|
|
|
|
} |