Files
puzzle-generator/tools/hint/src/HintJob.java
2025-12-29 22:52:47 +01:00

582 lines
19 KiB
Java

import java.sql.DriverManager;
static final String JDBC_URL = env("JDBC_URL", "jdbc:postgresql://192.168.1.159:5432/postgres");
static final String JDBC_USER = env("JDBC_USER", "puzzle");
static final String JDBC_PASS = env("JDBC_PASS", "heel-goed-wachtwoord");
static final String OLLAMA_URL = env("OLLAMA_URL", "http://192.168.1.159:8081/v1/chat/completions");
static final String MODEL = env("MODEL", "/models/TheBloke/Llama-2-13B-Chat-Dutch-GGUF/llama-2-13b-chat-dutch.Q8_0.gguf");
static String env(String k, String def) {
var v = System.getenv(k);
return (v == null || v.isBlank()) ? def : v;
}
void main(String[] args) throws Exception {
Class.forName("org.postgresql.Driver");
var limit = args.length > 0 ? Integer.parseInt(args[0]) : 3000;
try (var c = DriverManager.getConnection(JDBC_URL, JDBC_USER, JDBC_PASS)) {
c.setAutoCommit(false);
try (var sel = c.prepareStatement(
"select ctid::text, woord, hint, hint_score " +
"from export_real_words_with_hints " +
"order by updated_at nulls first " +
"limit ? for update skip locked");
var upd = c.prepareStatement(
"update export_real_words_with_hints set hint = ?, hint_score = ?, guessed_word = ?, suggested_hint = ?, updated_at = now() where ctid::text = ?")) {
sel.setInt(1, limit);
var done = 0;
try (var rs = sel.executeQuery()) {
while (rs.next()) {
if (done % 10 == 0) IO.println("Committed " + done);
var ctid = rs.getString(1);
var woord = rs.getString(2);
var oldHint = rs.getString(3);
var oldScore = rs.getInt(4);
if (rs.wasNull()) oldScore = -1;
var newHint = generateHint(woord);
if (newHint == null || newHint.isBlank()) continue;
newHint = sanitizeHint(newHint);
var scoreRes = scoreHint(newHint, woord);
var newScore = scoreRes.score;
// De gebruiker wil voornamelijk hints toevoegen aan records die er geen hebben.
// En de originele hint behouden omdat de LLM resultaten soms tegenvallen.
if (oldHint != null && !oldHint.isBlank()) {
// Er is al een hint. We genereren nog steeds een suggestie,
// maar we overschrijven de originele 'hint' kolom NIET.
var updSug = c.prepareStatement(
"update export_real_words_with_hints set suggested_hint = ?, hint_score = ?, guessed_word = ?, updated_at = now() where ctid::text = ?");
updSug.setString(1, newHint);
updSug.setInt(2, newScore);
updSug.setString(3, scoreRes.guessedWord);
updSug.setString(4, ctid);
updSug.executeUpdate();
c.commit();
done++;
continue;
}
// Geen bestaande hint, dus we vullen hem nu in
upd.setString(1, newHint);
upd.setInt(2, newScore);
upd.setString(3, scoreRes.guessedWord);
upd.setString(4, newHint);
upd.setString(5, ctid);
upd.executeUpdate();
c.commit();
done++;
}
}
IO.println("Done. Updated " + done + " rows.");
}
}
}
static String sanitizeHint(String hint) {
if (hint == null) return null;
hint = hint.trim();
if (hint.contains("\n")) {
var lines = hint.split("\n");
hint = lines[lines.length - 1].trim();
}
return hint;
}
record ScoreResult(String guessedWord, int score) {
}
record Candidate(String w, double p) { }
static ScoreResult scoreHint(String hint, String woord) throws Exception {
int L = woord.replaceAll("[^A-Za-z]", "").length();
String schema = """
{
"type": "json_object",
"additionalProperties": false,
"properties":{
"candidates":{
"type":"array",
"minItems":5,
"maxItems":5,
"items":{
"type":"object",
"additionalProperties": false,
"properties":{"w":{"type":"string","minLength":%d,"maxLength":%d},"p":{"type":"number","minimum":0,"maximum":1}},
"required":["w","p"]
}
}
},
"required":["candidates"]
}
""".formatted(L, L);
String prompt = """
Je bent een Nederlandse kruiswoord-hulp.
Geef EXACT 5 kandidaatwoorden (allemaal %d letters) die passen bij de hint.
Geef ook p (0..1) per kandidaat, som p ≈ 1.0.
Alleen JSON volgens dit schema, geen extra tekst.
Hint: %s
Schema: %s
""".formatted(L, hint, schema);
String payload = "{"
+ "\"model\":\"" + jsonEscape(MODEL) + "\","
+ "\"messages\":["
+ "{\"role\":\"system\",\"content\":\"Je antwoordt uitsluitend met geldige JSON volgens het schema.\"},"
+ "{\"role\":\"user\",\"content\":\"" + jsonEscape(prompt) + "\"}"
+ "],"
+ "\"stream\":false,"
+ "\"max_tokens\":100,"
+ "\"response_format\":" + schema + ","
+ "\"options\":{"
+ "\"temperature\":0.4,"
+ "\"seed\":1,"
+ "\"num_predict\":400,"
+ "\"stop\":[\"\\n\"]"
+ "},"
+ "\"keep_alive\":\"5m\""
+ "}";
var p = new ProcessBuilder(
"curl", "-sS",
"-H", "Content-Type: application/json",
"-X", "POST", OLLAMA_URL,
"-d", payload
).redirectErrorStream(true).start();
String out;
try (var in = p.getInputStream()) {
out = new String(in.readAllBytes(), StandardCharsets.UTF_8);
}
p.waitFor();
// Ollama /api/chat => message.content bevat de JSON-string :contentReference[oaicite:2]{index=2}
String content = extractJsonString(out, "content");
if (content == null) content = "";
// parse content JSON -> candidates[ {w,p}, ... ]
var candidates = parseCandidates(content); // implement met Jackson/Gson/whatever
String original = woord.toUpperCase().replaceAll("[^A-Z]", "");
String guessed = candidates.isEmpty() ? "" : normalizeWord(candidates.get(0).w);
int rank = findRank(candidates, original);
int score;
if (rank > 0) {
var candidate = candidates.get(rank - 1);
guessed = normalizeWord(candidate.w);
double pCorrect = candidate.p;
score = (int) Math.round(100 * pCorrect * (1.0 - 0.15 * (rank - 1)));
} else {
int bestDist = Integer.MAX_VALUE;
for (var c : candidates) {
String cw = normalizeWord(c.w);
if (cw.isEmpty()) continue;
var levenshtein = levenshtein(cw, original);
if (levenshtein < bestDist) {
bestDist = levenshtein;
guessed = cw;
}
}
// “bijna”-score: pas de factor aan naar smaak
score = Math.max(0, 60 - bestDist * 10);
}
return new ScoreResult(guessed, score);
}
static int findRank(List<Candidate> candidates, String targetWord) {
if (candidates == null || candidates.isEmpty()) return -1;
String target = normalizeWord(targetWord);
if (target.isEmpty()) return -1;
for (int i = 0; i < candidates.size(); i++) {
String cw = normalizeWord(candidates.get(i).w);
if (cw.equals(target)) return i + 1; // 1-based rank
}
return -1;
}
private static String normalizeWord(String s) {
if (s == null) return "";
return s.trim().toUpperCase().replaceAll("[^A-Z]", "");
}
private static int skipWs(String s, int i) {
while (i < s.length()) {
char c = s.charAt(i);
if (c != ' ' && c != '\n' && c != '\r' && c != '\t') break;
i++;
}
return i;
}
private static int findMatching(String s, int start, char open, char close) {
boolean inString = false;
boolean esc = false;
int depth = 0;
for (int i = start; i < s.length(); i++) {
char c = s.charAt(i);
if (inString) {
if (esc) {
esc = false;
} else if (c == '\\') {
esc = true;
} else if (c == '"') {
inString = false;
}
continue;
}
if (c == '"') {
inString = true;
continue;
}
if (c == open) depth++;
else if (c == close) {
depth--;
if (depth == 0) return i;
}
}
return -1;
}
static List<Candidate> parseCandidates(String contentJson) throws Exception {
if (contentJson == null) return List.of();
int a = contentJson.indexOf('{');
int b = contentJson.lastIndexOf('}');
String json = (a >= 0) ? contentJson.substring(a) : contentJson; // ook ok bij truncatie
int keyPos = json.indexOf("\"candidates\"");
if (keyPos < 0) return List.of();
int colon = json.indexOf(':', keyPos);
if (colon < 0) return List.of();
int arrStart = json.indexOf('[', colon);
if (arrStart < 0) return List.of();
int arrEnd = findMatching(json, arrStart, '[', ']');
if (arrEnd < 0) arrEnd = json.length(); // <-- TRUNCATIE-TOLERANT
String arr = json.substring(arrStart + 1, arrEnd);
List<Candidate> out = new ArrayList<>(5);
int i = 0;
while (i < arr.length() && out.size() < 5) {
i = skipWs(arr, i);
if (i >= arr.length()) break;
if (arr.charAt(i) == ',') {
i++;
continue;
}
int objStart = arr.indexOf('{', i);
if (objStart < 0) break;
int objEnd = findMatching(arr, objStart, '{', '}');
if (objEnd < 0) {
// object is afgebroken -> stop (want alles hierna is ook verdacht)
break;
}
String obj = arr.substring(objStart, objEnd + 1);
String w = extractJsonString(obj, "w");
Double p = extractJsonNumber(obj, "p");
if (w != null && p != null) {
String nw = normalizeWord(w);
out.add(new Candidate(nw, p));
}
i = objEnd + 1;
}
return out;
}
static int levenshtein(String s1, String s2) {
var dp = new int[s1.length() + 1][s2.length() + 1];
for (var i = 0; i <= s1.length(); i++) dp[i][0] = i;
for (var j = 0; j <= s2.length(); j++) dp[0][j] = j;
for (var i = 1; i <= s1.length(); i++)
for (var j = 1; j <= s2.length(); j++) {
var cost = (s1.charAt(i - 1) == s2.charAt(j - 1)) ? 0 : 1;
dp[i][j] = Math.min(Math.min(dp[i - 1][j] + 1, dp[i][j - 1] + 1), dp[i - 1][j - 1] + cost);
}
return dp[s1.length()][s2.length()];
}
static String generateHint(String woord) throws Exception {
String jsonSchema = """
{
"type":"object",
"additionalProperties": false,
"properties":{
"hint":{"type":"string","minLength":3,"maxLength":120}
},
"required":["hint"]
}
""";
String system =
"Je antwoordt uitsluitend met geldige JSON volgens response_format. "
+ "Geen extra tekst.";
String user = """
Je bent een kruiswoordpuzzelmaker.
Maak precies 1 hint voor "<woord:%s>".
Regels:
- 4 t/m 8 woorden.
- Vermijd "<woord:%s>" en elk deel ervan.
- Geen inleiding, geen aanhalingstekens, geen punt.
- Neutrale omschrijving, niet cryptisch.
""".formatted(woord,woord);
String payload = "{"
+ "\"model\":\"" + jsonEscape(MODEL) + "\","
+ "\"messages\":["
+ "{\"role\":\"system\",\"content\":\"" + jsonEscape(system) + "\"},"
+ "{\"role\":\"user\",\"content\":\"" + jsonEscape(user) + "\"}"
+ "],"
+ "\"stream\":false,"
+ "\"max_tokens\":100,"
+ "\"response_format\":{"
+ "\"type\":\"json_object\","
+ "\"schema\":" + jsonSchema
+ "},"
+ "\"options\":{"
+ "\"temperature\":0.35,"
+ "\"seed\":1,"
+ "\"num_predict\":120"
+ "}"
+ "}";
String out = postCurl(OLLAMA_URL, payload);
String content = jsonGetString(out, "content"); // -> {"hint":"..."}
if (content == null) return null;
String hint = jsonGetString(content, "hint"); // -> de echte hint
return hint == null ? null : hint.trim();
}
static boolean leaksHint(String hint, String[] forbidden) {
String h = normalize(hint);
for (String f : forbidden) {
if (f == null || f.isBlank()) continue;
if (h.contains(normalize(f))) return true;
}
return false;
}
static String[] forbiddenForms(String woord) {
String w = normalize(woord).toLowerCase();
// simpele NL-varianten (goed genoeg voor jouw voorbeeld “radijsje”)
List<String> forms = new ArrayList<>();
forms.add(w);
if (w.endsWith("je") && w.length() > 4) forms.add(w.substring(0, w.length() - 2)); // radijsje -> radijs
if (w.endsWith("en") && w.length() > 4) forms.add(w.substring(0, w.length() - 2)); // meervoud
if (w.endsWith("s") && w.length() > 4) forms.add(w.substring(0, w.length() - 1)); // bezits/meervoud
// unieke lijst terug
return forms.stream().distinct().toArray(String[]::new);
}
static String normalize(String s) {
if (s == null) return "";
return s.toLowerCase().replaceAll("[^a-z]", "");
}
static String jsonEscape(String s) {
var b = new StringBuilder(s.length() + 16);
for (var i = 0; i < s.length(); i++) {
var ch = s.charAt(i);
switch (ch) {
case '\\' -> b.append("\\\\");
case '"' -> b.append("\\\"");
case '\n' -> b.append("\\n");
case '\r' -> b.append("\\r");
case '\t' -> b.append("\\t");
default -> b.append(ch);
}
}
return b.toString();
}
private static String extractJsonString(String obj, String key) throws Exception {
int k = obj.indexOf("\"" + key + "\"");
if (k < 0) return null;
int colon = obj.indexOf(':', k);
if (colon < 0) return null;
int i = skipWs(obj, colon + 1);
if (i >= obj.length() || obj.charAt(i) != '"') return null;
StringBuilder sb = new StringBuilder();
i++; // na eerste quote
boolean esc = false;
while (i < obj.length()) {
char c = obj.charAt(i++);
if (esc) {
esc = false;
switch (c) {
case '"':
sb.append('"');
break;
case '\\':
sb.append('\\');
break;
case '/':
sb.append('/');
break;
case 'b':
sb.append('\b');
break;
case 'f':
sb.append('\f');
break;
case 'n':
sb.append('\n');
break;
case 'r':
sb.append('\r');
break;
case 't':
sb.append('\t');
break;
case 'u':
if (i + 4 > obj.length()) throw new Exception("Bad unicode escape");
String hex = obj.substring(i, i + 4);
sb.append((char) Integer.parseInt(hex, 16));
i += 4;
break;
default:
// onbekende escape: neem letterlijk
sb.append(c);
}
continue;
}
if (c == '\\') {
esc = true;
continue;
}
if (c == '"') break; // einde string
sb.append(c);
}
return sb.toString();
}
// minimal JSON string extractor for {"key":"value"} with escapes
static String jsonGetString(String json, String key) {
var needle = "\"" + key + "\"";
var i = json.indexOf(needle);
if (i < 0) return null;
i = json.indexOf(':', i + needle.length());
if (i < 0) return null;
i++;
while (i < json.length() && Character.isWhitespace(json.charAt(i))) i++;
if (i >= json.length() || json.charAt(i) != '"') return null;
i++; // after opening quote
var b = new StringBuilder();
var esc = false;
for (; i < json.length(); i++) {
var ch = json.charAt(i);
if (esc) {
switch (ch) {
case 'n' -> b.append('\n');
case 'r' -> b.append('\r');
case 't' -> b.append('\t');
case '"' -> b.append('"');
case '\\' -> b.append('\\');
default -> b.append(ch);
}
esc = false;
} else if (ch == '\\') esc = true;
else if (ch == '"') return b.toString();
else b.append(ch);
}
return null;
}
private static Double extractJsonNumber(String obj, String key) {
int k = obj.indexOf("\"" + key + "\"");
if (k < 0) return null;
int colon = obj.indexOf(':', k);
if (colon < 0) return null;
int i = skipWs(obj, colon + 1);
if (i >= obj.length()) return null;
int j = i;
while (j < obj.length()) {
char c = obj.charAt(j);
if ((c >= '0' && c <= '9') || c == '-' || c == '+' || c == '.' || c == 'e' || c == 'E') {
j++;
} else {
break;
}
}
if (j == i) return null;
try {
return Double.parseDouble(obj.substring(i, j));
} catch (NumberFormatException e) {
return null;
}
}
static String postCurl(String url, String payload) throws Exception {
var p = new ProcessBuilder(
"curl", "-sS", "-f",
"-H", "Content-Type: application/json",
"-X", "POST", url,
"--data-binary", "@-",
"-w", "\n__HTTP_STATUS__:%{http_code}\n"
).redirectErrorStream(true).start();
try (var os = p.getOutputStream()) {
os.write(payload.getBytes(StandardCharsets.UTF_8));
}
String out;
try (var in = p.getInputStream()) {
out = new String(in.readAllBytes(), StandardCharsets.UTF_8);
}
int code = p.waitFor();
if (code != 0) {
System.err.println("curl failed (" + code + "): " + out);
return null;
}
// strip status marker from output
int k = out.lastIndexOf("__HTTP_STATUS__:");
if (k >= 0) {
String status = out.substring(k + "__HTTP_STATUS__:".length()).trim();
out = out.substring(0, k).trim();
// optioneel: log status
// System.err.println("HTTP status: " + status);
}
return out;
}