Gather data

This commit is contained in:
mike
2025-12-25 00:21:58 +01:00
parent 85ebfd3013
commit 49a1aa4152
12 changed files with 494 additions and 27933 deletions

View File

@@ -0,0 +1,220 @@
package puzzle;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
public class ClueGenerator {
private static final String OLLAMA_URL = "http://localhost:11434/api/chat";
private static final String MODEL = "qwen2.5:14b";
private static final String HINTS_FILE = "export_with_hints.csv";
private static Map<String, String> prebuiltClues = null;
private static synchronized void ensurePrebuiltCluesLoaded() {
if (prebuiltClues != null) return;
prebuiltClues = new HashMap<>();
try {
List<String> lines = Files.readAllLines(Path.of(HINTS_FILE), StandardCharsets.UTF_8);
for (String line : lines) {
String[] parts = line.split(",", 3);
if (parts.length >= 3) {
String word = parts[0].trim().toUpperCase(Locale.ROOT);
String rawClue = parts[2].trim();
if (rawClue.startsWith("\"") && rawClue.endsWith("\"")) {
rawClue = rawClue.substring(1, rawClue.length() - 1).replace("\"\"", "\"");
}
if (!word.isEmpty() && !rawClue.isEmpty()) {
prebuiltClues.put(word, rawClue);
}
}
}
} catch (IOException e) {
System.err.println("Warning: " + HINTS_FILE + " not found or could not be read.");
}
}
public static ExportFormat.ExportedPuzzle applyClues(ExportFormat.ExportedPuzzle puzzle) {
if (puzzle == null || puzzle.words().isEmpty()) {
return puzzle;
}
ensurePrebuiltCluesLoaded();
Map<String, String> finalClueMap = new HashMap<>();
List<String> wordsMissingClues = new ArrayList<>();
for (var w : puzzle.words()) {
String wordUpper = w.word().toUpperCase(Locale.ROOT);
if (prebuiltClues.containsKey(wordUpper)) {
finalClueMap.put(w.word(), prebuiltClues.get(wordUpper));
} else {
wordsMissingClues.add(w.word());
}
}
if (!wordsMissingClues.isEmpty()) {
Map<String, String> generatedClues = generateClues(wordsMissingClues);
finalClueMap.putAll(generatedClues);
}
List<ExportFormat.WordOut> wordsWithClues = new ArrayList<>();
for (var w : puzzle.words()) {
String clue = finalClueMap.getOrDefault(w.word(), w.word());
wordsWithClues.add(new ExportFormat.WordOut(
w.word(),
clue,
w.startRow(),
w.startCol(),
w.direction(),
w.answer(),
w.arrowRow(),
w.arrowCol(),
w.isReversed()
));
}
return new ExportFormat.ExportedPuzzle(puzzle.gridv2(), wordsWithClues, puzzle.difficulty(), puzzle.rewards());
}
public static Map<String, String> generateClues(List<String> words) {
if (words == null || words.isEmpty()) {
return Collections.emptyMap();
}
String prompt = createCluePrompt(words);
try {
String jsonRequest = String.format(
"{\"model\":\"%s\",\"messages\":[{\"role\":\"user\",\"content\":\"%s\"}],\"stream\":false,\"temperature\":0.7}",
MODEL, escapeJson(prompt)
);
String responseBody = curlPostJson(OLLAMA_URL, jsonRequest, 120);
String content = extractChatContent(responseBody);
if (content == null || content.isEmpty()) {
return Collections.emptyMap();
}
return parseCluesFromReply(words, content);
} catch (Exception e) {
System.err.println("Failed to generate clues: " + e.getMessage());
return Collections.emptyMap();
}
}
private static String createCluePrompt(List<String> words) {
return "Je bent een expert in het maken van kruiswoordpuzzels. Geef voor elk van de onderstaande woorden een korte, uitdagende maar duidelijke cryptische of beschrijvende aanwijzing in het Nederlands.\n\n" +
"Output ALLEEN in dit formaat:\n" +
"woord1:aanwijzing\n" +
"woord2:aanwijzing\n\n" +
"GEEN andere tekst of uitleg. Sla GEEN woorden over.\n\n" +
"Lijst:\n" +
String.join("\n", words);
}
private static Map<String, String> parseCluesFromReply(List<String> expectedWords, String reply) {
Map<String, String> wordClueMap = new HashMap<>();
String[] lines = reply.split("\n");
for (String line : lines) {
line = line.trim();
if (line.contains(":")) {
String[] parts = line.split(":", 2);
if (parts.length == 2) {
String wordPart = parts[0].trim().replaceAll("^[\\d+.)*\\-\\s]+", "").toLowerCase();
String clue = parts[1].trim();
if (!clue.isEmpty()) {
wordClueMap.put(wordPart, clue);
}
}
}
}
Map<String, String> results = new HashMap<>();
for (String word : expectedWords) {
String clue = wordClueMap.get(word.toLowerCase());
if (clue != null) {
results.put(word, clue);
}
}
return results;
}
private static String curlPostJson(String url, String jsonBody, int timeoutSeconds) throws Exception {
var tempFile = Files.createTempFile("clue-request-", ".json");
try {
Files.writeString(tempFile, jsonBody, StandardCharsets.UTF_8);
List<String> cmd = new ArrayList<>();
cmd.add("curl");
cmd.add("-fsSL");
cmd.add("--connect-timeout");
cmd.add("10");
cmd.add("--max-time");
cmd.add(String.valueOf(timeoutSeconds));
cmd.add("-H");
cmd.add("Content-Type: application/json");
cmd.add("-d");
cmd.add("@" + tempFile);
cmd.add(url);
var p = new ProcessBuilder(cmd)
.redirectErrorStream(true)
.start();
var bytes = p.getInputStream().readAllBytes();
var code = p.waitFor();
if (code != 0) {
throw new IOException("curl POST failed (" + code + ") url=" + url + "\nOutput:\n" +
new String(bytes, StandardCharsets.UTF_8));
}
return new String(bytes, StandardCharsets.UTF_8);
} finally {
Files.deleteIfExists(tempFile);
}
}
private static String extractChatContent(String json) {
if (json == null) return null;
var choices = json.indexOf("\"choices\"");
var p = (choices >= 0) ? choices : 0;
var i = json.indexOf("\"content\"", p);
if (i < 0) {
// Fallback for Ollama non-chat format if needed, but we used /api/chat
// Ollama /api/chat returns {"model":"...","message":{"role":"assistant","content":"..."}}
i = json.indexOf("\"content\"");
if (i < 0) return null;
}
var colon = json.indexOf(':', i);
if (colon < 0) return null;
var q = json.indexOf('"', colon + 1);
if (q < 0) return null;
var sb = new StringBuilder();
var esc = false;
for (var k = q + 1; k < json.length(); k++) {
var ch = json.charAt(k);
if (esc) {
if (ch == 'n') sb.append('\n');
else if (ch == 't') sb.append('\t');
else if (ch == 'r') sb.append('\r');
else sb.append(ch);
esc = false;
} else {
if (ch == '\\') esc = true;
else if (ch == '"') break;
else sb.append(ch);
}
}
return sb.toString();
}
private static String escapeJson(String str) {
return str.replace("\\", "\\\\")
.replace("\"", "\\\"")
.replace("\n", "\\n");
}
}

View File

@@ -31,7 +31,7 @@ public class DailyGenerator {
public static void main(String[] args) {
var outDir = env("OUT_DIR", "/home/mike/dev/puzzle-generator/data/");
var wordsPath = env("WORDS_PATH", "./word-list.txt");
var wordsPath = env("WORDS_PATH", "./export_words_only.txt");
var puzzlesPerDay = envInt("PUZZLES_PER_DAY", 3);
var seed = envInt("SEED", (int) System.currentTimeMillis());
var themeFilter = envBool("THEME_FILTER", true);
@@ -119,6 +119,10 @@ public class DailyGenerator {
result, 1, new ExportFormat.Rewards(50, 2, 1)
);
// Generate clues via LLM
System.out.println("Generating clues for " + exported.words().size() + " words...");
exported = ClueGenerator.applyClues(exported);
// Write to JSON file
var filename = String.format("crossword_%s_%02d_%s.json", dateStr, i, safeSlug(theme));
var outputPath = Paths.get(outDir, filename);

View File

@@ -7,7 +7,10 @@ import java.nio.file.Paths;
import java.time.OffsetDateTime;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
public class Main {
// ---------------- CLI ----------------
@@ -15,7 +18,7 @@ public class Main {
public static class Opts {
public int seed = 1;
public int pop = 18;
public int gens = 200;
public int gens = 1000;
public int tries = 5;
public String wordsPath = "./out/pool.txt";
public double minSimplicity = 0; // 0 means no limit
@@ -74,6 +77,11 @@ public class Main {
System.out.println(SwedishGenerator.renderHuman(res.filled().grid));
System.out.printf(Locale.ROOT, "Puzzle Simplicity: %.2f%n", res.filled().simplicity);
var out = ExportFormat.exportFormatFromFilled(res, 1, new ExportFormat.Rewards(50, 2, 1));
// Generate clues via LLM
System.out.println("Generating clues for " + out.words().size() + " words...");
out = ClueGenerator.applyClues(out);
System.out.println("gridv2:");
for (String row : out.gridv2()) System.out.println(row);
System.out.println("words: " + out.words().size());

View File

@@ -145,7 +145,7 @@ public class SwedishGenerator {
// Base difficulty starts high and decreases with length and score.
// Length impact: up to 8 * 10 = 80
// Score impact: up to 10 * 15 = 150
var difficulty1 = 0 + ((8 - word.length()) * 30) + ((10-score) * 15);
var difficulty1 = 0 + ((8 - word.length()) * 30) + ((10 - score) * 15);
this.difficulty = difficulty1;
}
}
@@ -153,24 +153,19 @@ public class SwedishGenerator {
static Map<String, Integer> loadScores() {
var scores = new HashMap<String, Integer>();
try {
var lines = Files.readAllLines(Path.of("word_scores.csv"), StandardCharsets.UTF_8);
var lines = Files.readAllLines(Path.of("export_words.csv"), StandardCharsets.UTF_8);
var first = true;
for (var line : lines) {
if (first) {
first = false;
continue;
}
var parts = line.split("," );
if (parts.length >= 3) {
var parts = line.split(",");
if (parts.length >= 2) {
try {
var word = parts[0].trim().toUpperCase(Locale.ROOT);
var score = Integer.parseInt(parts[1].trim());
var status = parts[2].trim();
if ("OK".equalsIgnoreCase(status)) {
scores.put(word, score);
} else {
System.err.println("Skipping:" +Arrays.toString( parts));
}
scores.put(word, score);
} catch (NumberFormatException ignored) {
System.err.println("Illegal number format: " + line);
}
@@ -906,7 +901,7 @@ public class SwedishGenerator {
var tLoad0 = System.nanoTime();
var dict = loadWords(opts.wordsPath, llmScores);
var tLoad1 = System.nanoTime();
System.out.printf(Locale.ROOT, "LOAD_WORDS: %.3fs%n %s words", (tLoad1 - tLoad0) / 1e9,dict.words.size());
System.out.printf(Locale.ROOT, "LOAD_WORDS: %.3fs%n %s words", (tLoad1 - tLoad0) / 1e9, dict.words.size());
for (var attempt = 1; attempt <= opts.tries; attempt++) {
System.out.println("\nAttempt " + attempt + "/" + opts.tries);

View File

@@ -56,7 +56,7 @@ public class ThemePoolBuilderLength {
static final class Opts {
String wordsPath = "/home/mike/dev/puzzle-generator/word-list.txt";
String wordsPath = "/home/mike/dev/puzzle-generator/export_words_only.txt";
String endpoint = "https://jarvis-lan.appmodel.nl/api/stoic/";
List<String> feeds = new ArrayList<>(DEFAULT_FEEDS);
String outDir = "./out";
@@ -301,12 +301,12 @@ public class ThemePoolBuilderLength {
var parts = line.split(",", 3);
if (parts.length >= 3) {
try {
var word = parts[0].trim().toUpperCase(Locale.ROOT);
var score = Integer.parseInt(parts[1].trim());
var status = parts[2].trim();
if ("OK".equalsIgnoreCase(status)) {
llmScores.put(word, score);
}
var word = parts[0].trim().toUpperCase(Locale.ROOT);
var score = Integer.parseInt(parts[1].trim());
// var status = parts[2].trim();
// if ("OK".equalsIgnoreCase(status)) {
llmScores.put(word, score);
//}
} catch (NumberFormatException ignored) { }
}
}
@@ -731,7 +731,7 @@ public class ThemePoolBuilderLength {
// Optionally filter out VERY complex words from the bridge (e.g. lScore < 3)
// But since we sort by score (which is now dominated by lScore),
// they will be at the very bottom anyway.
if (lex.score[i] < 800) continue;
// if (lex.score[i] < 800) continue;
ids.add(i);
}
@@ -774,8 +774,8 @@ public class ThemePoolBuilderLength {
var out = new ArrayList<String>(ids.size());
for (var id : ids) {
/* if (lex.score[id] < 680)
continue;*/
if (lex.score[id] < 680)
continue;
out.add(lex.words.get(id));
}
Files.write(path, out, StandardCharsets.UTF_8, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);