Gather data
This commit is contained in:
100
.db/init.sql
Normal file
100
.db/init.sql
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
-- 0) Optioneel: index (handig)
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_frequency_word ON frequency(Word);
|
||||||
|
|
||||||
|
-- 1) View: normalisatie voor Zipf en lengte (0..1)
|
||||||
|
DROP VIEW IF EXISTS word_norm;
|
||||||
|
CREATE VIEW word_norm AS
|
||||||
|
WITH base AS (
|
||||||
|
SELECT
|
||||||
|
Word,
|
||||||
|
CAST(NULLIF(REPLACE(Zipf, ',', '.'), '') AS REAL) AS zipf_num,
|
||||||
|
LENGTH(Word) AS len
|
||||||
|
FROM frequency
|
||||||
|
WHERE Word IS NOT NULL
|
||||||
|
),
|
||||||
|
stats AS (
|
||||||
|
SELECT
|
||||||
|
MIN(zipf_num) AS min_zipf,
|
||||||
|
MAX(zipf_num) AS max_zipf,
|
||||||
|
MIN(len) AS min_len,
|
||||||
|
MAX(len) AS max_len
|
||||||
|
FROM base
|
||||||
|
WHERE zipf_num IS NOT NULL
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
b.Word,
|
||||||
|
b.zipf_num,
|
||||||
|
b.len,
|
||||||
|
|
||||||
|
-- freq_norm: 0 = heel frequent, 1 = heel zeldzaam
|
||||||
|
(1.0 - (b.zipf_num - s.min_zipf) / NULLIF(s.max_zipf - s.min_zipf, 0.0)) AS freq_norm,
|
||||||
|
|
||||||
|
-- len_norm: 0..1
|
||||||
|
( (b.len - s.min_len) / NULLIF(CAST(s.max_len - s.min_len AS REAL), 0.0) ) AS len_norm
|
||||||
|
FROM base b
|
||||||
|
CROSS JOIN stats s
|
||||||
|
WHERE b.zipf_num IS NOT NULL;
|
||||||
|
|
||||||
|
|
||||||
|
-- 2) Letter-frequenties bouwen (letter_freq)
|
||||||
|
CREATE TABLE IF NOT EXISTS letter_freq (
|
||||||
|
ch TEXT PRIMARY KEY,
|
||||||
|
cnt INTEGER NOT NULL
|
||||||
|
);
|
||||||
|
|
||||||
|
DELETE FROM letter_freq;
|
||||||
|
|
||||||
|
WITH RECURSIVE chars(word, pos, ch) AS (
|
||||||
|
SELECT Word, 1, LOWER(SUBSTR(Word, 1, 1))
|
||||||
|
FROM frequency
|
||||||
|
WHERE Word IS NOT NULL AND LENGTH(Word) > 0
|
||||||
|
UNION ALL
|
||||||
|
SELECT word, pos + 1, LOWER(SUBSTR(word, pos + 1, 1))
|
||||||
|
FROM chars
|
||||||
|
WHERE pos < LENGTH(word)
|
||||||
|
)
|
||||||
|
INSERT INTO letter_freq(ch, cnt)
|
||||||
|
SELECT ch, COUNT(*) AS cnt
|
||||||
|
FROM chars
|
||||||
|
WHERE ch <> '' -- basic guard
|
||||||
|
GROUP BY ch;
|
||||||
|
|
||||||
|
|
||||||
|
-- 3) View: letterRarity per woord (0..1)
|
||||||
|
DROP VIEW IF EXISTS letter_rarity;
|
||||||
|
CREATE VIEW letter_rarity AS
|
||||||
|
WITH total AS (
|
||||||
|
SELECT CAST(SUM(cnt) AS REAL) AS total_cnt
|
||||||
|
FROM letter_freq
|
||||||
|
),
|
||||||
|
word_chars(word, pos, ch) AS (
|
||||||
|
SELECT Word, 1, LOWER(SUBSTR(Word, 1, 1))
|
||||||
|
FROM frequency
|
||||||
|
WHERE Word IS NOT NULL AND LENGTH(Word) > 0
|
||||||
|
UNION ALL
|
||||||
|
SELECT word, pos + 1, LOWER(SUBSTR(word, pos + 1, 1))
|
||||||
|
FROM word_chars
|
||||||
|
WHERE pos < LENGTH(word)
|
||||||
|
)
|
||||||
|
SELECT
|
||||||
|
wc.word AS Word,
|
||||||
|
AVG(1.0 - (lf.cnt / t.total_cnt)) AS letter_rarity_norm
|
||||||
|
FROM word_chars wc
|
||||||
|
JOIN letter_freq lf ON lf.ch = wc.ch
|
||||||
|
CROSS JOIN total t
|
||||||
|
GROUP BY wc.word;
|
||||||
|
|
||||||
|
|
||||||
|
-- 4) View: complexiteit v1 (ngramProb + morfologie = placeholder 0.5)
|
||||||
|
DROP VIEW IF EXISTS word_complexity_v1;
|
||||||
|
CREATE VIEW word_complexity_v1 AS
|
||||||
|
SELECT
|
||||||
|
n.Word,
|
||||||
|
(0.35 * n.freq_norm) +
|
||||||
|
(0.20 * n.len_norm) +
|
||||||
|
(0.20 * 0.5) + -- ngramProb placeholder
|
||||||
|
(0.15 * COALESCE(r.letter_rarity_norm, 0.5)) +
|
||||||
|
(0.10 * 0.5) -- morfologie placeholder
|
||||||
|
AS complexity
|
||||||
|
FROM word_norm n
|
||||||
|
LEFT JOIN letter_rarity r USING (Word);
|
||||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -6,3 +6,4 @@ target/
|
|||||||
out/puzzle/
|
out/puzzle/
|
||||||
*.log
|
*.log
|
||||||
.output.txt
|
.output.txt
|
||||||
|
out/
|
||||||
|
|||||||
142
import.py
Normal file
142
import.py
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import sqlite3
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
RE_ASCII_WORD = re.compile(r"^[A-Za-z]+$")
|
||||||
|
RE_SPACE = re.compile(r"\s+")
|
||||||
|
RE_PARENS = re.compile(r"\s*\([^)]*\)\s*") # verwijder (labels)
|
||||||
|
RE_BRACKETS = re.compile(r"\s*\[[^]]*]\s*") # verwijder [labels]
|
||||||
|
|
||||||
|
def clean_hint(s: str) -> str:
|
||||||
|
s = s.strip()
|
||||||
|
s = RE_BRACKETS.sub(" ", s)
|
||||||
|
s = RE_PARENS.sub(" ", s)
|
||||||
|
s = s.replace("’", "'")
|
||||||
|
s = RE_SPACE.sub(" ", s).strip(" -;:,.\t")
|
||||||
|
return s
|
||||||
|
|
||||||
|
def pick_gloss(obj: dict) -> tuple[str | None, str | None]:
|
||||||
|
"""Return (hint, pos) from a Wiktextract JSON line."""
|
||||||
|
pos = obj.get("pos")
|
||||||
|
senses = obj.get("senses") or []
|
||||||
|
best = None
|
||||||
|
|
||||||
|
for s in senses:
|
||||||
|
glosses = s.get("glosses") or []
|
||||||
|
if not glosses:
|
||||||
|
continue
|
||||||
|
# Neem de eerste gloss die "normaal" oogt
|
||||||
|
for g in glosses:
|
||||||
|
if not isinstance(g, str):
|
||||||
|
continue
|
||||||
|
g2 = clean_hint(g)
|
||||||
|
if len(g2) < 3:
|
||||||
|
continue
|
||||||
|
best = g2
|
||||||
|
break
|
||||||
|
if best:
|
||||||
|
break
|
||||||
|
|
||||||
|
return best, pos
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("--db", required=True, help="pad naar jouw sqlite db")
|
||||||
|
ap.add_argument("--jsonl", required=True, help="pad naar nl-extract.jsonl")
|
||||||
|
ap.add_argument("--minlen", type=int, default=2)
|
||||||
|
ap.add_argument("--maxlen", type=int, default=8)
|
||||||
|
ap.add_argument("--maxhint", type=int, default=80)
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
db_path = Path(args.db)
|
||||||
|
jsonl_path = Path(args.jsonl)
|
||||||
|
|
||||||
|
con = sqlite3.connect(db_path)
|
||||||
|
cur = con.cursor()
|
||||||
|
|
||||||
|
# speed pragmas (alleen tijdens import)
|
||||||
|
cur.execute("PRAGMA journal_mode=WAL;")
|
||||||
|
cur.execute("PRAGMA synchronous=NORMAL;")
|
||||||
|
cur.execute("PRAGMA temp_store=MEMORY;")
|
||||||
|
|
||||||
|
cur.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS hints (
|
||||||
|
word TEXT NOT NULL,
|
||||||
|
hint TEXT NOT NULL,
|
||||||
|
source TEXT NOT NULL DEFAULT 'wiktionary',
|
||||||
|
pos TEXT,
|
||||||
|
quality INTEGER NOT NULL DEFAULT 80,
|
||||||
|
PRIMARY KEY (word, hint, source)
|
||||||
|
);
|
||||||
|
""")
|
||||||
|
cur.execute("CREATE INDEX IF NOT EXISTS idx_hints_word ON hints(word);")
|
||||||
|
con.commit()
|
||||||
|
|
||||||
|
batch = []
|
||||||
|
inserted = 0
|
||||||
|
seen = 0
|
||||||
|
|
||||||
|
con.execute("BEGIN;")
|
||||||
|
with jsonl_path.open("r", encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
obj = json.loads(line)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Kaikki/Wiktextract: vaak lang_code = "nl" en lang = "Dutch"
|
||||||
|
lang_code = obj.get("lang_code")
|
||||||
|
if lang_code and lang_code != "nl":
|
||||||
|
continue
|
||||||
|
|
||||||
|
word = obj.get("word")
|
||||||
|
if not word:
|
||||||
|
continue
|
||||||
|
|
||||||
|
word_up = word.upper().strip()
|
||||||
|
if not (args.minlen <= len(word_up) <= args.maxlen):
|
||||||
|
continue
|
||||||
|
if not RE_ASCII_WORD.match(word_up):
|
||||||
|
continue
|
||||||
|
|
||||||
|
hint, pos = pick_gloss(obj)
|
||||||
|
if not hint:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Hint kort houden
|
||||||
|
hint = hint[: args.maxhint].rstrip()
|
||||||
|
|
||||||
|
# Simpele kwaliteit: iets hoger als POS bekend is
|
||||||
|
quality = 85 if pos else 80
|
||||||
|
|
||||||
|
batch.append((word_up, hint, "wiktionary", pos, quality))
|
||||||
|
seen += 1
|
||||||
|
|
||||||
|
if len(batch) >= 2000:
|
||||||
|
cur.executemany(
|
||||||
|
"INSERT OR IGNORE INTO hints(word,hint,source,pos,quality) VALUES (?,?,?,?,?)",
|
||||||
|
batch
|
||||||
|
)
|
||||||
|
inserted += cur.rowcount if cur.rowcount != -1 else 0
|
||||||
|
batch.clear()
|
||||||
|
|
||||||
|
if batch:
|
||||||
|
cur.executemany(
|
||||||
|
"INSERT OR IGNORE INTO hints(word,hint,source,pos,quality) VALUES (?,?,?,?,?)",
|
||||||
|
batch
|
||||||
|
)
|
||||||
|
inserted += cur.rowcount if cur.rowcount != -1 else 0
|
||||||
|
|
||||||
|
con.commit()
|
||||||
|
con.close()
|
||||||
|
|
||||||
|
print(f"Done. processed_lines≈{seen}, inserted≈{inserted} (OR IGNORE kan inserts verlagen).")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
27831
out/pool.txt
27831
out/pool.txt
File diff suppressed because it is too large
Load Diff
@@ -1,38 +0,0 @@
|
|||||||
Date: 2025-12-22
|
|
||||||
Feeds: https://feeds.nos.nl/nosnieuwsalgemeen, https://feeds.nos.nl/nosnieuwstech
|
|
||||||
Model: mistralai/mistral-nemo-instruct-2407
|
|
||||||
|
|
||||||
Master size: 91871
|
|
||||||
Theme kept (in master): 0
|
|
||||||
Bridge size: 21125
|
|
||||||
Shorts kept: 130
|
|
||||||
Pool total: 27831
|
|
||||||
|
|
||||||
Enforced minima:
|
|
||||||
2: 1000
|
|
||||||
3: 1000
|
|
||||||
4: 1000
|
|
||||||
5: 1000
|
|
||||||
6: 2000
|
|
||||||
7: 2000
|
|
||||||
8: 2000
|
|
||||||
|
|
||||||
Counts per length (theme):
|
|
||||||
2: 0
|
|
||||||
3: 0
|
|
||||||
4: 0
|
|
||||||
5: 0
|
|
||||||
6: 0
|
|
||||||
7: 0
|
|
||||||
8: 0
|
|
||||||
|
|
||||||
|
|
||||||
Counts per length (pool):
|
|
||||||
2: 242
|
|
||||||
3: 1000
|
|
||||||
4: 1000
|
|
||||||
5: 1000
|
|
||||||
6: 2000
|
|
||||||
7: 2000
|
|
||||||
8: 20589
|
|
||||||
|
|
||||||
40
out/rss.txt
40
out/rss.txt
File diff suppressed because one or more lines are too long
220
src/puzzle/ClueGenerator.java
Normal file
220
src/puzzle/ClueGenerator.java
Normal file
@@ -0,0 +1,220 @@
|
|||||||
|
package puzzle;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
public class ClueGenerator {
|
||||||
|
|
||||||
|
private static final String OLLAMA_URL = "http://localhost:11434/api/chat";
|
||||||
|
private static final String MODEL = "qwen2.5:14b";
|
||||||
|
private static final String HINTS_FILE = "export_with_hints.csv";
|
||||||
|
private static Map<String, String> prebuiltClues = null;
|
||||||
|
|
||||||
|
private static synchronized void ensurePrebuiltCluesLoaded() {
|
||||||
|
if (prebuiltClues != null) return;
|
||||||
|
prebuiltClues = new HashMap<>();
|
||||||
|
try {
|
||||||
|
List<String> lines = Files.readAllLines(Path.of(HINTS_FILE), StandardCharsets.UTF_8);
|
||||||
|
for (String line : lines) {
|
||||||
|
String[] parts = line.split(",", 3);
|
||||||
|
if (parts.length >= 3) {
|
||||||
|
String word = parts[0].trim().toUpperCase(Locale.ROOT);
|
||||||
|
String rawClue = parts[2].trim();
|
||||||
|
if (rawClue.startsWith("\"") && rawClue.endsWith("\"")) {
|
||||||
|
rawClue = rawClue.substring(1, rawClue.length() - 1).replace("\"\"", "\"");
|
||||||
|
}
|
||||||
|
if (!word.isEmpty() && !rawClue.isEmpty()) {
|
||||||
|
prebuiltClues.put(word, rawClue);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
System.err.println("Warning: " + HINTS_FILE + " not found or could not be read.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static ExportFormat.ExportedPuzzle applyClues(ExportFormat.ExportedPuzzle puzzle) {
|
||||||
|
if (puzzle == null || puzzle.words().isEmpty()) {
|
||||||
|
return puzzle;
|
||||||
|
}
|
||||||
|
|
||||||
|
ensurePrebuiltCluesLoaded();
|
||||||
|
|
||||||
|
Map<String, String> finalClueMap = new HashMap<>();
|
||||||
|
List<String> wordsMissingClues = new ArrayList<>();
|
||||||
|
|
||||||
|
for (var w : puzzle.words()) {
|
||||||
|
String wordUpper = w.word().toUpperCase(Locale.ROOT);
|
||||||
|
if (prebuiltClues.containsKey(wordUpper)) {
|
||||||
|
finalClueMap.put(w.word(), prebuiltClues.get(wordUpper));
|
||||||
|
} else {
|
||||||
|
wordsMissingClues.add(w.word());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!wordsMissingClues.isEmpty()) {
|
||||||
|
Map<String, String> generatedClues = generateClues(wordsMissingClues);
|
||||||
|
finalClueMap.putAll(generatedClues);
|
||||||
|
}
|
||||||
|
|
||||||
|
List<ExportFormat.WordOut> wordsWithClues = new ArrayList<>();
|
||||||
|
for (var w : puzzle.words()) {
|
||||||
|
String clue = finalClueMap.getOrDefault(w.word(), w.word());
|
||||||
|
wordsWithClues.add(new ExportFormat.WordOut(
|
||||||
|
w.word(),
|
||||||
|
clue,
|
||||||
|
w.startRow(),
|
||||||
|
w.startCol(),
|
||||||
|
w.direction(),
|
||||||
|
w.answer(),
|
||||||
|
w.arrowRow(),
|
||||||
|
w.arrowCol(),
|
||||||
|
w.isReversed()
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
return new ExportFormat.ExportedPuzzle(puzzle.gridv2(), wordsWithClues, puzzle.difficulty(), puzzle.rewards());
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Map<String, String> generateClues(List<String> words) {
|
||||||
|
if (words == null || words.isEmpty()) {
|
||||||
|
return Collections.emptyMap();
|
||||||
|
}
|
||||||
|
|
||||||
|
String prompt = createCluePrompt(words);
|
||||||
|
try {
|
||||||
|
String jsonRequest = String.format(
|
||||||
|
"{\"model\":\"%s\",\"messages\":[{\"role\":\"user\",\"content\":\"%s\"}],\"stream\":false,\"temperature\":0.7}",
|
||||||
|
MODEL, escapeJson(prompt)
|
||||||
|
);
|
||||||
|
|
||||||
|
String responseBody = curlPostJson(OLLAMA_URL, jsonRequest, 120);
|
||||||
|
String content = extractChatContent(responseBody);
|
||||||
|
|
||||||
|
if (content == null || content.isEmpty()) {
|
||||||
|
return Collections.emptyMap();
|
||||||
|
}
|
||||||
|
|
||||||
|
return parseCluesFromReply(words, content);
|
||||||
|
} catch (Exception e) {
|
||||||
|
System.err.println("Failed to generate clues: " + e.getMessage());
|
||||||
|
return Collections.emptyMap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String createCluePrompt(List<String> words) {
|
||||||
|
return "Je bent een expert in het maken van kruiswoordpuzzels. Geef voor elk van de onderstaande woorden een korte, uitdagende maar duidelijke cryptische of beschrijvende aanwijzing in het Nederlands.\n\n" +
|
||||||
|
"Output ALLEEN in dit formaat:\n" +
|
||||||
|
"woord1:aanwijzing\n" +
|
||||||
|
"woord2:aanwijzing\n\n" +
|
||||||
|
"GEEN andere tekst of uitleg. Sla GEEN woorden over.\n\n" +
|
||||||
|
"Lijst:\n" +
|
||||||
|
String.join("\n", words);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Map<String, String> parseCluesFromReply(List<String> expectedWords, String reply) {
|
||||||
|
Map<String, String> wordClueMap = new HashMap<>();
|
||||||
|
String[] lines = reply.split("\n");
|
||||||
|
|
||||||
|
for (String line : lines) {
|
||||||
|
line = line.trim();
|
||||||
|
if (line.contains(":")) {
|
||||||
|
String[] parts = line.split(":", 2);
|
||||||
|
if (parts.length == 2) {
|
||||||
|
String wordPart = parts[0].trim().replaceAll("^[\\d+.)*\\-\\s]+", "").toLowerCase();
|
||||||
|
String clue = parts[1].trim();
|
||||||
|
if (!clue.isEmpty()) {
|
||||||
|
wordClueMap.put(wordPart, clue);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Map<String, String> results = new HashMap<>();
|
||||||
|
for (String word : expectedWords) {
|
||||||
|
String clue = wordClueMap.get(word.toLowerCase());
|
||||||
|
if (clue != null) {
|
||||||
|
results.put(word, clue);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String curlPostJson(String url, String jsonBody, int timeoutSeconds) throws Exception {
|
||||||
|
var tempFile = Files.createTempFile("clue-request-", ".json");
|
||||||
|
try {
|
||||||
|
Files.writeString(tempFile, jsonBody, StandardCharsets.UTF_8);
|
||||||
|
List<String> cmd = new ArrayList<>();
|
||||||
|
cmd.add("curl");
|
||||||
|
cmd.add("-fsSL");
|
||||||
|
cmd.add("--connect-timeout");
|
||||||
|
cmd.add("10");
|
||||||
|
cmd.add("--max-time");
|
||||||
|
cmd.add(String.valueOf(timeoutSeconds));
|
||||||
|
cmd.add("-H");
|
||||||
|
cmd.add("Content-Type: application/json");
|
||||||
|
cmd.add("-d");
|
||||||
|
cmd.add("@" + tempFile);
|
||||||
|
cmd.add(url);
|
||||||
|
|
||||||
|
var p = new ProcessBuilder(cmd)
|
||||||
|
.redirectErrorStream(true)
|
||||||
|
.start();
|
||||||
|
|
||||||
|
var bytes = p.getInputStream().readAllBytes();
|
||||||
|
var code = p.waitFor();
|
||||||
|
|
||||||
|
if (code != 0) {
|
||||||
|
throw new IOException("curl POST failed (" + code + ") url=" + url + "\nOutput:\n" +
|
||||||
|
new String(bytes, StandardCharsets.UTF_8));
|
||||||
|
}
|
||||||
|
|
||||||
|
return new String(bytes, StandardCharsets.UTF_8);
|
||||||
|
} finally {
|
||||||
|
Files.deleteIfExists(tempFile);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String extractChatContent(String json) {
|
||||||
|
if (json == null) return null;
|
||||||
|
var choices = json.indexOf("\"choices\"");
|
||||||
|
var p = (choices >= 0) ? choices : 0;
|
||||||
|
var i = json.indexOf("\"content\"", p);
|
||||||
|
if (i < 0) {
|
||||||
|
// Fallback for Ollama non-chat format if needed, but we used /api/chat
|
||||||
|
// Ollama /api/chat returns {"model":"...","message":{"role":"assistant","content":"..."}}
|
||||||
|
i = json.indexOf("\"content\"");
|
||||||
|
if (i < 0) return null;
|
||||||
|
}
|
||||||
|
var colon = json.indexOf(':', i);
|
||||||
|
if (colon < 0) return null;
|
||||||
|
var q = json.indexOf('"', colon + 1);
|
||||||
|
if (q < 0) return null;
|
||||||
|
var sb = new StringBuilder();
|
||||||
|
var esc = false;
|
||||||
|
for (var k = q + 1; k < json.length(); k++) {
|
||||||
|
var ch = json.charAt(k);
|
||||||
|
if (esc) {
|
||||||
|
if (ch == 'n') sb.append('\n');
|
||||||
|
else if (ch == 't') sb.append('\t');
|
||||||
|
else if (ch == 'r') sb.append('\r');
|
||||||
|
else sb.append(ch);
|
||||||
|
esc = false;
|
||||||
|
} else {
|
||||||
|
if (ch == '\\') esc = true;
|
||||||
|
else if (ch == '"') break;
|
||||||
|
else sb.append(ch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String escapeJson(String str) {
|
||||||
|
return str.replace("\\", "\\\\")
|
||||||
|
.replace("\"", "\\\"")
|
||||||
|
.replace("\n", "\\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -31,7 +31,7 @@ public class DailyGenerator {
|
|||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
var outDir = env("OUT_DIR", "/home/mike/dev/puzzle-generator/data/");
|
var outDir = env("OUT_DIR", "/home/mike/dev/puzzle-generator/data/");
|
||||||
var wordsPath = env("WORDS_PATH", "./word-list.txt");
|
var wordsPath = env("WORDS_PATH", "./export_words_only.txt");
|
||||||
var puzzlesPerDay = envInt("PUZZLES_PER_DAY", 3);
|
var puzzlesPerDay = envInt("PUZZLES_PER_DAY", 3);
|
||||||
var seed = envInt("SEED", (int) System.currentTimeMillis());
|
var seed = envInt("SEED", (int) System.currentTimeMillis());
|
||||||
var themeFilter = envBool("THEME_FILTER", true);
|
var themeFilter = envBool("THEME_FILTER", true);
|
||||||
@@ -119,6 +119,10 @@ public class DailyGenerator {
|
|||||||
result, 1, new ExportFormat.Rewards(50, 2, 1)
|
result, 1, new ExportFormat.Rewards(50, 2, 1)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Generate clues via LLM
|
||||||
|
System.out.println("Generating clues for " + exported.words().size() + " words...");
|
||||||
|
exported = ClueGenerator.applyClues(exported);
|
||||||
|
|
||||||
// Write to JSON file
|
// Write to JSON file
|
||||||
var filename = String.format("crossword_%s_%02d_%s.json", dateStr, i, safeSlug(theme));
|
var filename = String.format("crossword_%s_%02d_%s.json", dateStr, i, safeSlug(theme));
|
||||||
var outputPath = Paths.get(outDir, filename);
|
var outputPath = Paths.get(outDir, filename);
|
||||||
|
|||||||
@@ -7,7 +7,10 @@ import java.nio.file.Paths;
|
|||||||
import java.time.OffsetDateTime;
|
import java.time.OffsetDateTime;
|
||||||
import java.time.ZoneOffset;
|
import java.time.ZoneOffset;
|
||||||
import java.time.format.DateTimeFormatter;
|
import java.time.format.DateTimeFormatter;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
public class Main {
|
public class Main {
|
||||||
// ---------------- CLI ----------------
|
// ---------------- CLI ----------------
|
||||||
@@ -15,7 +18,7 @@ public class Main {
|
|||||||
public static class Opts {
|
public static class Opts {
|
||||||
public int seed = 1;
|
public int seed = 1;
|
||||||
public int pop = 18;
|
public int pop = 18;
|
||||||
public int gens = 200;
|
public int gens = 1000;
|
||||||
public int tries = 5;
|
public int tries = 5;
|
||||||
public String wordsPath = "./out/pool.txt";
|
public String wordsPath = "./out/pool.txt";
|
||||||
public double minSimplicity = 0; // 0 means no limit
|
public double minSimplicity = 0; // 0 means no limit
|
||||||
@@ -74,6 +77,11 @@ public class Main {
|
|||||||
System.out.println(SwedishGenerator.renderHuman(res.filled().grid));
|
System.out.println(SwedishGenerator.renderHuman(res.filled().grid));
|
||||||
System.out.printf(Locale.ROOT, "Puzzle Simplicity: %.2f%n", res.filled().simplicity);
|
System.out.printf(Locale.ROOT, "Puzzle Simplicity: %.2f%n", res.filled().simplicity);
|
||||||
var out = ExportFormat.exportFormatFromFilled(res, 1, new ExportFormat.Rewards(50, 2, 1));
|
var out = ExportFormat.exportFormatFromFilled(res, 1, new ExportFormat.Rewards(50, 2, 1));
|
||||||
|
|
||||||
|
// Generate clues via LLM
|
||||||
|
System.out.println("Generating clues for " + out.words().size() + " words...");
|
||||||
|
out = ClueGenerator.applyClues(out);
|
||||||
|
|
||||||
System.out.println("gridv2:");
|
System.out.println("gridv2:");
|
||||||
for (String row : out.gridv2()) System.out.println(row);
|
for (String row : out.gridv2()) System.out.println(row);
|
||||||
System.out.println("words: " + out.words().size());
|
System.out.println("words: " + out.words().size());
|
||||||
|
|||||||
@@ -153,7 +153,7 @@ public class SwedishGenerator {
|
|||||||
static Map<String, Integer> loadScores() {
|
static Map<String, Integer> loadScores() {
|
||||||
var scores = new HashMap<String, Integer>();
|
var scores = new HashMap<String, Integer>();
|
||||||
try {
|
try {
|
||||||
var lines = Files.readAllLines(Path.of("word_scores.csv"), StandardCharsets.UTF_8);
|
var lines = Files.readAllLines(Path.of("export_words.csv"), StandardCharsets.UTF_8);
|
||||||
var first = true;
|
var first = true;
|
||||||
for (var line : lines) {
|
for (var line : lines) {
|
||||||
if (first) {
|
if (first) {
|
||||||
@@ -161,16 +161,11 @@ public class SwedishGenerator {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
var parts = line.split(",");
|
var parts = line.split(",");
|
||||||
if (parts.length >= 3) {
|
if (parts.length >= 2) {
|
||||||
try {
|
try {
|
||||||
var word = parts[0].trim().toUpperCase(Locale.ROOT);
|
var word = parts[0].trim().toUpperCase(Locale.ROOT);
|
||||||
var score = Integer.parseInt(parts[1].trim());
|
var score = Integer.parseInt(parts[1].trim());
|
||||||
var status = parts[2].trim();
|
|
||||||
if ("OK".equalsIgnoreCase(status)) {
|
|
||||||
scores.put(word, score);
|
scores.put(word, score);
|
||||||
} else {
|
|
||||||
System.err.println("Skipping:" +Arrays.toString( parts));
|
|
||||||
}
|
|
||||||
} catch (NumberFormatException ignored) {
|
} catch (NumberFormatException ignored) {
|
||||||
System.err.println("Illegal number format: " + line);
|
System.err.println("Illegal number format: " + line);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ public class ThemePoolBuilderLength {
|
|||||||
|
|
||||||
static final class Opts {
|
static final class Opts {
|
||||||
|
|
||||||
String wordsPath = "/home/mike/dev/puzzle-generator/word-list.txt";
|
String wordsPath = "/home/mike/dev/puzzle-generator/export_words_only.txt";
|
||||||
String endpoint = "https://jarvis-lan.appmodel.nl/api/stoic/";
|
String endpoint = "https://jarvis-lan.appmodel.nl/api/stoic/";
|
||||||
List<String> feeds = new ArrayList<>(DEFAULT_FEEDS);
|
List<String> feeds = new ArrayList<>(DEFAULT_FEEDS);
|
||||||
String outDir = "./out";
|
String outDir = "./out";
|
||||||
@@ -303,10 +303,10 @@ public class ThemePoolBuilderLength {
|
|||||||
try {
|
try {
|
||||||
var word = parts[0].trim().toUpperCase(Locale.ROOT);
|
var word = parts[0].trim().toUpperCase(Locale.ROOT);
|
||||||
var score = Integer.parseInt(parts[1].trim());
|
var score = Integer.parseInt(parts[1].trim());
|
||||||
var status = parts[2].trim();
|
// var status = parts[2].trim();
|
||||||
if ("OK".equalsIgnoreCase(status)) {
|
// if ("OK".equalsIgnoreCase(status)) {
|
||||||
llmScores.put(word, score);
|
llmScores.put(word, score);
|
||||||
}
|
//}
|
||||||
} catch (NumberFormatException ignored) { }
|
} catch (NumberFormatException ignored) { }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -731,7 +731,7 @@ public class ThemePoolBuilderLength {
|
|||||||
// Optionally filter out VERY complex words from the bridge (e.g. lScore < 3)
|
// Optionally filter out VERY complex words from the bridge (e.g. lScore < 3)
|
||||||
// But since we sort by score (which is now dominated by lScore),
|
// But since we sort by score (which is now dominated by lScore),
|
||||||
// they will be at the very bottom anyway.
|
// they will be at the very bottom anyway.
|
||||||
if (lex.score[i] < 800) continue;
|
// if (lex.score[i] < 800) continue;
|
||||||
ids.add(i);
|
ids.add(i);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -774,8 +774,8 @@ public class ThemePoolBuilderLength {
|
|||||||
|
|
||||||
var out = new ArrayList<String>(ids.size());
|
var out = new ArrayList<String>(ids.size());
|
||||||
for (var id : ids) {
|
for (var id : ids) {
|
||||||
/* if (lex.score[id] < 680)
|
if (lex.score[id] < 680)
|
||||||
continue;*/
|
continue;
|
||||||
out.add(lex.words.get(id));
|
out.add(lex.words.get(id));
|
||||||
}
|
}
|
||||||
Files.write(path, out, StandardCharsets.UTF_8, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
|
Files.write(path, out, StandardCharsets.UTF_8, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
|
||||||
|
|||||||
Reference in New Issue
Block a user