Files
puzzle-generator/import.py
2025-12-25 00:21:58 +01:00

143 lines
4.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
import argparse
import json
import re
import sqlite3
from pathlib import Path
RE_ASCII_WORD = re.compile(r"^[A-Za-z]+$")
RE_SPACE = re.compile(r"\s+")
RE_PARENS = re.compile(r"\s*\([^)]*\)\s*") # verwijder (labels)
RE_BRACKETS = re.compile(r"\s*\[[^]]*]\s*") # verwijder [labels]
def clean_hint(s: str) -> str:
s = s.strip()
s = RE_BRACKETS.sub(" ", s)
s = RE_PARENS.sub(" ", s)
s = s.replace("", "'")
s = RE_SPACE.sub(" ", s).strip(" -;:,.\t")
return s
def pick_gloss(obj: dict) -> tuple[str | None, str | None]:
"""Return (hint, pos) from a Wiktextract JSON line."""
pos = obj.get("pos")
senses = obj.get("senses") or []
best = None
for s in senses:
glosses = s.get("glosses") or []
if not glosses:
continue
# Neem de eerste gloss die "normaal" oogt
for g in glosses:
if not isinstance(g, str):
continue
g2 = clean_hint(g)
if len(g2) < 3:
continue
best = g2
break
if best:
break
return best, pos
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--db", required=True, help="pad naar jouw sqlite db")
ap.add_argument("--jsonl", required=True, help="pad naar nl-extract.jsonl")
ap.add_argument("--minlen", type=int, default=2)
ap.add_argument("--maxlen", type=int, default=8)
ap.add_argument("--maxhint", type=int, default=80)
args = ap.parse_args()
db_path = Path(args.db)
jsonl_path = Path(args.jsonl)
con = sqlite3.connect(db_path)
cur = con.cursor()
# speed pragmas (alleen tijdens import)
cur.execute("PRAGMA journal_mode=WAL;")
cur.execute("PRAGMA synchronous=NORMAL;")
cur.execute("PRAGMA temp_store=MEMORY;")
cur.execute("""
CREATE TABLE IF NOT EXISTS hints (
word TEXT NOT NULL,
hint TEXT NOT NULL,
source TEXT NOT NULL DEFAULT 'wiktionary',
pos TEXT,
quality INTEGER NOT NULL DEFAULT 80,
PRIMARY KEY (word, hint, source)
);
""")
cur.execute("CREATE INDEX IF NOT EXISTS idx_hints_word ON hints(word);")
con.commit()
batch = []
inserted = 0
seen = 0
con.execute("BEGIN;")
with jsonl_path.open("r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
obj = json.loads(line)
except json.JSONDecodeError:
continue
# Kaikki/Wiktextract: vaak lang_code = "nl" en lang = "Dutch"
lang_code = obj.get("lang_code")
if lang_code and lang_code != "nl":
continue
word = obj.get("word")
if not word:
continue
word_up = word.upper().strip()
if not (args.minlen <= len(word_up) <= args.maxlen):
continue
if not RE_ASCII_WORD.match(word_up):
continue
hint, pos = pick_gloss(obj)
if not hint:
continue
# Hint kort houden
hint = hint[: args.maxhint].rstrip()
# Simpele kwaliteit: iets hoger als POS bekend is
quality = 85 if pos else 80
batch.append((word_up, hint, "wiktionary", pos, quality))
seen += 1
if len(batch) >= 2000:
cur.executemany(
"INSERT OR IGNORE INTO hints(word,hint,source,pos,quality) VALUES (?,?,?,?,?)",
batch
)
inserted += cur.rowcount if cur.rowcount != -1 else 0
batch.clear()
if batch:
cur.executemany(
"INSERT OR IGNORE INTO hints(word,hint,source,pos,quality) VALUES (?,?,?,?,?)",
batch
)
inserted += cur.rowcount if cur.rowcount != -1 else 0
con.commit()
con.close()
print(f"Done. processed_lines≈{seen}, inserted≈{inserted} (OR IGNORE kan inserts verlagen).")
if __name__ == "__main__":
main()