#!/usr/bin/env python3 import argparse import json import re import sqlite3 from pathlib import Path RE_ASCII_WORD = re.compile(r"^[A-Za-z]+$") RE_SPACE = re.compile(r"\s+") RE_PARENS = re.compile(r"\s*\([^)]*\)\s*") # verwijder (labels) RE_BRACKETS = re.compile(r"\s*\[[^]]*]\s*") # verwijder [labels] def clean_hint(s: str) -> str: s = s.strip() s = RE_BRACKETS.sub(" ", s) s = RE_PARENS.sub(" ", s) s = s.replace("’", "'") s = RE_SPACE.sub(" ", s).strip(" -;:,.\t") return s def pick_gloss(obj: dict) -> tuple[str | None, str | None]: """Return (hint, pos) from a Wiktextract JSON line.""" pos = obj.get("pos") senses = obj.get("senses") or [] best = None for s in senses: glosses = s.get("glosses") or [] if not glosses: continue # Neem de eerste gloss die "normaal" oogt for g in glosses: if not isinstance(g, str): continue g2 = clean_hint(g) if len(g2) < 3: continue best = g2 break if best: break return best, pos def main(): ap = argparse.ArgumentParser() ap.add_argument("--db", required=True, help="pad naar jouw sqlite db") ap.add_argument("--jsonl", required=True, help="pad naar nl-extract.jsonl") ap.add_argument("--minlen", type=int, default=2) ap.add_argument("--maxlen", type=int, default=8) ap.add_argument("--maxhint", type=int, default=80) args = ap.parse_args() db_path = Path(args.db) jsonl_path = Path(args.jsonl) con = sqlite3.connect(db_path) cur = con.cursor() # speed pragmas (alleen tijdens import) cur.execute("PRAGMA journal_mode=WAL;") cur.execute("PRAGMA synchronous=NORMAL;") cur.execute("PRAGMA temp_store=MEMORY;") cur.execute(""" CREATE TABLE IF NOT EXISTS hints ( word TEXT NOT NULL, hint TEXT NOT NULL, source TEXT NOT NULL DEFAULT 'wiktionary', pos TEXT, quality INTEGER NOT NULL DEFAULT 80, PRIMARY KEY (word, hint, source) ); """) cur.execute("CREATE INDEX IF NOT EXISTS idx_hints_word ON hints(word);") con.commit() batch = [] inserted = 0 seen = 0 con.execute("BEGIN;") with jsonl_path.open("r", encoding="utf-8") as f: for line in f: line = line.strip() if not line: continue try: obj = json.loads(line) except json.JSONDecodeError: continue # Kaikki/Wiktextract: vaak lang_code = "nl" en lang = "Dutch" lang_code = obj.get("lang_code") if lang_code and lang_code != "nl": continue word = obj.get("word") if not word: continue word_up = word.upper().strip() if not (args.minlen <= len(word_up) <= args.maxlen): continue if not RE_ASCII_WORD.match(word_up): continue hint, pos = pick_gloss(obj) if not hint: continue # Hint kort houden hint = hint[: args.maxhint].rstrip() # Simpele kwaliteit: iets hoger als POS bekend is quality = 85 if pos else 80 batch.append((word_up, hint, "wiktionary", pos, quality)) seen += 1 if len(batch) >= 2000: cur.executemany( "INSERT OR IGNORE INTO hints(word,hint,source,pos,quality) VALUES (?,?,?,?,?)", batch ) inserted += cur.rowcount if cur.rowcount != -1 else 0 batch.clear() if batch: cur.executemany( "INSERT OR IGNORE INTO hints(word,hint,source,pos,quality) VALUES (?,?,?,?,?)", batch ) inserted += cur.rowcount if cur.rowcount != -1 else 0 con.commit() con.close() print(f"Done. processed_lines≈{seen}, inserted≈{inserted} (OR IGNORE kan inserts verlagen).") if __name__ == "__main__": main()