143 lines
4.0 KiB
Python
143 lines
4.0 KiB
Python
#!/usr/bin/env python3
|
||
import argparse
|
||
import json
|
||
import re
|
||
import sqlite3
|
||
from pathlib import Path
|
||
|
||
RE_ASCII_WORD = re.compile(r"^[A-Za-z]+$")
|
||
RE_SPACE = re.compile(r"\s+")
|
||
RE_PARENS = re.compile(r"\s*\([^)]*\)\s*") # verwijder (labels)
|
||
RE_BRACKETS = re.compile(r"\s*\[[^]]*]\s*") # verwijder [labels]
|
||
|
||
def clean_hint(s: str) -> str:
|
||
s = s.strip()
|
||
s = RE_BRACKETS.sub(" ", s)
|
||
s = RE_PARENS.sub(" ", s)
|
||
s = s.replace("’", "'")
|
||
s = RE_SPACE.sub(" ", s).strip(" -;:,.\t")
|
||
return s
|
||
|
||
def pick_gloss(obj: dict) -> tuple[str | None, str | None]:
|
||
"""Return (hint, pos) from a Wiktextract JSON line."""
|
||
pos = obj.get("pos")
|
||
senses = obj.get("senses") or []
|
||
best = None
|
||
|
||
for s in senses:
|
||
glosses = s.get("glosses") or []
|
||
if not glosses:
|
||
continue
|
||
# Neem de eerste gloss die "normaal" oogt
|
||
for g in glosses:
|
||
if not isinstance(g, str):
|
||
continue
|
||
g2 = clean_hint(g)
|
||
if len(g2) < 3:
|
||
continue
|
||
best = g2
|
||
break
|
||
if best:
|
||
break
|
||
|
||
return best, pos
|
||
|
||
def main():
|
||
ap = argparse.ArgumentParser()
|
||
ap.add_argument("--db", required=True, help="pad naar jouw sqlite db")
|
||
ap.add_argument("--jsonl", required=True, help="pad naar nl-extract.jsonl")
|
||
ap.add_argument("--minlen", type=int, default=2)
|
||
ap.add_argument("--maxlen", type=int, default=8)
|
||
ap.add_argument("--maxhint", type=int, default=80)
|
||
args = ap.parse_args()
|
||
|
||
db_path = Path(args.db)
|
||
jsonl_path = Path(args.jsonl)
|
||
|
||
con = sqlite3.connect(db_path)
|
||
cur = con.cursor()
|
||
|
||
# speed pragmas (alleen tijdens import)
|
||
cur.execute("PRAGMA journal_mode=WAL;")
|
||
cur.execute("PRAGMA synchronous=NORMAL;")
|
||
cur.execute("PRAGMA temp_store=MEMORY;")
|
||
|
||
cur.execute("""
|
||
CREATE TABLE IF NOT EXISTS hints (
|
||
word TEXT NOT NULL,
|
||
hint TEXT NOT NULL,
|
||
source TEXT NOT NULL DEFAULT 'wiktionary',
|
||
pos TEXT,
|
||
quality INTEGER NOT NULL DEFAULT 80,
|
||
PRIMARY KEY (word, hint, source)
|
||
);
|
||
""")
|
||
cur.execute("CREATE INDEX IF NOT EXISTS idx_hints_word ON hints(word);")
|
||
con.commit()
|
||
|
||
batch = []
|
||
inserted = 0
|
||
seen = 0
|
||
|
||
con.execute("BEGIN;")
|
||
with jsonl_path.open("r", encoding="utf-8") as f:
|
||
for line in f:
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
try:
|
||
obj = json.loads(line)
|
||
except json.JSONDecodeError:
|
||
continue
|
||
|
||
# Kaikki/Wiktextract: vaak lang_code = "nl" en lang = "Dutch"
|
||
lang_code = obj.get("lang_code")
|
||
if lang_code and lang_code != "nl":
|
||
continue
|
||
|
||
word = obj.get("word")
|
||
if not word:
|
||
continue
|
||
|
||
word_up = word.upper().strip()
|
||
if not (args.minlen <= len(word_up) <= args.maxlen):
|
||
continue
|
||
if not RE_ASCII_WORD.match(word_up):
|
||
continue
|
||
|
||
hint, pos = pick_gloss(obj)
|
||
if not hint:
|
||
continue
|
||
|
||
# Hint kort houden
|
||
hint = hint[: args.maxhint].rstrip()
|
||
|
||
# Simpele kwaliteit: iets hoger als POS bekend is
|
||
quality = 85 if pos else 80
|
||
|
||
batch.append((word_up, hint, "wiktionary", pos, quality))
|
||
seen += 1
|
||
|
||
if len(batch) >= 2000:
|
||
cur.executemany(
|
||
"INSERT OR IGNORE INTO hints(word,hint,source,pos,quality) VALUES (?,?,?,?,?)",
|
||
batch
|
||
)
|
||
inserted += cur.rowcount if cur.rowcount != -1 else 0
|
||
batch.clear()
|
||
|
||
if batch:
|
||
cur.executemany(
|
||
"INSERT OR IGNORE INTO hints(word,hint,source,pos,quality) VALUES (?,?,?,?,?)",
|
||
batch
|
||
)
|
||
inserted += cur.rowcount if cur.rowcount != -1 else 0
|
||
|
||
con.commit()
|
||
con.close()
|
||
|
||
print(f"Done. processed_lines≈{seen}, inserted≈{inserted} (OR IGNORE kan inserts verlagen).")
|
||
|
||
if __name__ == "__main__":
|
||
main()
|