Gather data
This commit is contained in:
142
py/import.py
Normal file
142
py/import.py
Normal file
@@ -0,0 +1,142 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
RE_ASCII_WORD = re.compile(r"^[A-Za-z]+$")
|
||||
RE_SPACE = re.compile(r"\s+")
|
||||
RE_PARENS = re.compile(r"\s*\([^)]*\)\s*") # verwijder (labels)
|
||||
RE_BRACKETS = re.compile(r"\s*\[[^]]*]\s*") # verwijder [labels]
|
||||
|
||||
def clean_hint(s: str) -> str:
|
||||
s = s.strip()
|
||||
s = RE_BRACKETS.sub(" ", s)
|
||||
s = RE_PARENS.sub(" ", s)
|
||||
s = s.replace("’", "'")
|
||||
s = RE_SPACE.sub(" ", s).strip(" -;:,.\t")
|
||||
return s
|
||||
|
||||
def pick_gloss(obj: dict) -> tuple[str | None, str | None]:
|
||||
"""Return (hint, pos) from a Wiktextract JSON line."""
|
||||
pos = obj.get("pos")
|
||||
senses = obj.get("senses") or []
|
||||
best = None
|
||||
|
||||
for s in senses:
|
||||
glosses = s.get("glosses") or []
|
||||
if not glosses:
|
||||
continue
|
||||
# Neem de eerste gloss die "normaal" oogt
|
||||
for g in glosses:
|
||||
if not isinstance(g, str):
|
||||
continue
|
||||
g2 = clean_hint(g)
|
||||
if len(g2) < 3:
|
||||
continue
|
||||
best = g2
|
||||
break
|
||||
if best:
|
||||
break
|
||||
|
||||
return best, pos
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--db", required=True, help="pad naar jouw sqlite db")
|
||||
ap.add_argument("--jsonl", required=True, help="pad naar nl-extract.jsonl")
|
||||
ap.add_argument("--minlen", type=int, default=2)
|
||||
ap.add_argument("--maxlen", type=int, default=8)
|
||||
ap.add_argument("--maxhint", type=int, default=80)
|
||||
args = ap.parse_args()
|
||||
|
||||
db_path = Path(args.db)
|
||||
jsonl_path = Path(args.jsonl)
|
||||
|
||||
con = sqlite3.connect(db_path)
|
||||
cur = con.cursor()
|
||||
|
||||
# speed pragmas (alleen tijdens import)
|
||||
cur.execute("PRAGMA journal_mode=WAL;")
|
||||
cur.execute("PRAGMA synchronous=NORMAL;")
|
||||
cur.execute("PRAGMA temp_store=MEMORY;")
|
||||
|
||||
cur.execute("""
|
||||
CREATE TABLE IF NOT EXISTS hints (
|
||||
word TEXT NOT NULL,
|
||||
hint TEXT NOT NULL,
|
||||
source TEXT NOT NULL DEFAULT 'wiktionary',
|
||||
pos TEXT,
|
||||
quality INTEGER NOT NULL DEFAULT 80,
|
||||
PRIMARY KEY (word, hint, source)
|
||||
);
|
||||
""")
|
||||
cur.execute("CREATE INDEX IF NOT EXISTS idx_hints_word ON hints(word);")
|
||||
con.commit()
|
||||
|
||||
batch = []
|
||||
inserted = 0
|
||||
seen = 0
|
||||
|
||||
con.execute("BEGIN;")
|
||||
with jsonl_path.open("r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
# Kaikki/Wiktextract: vaak lang_code = "nl" en lang = "Dutch"
|
||||
lang_code = obj.get("lang_code")
|
||||
if lang_code and lang_code != "nl":
|
||||
continue
|
||||
|
||||
word = obj.get("word")
|
||||
if not word:
|
||||
continue
|
||||
|
||||
word_up = word.upper().strip()
|
||||
if not (args.minlen <= len(word_up) <= args.maxlen):
|
||||
continue
|
||||
if not RE_ASCII_WORD.match(word_up):
|
||||
continue
|
||||
|
||||
hint, pos = pick_gloss(obj)
|
||||
if not hint:
|
||||
continue
|
||||
|
||||
# Hint kort houden
|
||||
hint = hint[: args.maxhint].rstrip()
|
||||
|
||||
# Simpele kwaliteit: iets hoger als POS bekend is
|
||||
quality = 85 if pos else 80
|
||||
|
||||
batch.append((word_up, hint, "wiktionary", pos, quality))
|
||||
seen += 1
|
||||
|
||||
if len(batch) >= 2000:
|
||||
cur.executemany(
|
||||
"INSERT OR IGNORE INTO hints(word,hint,source,pos,quality) VALUES (?,?,?,?,?)",
|
||||
batch
|
||||
)
|
||||
inserted += cur.rowcount if cur.rowcount != -1 else 0
|
||||
batch.clear()
|
||||
|
||||
if batch:
|
||||
cur.executemany(
|
||||
"INSERT OR IGNORE INTO hints(word,hint,source,pos,quality) VALUES (?,?,?,?,?)",
|
||||
batch
|
||||
)
|
||||
inserted += cur.rowcount if cur.rowcount != -1 else 0
|
||||
|
||||
con.commit()
|
||||
con.close()
|
||||
|
||||
print(f"Done. processed_lines≈{seen}, inserted≈{inserted} (OR IGNORE kan inserts verlagen).")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user