Gather data

This commit is contained in:
mike
2025-12-25 00:21:58 +01:00
parent 85ebfd3013
commit 49a1aa4152
12 changed files with 494 additions and 27933 deletions

142
import.py Normal file
View File

@@ -0,0 +1,142 @@
#!/usr/bin/env python3
import argparse
import json
import re
import sqlite3
from pathlib import Path
RE_ASCII_WORD = re.compile(r"^[A-Za-z]+$")
RE_SPACE = re.compile(r"\s+")
RE_PARENS = re.compile(r"\s*\([^)]*\)\s*") # verwijder (labels)
RE_BRACKETS = re.compile(r"\s*\[[^]]*]\s*") # verwijder [labels]
def clean_hint(s: str) -> str:
s = s.strip()
s = RE_BRACKETS.sub(" ", s)
s = RE_PARENS.sub(" ", s)
s = s.replace("", "'")
s = RE_SPACE.sub(" ", s).strip(" -;:,.\t")
return s
def pick_gloss(obj: dict) -> tuple[str | None, str | None]:
"""Return (hint, pos) from a Wiktextract JSON line."""
pos = obj.get("pos")
senses = obj.get("senses") or []
best = None
for s in senses:
glosses = s.get("glosses") or []
if not glosses:
continue
# Neem de eerste gloss die "normaal" oogt
for g in glosses:
if not isinstance(g, str):
continue
g2 = clean_hint(g)
if len(g2) < 3:
continue
best = g2
break
if best:
break
return best, pos
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--db", required=True, help="pad naar jouw sqlite db")
ap.add_argument("--jsonl", required=True, help="pad naar nl-extract.jsonl")
ap.add_argument("--minlen", type=int, default=2)
ap.add_argument("--maxlen", type=int, default=8)
ap.add_argument("--maxhint", type=int, default=80)
args = ap.parse_args()
db_path = Path(args.db)
jsonl_path = Path(args.jsonl)
con = sqlite3.connect(db_path)
cur = con.cursor()
# speed pragmas (alleen tijdens import)
cur.execute("PRAGMA journal_mode=WAL;")
cur.execute("PRAGMA synchronous=NORMAL;")
cur.execute("PRAGMA temp_store=MEMORY;")
cur.execute("""
CREATE TABLE IF NOT EXISTS hints (
word TEXT NOT NULL,
hint TEXT NOT NULL,
source TEXT NOT NULL DEFAULT 'wiktionary',
pos TEXT,
quality INTEGER NOT NULL DEFAULT 80,
PRIMARY KEY (word, hint, source)
);
""")
cur.execute("CREATE INDEX IF NOT EXISTS idx_hints_word ON hints(word);")
con.commit()
batch = []
inserted = 0
seen = 0
con.execute("BEGIN;")
with jsonl_path.open("r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
obj = json.loads(line)
except json.JSONDecodeError:
continue
# Kaikki/Wiktextract: vaak lang_code = "nl" en lang = "Dutch"
lang_code = obj.get("lang_code")
if lang_code and lang_code != "nl":
continue
word = obj.get("word")
if not word:
continue
word_up = word.upper().strip()
if not (args.minlen <= len(word_up) <= args.maxlen):
continue
if not RE_ASCII_WORD.match(word_up):
continue
hint, pos = pick_gloss(obj)
if not hint:
continue
# Hint kort houden
hint = hint[: args.maxhint].rstrip()
# Simpele kwaliteit: iets hoger als POS bekend is
quality = 85 if pos else 80
batch.append((word_up, hint, "wiktionary", pos, quality))
seen += 1
if len(batch) >= 2000:
cur.executemany(
"INSERT OR IGNORE INTO hints(word,hint,source,pos,quality) VALUES (?,?,?,?,?)",
batch
)
inserted += cur.rowcount if cur.rowcount != -1 else 0
batch.clear()
if batch:
cur.executemany(
"INSERT OR IGNORE INTO hints(word,hint,source,pos,quality) VALUES (?,?,?,?,?)",
batch
)
inserted += cur.rowcount if cur.rowcount != -1 else 0
con.commit()
con.close()
print(f"Done. processed_lines≈{seen}, inserted≈{inserted} (OR IGNORE kan inserts verlagen).")
if __name__ == "__main__":
main()