puzzle-generator/tools/puzzle-gen/generate_daily_puzzles.py

#!/usr/bin/env python3
import datetime as dt
import json
import os
import random
import re
import urllib.request
import xml.etree.ElementTree as ET
import json, re

# --- USER-FRIENDLY CONFIG ---
# Max 7 letters for shorter, more common words
WORD_RE = re.compile(r"^[A-Z]{3,7}$")
EMPTY = " "
# Slightly smaller grid for denser puzzles
SIZE = 10
# More words needed since they're shorter
TARGET_WORDS = 15
MIN_ACCEPT_WORDS = 10

FEEDS = [
    "https://feeds.nos.nl/nosnieuwsalgemeen",
    "https://feeds.nos.nl/nosnieuwstech",
]


def env(name, default=None):
    v = os.getenv(name)
    return default if v is None or v == "" else v


def http_get(url, timeout=15):
    req = urllib.request.Request(url, headers={"User-Agent": "puzzle-gen/1.0"})
    with urllib.request.urlopen(req, timeout=timeout) as r:
        return r.read()


def http_post_json(url, payload, timeout=45):
    data = json.dumps(payload).encode("utf-8")
    req = urllib.request.Request(
        url,
        data=data,
        headers={
            "Content-Type": "application/json",
            "Authorization": "Bearer lm-studio",
            "User-Agent": "puzzle-gen/1.0",
        },
        method="POST",
    )
    with urllib.request.urlopen(req, timeout=timeout) as r:
        return json.loads(r.read().decode("utf-8"))


def fetch_rss_items(url, limit=12):
    raw = http_get(url)
    root = ET.fromstring(raw)
    channel = root.find("channel") if root.tag.lower().endswith("rss") else root
    items = []
    for it in channel.findall("item"):
        title = (it.findtext("title") or "").strip()
        desc = (it.findtext("description") or "").strip()
        if title:
            items.append((title, desc))
        if len(items) >= limit:
            break
    return items


def safe_slug(s, maxlen=50):
    s = s.lower()
    s = re.sub(r"[^a-z0-9]+", "-", s).strip("-")
    return (s[:maxlen] or "news")


def extract_first_json(text: str):
    """Parse first JSON value (object OR array) from any text."""
    if not text:
        return None
    starts = [i for i in (text.find("{"), text.find("[")) if i != -1]
    if not starts:
        return None
    i = min(starts)
    try:
        return json.JSONDecoder().raw_decode(text[i:])[0]
    except json.JSONDecodeError:
        return None


def normalize_word(raw: str) -> str:
    # A-Z only, remove hyphens/digits/spaces/etc.
    w = re.sub(r"[^A-Za-z]", "", (raw or "")).upper()
    return w


def sanitize_wordcluemap(obj):
    """
    Accepts:
      - dict: {"WORD":"clue", ...}
      - list: [{"word":"...","clue":"..."}, {"WOORD":"...","clue":"..."}, ...]
    Returns dict with keys A-Z 3..7 and non-empty clue.
    """
    out = {}

    if isinstance(obj, dict):
        items = list(obj.items())
    elif isinstance(obj, list):
        items = []
        for it in obj:
            if not isinstance(it, dict):
                continue
            raw_word = it.get("word") or it.get("WOORD") or it.get("Word")
            clue = it.get("clue") or it.get("CLUE") or it.get("hint") or it.get("HINT")
            items.append((raw_word, clue))
    else:
        return out

    for raw_word, clue in items:
        if not isinstance(raw_word, str) or not isinstance(clue, str):
            continue
        w = normalize_word(raw_word)
        if not WORD_RE.fullmatch(w):
            continue
        clue = clue.strip()
        if not clue:
            continue
        out[w] = clue

    return out


# ---- generator (no-touch) ----
def make_grid():
    return [[EMPTY for _ in range(SIZE)] for _ in range(SIZE)]


def in_bounds(g, r, c):
    return 0 <= r < len(g) and 0 <= c < len(g[0])


def can_place_notouch(g, word, r, c, direction):
    H, W = len(g), len(g[0])
    if r < 0 or c < 0:
        return False
    if direction == "horizontal" and c + len(word) > W:
        return False
    if direction == "vertical" and r + len(word) > H:
        return False

    # no "glue" before/after
    br = r if direction == "horizontal" else r - 1
    bc = c - 1 if direction == "horizontal" else c
    if in_bounds(g, br, bc) and g[br][bc] != EMPTY:
        return False

    ar = r if direction == "horizontal" else r + len(word)
    ac = c + len(word) if direction == "horizontal" else c
    if in_bounds(g, ar, ac) and g[ar][ac] != EMPTY:
        return False

    for i, ch in enumerate(word):
        rr = r if direction == "horizontal" else r + i
        cc = c + i if direction == "horizontal" else c
        cell = g[rr][cc]
        crossing = cell != EMPTY
        if crossing and cell != ch:
            return False

        if not crossing:
            if direction == "horizontal":
                if in_bounds(g, rr - 1, cc) and g[rr - 1][cc] != EMPTY: return False
                if in_bounds(g, rr + 1, cc) and g[rr + 1][cc] != EMPTY: return False
            else:
                if in_bounds(g, rr, cc - 1) and g[rr][cc - 1] != EMPTY: return False
                if in_bounds(g, rr, cc + 1) and g[rr][cc + 1] != EMPTY: return False
    return True


def place_word(g, word, r, c, direction):
    for i, ch in enumerate(word):
        rr = r if direction == "horizontal" else r + i
        cc = c + i if direction == "horizontal" else c
        g[rr][cc] = ch


def find_spots(g, word, placed):
    spots = []
    for p in placed:
        pw = p["word"]
        for i, pch in enumerate(pw):
            pr = p["row"] if p["direction"] == "horizontal" else p["row"] + i
            pc = p["col"] + i if p["direction"] == "horizontal" else p["col"]
            for j, wch in enumerate(word):
                if wch != pch:
                    continue
                direction = "vertical" if p["direction"] == "horizontal" else "horizontal"
                r = pr if direction == "horizontal" else pr - j
                c = pc - j if direction == "horizontal" else pc
                if can_place_notouch(g, word, r, c, direction):
                    spots.append((r, c, direction))
    return spots


def generate_puzzle(wordcluemap, rnd):
    words = sorted(wordcluemap.keys(), key=len, reverse=True)
    g = make_grid()
    placed = []

    first = words[0]
    sr = SIZE // 2
    sc = (SIZE - len(first)) // 2
    if not can_place_notouch(g, first, sr, sc, "horizontal"):
        return None
    place_word(g, first, sr, sc, "horizontal")
    placed.append({"word": first, "clue": wordcluemap[first], "row": sr, "col": sc, "direction": "horizontal"})

    for w in words[1:]:
        spots = find_spots(g, w, placed)
        rnd.shuffle(spots)
        if not spots:
            continue
        r, c, d = spots[0]
        place_word(g, w, r, c, d)
        placed.append({"word": w, "clue": wordcluemap[w], "row": r, "col": c, "direction": d})

    return {"grid": g, "placed": placed}


def export_format(puz, difficulty=1, rewards=None):
    if rewards is None:
        rewards = {"coins": 50, "stars": 2, "hints": 1}

    g = puz["grid"]
    placed = puz["placed"]
    H, W = len(g), len(g[0])

    cells = []
    for p in placed:
        for i in range(len(p["word"])):
            r = p["row"] if p["direction"] == "horizontal" else p["row"] + i
            c = p["col"] + i if p["direction"] == "horizontal" else p["col"]
            cells.append((r, c))
        # arrow cell: before the start
        ar = p["row"] if p["direction"] == "horizontal" else p["row"] - 1
        ac = p["col"] - 1 if p["direction"] == "horizontal" else p["col"]
        cells.append((ar, ac))

    minR = min(r for r, _ in cells) - 1
    minC = min(c for _, c in cells) - 1
    maxR = max(r for r, _ in cells) + 1
    maxC = max(c for _, c in cells) + 1

    def ch_at(r, c):
        if r < 0 or c < 0 or r >= H or c >= W:
            return "#"
        ch = g[r][c]
        return "#" if ch == EMPTY else ch

    gridv2 = []
    for r in range(minR, maxR + 1):
        row = "".join(ch_at(r, c) for c in range(minC, maxC + 1))
        gridv2.append(row)

    words_out = []
    for p in placed:
        arrowRow = (p["row"] if p["direction"] == "horizontal" else p["row"] - 1) - minR
        arrowCol = (p["col"] - 1 if p["direction"] == "horizontal" else p["col"]) - minC
        words_out.append({
            "word": p["word"],
            "clue": p["clue"],
            "startRow": p["row"] - minR,
            "startCol": p["col"] - minC,
            "direction": p["direction"],
            "answer": p["word"],
            "arrowRow": arrowRow,
            "arrowCol": arrowCol,
        })

    return {"gridv2": gridv2, "words": words_out, "difficulty": difficulty, "rewards": rewards}


def list_models(base_url):
    try:
        data = json.loads(http_get(f"{base_url}/models").decode("utf-8"))
        return [m.get("id") for m in data.get("data", []) if m.get("id")]
    except Exception:
        return []


def llm_make_wordcluemap(base_url, model, title, desc, n_words=12):
    prompt = f"""
Geef ALLEEN een JSON object terug (geen array, geen markdown).
Formaat exact:
{{
  "WOORD": "clue",
  ...
}}

REGELS:
- WOORD: alleen letters A-Z, geen streepjes/cijfers, lengte 3..7.
- Gebruik KORTE, GEBRUIKELIJKE Nederlandse woorden (geen jargon, geen moeilijke termen).
- Clue: korte, duidelijke hint in het Nederlands.
- Maak {n_words} items.
Thema: {title}
Context: {desc[:260]}
""".strip()

    payload = {
        "model": model,
        "temperature": 0.7,
        "messages": [
            {"role": "system", "content": "Return STRICT JSON object only."},
            {"role": "user", "content": prompt},
        ],
    }

    data = http_post_json(f"{base_url}/chat/completions", payload)
    content = data["choices"][0]["message"]["content"]
    obj = extract_first_json(content)
    wc = sanitize_wordcluemap(obj)

    # Aggressive repair for short words
    if len(wc) < MIN_ACCEPT_WORDS:
        repair = f"""
Zet dit om naar een STRICT JSON OBJECT (geen array) "WOORD":"clue".
KRITIEK:
- WOORD: A-Z only, lengte 3..7. GEEN lange woorden!
- Gebruik ALLEEN korte, bekende Nederlandse woorden bij twijfel.
- Vervang ongeldige/moeilijke woorden door veelvoorkomende synoniemen.
Input:
{content}
""".strip()

        payload["messages"] = [
            {"role": "system", "content": "Return STRICT JSON object only."},
            {"role": "user", "content": repair},
        ]
        data = http_post_json(f"{base_url}/chat/completions", payload)
        content2 = data["choices"][0]["message"]["content"]
        obj2 = extract_first_json(content2)
        wc2 = sanitize_wordcluemap(obj2)
        if len(wc2) > len(wc):
            wc = wc2

    return wc


def main():
    base_url = env("LM_STUDIO_BASE_URL", "http://192.168.1.159:1234/v1")
    out_dir = env("OUT_DIR", "/data/puzzles")
    per_day = int(env("PUZZLES_PER_DAY", "3"))
    today = dt.date.today().isoformat()
    rnd = random.Random(today)

    os.makedirs(out_dir, exist_ok=True)

    items = []
    for f in FEEDS:
        try:
            items.extend(fetch_rss_items(f))
        except Exception:
            pass
    if not items:
        raise SystemExit("No RSS items found")

    models = list_models(base_url)
    model = env("LM_MODEL", models[0] if models else "model-identifier")

    made = 0
    for idx in range(1, per_day + 1):
        title, desc = rnd.choice(items)
        slug = safe_slug(title)

        wc = llm_make_wordcluemap(base_url, model, title, desc, n_words=TARGET_WORDS)
        # Stricter validation: need more words since they're shorter
        if len(wc) < MIN_ACCEPT_WORDS:
            continue

        puz = generate_puzzle(wc, rnd)
        # Require at least 7 placed words for a decent puzzle
        if not puz or len(puz["placed"]) < 7:
            continue

        exported = export_format(puz, difficulty=1, rewards={"coins": 50, "stars": 2, "hints": 1})
        fn = f"crossword_{today}_{idx:02d}_{slug}.json"
        path = os.path.join(out_dir, fn)
        with open(path, "w", encoding="utf-8") as fp:
            json.dump(exported, fp, ensure_ascii=False, indent=2)
        made += 1

    # index.json (handig voor je frontend)
    files = sorted([f for f in os.listdir(out_dir) if f.startswith(f"crossword_{today}_") and f.endswith(".json")])
    with open(os.path.join(out_dir, "index.json"), "w", encoding="utf-8") as fp:
        json.dump({"date": today, "files": files}, fp, ensure_ascii=False, indent=2)

    print(f"Generated {made} puzzles for {today}")


if __name__ == "__main__":
    main()