-- 0) Optioneel: index (handig) CREATE INDEX IF NOT EXISTS idx_frequency_word ON frequency(Word); -- 1) View: normalisatie voor Zipf en lengte (0..1) DROP VIEW IF EXISTS word_norm; CREATE VIEW word_norm AS WITH base AS ( SELECT Word, CAST(NULLIF(REPLACE(Zipf, ',', '.'), '') AS REAL) AS zipf_num, LENGTH(Word) AS len FROM frequency WHERE Word IS NOT NULL ), stats AS ( SELECT MIN(zipf_num) AS min_zipf, MAX(zipf_num) AS max_zipf, MIN(len) AS min_len, MAX(len) AS max_len FROM base WHERE zipf_num IS NOT NULL ) SELECT b.Word, b.zipf_num, b.len, -- freq_norm: 0 = heel frequent, 1 = heel zeldzaam (1.0 - (b.zipf_num - s.min_zipf) / NULLIF(s.max_zipf - s.min_zipf, 0.0)) AS freq_norm, -- len_norm: 0..1 ( (b.len - s.min_len) / NULLIF(CAST(s.max_len - s.min_len AS REAL), 0.0) ) AS len_norm FROM base b CROSS JOIN stats s WHERE b.zipf_num IS NOT NULL; -- 2) Letter-frequenties bouwen (letter_freq) CREATE TABLE IF NOT EXISTS letter_freq ( ch TEXT PRIMARY KEY, cnt INTEGER NOT NULL ); DELETE FROM letter_freq; WITH RECURSIVE chars(word, pos, ch) AS ( SELECT Word, 1, LOWER(SUBSTR(Word, 1, 1)) FROM frequency WHERE Word IS NOT NULL AND LENGTH(Word) > 0 UNION ALL SELECT word, pos + 1, LOWER(SUBSTR(word, pos + 1, 1)) FROM chars WHERE pos < LENGTH(word) ) INSERT INTO letter_freq(ch, cnt) SELECT ch, COUNT(*) AS cnt FROM chars WHERE ch <> '' -- basic guard GROUP BY ch; -- 3) View: letterRarity per woord (0..1) DROP VIEW IF EXISTS letter_rarity; CREATE VIEW letter_rarity AS WITH total AS ( SELECT CAST(SUM(cnt) AS REAL) AS total_cnt FROM letter_freq ), word_chars(word, pos, ch) AS ( SELECT Word, 1, LOWER(SUBSTR(Word, 1, 1)) FROM frequency WHERE Word IS NOT NULL AND LENGTH(Word) > 0 UNION ALL SELECT word, pos + 1, LOWER(SUBSTR(word, pos + 1, 1)) FROM word_chars WHERE pos < LENGTH(word) ) SELECT wc.word AS Word, AVG(1.0 - (lf.cnt / t.total_cnt)) AS letter_rarity_norm FROM word_chars wc JOIN letter_freq lf ON lf.ch = wc.ch CROSS JOIN total t GROUP BY wc.word; -- 4) View: complexiteit v1 (ngramProb + morfologie = placeholder 0.5) DROP VIEW IF EXISTS word_complexity_v1; CREATE VIEW word_complexity_v1 AS SELECT n.Word, (0.35 * n.freq_norm) + (0.20 * n.len_norm) + (0.20 * 0.5) + -- ngramProb placeholder (0.15 * COALESCE(r.letter_rarity_norm, 0.5)) + (0.10 * 0.5) -- morfologie placeholder AS complexity FROM word_norm n LEFT JOIN letter_rarity r USING (Word);