Gather data
This commit is contained in:
100
.db/init.sql
Normal file
100
.db/init.sql
Normal file
@@ -0,0 +1,100 @@
|
||||
-- 0) Optioneel: index (handig)
|
||||
CREATE INDEX IF NOT EXISTS idx_frequency_word ON frequency(Word);
|
||||
|
||||
-- 1) View: normalisatie voor Zipf en lengte (0..1)
|
||||
DROP VIEW IF EXISTS word_norm;
|
||||
CREATE VIEW word_norm AS
|
||||
WITH base AS (
|
||||
SELECT
|
||||
Word,
|
||||
CAST(NULLIF(REPLACE(Zipf, ',', '.'), '') AS REAL) AS zipf_num,
|
||||
LENGTH(Word) AS len
|
||||
FROM frequency
|
||||
WHERE Word IS NOT NULL
|
||||
),
|
||||
stats AS (
|
||||
SELECT
|
||||
MIN(zipf_num) AS min_zipf,
|
||||
MAX(zipf_num) AS max_zipf,
|
||||
MIN(len) AS min_len,
|
||||
MAX(len) AS max_len
|
||||
FROM base
|
||||
WHERE zipf_num IS NOT NULL
|
||||
)
|
||||
SELECT
|
||||
b.Word,
|
||||
b.zipf_num,
|
||||
b.len,
|
||||
|
||||
-- freq_norm: 0 = heel frequent, 1 = heel zeldzaam
|
||||
(1.0 - (b.zipf_num - s.min_zipf) / NULLIF(s.max_zipf - s.min_zipf, 0.0)) AS freq_norm,
|
||||
|
||||
-- len_norm: 0..1
|
||||
( (b.len - s.min_len) / NULLIF(CAST(s.max_len - s.min_len AS REAL), 0.0) ) AS len_norm
|
||||
FROM base b
|
||||
CROSS JOIN stats s
|
||||
WHERE b.zipf_num IS NOT NULL;
|
||||
|
||||
|
||||
-- 2) Letter-frequenties bouwen (letter_freq)
|
||||
CREATE TABLE IF NOT EXISTS letter_freq (
|
||||
ch TEXT PRIMARY KEY,
|
||||
cnt INTEGER NOT NULL
|
||||
);
|
||||
|
||||
DELETE FROM letter_freq;
|
||||
|
||||
WITH RECURSIVE chars(word, pos, ch) AS (
|
||||
SELECT Word, 1, LOWER(SUBSTR(Word, 1, 1))
|
||||
FROM frequency
|
||||
WHERE Word IS NOT NULL AND LENGTH(Word) > 0
|
||||
UNION ALL
|
||||
SELECT word, pos + 1, LOWER(SUBSTR(word, pos + 1, 1))
|
||||
FROM chars
|
||||
WHERE pos < LENGTH(word)
|
||||
)
|
||||
INSERT INTO letter_freq(ch, cnt)
|
||||
SELECT ch, COUNT(*) AS cnt
|
||||
FROM chars
|
||||
WHERE ch <> '' -- basic guard
|
||||
GROUP BY ch;
|
||||
|
||||
|
||||
-- 3) View: letterRarity per woord (0..1)
|
||||
DROP VIEW IF EXISTS letter_rarity;
|
||||
CREATE VIEW letter_rarity AS
|
||||
WITH total AS (
|
||||
SELECT CAST(SUM(cnt) AS REAL) AS total_cnt
|
||||
FROM letter_freq
|
||||
),
|
||||
word_chars(word, pos, ch) AS (
|
||||
SELECT Word, 1, LOWER(SUBSTR(Word, 1, 1))
|
||||
FROM frequency
|
||||
WHERE Word IS NOT NULL AND LENGTH(Word) > 0
|
||||
UNION ALL
|
||||
SELECT word, pos + 1, LOWER(SUBSTR(word, pos + 1, 1))
|
||||
FROM word_chars
|
||||
WHERE pos < LENGTH(word)
|
||||
)
|
||||
SELECT
|
||||
wc.word AS Word,
|
||||
AVG(1.0 - (lf.cnt / t.total_cnt)) AS letter_rarity_norm
|
||||
FROM word_chars wc
|
||||
JOIN letter_freq lf ON lf.ch = wc.ch
|
||||
CROSS JOIN total t
|
||||
GROUP BY wc.word;
|
||||
|
||||
|
||||
-- 4) View: complexiteit v1 (ngramProb + morfologie = placeholder 0.5)
|
||||
DROP VIEW IF EXISTS word_complexity_v1;
|
||||
CREATE VIEW word_complexity_v1 AS
|
||||
SELECT
|
||||
n.Word,
|
||||
(0.35 * n.freq_norm) +
|
||||
(0.20 * n.len_norm) +
|
||||
(0.20 * 0.5) + -- ngramProb placeholder
|
||||
(0.15 * COALESCE(r.letter_rarity_norm, 0.5)) +
|
||||
(0.10 * 0.5) -- morfologie placeholder
|
||||
AS complexity
|
||||
FROM word_norm n
|
||||
LEFT JOIN letter_rarity r USING (Word);
|
||||
Reference in New Issue
Block a user