Files
puzzle-generator/src/puzzle/ThemeGraph.java
2025-12-19 16:20:03 +01:00

206 lines
6.6 KiB
Java

package puzzle;
import java.util.*;
/**
* ThemeGraph - Creates a graph between words and themes for filtering.
* Uses word embeddings approach: co-occurrence and semantic similarity.
*/
public class ThemeGraph {
// Predefined theme keywords for Dutch word filtering
private static final Map<String, Set<String>> THEME_KEYWORDS = new HashMap<>();
static {
// News/Politics
THEME_KEYWORDS.put("nieuws", Set.of(
"POLITIEK", "VERKIEZING", "MINISTER", "PARLEMENT", "WET", "BELEID",
"REGERING", "PARTIJ", "STEM", "KAMER", "RAAD", "STAAT"
));
// Technology
THEME_KEYWORDS.put("technologie", Set.of(
"COMPUTER", "INTERNET", "SOFTWARE", "APP", "DATA", "CODE",
"NETWERK", "SYSTEEM", "DIGITAAL", "TECH", "ROBOT", "AI"
));
// Sports
THEME_KEYWORDS.put("sport", Set.of(
"VOETBAL", "TENNIS", "WIELREN", "SPELER", "WEDSTRIJD", "TEAM",
"GOAL", "BAL", "SPEL", "WINNEN", "COACH", "ATLEET"
));
// Weather/Nature
THEME_KEYWORDS.put("weer", Set.of(
"REGEN", "ZON", "WIND", "WOLKEN", "STORM", "SNEEUW",
"WEER", "KLIMAAT", "NATUUR", "LUCHT", "WARMTE", "KOU"
));
// Economy
THEME_KEYWORDS.put("economie", Set.of(
"GELD", "EURO", "MARKT", "PRIJS", "KOPEN", "VERKOOP",
"BEDRIJF", "BANK", "HANDEL", "WINST", "SCHULD", "BUDGET"
));
// Health
THEME_KEYWORDS.put("gezondheid", Set.of(
"ZORG", "DOKTER", "MEDICIJN", "PATIENT", "ZIEKENHUIS", "GEZOND",
"VIRUS", "VACCIN", "THERAPIE", "BEHANDEL", "ARTS", "KLINIEK"
));
// General/Common
THEME_KEYWORDS.put("algemeen", Set.of(
"HUIS", "AUTO", "BOOM", "WATER", "MENS", "TIJD",
"LEVEN", "WERK", "SCHOOL", "FAMILIE", "STAD", "LAND"
));
}
/**
* Score a word against a theme (0.0 = no match, 1.0 = perfect match)
*/
public static double scoreWordTheme(String word, String theme) {
var keywords = THEME_KEYWORDS.get(theme.toLowerCase());
if (keywords == null) {
return 0.5; // unknown theme = neutral score
}
word = word.toUpperCase();
// Direct match
if (keywords.contains(word)) {
return 1.0;
}
// Substring match (partial relevance)
for (var kw : keywords) {
if (word.contains(kw) || kw.contains(word)) {
return 0.7;
}
}
// Edit distance similarity (for typos/variations)
for (var kw : keywords) {
var similarity = editDistanceSimilarity(word, kw);
if (similarity > 0.8) {
return similarity * 0.9;
}
}
return 0.0;
}
/**
* Filter word list by theme with minimum score threshold
*/
public static List<String> filterByTheme(List<String> words, String theme, double minScore) {
List<String> filtered = new ArrayList<>();
for (var word : words) {
var score = scoreWordTheme(word, theme);
if (score >= minScore) {
filtered.add(word);
}
}
return filtered;
}
/**
* Get theme suggestions for a word (sorted by score)
*/
public static List<ThemeScore> getThemesForWord(String word) {
List<ThemeScore> scores = new ArrayList<>();
for (var theme : THEME_KEYWORDS.keySet()) {
var score = scoreWordTheme(word, theme);
if (score > 0.0) {
scores.add(new ThemeScore(theme, score));
}
}
scores.sort(Comparator.comparingDouble(ts -> -ts.score));
return scores;
}
/**
* Auto-detect best theme from a word list
*/
public static String detectTheme(List<String> words) {
Map<String, Double> themeScores = new HashMap<>();
for (var theme : THEME_KEYWORDS.keySet()) {
double totalScore = 0;
for (var word : words) {
totalScore += scoreWordTheme(word, theme);
}
themeScores.put(theme, totalScore / words.size());
}
return themeScores.entrySet().stream()
.max(Comparator.comparingDouble(Map.Entry::getValue))
.map(Map.Entry::getKey)
.orElse("algemeen");
}
/**
* Simple edit distance similarity (normalized Levenshtein)
*/
private static double editDistanceSimilarity(String a, String b) {
var dist = levenshtein(a, b);
var maxLen = Math.max(a.length(), b.length());
if (maxLen == 0) return 1.0;
return 1.0 - ((double) dist / maxLen);
}
private static int levenshtein(String a, String b) {
var dp = new int[a.length() + 1][b.length() + 1];
for (var i = 0; i <= a.length(); i++) dp[i][0] = i;
for (var j = 0; j <= b.length(); j++) dp[0][j] = j;
for (var i = 1; i <= a.length(); i++) {
for (var j = 1; j <= b.length(); j++) {
var cost = (a.charAt(i - 1) == b.charAt(j - 1)) ? 0 : 1;
dp[i][j] = Math.min(
Math.min(dp[i - 1][j] + 1, dp[i][j - 1] + 1),
dp[i - 1][j - 1] + cost
);
}
}
return dp[a.length()][b.length()];
}
public record ThemeScore(String theme, double score) {
@Override
public String toString() {
return String.format("%s: %.2f", theme, score);
}
}
// ---- Main for testing ----
public static void main(String[] args) {
System.out.println("=== Theme Graph Test ===\n");
// Test word scoring
var testWords = new String[]{ "POLITIEK", "VOETBAL", "COMPUTER", "REGEN", "AUTO" };
for (var word : testWords) {
System.out.println("Word: " + word);
var themes = getThemesForWord(word);
for (var ts : themes) {
System.out.println(" " + ts);
}
System.out.println();
}
// Test theme detection
var techWords = Arrays.asList("COMPUTER", "INTERNET", "SOFTWARE", "DATA");
var detected = detectTheme(techWords);
System.out.println("Detected theme for tech words: " + detected);
// Test filtering
var allWords = Arrays.asList(
"POLITIEK", "COMPUTER", "AUTO", "VOETBAL", "INTERNET", "BOOM"
);
var filtered = filterByTheme(allWords, "technologie", 0.5);
System.out.println("\nFiltered for 'technologie' (min 0.5): " + filtered);
}
}