206 lines
6.6 KiB
Java
206 lines
6.6 KiB
Java
package puzzle;
|
|
|
|
import java.util.*;
|
|
|
|
/**
|
|
* ThemeGraph - Creates a graph between words and themes for filtering.
|
|
* Uses word embeddings approach: co-occurrence and semantic similarity.
|
|
*/
|
|
public class ThemeGraph {
|
|
|
|
// Predefined theme keywords for Dutch word filtering
|
|
private static final Map<String, Set<String>> THEME_KEYWORDS = new HashMap<>();
|
|
|
|
static {
|
|
// News/Politics
|
|
THEME_KEYWORDS.put("nieuws", Set.of(
|
|
"POLITIEK", "VERKIEZING", "MINISTER", "PARLEMENT", "WET", "BELEID",
|
|
"REGERING", "PARTIJ", "STEM", "KAMER", "RAAD", "STAAT"
|
|
));
|
|
|
|
// Technology
|
|
THEME_KEYWORDS.put("technologie", Set.of(
|
|
"COMPUTER", "INTERNET", "SOFTWARE", "APP", "DATA", "CODE",
|
|
"NETWERK", "SYSTEEM", "DIGITAAL", "TECH", "ROBOT", "AI"
|
|
));
|
|
|
|
// Sports
|
|
THEME_KEYWORDS.put("sport", Set.of(
|
|
"VOETBAL", "TENNIS", "WIELREN", "SPELER", "WEDSTRIJD", "TEAM",
|
|
"GOAL", "BAL", "SPEL", "WINNEN", "COACH", "ATLEET"
|
|
));
|
|
|
|
// Weather/Nature
|
|
THEME_KEYWORDS.put("weer", Set.of(
|
|
"REGEN", "ZON", "WIND", "WOLKEN", "STORM", "SNEEUW",
|
|
"WEER", "KLIMAAT", "NATUUR", "LUCHT", "WARMTE", "KOU"
|
|
));
|
|
|
|
// Economy
|
|
THEME_KEYWORDS.put("economie", Set.of(
|
|
"GELD", "EURO", "MARKT", "PRIJS", "KOPEN", "VERKOOP",
|
|
"BEDRIJF", "BANK", "HANDEL", "WINST", "SCHULD", "BUDGET"
|
|
));
|
|
|
|
// Health
|
|
THEME_KEYWORDS.put("gezondheid", Set.of(
|
|
"ZORG", "DOKTER", "MEDICIJN", "PATIENT", "ZIEKENHUIS", "GEZOND",
|
|
"VIRUS", "VACCIN", "THERAPIE", "BEHANDEL", "ARTS", "KLINIEK"
|
|
));
|
|
|
|
// General/Common
|
|
THEME_KEYWORDS.put("algemeen", Set.of(
|
|
"HUIS", "AUTO", "BOOM", "WATER", "MENS", "TIJD",
|
|
"LEVEN", "WERK", "SCHOOL", "FAMILIE", "STAD", "LAND"
|
|
));
|
|
}
|
|
|
|
/**
|
|
* Score a word against a theme (0.0 = no match, 1.0 = perfect match)
|
|
*/
|
|
public static double scoreWordTheme(String word, String theme) {
|
|
var keywords = THEME_KEYWORDS.get(theme.toLowerCase());
|
|
if (keywords == null) {
|
|
return 0.5; // unknown theme = neutral score
|
|
}
|
|
|
|
word = word.toUpperCase();
|
|
|
|
// Direct match
|
|
if (keywords.contains(word)) {
|
|
return 1.0;
|
|
}
|
|
|
|
// Substring match (partial relevance)
|
|
for (var kw : keywords) {
|
|
if (word.contains(kw) || kw.contains(word)) {
|
|
return 0.7;
|
|
}
|
|
}
|
|
|
|
// Edit distance similarity (for typos/variations)
|
|
for (var kw : keywords) {
|
|
var similarity = editDistanceSimilarity(word, kw);
|
|
if (similarity > 0.8) {
|
|
return similarity * 0.9;
|
|
}
|
|
}
|
|
|
|
return 0.0;
|
|
}
|
|
|
|
/**
|
|
* Filter word list by theme with minimum score threshold
|
|
*/
|
|
public static List<String> filterByTheme(List<String> words, String theme, double minScore) {
|
|
List<String> filtered = new ArrayList<>();
|
|
for (var word : words) {
|
|
var score = scoreWordTheme(word, theme);
|
|
if (score >= minScore) {
|
|
filtered.add(word);
|
|
}
|
|
}
|
|
return filtered;
|
|
}
|
|
|
|
/**
|
|
* Get theme suggestions for a word (sorted by score)
|
|
*/
|
|
public static List<ThemeScore> getThemesForWord(String word) {
|
|
List<ThemeScore> scores = new ArrayList<>();
|
|
for (var theme : THEME_KEYWORDS.keySet()) {
|
|
var score = scoreWordTheme(word, theme);
|
|
if (score > 0.0) {
|
|
scores.add(new ThemeScore(theme, score));
|
|
}
|
|
}
|
|
scores.sort(Comparator.comparingDouble(ts -> -ts.score));
|
|
return scores;
|
|
}
|
|
|
|
/**
|
|
* Auto-detect best theme from a word list
|
|
*/
|
|
public static String detectTheme(List<String> words) {
|
|
Map<String, Double> themeScores = new HashMap<>();
|
|
|
|
for (var theme : THEME_KEYWORDS.keySet()) {
|
|
double totalScore = 0;
|
|
for (var word : words) {
|
|
totalScore += scoreWordTheme(word, theme);
|
|
}
|
|
themeScores.put(theme, totalScore / words.size());
|
|
}
|
|
|
|
return themeScores.entrySet().stream()
|
|
.max(Comparator.comparingDouble(Map.Entry::getValue))
|
|
.map(Map.Entry::getKey)
|
|
.orElse("algemeen");
|
|
}
|
|
|
|
/**
|
|
* Simple edit distance similarity (normalized Levenshtein)
|
|
*/
|
|
private static double editDistanceSimilarity(String a, String b) {
|
|
var dist = levenshtein(a, b);
|
|
var maxLen = Math.max(a.length(), b.length());
|
|
if (maxLen == 0) return 1.0;
|
|
return 1.0 - ((double) dist / maxLen);
|
|
}
|
|
|
|
private static int levenshtein(String a, String b) {
|
|
var dp = new int[a.length() + 1][b.length() + 1];
|
|
|
|
for (var i = 0; i <= a.length(); i++) dp[i][0] = i;
|
|
for (var j = 0; j <= b.length(); j++) dp[0][j] = j;
|
|
|
|
for (var i = 1; i <= a.length(); i++) {
|
|
for (var j = 1; j <= b.length(); j++) {
|
|
var cost = (a.charAt(i - 1) == b.charAt(j - 1)) ? 0 : 1;
|
|
dp[i][j] = Math.min(
|
|
Math.min(dp[i - 1][j] + 1, dp[i][j - 1] + 1),
|
|
dp[i - 1][j - 1] + cost
|
|
);
|
|
}
|
|
}
|
|
|
|
return dp[a.length()][b.length()];
|
|
}
|
|
|
|
public record ThemeScore(String theme, double score) {
|
|
|
|
@Override
|
|
public String toString() {
|
|
return String.format("%s: %.2f", theme, score);
|
|
}
|
|
}
|
|
|
|
// ---- Main for testing ----
|
|
public static void main(String[] args) {
|
|
System.out.println("=== Theme Graph Test ===\n");
|
|
|
|
// Test word scoring
|
|
var testWords = new String[]{ "POLITIEK", "VOETBAL", "COMPUTER", "REGEN", "AUTO" };
|
|
for (var word : testWords) {
|
|
System.out.println("Word: " + word);
|
|
var themes = getThemesForWord(word);
|
|
for (var ts : themes) {
|
|
System.out.println(" " + ts);
|
|
}
|
|
System.out.println();
|
|
}
|
|
|
|
// Test theme detection
|
|
var techWords = Arrays.asList("COMPUTER", "INTERNET", "SOFTWARE", "DATA");
|
|
var detected = detectTheme(techWords);
|
|
System.out.println("Detected theme for tech words: " + detected);
|
|
|
|
// Test filtering
|
|
var allWords = Arrays.asList(
|
|
"POLITIEK", "COMPUTER", "AUTO", "VOETBAL", "INTERNET", "BOOM"
|
|
);
|
|
var filtered = filterByTheme(allWords, "technologie", 0.5);
|
|
System.out.println("\nFiltered for 'technologie' (min 0.5): " + filtered);
|
|
}
|
|
}
|