diff --git a/src/main/java/puzzle/CsvIndexService.java b/src/main/java/puzzle/CsvIndexService.java new file mode 100644 index 0000000..44dcddd --- /dev/null +++ b/src/main/java/puzzle/CsvIndexService.java @@ -0,0 +1,244 @@ +package puzzle; + +import com.google.gson.Gson; +import puzzle.SwedishGenerator.Lemma; +import java.io.*; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.charset.StandardCharsets; +import java.nio.file.*; +import java.util.Arrays; +import java.util.Locale; +import java.util.function.Consumer; + +public final class CsvIndexService + implements Closeable { + + static final ScopedValue SC = ScopedValue.newInstance(); + static final Gson GSON = new Gson(); + private static final int MAGIC = 0x4C494458; // "LIDX" + private static final int VERSION = 1; + + private final Path csvPath; + private final Path idxPath; + + private volatile long[] offsets; // lazy + private volatile FileChannel csvChannel; // open once + private final Object lock = new Object(); + + public CsvIndexService(Path csvPath, Path idxPath) { + this.csvPath = csvPath; + this.idxPath = idxPath; + } + + public static String[] lineToClue(String line) { + if (line.isBlank()) throw new RuntimeException("Empty line"); + var parts = line.split(",", 5); + var rawClue = parts[4].trim(); + if (rawClue.startsWith("\"") && rawClue.endsWith("\"")) { + rawClue = rawClue.substring(1, rawClue.length() - 1).replace("\"\"", "\""); + } + return GSON.fromJson(rawClue, String[].class); + } + public static void lineToLemma(String line, Consumer ok) { + if (line.isBlank()) { + throw new RuntimeException("Empty line"); + } + var parts = line.split(",", 5); + var id = Integer.parseInt(parts[0].trim()); + var word = parts[1].trim(); + if (!word.matches("^[A-Z]{2,8}$")) { + throw new RuntimeException("Invalid word:" + line); + } + + // CSV has level 1-10. llmScores use 10-level. + int score = Integer.parseInt(parts[2].trim()); + if (score < 1) { + if (Main.VERBOSE) System.err.println("Word too complex: " + line); + return; + } + int simpel = Integer.parseInt(parts[3].trim()); + ok.accept(new Lemma(id, word, simpel)); + } + + public static String[] clues(int index) { + try { + if (SC.isBound()) + return lineToClue(SC.get().getLine(index)); + return new String[0]; + } catch (Exception e) { + throw new RuntimeException("Failed to get clues for index " + index, e); + } + } + /** Haal één regel op (0-based line index), met self-healing index (1x rebuild). */ + public String getLine(int lineIndex) throws IOException { + ensureLoaded(); + + var line = readLineAt(lineIndex); + + if (startsWithIndex(line, lineIndex)) return line; + + // mismatch => rebuild index en nog 1x proberen + synchronized (lock) { + rebuildIndexLocked(); + line = readLineAt(lineIndex); + if (startsWithIndex(line, lineIndex)) return line; + } + + throw new RuntimeException("Index mismatch after rebuild. Requested=" + lineIndex + ", got line=" + preview(line)); + } + + private void ensureLoaded() throws IOException { + if (offsets != null && csvChannel != null && csvChannel.isOpen()) return; + + synchronized (lock) { + if (offsets != null && csvChannel != null && csvChannel.isOpen()) return; + + csvChannel = FileChannel.open(csvPath, StandardOpenOption.READ); + + if (Files.exists(idxPath)) { + try { + offsets = readIndex(idxPath); + return; + } catch (IOException badIndex) { + // fall-through -> rebuild + } + } + + rebuildIndexLocked(); + } + } + + private void rebuildIndexLocked() throws IOException { + var built = buildOffsets(csvPath); + writeIndex(idxPath, built); + offsets = built; + } + + private String readLineAt(int lineIndex) throws IOException { + var local = offsets; + if (lineIndex < 0 || lineIndex >= local.length) { + throw new IndexOutOfBoundsException("lineIndex=" + lineIndex + ", max=" + (local.length - 1)); + } + + var start = local[lineIndex]; + csvChannel.position(start); + + // lees in blokjes (sneller dan 1 byte) tot newline + var buf = new byte[8192]; + var total = 0; + var out = new byte[256]; + + while (true) { + var bb = ByteBuffer.wrap(buf); + var n = csvChannel.read(bb); + if (n < 0) break; // EOF + var end = n; + + for (var i = 0; i < end; i++) { + var b = buf[i]; + + if (b == (byte) '\n') { + // reposition kanaal op byte na newline + long back = (end - i - 1); + csvChannel.position(csvChannel.position() - back); + return new String(out, 0, total, StandardCharsets.UTF_8); + } + if (b == (byte) '\r') continue; + + if (total == out.length) out = Arrays.copyOf(out, out.length * 2); + out[total++] = b; + } + } + + return new String(out, 0, total, StandardCharsets.UTF_8); + } + + /** Check: begint de regel met "," */ + private static boolean startsWithIndex(String line, int lineIndex) { + if (line == null || line.isEmpty()) return false; + + var comma = line.indexOf(','); + if (comma <= 0) return false; + + // snelle parse zonder split + long v = 0; + for (var i = 0; i < comma; i++) { + var c = line.charAt(i); + if (c < '0' || c > '9') return false; + v = (v * 10) + (c - '0'); + if (v > Integer.MAX_VALUE) return false; + } + return v == lineIndex; + } + + private static String preview(String s) { + if (s == null) return "null"; + return s.length() <= 120 ? s : s.substring(0, 120) + "..."; + } + + /** Bouw offsets door newlines te scannen. Resultaat is exact getrimd. */ + public static long[] buildOffsets(Path path) throws IOException { + try (var ch = FileChannel.open(path, StandardOpenOption.READ)) { + var offs = new long[131072]; // start-capacity, groeit indien nodig + var c = 0; + offs[c++] = 0L; + + var buf = ByteBuffer.allocateDirect(1 << 20); + long pos = 0; + + while (true) { + buf.clear(); + var n = ch.read(buf); + if (n < 0) break; + buf.flip(); + + for (var i = 0; i < n; i++) { + if (buf.get(i) == (byte) '\n') { + if (c == offs.length) offs = Arrays.copyOf(offs, offs.length * 2); + offs[c++] = pos + i + 1; + } + } + pos += n; + } + + return Arrays.copyOf(offs, c); + } + } + + public static void writeIndex(Path out, long[] offsets) throws IOException { + try (var dos = new DataOutputStream(new BufferedOutputStream(Files.newOutputStream( + out, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.WRITE)))) { + dos.writeInt(MAGIC); + dos.writeInt(VERSION); + dos.writeInt(offsets.length); + for (var v : offsets) dos.writeLong(v); + } + } + + public static long[] readIndex(Path in) throws IOException { + try (var dis = new DataInputStream(new BufferedInputStream(Files.newInputStream(in)))) { + var magic = dis.readInt(); + if (magic != MAGIC) throw new IOException("Not a LIDX file"); + + var version = dis.readInt(); + if (version != VERSION) throw new IOException("Unsupported version: " + version); + + var n = dis.readInt(); + if (n < 0) throw new IOException("Corrupt length: " + n); + + var offsets = new long[n]; + for (var i = 0; i < n; i++) offsets[i] = dis.readLong(); + return offsets; + } + } + + @Override + public void close() throws IOException { + synchronized (lock) { + if (csvChannel != null) csvChannel.close(); + csvChannel = null; + offsets = null; + } + } +} diff --git a/src/main/java/puzzle/Main.java b/src/main/java/puzzle/Main.java index 118a2c5..46717bb 100644 --- a/src/main/java/puzzle/Main.java +++ b/src/main/java/puzzle/Main.java @@ -13,9 +13,9 @@ import java.util.*; import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicLong; +import static puzzle.CsvIndexService.SC; import static puzzle.Export.*; import static puzzle.SwedishGenerator.*; -import static puzzle.SwedishGenerator.Dict.GSON; import static puzzle.SwedishGenerator.Dict.loadDict; public class Main { @@ -53,6 +53,11 @@ public class Main { } public void main(String[] args) { + var csv = Paths.get("nl_score_hints_v3.csv"); + var idx = Paths.get("nl_score_hints_v3.idx"); + ScopedValue.where(SC, new CsvIndexService(csv, idx)).run(() -> _main(args)); + } + public void _main(String[] args) { var opts = parseArgs(args); if (opts.reindex) { @@ -69,6 +74,8 @@ public class Main { section("Settings"); printSettings(opts); + var csv = Paths.get("nl_score_hints_v3.csv"); + var idx = Paths.get("nl_score_hints_v3.idx"); var res = generatePuzzle(opts); if (res == null) { @@ -377,7 +384,7 @@ public class Main { record JsonExportedPuzzle(String date, String theme, int difficulty, Rewards rewards, String[] grid, WordOut[] words) { } private static String toJson(ExportedPuzzle puzzle, String date, String theme) { - return GSON.toJson(new JsonExportedPuzzle(date, theme, puzzle.difficulty(), puzzle.rewards(), puzzle.gridv2(), puzzle.words())); + return CsvIndexService.GSON.toJson(new JsonExportedPuzzle(date, theme, puzzle.difficulty(), puzzle.rewards(), puzzle.gridv2(), puzzle.words())); } private static String escapeJson(String s) { diff --git a/src/main/java/puzzle/SwedishGenerator.java b/src/main/java/puzzle/SwedishGenerator.java index 0c4b834..a9cb7b2 100644 --- a/src/main/java/puzzle/SwedishGenerator.java +++ b/src/main/java/puzzle/SwedishGenerator.java @@ -1,6 +1,5 @@ package puzzle; -import com.google.gson.Gson; import lombok.Getter; import lombok.val; import precomp.Neighbors9x8; @@ -36,6 +35,7 @@ public record SwedishGenerator(Rng rng) { //@formatter:off @FunctionalInterface interface SlotVisitor { void visit(int key, long packedPos, int len); } //@formatter:on + static final long GT_1_OFFSET_53_BIT = 0x3E00000000000000L; static final long X = 0L; static final int LOG_EVERY_MS = 200; static final int BAR_LEN = 22; @@ -208,7 +208,7 @@ public record SwedishGenerator(Rng rng) { if (idx < 64) lo &= ~(1L << idx); else hi &= ~(1L << (idx & 63)); } - static boolean isDigit(byte b) { return (b & 48) == 48; } + static boolean isDigit(byte b) { return (b & B48) == B48; } boolean isDigitAt(int index) { return isDigit(g[index]); } boolean isClue(long index) { if (index < 64) return ((lo >> index) & 1L) != X; @@ -242,9 +242,12 @@ public record SwedishGenerator(Rng rng) { } return true; } - static boolean isLetter(byte b) { return (b & 64) != 0; } + static final byte B0 = (byte) 0; + static final byte B64 = (byte) 64; + static final byte B48 = (byte) 48; + static boolean isLetter(byte b) { return (b & B64) != B0; } public boolean isLetterSet(int idx) { return isLetter(g[idx]); } - static boolean notDigit(byte b) { return (b & 48) != 48; } + static boolean notDigit(byte b) { return (b & B48) != B48; } public boolean isLetterAt(int index) { return notDigit(g[index]); } public double similarity(Grid b) { @@ -252,17 +255,11 @@ public record SwedishGenerator(Rng rng) { for (int i = 0; i < SIZE; i++) if (g[i] == b.g[i]) same++; return same / SIZED; } - int clueCount() { return Long.bitCount(lo) + Long.bitCount(hi); } - /* for (int k = 0, n = Math.min(MAX_WORD_LENGTH7, (int) (packed >>> 56) * 7); k < n; ) { - if (isClue((int) ((packed >>> k) & 0x7F))) break; - k += 7; - if (k >= MIN_LEN7) return true; - } - return false;*/ - boolean hasRoomForClue(long packed) { return ((packed >>> 56)) > 1L && notClue(packed & 0x7FL) && notClue((packed >>> 7) & 0x7FL); } + int clueCount() { return Long.bitCount(lo) + Long.bitCount(hi); } + boolean hasRoomForClue(long packed) { return (packed & GT_1_OFFSET_53_BIT) != X && notClue(packed & 0x7FL) && notClue((packed >>> 7) & 0x7FL); } void forEachSlot(SlotVisitor visitor) { for (var l = lo; l != X; l &= l - 1) processSlot(this, visitor, Long.numberOfTrailingZeros(l)); - for (var h = hi; h != X; h &= h - 1) processSlot(this, visitor, 64 + Long.numberOfTrailingZeros(h)); + for (var h = hi; h != X; h &= h - 1) processSlot(this, visitor, 64 | Long.numberOfTrailingZeros(h)); } } @@ -286,21 +283,21 @@ public record SwedishGenerator(Rng rng) { } } - public static record Lemma(int index, byte[] word, int simpel, String[] clue) { + public static record Lemma(int index, byte[] word, int simpel) { static int LEMMA_COUNTER = 0; - public Lemma(int index, String word, int simpel, String[] clu) { this(index, word.getBytes(StandardCharsets.US_ASCII), simpel, clu); } - public Lemma(String word, int simpel, String clue) { this(LEMMA_COUNTER++, word, simpel, new String[]{ clue }); } - byte byteAt(int idx) { return word[idx]; } - @Override public int hashCode() { return index; } - @Override public boolean equals(Object o) { return (o == this) || (o instanceof Lemma l && l.index == index); } + public Lemma(int index, String word, int simpel) { this(index, word.getBytes(StandardCharsets.US_ASCII), simpel); } + public Lemma(String word, int simpel) { this(LEMMA_COUNTER++, word, simpel); } + byte byteAt(int idx) { return word[idx]; } + @Override public int hashCode() { return index; } + @Override public boolean equals(Object o) { return (o == this) || (o instanceof Lemma l && l.index == index); } + String[] clue() { return CsvIndexService.clues(index); } } public static record Dict( DictEntry[] index, int length) { - static final Gson GSON = new Gson(); public Dict(Lemma[] wordz) { var index = new DictEntry[MAX_WORD_LENGTH_PLUS_ONE]; Arrays.setAll(index, i -> new DictEntry(i)); @@ -317,59 +314,18 @@ public record SwedishGenerator(Rng rng) { entry.pos[i][letter].add(idx); } } - for (int i = MIN_LEN; i < index.length; i++) { - var len = index[i].words.size(); - if (len <= 0) { - throw new RuntimeException("No words for length " + i); - } - } + for (int i = MIN_LEN; i < index.length; i++) if (index[i].words.size() <= 0) throw new RuntimeException("No words for length " + i); this(index, Arrays.stream(index).mapToInt(i -> i.words.size()).sum()); } static Dict loadDict(String wordsPath) { - String raw; try { - raw = Files.readString(Path.of(wordsPath), StandardCharsets.UTF_8); + var map = new ArrayList(); + Files.lines(Path.of(wordsPath), StandardCharsets.UTF_8).forEach(line -> CsvIndexService.lineToLemma(line, map::add)); + return new Dict(map.toArray(Lemma[]::new)); } catch (IOException e) { e.printStackTrace(); throw new RuntimeException("Failed to load dictionary from " + wordsPath, e); } - - var map = new ArrayList(); - var first = true; - for (var line : raw.split("\\R")) { - if (line.isBlank()) { - System.err.println("Empty line: " + line); - continue; - } - var parts = line.split(",", 5); - var id = Integer.parseInt(parts[0].trim()); - var word = parts[1].trim(); - if (first && word.equalsIgnoreCase("WOORD")) { - first = false; - continue; - } - first = false; - var s = word.toUpperCase(Locale.ROOT); - if (!s.matches("^[A-Z]{2,8}$")) { - System.err.println("Invalid word: " + line); - continue; - } - - // CSV has level 1-10. llmScores use 10-level. - int score = Integer.parseInt(parts[2].trim()); - if (score < 1) { - if (Main.VERBOSE) System.err.println("Word too complex: " + line); - continue; - } - int simpel = Integer.parseInt(parts[3].trim()); - var rawClue = parts[4].trim(); - if (rawClue.startsWith("\"") && rawClue.endsWith("\"")) { - rawClue = rawClue.substring(1, rawClue.length() - 1).replace("\"\"", "\""); - } - map.add(new Lemma(id, s, simpel, GSON.fromJson(rawClue, String[].class))); - } - - return new Dict(map.toArray(Lemma[]::new)); } } diff --git a/src/test/java/puzzle/ExportFormatTest.java b/src/test/java/puzzle/ExportFormatTest.java index 8ac1a71..dd6e37b 100644 --- a/src/test/java/puzzle/ExportFormatTest.java +++ b/src/test/java/puzzle/ExportFormatTest.java @@ -10,6 +10,8 @@ import puzzle.SwedishGenerator.Grid; import puzzle.SwedishGenerator.Lemma; import puzzle.SwedishGenerator.Rng; +import java.io.IOException; +import java.nio.file.Paths; import java.util.HashMap; import static org.junit.jupiter.api.Assertions.*; @@ -29,7 +31,7 @@ public class ExportFormatTest { var clueMap = new HashMap(); // key = (cellIndex << 4) | direction var key = (0 << 4) | 2; - var lemma = new Lemma("TEST", 1, "A test word"); + var lemma = new Lemma("TEST", 1); clueMap.put(key, lemma); // Manually fill the grid letters for "TEST" at (0,1), (0,2), (0,3), (0,4) @@ -94,4 +96,16 @@ public class ExportFormatTest { assertTrue(row.matches("#+")); } } + @Test + void testIndex() { + var csv = Paths.get("nl_score_hints_v3.csv"); + var idx = Paths.get("nl_score_hints_v3.idx"); + + try (var svc = new CsvIndexService(csv, idx)) { + System.out.println(svc.getLine(1319)); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + } diff --git a/src/test/java/puzzle/SwedishGeneratorTest.java b/src/test/java/puzzle/SwedishGeneratorTest.java index 99ea5c1..de27159 100644 --- a/src/test/java/puzzle/SwedishGeneratorTest.java +++ b/src/test/java/puzzle/SwedishGeneratorTest.java @@ -103,19 +103,19 @@ public class SwedishGeneratorTest { @Test void testLemmaAndDict() { - var l2a = new Lemma("IN", 1, "BIJ"); - var l4a = new Lemma("INER", 1, "BIJER"); - var l6a = new Lemma("INEREN", 1, "BIJERE"); - var l7a = new Lemma("INERENA", 1, "BIJERE"); - var l8a = new Lemma("INERENAE", 1, "BIJERE"); + var l2a = new Lemma("IN", 1); + var l4a = new Lemma("INER", 1); + var l6a = new Lemma("INEREN", 1); + var l7a = new Lemma("INERENA", 1); + var l8a = new Lemma("INERENAE", 1); - var l1 = new Lemma("APPLE", 5, "A fruit"); + var l1 = new Lemma("APPLE", 5); Assertions.assertArrayEquals("APPLE".getBytes(StandardCharsets.US_ASCII), l1.word()); assertEquals(5, l1.word().length); assertEquals(5, l1.simpel()); assertEquals((byte) 'A', l1.byteAt(0)); - var l2 = new Lemma("AXE", 2, "A tool"); + var l2 = new Lemma("AXE", 2); var dict = new Dict(new Lemma[]{ l1, l2, l2a, l4a, l6a, l7a, l8a }); assertEquals(1, dict.index()[3].words().size()); @@ -179,15 +179,15 @@ public class SwedishGeneratorTest { @Test void testCandidateInfoForPattern() { - var l0 = new Lemma("IN", 1, "BIJ"); - var l3a = new Lemma("INE", 1, "BIJE"); - var l4a = new Lemma("INER", 1, "BIJER"); - var l6a = new Lemma("INEREN", 1, "BIJERE"); - var l7a = new Lemma("INERENA", 1, "BIJERE"); - var l8a = new Lemma("INERENAE", 1, "BIJERE"); - var l1 = new Lemma("APPLE", 1, "fruit"); - var l2 = new Lemma("APPLY", 1, "verb"); - var l3 = new Lemma("BANAN", 1, "fruit"); + var l0 = new Lemma("IN", 1); + var l3a = new Lemma("INE", 1); + var l4a = new Lemma("INER", 1); + var l6a = new Lemma("INEREN", 1); + var l7a = new Lemma("INERENA", 1); + var l8a = new Lemma("INERENAE", 1); + var l1 = new Lemma("APPLE", 1); + var l2 = new Lemma("APPLY", 1); + var l3 = new Lemma("BANAN", 1); var dict = new Dict(new Lemma[]{ l0, l1, l2, l3, l3a, l4a, l6a, l7a, l8a }); // Pattern "APP--" for length 5 @@ -274,7 +274,7 @@ public class SwedishGeneratorTest { // r(i) and c(i) are used by placeWord. var packedPos = ((long) Grid.offset(0, 0)) | (((long) Grid.offset(0, 1)) << 7) | (((long) Grid.offset(0, 2)) << 14); var s = Slot.from(0, packedPos, 3); - var w1 = new Lemma("ABC", 1, "test"); + var w1 = new Lemma("ABC", 1); var undoBuffer = new int[10]; // 1. Successful placement in empty grid @@ -289,7 +289,7 @@ public class SwedishGeneratorTest { assertEquals(0L, undoBuffer[1]); // 0 new characters placed // 3. Conflict: place "ABD" where "ABC" is - var w2 = new Lemma("ABD", 1, "conflict"); + var w2 = new Lemma("ABD", 1); assertFalse(placeWord(grid, s, w2, undoBuffer, 2)); // Verify grid is unchanged (still "ABC") assertEquals('A', grid.byteAt(Grid.offset(0, 0))); @@ -312,7 +312,7 @@ public class SwedishGeneratorTest { // Slot at 0,1 length 2 var packedPos = ((long) Grid.offset(0, 1)) | (((long) Grid.offset(0, 2)) << 7); var s = Slot.from((0 << 8) | (1 << 4) | 2, packedPos, 2); - var w = new Lemma("AZ", 1, "A to Z"); + var w = new Lemma("AZ", 1); var undoBuffer = new int[10]; var placed = placeWord(grid, s, w, undoBuffer, 0);