package puzzle; import module java.base; import lombok.Getter; import lombok.NoArgsConstructor; import lombok.experimental.Accessors; import lombok.val; import puzzle.SwedishGenerator.Dict; import puzzle.SwedishGenerator.DictEntry; import puzzle.SwedishGenerator.Lemma; import static java.nio.charset.StandardCharsets.US_ASCII; public final class DictJavaGeneratorMulti { // Smaller = more files, but safer for javac/class limits. private static final int WORDS_CHUNK = 8_192; private static final int POS_CHUNK = 8_192; public static void main(String[] args) throws Exception { var THRESS = 900; var wordsFile = Path.of(args.length > 0 ? args[0] : "nl_score_hints_v4.csv"); var outDir = Path.of(args.length > 1 ? args[1] : "src/main/generated-sources/puzzle/dict" + THRESS); var pkg = "puzzle.dict" + THRESS; var builders = new HashMap(16); var dict = buildDict(wordsFile, builders, THRESS); Files.createDirectories(outDir); // Generate L2..L8 for (var L = 2; L <= 8; L++) { var entry = dict.index()[L]; if (entry == null || entry.words() == null || entry.words().length == 0) { throw new IllegalStateException("No words for length " + L); } writeLengthBundle(outDir, pkg, L, entry); } // Aggregator writeAggregator(outDir, pkg, "DictData", dict.length(), THRESS); System.out.println("Generated sources into: " + outDir.toAbsolutePath()); } static String shardKey(long word) { return "" + Lemma.unpackSize(word) + 1; } private static SwedishGenerator.Dict buildDict(Path wordsPath, HashMap builders, int thress) throws IOException { var map = new LongArrayList(100_000); try (var lines = Files.lines(wordsPath, StandardCharsets.UTF_8)) { lines.forEach(line -> { var parts = line.split(",", 4); var word = parts[0].trim(); var w = SwedishGenerator.Lemma.from(word.getBytes(US_ASCII)); if (!word.equals(SwedishGenerator.Lemma.asWord(w, Export.BYTES.get()))) { throw new RuntimeException(); } var score = Integer.parseInt(parts[1].trim()); var simpel = Integer.parseInt(parts[CsvIndexService.SIMPEL_IDX].trim()); if (score < 1 || simpel > thress) { if (Main.VERBOSE) System.err.println("Word too complex: " + line); return; } var key = shardKey(w); var sb = builders.computeIfAbsent(key, k -> new ShardBuilder()); map.add(Lemma.pack(w, sb.addRecord())); }); } return Dicts.makeDict(map.toArray()); } interface Dicts { static Dict makeDict(long[] wordz) { var index = new DictEntryDTO[SwedishGenerator.MAX_WORD_LENGTH_PLUS_ONE]; Arrays.setAll(index, DictEntryDTO::new); for (var lemma : wordz) { var L = Lemma.unpackSize(lemma) + 1;//Lemma.unpackSize(lemma) + 2; val entry = index[L]; val idx = entry.words().size(); val pos = entry.pos(); entry.words().add(lemma); var i = 0; for (var w = lemma & Lemma.LETTER_MASK; w != 0; w >>>= 5, i++) { pos[i][(int) ((w & 31) - 1)].add(idx); } } for (var i = 2; i < index.length; i++) if (index[i].words().size() <= 0) throw new RuntimeException("No words for length " + i); return new Dict(Arrays.stream(index).map(i -> { var words = i.words().toArray(); var numWords = words.length; var numLongs = (numWords + 63) >>> 6; var bitsets = new long[i.pos().length * 26][numLongs]; for (var p = 0; p < i.pos().length; p++) { for (var l = 0; l < 26; l++) { var list = i.pos()[p][l]; var bs = bitsets[p * 26 + l]; for (var k = 0; k < list.size(); k++) { var wordIdx = list.data()[k]; bs[wordIdx >>> 6] |= (1L << (wordIdx & 63)); } } } return new DictEntry(words, bitsets, words.length, (words.length + 63) >>> 6); }).toArray(DictEntry[]::new), Arrays.stream(index).mapToInt(i -> i.words().size()).sum()); } } static final class ShardBuilder { int c; int addRecord() { return c++; } } private static void writeAggregator(Path outDir, String pkg, String cls, int totalLen, int thress) throws IOException { var out = outDir.resolve(cls + ".java"); try (var w = writer(out)) { w.write("package " + pkg + ";\n\n"); w.write("public final class " + cls + " {\n"); w.write(" private " + cls + "() {}\n\n"); w.write(" public static final puzzle.SwedishGenerator.Dict DICT" + thress + " = build();\n\n"); w.write(" private static puzzle.SwedishGenerator.Dict build() {\n"); w.write(" puzzle.SwedishGenerator.DictEntry[] idx = new puzzle.SwedishGenerator.DictEntry[puzzle.SwedishGenerator.MAX_WORD_LENGTH_PLUS_ONE];\n"); for (var L = 2; L <= 8; L++) w.write(" idx[" + L + "] = DictDataL" + L + ".entry();\n"); w.write(" return new puzzle.SwedishGenerator.Dict(idx, " + totalLen + ");\n"); w.write(" }\n"); w.write("}\n"); } } private static void writeLengthBundle(Path outDir, String pkg, int L, SwedishGenerator.DictEntry e) throws IOException { var words = e.words(); // flatten posBitsets: [rows][cols] -> flat[] var bs = e.posBitsets(); var rows = bs.length; var cols = bs[0].length; var flat = new long[rows * cols]; var t = 0; for (var r = 0; r < rows; r++) { System.arraycopy(bs[r], 0, flat, t, cols); t += cols; } var base = "DictDataL" + L; // 1) chunk classes var wChunks = writeChunkClasses(outDir, pkg, base + "W", words, WORDS_CHUNK); var pChunks = writeChunkClasses(outDir, pkg, base + "P", flat, POS_CHUNK); // 2) assembler class writeLengthAssembler(outDir, pkg, base, L, rows, cols, words.length, flat.length, wChunks, pChunks); } /** Writes classes like Prefix0..PrefixN each with static long[] DATA. Returns chunk count. */ private static int writeChunkClasses(Path outDir, String pkg, String prefix, long[] data, int chunkSize) throws IOException { var chunks = (data.length + chunkSize - 1) / chunkSize; for (var ci = 0; ci < chunks; ci++) { var from = ci * chunkSize; var to = Math.min(data.length, from + chunkSize); var out = outDir.resolve(prefix + ci + ".java"); try (var w = writer(out)) { w.write("package " + pkg + ";\n\n"); w.write("public final class " + prefix + ci + " {\n"); w.write(" private " + prefix + ci + "() {}\n"); w.write(" public static long[] get() {\n"); w.write(" return new long[] { \n"); for (var i = from; i < to; i++) { w.write(" " + toLongLiteral(data[i]) + (i + 1 < to ? "," : "") + "\n"); } w.write(" };\n"); w.write(" }\n"); w.write("}\n"); } } return chunks; } private static void writeLengthAssembler(Path outDir, String pkg, String cls, int L, int rows, int cols, int wordsLen, int posLen, int wChunks, int pChunks) throws IOException { var out = outDir.resolve(cls + ".java"); try (var w = writer(out)) { w.write("package " + pkg + ";\n\n"); w.write("public final class " + cls + " {\n"); w.write(" private " + cls + "() {}\n\n"); w.write(" static final int LEN = " + L + ";\n"); w.write(" static final int ROWS = " + rows + ";\n"); w.write(" static final int COLS = " + cols + ";\n"); w.write(" static final int WORDS_LEN = " + wordsLen + ";\n"); w.write(" static final int POS_LEN = " + posLen + ";\n\n"); // assemble words w.write(" private static long[] words() {\n"); var wPrefix = "DictDataL" + L + "W"; if (wChunks == 1) { w.write(" return " + wPrefix + "0.get();\n"); } else { w.write(" long[] out = new long[WORDS_LEN];\n"); w.write(" int k = 0;\n"); for (var ci = 0; ci < wChunks; ci++) { w.write(" k = copy(out, k, " + wPrefix + ci + ".get());\n"); } w.write(" return out;\n"); } w.write(" }\n\n"); // assemble pos w.write(" private static long[] posFlat() {\n"); var pPrefix = "DictDataL" + L + "P"; if (pChunks == 1) { w.write(" return " + pPrefix + "0.get();\n"); } else { w.write(" long[] out = new long[POS_LEN];\n"); w.write(" int k = 0;\n"); for (var ci = 0; ci < pChunks; ci++) { w.write(" k = copy(out, k, " + pPrefix + ci + ".get());\n"); } w.write(" return out;\n"); } w.write(" }\n\n"); // entry w.write(" public static puzzle.SwedishGenerator.DictEntry entry() {\n"); w.write(" long[] wds = words();\n"); w.write(" long[] flat = posFlat();\n"); w.write(" long[][] pos = reshape(flat, ROWS, COLS);\n"); w.write(" return new puzzle.SwedishGenerator.DictEntry(wds, pos, wds.length, (wds.length + 63) >>> 6);\n"); w.write(" }\n\n"); // helpers w.write(" private static int copy(long[] dst, int at, long[] src) {\n"); w.write(" System.arraycopy(src, 0, dst, at, src.length);\n"); w.write(" return at + src.length;\n"); w.write(" }\n\n"); w.write(" private static long[][] reshape(long[] flat, int rows, int cols) {\n"); w.write(" long[][] out = new long[rows][cols];\n"); w.write(" int k = 0;\n"); w.write(" for (int r = 0; r < rows; r++) {\n"); w.write(" System.arraycopy(flat, k, out[r], 0, cols);\n"); w.write(" k += cols;\n"); w.write(" }\n"); w.write(" return out;\n"); w.write(" }\n"); w.write("}\n"); } } private static BufferedWriter writer(Path out) throws IOException { return Files.newBufferedWriter(out, StandardCharsets.UTF_8, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.WRITE); } private static String toLongLiteral(long v) { return "0x" + Long.toUnsignedString(v, 16) + "L"; } public static final class CsvIndexService { static int SIMPEL_IDX = 2; } record DictEntryDTO(LongArrayList words, IntListDTO[][] pos) { public DictEntryDTO(int L) { this(new LongArrayList(1024), new IntListDTO[L][26]); for (var i = 0; i < L; i++) for (var j = 0; j < 26; j++) pos[i][j] = new IntListDTO(); } @Getter @Accessors(fluent = true) @NoArgsConstructor static final class IntListDTO { int[] data = new int[8]; int size = 0; void add(int v) { if (size >= data.length) data = Arrays.copyOf(data, data.length * 2); data[size++] = v; } } } static final class LongArrayList { long[] a; int size; LongArrayList(int initialCapacity) { if (initialCapacity < 0) throw new IllegalArgumentException(); a = new long[initialCapacity]; } int size() { return size; } void add(long v) { if (size == a.length) grow(); a[size++] = v; } void grow() { var newCap = a.length == 0 ? 1 : a.length * 2; var n = new long[newCap]; System.arraycopy(a, 0, n, 0, size); a = n; } long[] toArray() { return Arrays.copyOf(a, this.size); } } }