introduce bitloops
This commit is contained in:
279
src/test/java/puzzle/DictJavaGeneratorMulti.java
Normal file
279
src/test/java/puzzle/DictJavaGeneratorMulti.java
Normal file
@@ -0,0 +1,279 @@
|
||||
package puzzle;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import puzzle.Export.Dicts;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.*;
|
||||
import java.util.Arrays;
|
||||
|
||||
public final class DictJavaGeneratorMulti {
|
||||
|
||||
// Smaller = more files, but safer for javac/class limits.
|
||||
private static final int WORDS_CHUNK = 8_192;
|
||||
private static final int POS_CHUNK = 8_192;
|
||||
@Test
|
||||
public void dictCodeGen15() {
|
||||
System.out.println(DictData.DICT);
|
||||
}
|
||||
public static void main(String[] args) throws Exception {
|
||||
Path wordsFile = Path.of(args.length > 0 ? args[0] : "nl_score_hints_v3.csv");
|
||||
Path outDir = Path.of(args.length > 1 ? args[1] : "/home/mike/dev/puzzle-generator/src/main/generated-sources/puzzle");
|
||||
String pkg = "puzzle";
|
||||
|
||||
SwedishGenerator.Dict dict = buildDict(wordsFile);
|
||||
|
||||
Files.createDirectories(outDir);
|
||||
|
||||
// Generate L2..L8
|
||||
for (int L = 2; L <= 8; L++) {
|
||||
var entry = dict.index()[L];
|
||||
if (entry == null || entry.words() == null || entry.words().length == 0) {
|
||||
throw new IllegalStateException("No words for length " + L);
|
||||
}
|
||||
writeLengthBundle(outDir, pkg, L, entry);
|
||||
}
|
||||
|
||||
// Aggregator
|
||||
writeAggregator(outDir, pkg, "DictData", dict.length());
|
||||
generateHintShards(wordsFile, outDir);
|
||||
System.out.println("Generated sources into: " + outDir.toAbsolutePath());
|
||||
}
|
||||
|
||||
private static SwedishGenerator.Dict buildDict(Path wordsPath) throws IOException {
|
||||
var map = new LongArrayList(100_000);
|
||||
try (var lines = Files.lines(wordsPath, StandardCharsets.UTF_8)) {
|
||||
lines.forEach(line -> CsvIndexService.lineToLemma(line, map::add));
|
||||
}
|
||||
return Dicts.makeDict(map.toArray());
|
||||
}
|
||||
|
||||
static final int VERSION = 1;
|
||||
static String wordFromLine(String line) {
|
||||
// ID,WORD,*,*,"JSON"
|
||||
var parts = line.split(",", 5);
|
||||
return parts[1].trim();
|
||||
}
|
||||
static final class IntArrayList {
|
||||
|
||||
int[] a;
|
||||
int size;
|
||||
IntArrayList(int cap) { a = new int[cap]; }
|
||||
void add(int v) {
|
||||
if (size == a.length) a = Arrays.copyOf(a, a.length * 2);
|
||||
a[size++] = v;
|
||||
}
|
||||
int size() { return size; }
|
||||
int get(int i) { return a[i]; }
|
||||
int[] toArray() { return Arrays.copyOf(a, size); }
|
||||
}
|
||||
|
||||
static final class ShardBuilder {
|
||||
|
||||
final IntArrayList offsets = new IntArrayList(4096);
|
||||
final ByteArrayOutputStream data = new ByteArrayOutputStream(1 << 20); // grows
|
||||
void addRecord(byte[] rec) throws IOException {
|
||||
offsets.add(data.size());
|
||||
data.write(rec);
|
||||
}
|
||||
}
|
||||
static void generateHintShards(Path csv, Path outDir) throws IOException {
|
||||
Files.createDirectories(outDir);
|
||||
|
||||
var builders = new java.util.HashMap<String, ShardBuilder>(256);
|
||||
|
||||
try (var lines = Files.lines(csv, StandardCharsets.UTF_8)) {
|
||||
lines.forEach(line -> {
|
||||
if (line == null || line.isBlank()) return;
|
||||
|
||||
String word = wordFromLine(line);
|
||||
String[] clues = CsvIndexService.lineToClue(line);
|
||||
int simpel = CsvIndexService.lineToSimpel(line);
|
||||
|
||||
// serialize to: WORD \t JSON \n
|
||||
// (als je al JSON string wilt bewaren: gebruik Gson.toJson(clues))
|
||||
String json = Meta.GSON.toJson(clues);
|
||||
String recStr = word + "\t" + simpel + "\t" + json + "\n";
|
||||
byte[] rec = recStr.getBytes(StandardCharsets.UTF_8);
|
||||
|
||||
String key = Meta.shardKey(word);
|
||||
ShardBuilder sb = builders.computeIfAbsent(key, k -> new ShardBuilder());
|
||||
try {
|
||||
sb.addRecord(rec);
|
||||
} catch (IOException e) {
|
||||
throw new UncheckedIOException(e);
|
||||
}
|
||||
});
|
||||
} catch (UncheckedIOException uioe) {
|
||||
throw uioe.getCause();
|
||||
}
|
||||
|
||||
// flush all shards to disk as <key>.idx (e.g. 6Z.idx)
|
||||
for (var e : builders.entrySet()) {
|
||||
writeIndexedShard(outDir.resolve(e.getKey() + ".idx"), e.getValue());
|
||||
}
|
||||
}
|
||||
static void writeIndexedShard(Path out, ShardBuilder sb) throws IOException {
|
||||
int n = sb.offsets.size();
|
||||
int[] offs = sb.offsets.toArray();
|
||||
byte[] data = sb.data.toByteArray();
|
||||
|
||||
try (FileChannel ch = FileChannel.open(out,
|
||||
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING,
|
||||
StandardOpenOption.WRITE)) {
|
||||
|
||||
// header
|
||||
ByteBuffer hdr = ByteBuffer.allocate(12);
|
||||
hdr.putInt(Meta.SHARD_MAGIC).putInt(VERSION).putInt(n).flip();
|
||||
ch.write(hdr);
|
||||
|
||||
// offsets table (int per record)
|
||||
ByteBuffer tbl = ByteBuffer.allocate(n * 4);
|
||||
for (int i = 0; i < n; i++) tbl.putInt(offs[i]);
|
||||
tbl.flip();
|
||||
ch.write(tbl);
|
||||
|
||||
// data
|
||||
ch.write(ByteBuffer.wrap(data));
|
||||
}
|
||||
}
|
||||
|
||||
private static void writeAggregator(Path outDir, String pkg, String cls, int totalLen) throws IOException {
|
||||
Path out = outDir.resolve(cls + ".java");
|
||||
try (BufferedWriter w = writer(out)) {
|
||||
w.write("package " + pkg + ";\n\n");
|
||||
w.write("public final class " + cls + " {\n");
|
||||
w.write(" private " + cls + "() {}\n\n");
|
||||
w.write(" public static final SwedishGenerator.Dict DICT = build();\n\n");
|
||||
w.write(" private static SwedishGenerator.Dict build() {\n");
|
||||
w.write(" SwedishGenerator.DictEntry[] idx = new SwedishGenerator.DictEntry[SwedishGenerator.MAX_WORD_LENGTH_PLUS_ONE];\n");
|
||||
for (int L = 2; L <= 8; L++) w.write(" idx[" + L + "] = DictDataL" + L + ".entry();\n");
|
||||
w.write(" return new SwedishGenerator.Dict(idx, " + totalLen + ");\n");
|
||||
w.write(" }\n");
|
||||
w.write("}\n");
|
||||
}
|
||||
}
|
||||
|
||||
private static void writeLengthBundle(Path outDir, String pkg, int L, SwedishGenerator.DictEntry e) throws IOException {
|
||||
long[] words = e.words();
|
||||
|
||||
// flatten posBitsets: [rows][cols] -> flat[]
|
||||
long[][] bs = e.posBitsets();
|
||||
int rows = bs.length;
|
||||
int cols = bs[0].length;
|
||||
long[] flat = new long[rows * cols];
|
||||
int t = 0;
|
||||
for (int r = 0; r < rows; r++) {
|
||||
System.arraycopy(bs[r], 0, flat, t, cols);
|
||||
t += cols;
|
||||
}
|
||||
|
||||
String base = "DictDataL" + L;
|
||||
|
||||
// 1) chunk classes
|
||||
int wChunks = writeChunkClasses(outDir, pkg, base + "W", words, WORDS_CHUNK);
|
||||
int pChunks = writeChunkClasses(outDir, pkg, base + "P", flat, POS_CHUNK);
|
||||
|
||||
// 2) assembler class
|
||||
writeLengthAssembler(outDir, pkg, base, L, rows, cols, words.length, flat.length, wChunks, pChunks);
|
||||
}
|
||||
|
||||
/** Writes classes like Prefix0..PrefixN each with static final long[] DATA. Returns chunk count. */
|
||||
private static int writeChunkClasses(Path outDir, String pkg, String prefix, long[] data, int chunkSize) throws IOException {
|
||||
int chunks = (data.length + chunkSize - 1) / chunkSize;
|
||||
for (int ci = 0; ci < chunks; ci++) {
|
||||
int from = ci * chunkSize;
|
||||
int to = Math.min(data.length, from + chunkSize);
|
||||
|
||||
Path out = outDir.resolve(prefix + ci + ".java");
|
||||
try (BufferedWriter w = writer(out)) {
|
||||
w.write("package " + pkg + ";\n\n");
|
||||
w.write("public final class " + prefix + ci + " {\n");
|
||||
w.write(" private " + prefix + ci + "() {}\n");
|
||||
w.write(" public static final long[] DATA = new long[] {\n");
|
||||
for (int i = from; i < to; i++) {
|
||||
w.write(" " + toLongLiteral(data[i]) + (i + 1 < to ? "," : "") + "\n");
|
||||
}
|
||||
w.write(" };\n");
|
||||
w.write("}\n");
|
||||
}
|
||||
}
|
||||
return chunks;
|
||||
}
|
||||
|
||||
private static void writeLengthAssembler(Path outDir, String pkg, String cls, int L,
|
||||
int rows, int cols,
|
||||
int wordsLen, int posLen,
|
||||
int wChunks, int pChunks) throws IOException {
|
||||
Path out = outDir.resolve(cls + ".java");
|
||||
try (BufferedWriter w = writer(out)) {
|
||||
w.write("package " + pkg + ";\n\n");
|
||||
w.write("public final class " + cls + " {\n");
|
||||
w.write(" private " + cls + "() {}\n\n");
|
||||
|
||||
w.write(" static final int LEN = " + L + ";\n");
|
||||
w.write(" static final int ROWS = " + rows + ";\n");
|
||||
w.write(" static final int COLS = " + cols + ";\n");
|
||||
w.write(" static final int WORDS_LEN = " + wordsLen + ";\n");
|
||||
w.write(" static final int POS_LEN = " + posLen + ";\n\n");
|
||||
|
||||
// assemble words
|
||||
w.write(" private static long[] words() {\n");
|
||||
w.write(" long[] out = new long[WORDS_LEN];\n");
|
||||
w.write(" int k = 0;\n");
|
||||
for (int ci = 0; ci < wChunks; ci++) {
|
||||
w.write(" k = copy(out, k, DictDataL" + L + "W" + ci + ".DATA);\n");
|
||||
}
|
||||
w.write(" return out;\n");
|
||||
w.write(" }\n\n");
|
||||
|
||||
// assemble pos
|
||||
w.write(" private static long[] posFlat() {\n");
|
||||
w.write(" long[] out = new long[POS_LEN];\n");
|
||||
w.write(" int k = 0;\n");
|
||||
for (int ci = 0; ci < pChunks; ci++) {
|
||||
w.write(" k = copy(out, k, DictDataL" + L + "P" + ci + ".DATA);\n");
|
||||
}
|
||||
w.write(" return out;\n");
|
||||
w.write(" }\n\n");
|
||||
|
||||
// entry
|
||||
w.write(" public static SwedishGenerator.DictEntry entry() {\n");
|
||||
w.write(" long[] wds = words();\n");
|
||||
w.write(" long[] flat = posFlat();\n");
|
||||
w.write(" long[][] pos = reshape(flat, ROWS, COLS);\n");
|
||||
w.write(" return new SwedishGenerator.DictEntry(wds, pos, wds.length, (wds.length + 63) >>> 6);\n");
|
||||
w.write(" }\n\n");
|
||||
|
||||
// helpers
|
||||
w.write(" private static int copy(long[] dst, int at, long[] src) {\n");
|
||||
w.write(" System.arraycopy(src, 0, dst, at, src.length);\n");
|
||||
w.write(" return at + src.length;\n");
|
||||
w.write(" }\n\n");
|
||||
|
||||
w.write(" private static long[][] reshape(long[] flat, int rows, int cols) {\n");
|
||||
w.write(" long[][] out = new long[rows][cols];\n");
|
||||
w.write(" int k = 0;\n");
|
||||
w.write(" for (int r = 0; r < rows; r++) {\n");
|
||||
w.write(" System.arraycopy(flat, k, out[r], 0, cols);\n");
|
||||
w.write(" k += cols;\n");
|
||||
w.write(" }\n");
|
||||
w.write(" return out;\n");
|
||||
w.write(" }\n");
|
||||
|
||||
w.write("}\n");
|
||||
}
|
||||
}
|
||||
|
||||
private static BufferedWriter writer(Path out) throws IOException {
|
||||
return Files.newBufferedWriter(out, StandardCharsets.UTF_8,
|
||||
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.WRITE);
|
||||
}
|
||||
|
||||
private static String toLongLiteral(long v) {
|
||||
return "0x" + Long.toUnsignedString(v, 16) + "L";
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user