391 lines
16 KiB
Java
391 lines
16 KiB
Java
package puzzle;
|
|
|
|
import module java.base;
|
|
import lombok.Getter;
|
|
import lombok.NoArgsConstructor;
|
|
import lombok.experimental.Accessors;
|
|
import lombok.val;
|
|
import puzzle.DictJavaGeneratorMulti.DictEntryDTO.IntListDTO;
|
|
import puzzle.SwedishGenerator.Dict;
|
|
import puzzle.SwedishGenerator.DictEntry;
|
|
import puzzle.SwedishGenerator.Lemma;
|
|
import static java.nio.charset.StandardCharsets.US_ASCII;
|
|
import static puzzle.SwedishGenerator.THRESS;
|
|
|
|
public final class DictJavaGeneratorMulti {
|
|
|
|
// Smaller = more files, but safer for javac/class limits.
|
|
private static final int WORDS_CHUNK = 8_192;
|
|
private static final int POS_CHUNK = 8_192;
|
|
|
|
public static void main(String[] args) throws Exception {
|
|
Path wordsFile = Path.of(args.length > 0 ? args[0] : "nl_score_hints_v3.csv");
|
|
Path outDir = Path.of(args.length > 1 ? args[1] : "src/main/generated-sources/puzzle/dict" + THRESS);
|
|
String pkg = "puzzle.dict" + THRESS;
|
|
HashMap<Path, ShardBuilder> builders = new HashMap<Path, ShardBuilder>(16);
|
|
|
|
SwedishGenerator.Dict dict = buildDict(wordsFile, builders);
|
|
|
|
Files.createDirectories(outDir);
|
|
|
|
// Generate L2..L8
|
|
for (int L = 2; L <= 8; L++) {
|
|
var entry = dict.index()[L];
|
|
if (entry == null || entry.words() == null || entry.words().length == 0) {
|
|
throw new IllegalStateException("No words for length " + L);
|
|
}
|
|
writeLengthBundle(outDir, pkg, L, entry);
|
|
}
|
|
|
|
// Aggregator
|
|
writeAggregator(outDir, pkg, "DictData", dict.length());
|
|
System.out.println("Generated sources into: " + outDir.toAbsolutePath());
|
|
builders.forEach(DictJavaGeneratorMulti::writeIndexedShard);
|
|
|
|
}
|
|
static final Path[] SHARDS = IntStream.range(0, 10).mapToObj(sId -> Path.of("src/main/generated-sources/puzzle/dict"+THRESS).resolve(sId + ".idx")).toArray(
|
|
Path[]::new);
|
|
static Path shardKey(long word) {
|
|
return SHARDS[Lemma.unpackSize(word) + 1];
|
|
}
|
|
private static SwedishGenerator.Dict buildDict(Path wordsPath, HashMap<Path, ShardBuilder> builders) throws IOException {
|
|
var map = new LongArrayList(100_000);
|
|
try (var lines = Files.lines(wordsPath, StandardCharsets.UTF_8)) {
|
|
lines.forEach(line -> {
|
|
CsvIndexService.lineToLemma(line, w -> {
|
|
String word = Lemma.asWord(w, Export.BYTES.get());
|
|
String[] clues = CsvIndexService.lineToClue(line);
|
|
int simpel = CsvIndexService.lineToSimpel(line);
|
|
|
|
// serialize to: WORD \t JSON \n
|
|
// (als je al JSON string wilt bewaren: gebruik Gson.toJson(clues))
|
|
String json = Meta.GSON.toJson(clues);
|
|
String recStr = word + "\t" + simpel + "\t" + json + "\n";
|
|
byte[] rec = recStr.getBytes(StandardCharsets.UTF_8);
|
|
|
|
var key = shardKey(w) ;
|
|
ShardBuilder sb = builders.computeIfAbsent(key, k -> new ShardBuilder());
|
|
try {
|
|
map.add(Lemma.pack(w, sb.addRecord(rec)));
|
|
} catch (IOException e) {
|
|
throw new UncheckedIOException(e);
|
|
}
|
|
});
|
|
|
|
});
|
|
}
|
|
return Dicts.makeDict(map.toArray());
|
|
}
|
|
|
|
interface Dicts {
|
|
|
|
static Dict makeDict(long[] wordz) {
|
|
var index = new DictEntryDTO[SwedishGenerator.MAX_WORD_LENGTH_PLUS_ONE];
|
|
Arrays.setAll(index, DictEntryDTO::new);
|
|
for (var lemma : wordz) {
|
|
var L = Lemma.unpackSize(lemma) + 1;//Lemma.unpackSize(lemma) + 2;
|
|
val entry = index[L];
|
|
val idx = entry.words().size();
|
|
val pos = entry.pos();
|
|
entry.words().add(lemma);
|
|
int i = 0;
|
|
for (long w = lemma & Lemma.LETTER_MASK; w != 0; w >>>= 5, i++) {
|
|
pos[i][(int) ((w & 31) - 1)].add(idx);
|
|
}
|
|
}
|
|
for (int i = 2; i < index.length; i++) if (index[i].words().size() <= 0) throw new RuntimeException("No words for length " + i);
|
|
return new Dict(Arrays.stream(index).map(i -> {
|
|
var words = i.words().toArray();
|
|
int numWords = words.length;
|
|
int numLongs = (numWords + 63) >>> 6;
|
|
var bitsets = new long[i.pos().length * 26][numLongs];
|
|
for (int p = 0; p < i.pos().length; p++) {
|
|
for (int l = 0; l < 26; l++) {
|
|
var list = i.pos()[p][l];
|
|
var bs = bitsets[p * 26 + l];
|
|
for (int k = 0; k < list.size(); k++) {
|
|
int wordIdx = list.data()[k];
|
|
bs[wordIdx >>> 6] |= (1L << (wordIdx & 63));
|
|
}
|
|
}
|
|
}
|
|
return new DictEntry(words, bitsets, words.length, (words.length + 63) >>> 6);
|
|
}).toArray(DictEntry[]::new),
|
|
Arrays.stream(index).mapToInt(i -> i.words().size()).sum());
|
|
}
|
|
}
|
|
|
|
static final class ShardBuilder {
|
|
|
|
final IntListDTO offsets = new IntListDTO(4096);
|
|
final ByteArrayOutputStream data = new ByteArrayOutputStream(1 << 20); // grows
|
|
int addRecord(byte[] rec) throws IOException {
|
|
var size = data.size();
|
|
val currSize = offsets.size();
|
|
offsets.add(size);
|
|
data.write(rec);
|
|
return currSize;
|
|
}
|
|
}
|
|
|
|
static final int VERSION = 1;
|
|
static void writeIndexedShard(Path out, ShardBuilder sb) {
|
|
int n = sb.offsets.size();
|
|
int[] offs = sb.offsets.toArray();
|
|
byte[] data = sb.data.toByteArray();
|
|
|
|
try (FileChannel ch = FileChannel.open(out,
|
|
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING,
|
|
StandardOpenOption.WRITE)) {
|
|
|
|
// header
|
|
ByteBuffer hdr = ByteBuffer.allocate(12);
|
|
hdr.putInt(Meta.SHARD_MAGIC).putInt(VERSION).putInt(n).flip();
|
|
ch.write(hdr);
|
|
|
|
// offsets table (int per record)
|
|
ByteBuffer tbl = ByteBuffer.allocate(n * 4);
|
|
for (int i = 0; i < n; i++) tbl.putInt(offs[i]);
|
|
tbl.flip();
|
|
ch.write(tbl);
|
|
|
|
// data
|
|
ch.write(ByteBuffer.wrap(data));
|
|
}catch (IOException e){
|
|
throw new RuntimeException("Failed to write shard to " + out, e);
|
|
}
|
|
}
|
|
|
|
private static void writeAggregator(Path outDir, String pkg, String cls, int totalLen) throws IOException {
|
|
Path out = outDir.resolve(cls + ".java");
|
|
try (BufferedWriter w = writer(out)) {
|
|
w.write("package " + pkg + ";\n\n");
|
|
w.write("public final class " + cls + " {\n");
|
|
w.write(" private " + cls + "() {}\n\n");
|
|
w.write(" public static final puzzle.SwedishGenerator.Dict DICT" + THRESS + " = build();\n\n");
|
|
w.write(" private static puzzle.SwedishGenerator.Dict build() {\n");
|
|
w.write(" puzzle.SwedishGenerator.DictEntry[] idx = new puzzle.SwedishGenerator.DictEntry[puzzle.SwedishGenerator.MAX_WORD_LENGTH_PLUS_ONE];\n");
|
|
for (int L = 2; L <= 8; L++) w.write(" idx[" + L + "] = DictDataL" + L + ".entry();\n");
|
|
w.write(" return new puzzle.SwedishGenerator.Dict(idx, " + totalLen + ");\n");
|
|
w.write(" }\n");
|
|
w.write("}\n");
|
|
}
|
|
}
|
|
|
|
private static void writeLengthBundle(Path outDir, String pkg, int L, SwedishGenerator.DictEntry e) throws IOException {
|
|
long[] words = e.words();
|
|
|
|
// flatten posBitsets: [rows][cols] -> flat[]
|
|
long[][] bs = e.posBitsets();
|
|
int rows = bs.length;
|
|
int cols = bs[0].length;
|
|
long[] flat = new long[rows * cols];
|
|
int t = 0;
|
|
for (int r = 0; r < rows; r++) {
|
|
System.arraycopy(bs[r], 0, flat, t, cols);
|
|
t += cols;
|
|
}
|
|
|
|
String base = "DictDataL" + L;
|
|
|
|
// 1) chunk classes
|
|
int wChunks = writeChunkClasses(outDir, pkg, base + "W", words, WORDS_CHUNK);
|
|
int pChunks = writeChunkClasses(outDir, pkg, base + "P", flat, POS_CHUNK);
|
|
|
|
// 2) assembler class
|
|
writeLengthAssembler(outDir, pkg, base, L, rows, cols, words.length, flat.length, wChunks, pChunks);
|
|
}
|
|
|
|
/** Writes classes like Prefix0..PrefixN each with static long[] DATA. Returns chunk count. */
|
|
private static int writeChunkClasses(Path outDir, String pkg, String prefix, long[] data, int chunkSize) throws IOException {
|
|
int chunks = (data.length + chunkSize - 1) / chunkSize;
|
|
for (int ci = 0; ci < chunks; ci++) {
|
|
int from = ci * chunkSize;
|
|
int to = Math.min(data.length, from + chunkSize);
|
|
|
|
Path out = outDir.resolve(prefix + ci + ".java");
|
|
try (BufferedWriter w = writer(out)) {
|
|
w.write("package " + pkg + ";\n\n");
|
|
w.write("public final class " + prefix + ci + " {\n");
|
|
w.write(" private " + prefix + ci + "() {}\n");
|
|
|
|
w.write(" public static long[] get() {\n");
|
|
w.write(" return new long[] { \n");
|
|
for (int i = from; i < to; i++) {
|
|
w.write(" " + toLongLiteral(data[i]) + (i + 1 < to ? "," : "") + "\n");
|
|
}
|
|
w.write(" };\n");
|
|
w.write(" }\n");
|
|
w.write("}\n");
|
|
}
|
|
}
|
|
return chunks;
|
|
}
|
|
|
|
private static void writeLengthAssembler(Path outDir, String pkg, String cls, int L,
|
|
int rows, int cols,
|
|
int wordsLen, int posLen,
|
|
int wChunks, int pChunks) throws IOException {
|
|
Path out = outDir.resolve(cls + ".java");
|
|
try (BufferedWriter w = writer(out)) {
|
|
w.write("package " + pkg + ";\n\n");
|
|
w.write("public final class " + cls + " {\n");
|
|
w.write(" private " + cls + "() {}\n\n");
|
|
|
|
w.write(" static final int LEN = " + L + ";\n");
|
|
w.write(" static final int ROWS = " + rows + ";\n");
|
|
w.write(" static final int COLS = " + cols + ";\n");
|
|
w.write(" static final int WORDS_LEN = " + wordsLen + ";\n");
|
|
w.write(" static final int POS_LEN = " + posLen + ";\n\n");
|
|
|
|
// assemble words
|
|
w.write(" private static long[] words() {\n");
|
|
String wPrefix = "DictDataL" + L + "W";
|
|
if (wChunks == 1) {
|
|
w.write(" return " + wPrefix + "0.get();\n");
|
|
} else {
|
|
w.write(" long[] out = new long[WORDS_LEN];\n");
|
|
w.write(" int k = 0;\n");
|
|
for (int ci = 0; ci < wChunks; ci++) {
|
|
w.write(" k = copy(out, k, " + wPrefix + ci + ".get());\n");
|
|
}
|
|
w.write(" return out;\n");
|
|
}
|
|
w.write(" }\n\n");
|
|
|
|
// assemble pos
|
|
w.write(" private static long[] posFlat() {\n");
|
|
String pPrefix = "DictDataL" + L + "P";
|
|
if (pChunks == 1) {
|
|
w.write(" return " + pPrefix + "0.get();\n");
|
|
} else {
|
|
w.write(" long[] out = new long[POS_LEN];\n");
|
|
w.write(" int k = 0;\n");
|
|
for (int ci = 0; ci < pChunks; ci++) {
|
|
w.write(" k = copy(out, k, " + pPrefix + ci + ".get());\n");
|
|
}
|
|
w.write(" return out;\n");
|
|
}
|
|
w.write(" }\n\n");
|
|
|
|
// entry
|
|
w.write(" public static puzzle.SwedishGenerator.DictEntry entry() {\n");
|
|
w.write(" long[] wds = words();\n");
|
|
w.write(" long[] flat = posFlat();\n");
|
|
w.write(" long[][] pos = reshape(flat, ROWS, COLS);\n");
|
|
w.write(" return new puzzle.SwedishGenerator.DictEntry(wds, pos, wds.length, (wds.length + 63) >>> 6);\n");
|
|
w.write(" }\n\n");
|
|
|
|
// helpers
|
|
w.write(" private static int copy(long[] dst, int at, long[] src) {\n");
|
|
w.write(" System.arraycopy(src, 0, dst, at, src.length);\n");
|
|
w.write(" return at + src.length;\n");
|
|
w.write(" }\n\n");
|
|
|
|
w.write(" private static long[][] reshape(long[] flat, int rows, int cols) {\n");
|
|
w.write(" long[][] out = new long[rows][cols];\n");
|
|
w.write(" int k = 0;\n");
|
|
w.write(" for (int r = 0; r < rows; r++) {\n");
|
|
w.write(" System.arraycopy(flat, k, out[r], 0, cols);\n");
|
|
w.write(" k += cols;\n");
|
|
w.write(" }\n");
|
|
w.write(" return out;\n");
|
|
w.write(" }\n");
|
|
|
|
w.write("}\n");
|
|
}
|
|
}
|
|
|
|
private static BufferedWriter writer(Path out) throws IOException {
|
|
return Files.newBufferedWriter(out, StandardCharsets.UTF_8,
|
|
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.WRITE);
|
|
}
|
|
|
|
private static String toLongLiteral(long v) {
|
|
return "0x" + Long.toUnsignedString(v, 16) + "L";
|
|
}
|
|
public static final class CsvIndexService {
|
|
|
|
static int SIMPEL_IDX = 3;
|
|
|
|
public static int lineToSimpel(String line) {
|
|
var parts = line.split(",", 5);
|
|
return Integer.parseInt(parts[SIMPEL_IDX].trim());
|
|
}
|
|
public static String[] lineToClue(String line) {
|
|
if (line.isBlank()) throw new RuntimeException("Empty line");
|
|
var parts = line.split(",", 5);
|
|
var rawClue = parts[4].trim();
|
|
if (rawClue.startsWith("\"") && rawClue.endsWith("\"")) {
|
|
rawClue = rawClue.substring(1, rawClue.length() - 1).replace("\"\"", "\"");
|
|
}
|
|
return Meta.GSON.fromJson(rawClue, String[].class);
|
|
}
|
|
public static void lineToLemma(String line, LongConsumer ok) {
|
|
if (line.isBlank()) {
|
|
throw new RuntimeException("Empty line");
|
|
}
|
|
var parts = line.split(",", 5);
|
|
var id = Integer.parseInt(parts[0].trim());
|
|
var word = parts[1].trim();
|
|
int score = Integer.parseInt(parts[2].trim());
|
|
int simpel = Integer.parseInt(parts[3].trim());
|
|
if (score < 1 || simpel>THRESS) {
|
|
if (Main.VERBOSE) System.err.println("Word too complex: " + line);
|
|
return;
|
|
}
|
|
ok.accept(Lemma.from(word.getBytes(US_ASCII)));
|
|
}
|
|
}
|
|
|
|
record DictEntryDTO(LongArrayList words, IntListDTO[][] pos) {
|
|
|
|
public DictEntryDTO(int L) {
|
|
this(new LongArrayList(1024), new IntListDTO[L][26]);
|
|
for (var i = 0; i < L; i++) for (var j = 0; j < 26; j++) pos[i][j] = new IntListDTO();
|
|
}
|
|
@Getter
|
|
@Accessors(fluent = true)
|
|
@NoArgsConstructor
|
|
static final class IntListDTO {
|
|
|
|
int[] data = new int[8];
|
|
int size = 0;
|
|
public IntListDTO(int size) {
|
|
data = new int[size];
|
|
}
|
|
void add(int v) {
|
|
if (size >= data.length) data = Arrays.copyOf(data, data.length * 2);
|
|
data[size++] = v;
|
|
}
|
|
int[] toArray() { return Arrays.copyOf(data, size); }
|
|
}
|
|
}
|
|
|
|
static final class LongArrayList {
|
|
|
|
long[] a;
|
|
int size;
|
|
|
|
LongArrayList(int initialCapacity) {
|
|
if (initialCapacity < 0) throw new IllegalArgumentException();
|
|
a = new long[initialCapacity];
|
|
}
|
|
|
|
int size() { return size; }
|
|
|
|
void add(long v) {
|
|
if (size == a.length) grow();
|
|
a[size++] = v;
|
|
}
|
|
|
|
void grow() {
|
|
int newCap = a.length == 0 ? 1 : a.length * 2;
|
|
long[] n = new long[newCap];
|
|
System.arraycopy(a, 0, n, 0, size);
|
|
a = n;
|
|
}
|
|
long[] toArray() { return Arrays.copyOf(a, this.size); }
|
|
}
|
|
}
|