introduce bitloops

This commit is contained in:
mike
2026-01-17 14:21:53 +01:00
parent bd25f65194
commit 9bd85c81a3
4 changed files with 94 additions and 29 deletions

View File

@@ -1,7 +1,11 @@
package puzzle;
import org.junit.jupiter.api.Test;
import puzzle.DictCodeGen.DictJavaGenerator;
import puzzle.Export.Dicts;
import puzzle.Export.IntListDTO;
import puzzle.SwedishGenerator.Dict;
import puzzle.SwedishGenerator.Lemma;
import java.io.*;
import java.nio.ByteBuffer;
@@ -39,7 +43,10 @@ public final class DictJavaGeneratorMulti {
// Aggregator
writeAggregator(outDir, pkg, "DictData", dict.length());
generateHintShards(wordsFile, outDir);
var csv = Paths.get("nl_score_hints_v3.csv");
var idx = Paths.get("nl_score_hints_v3.idx");
ScopedValue.where(CsvIndexService.SC, new CsvIndexService(csv, idx)).run(() -> generateHintShards(dict, outDir));
System.out.println("Generated sources into: " + outDir.toAbsolutePath());
}
@@ -57,23 +64,10 @@ public final class DictJavaGeneratorMulti {
var parts = line.split(",", 5);
return parts[1].trim();
}
static final class IntArrayList {
int[] a;
int size;
IntArrayList(int cap) { a = new int[cap]; }
void add(int v) {
if (size == a.length) a = Arrays.copyOf(a, a.length * 2);
a[size++] = v;
}
int size() { return size; }
int get(int i) { return a[i]; }
int[] toArray() { return Arrays.copyOf(a, size); }
}
static final class ShardBuilder {
final IntArrayList offsets = new IntArrayList(4096);
final IntListDTO offsets = new IntListDTO(4096);
final ByteArrayOutputStream data = new ByteArrayOutputStream(1 << 20); // grows
void addRecord(byte[] rec) throws IOException {
offsets.add(data.size());
@@ -116,6 +110,50 @@ public final class DictJavaGeneratorMulti {
writeIndexedShard(outDir.resolve(e.getKey() + ".idx"), e.getValue());
}
}
static void generateHintShards(Dict dict, Path outDir) {
try {
Files.createDirectories(outDir);
} catch (IOException e) {
throw new RuntimeException(e);
}
var builders = new java.util.HashMap<Path, ShardBuilder>(256);
for (var index : dict.index()) {
long[] words = index.words();
for (int shardIdx = 0; shardIdx < words.length; shardIdx++) {
var w = words[shardIdx];
String word = Lemma.asWord(w);
int wIdx = Lemma.unpackIndex(w);
String[] clues = CsvIndexService.clues(wIdx);
int simpel = CsvIndexService.simpel(wIdx);
// serialize to: WORD \t JSON \n
// (als je al JSON string wilt bewaren: gebruik Gson.toJson(clues))
String json = Meta.GSON.toJson(clues);
String recStr = word + "\t" + simpel + "\t" + json + "\n";
byte[] rec = recStr.getBytes(StandardCharsets.UTF_8);
var key = Meta.shardKey(w);
ShardBuilder sb = builders.computeIfAbsent(key, k -> new ShardBuilder());
try {
sb.addRecord(rec);
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
}
// flush all shards to disk as <key>.idx (e.g. 6Z.idx)
for (var e : builders.entrySet()) {
try {
writeIndexedShard(e.getKey(), e.getValue());
} catch (IOException ex) {
throw new RuntimeException(ex);
}
}
}
static void writeIndexedShard(Path out, ShardBuilder sb) throws IOException {
int n = sb.offsets.size();
int[] offs = sb.offsets.toArray();