introduce bitloops

This commit is contained in:
mike
2026-01-20 19:00:27 +01:00
parent 58b8b57688
commit dadde53f76
28 changed files with 42000 additions and 41654 deletions

View File

@@ -0,0 +1,123 @@
package puzzle;
import lombok.val;
import puzzle.DictJavaGeneratorMulti.CsvIndexService;
import puzzle.Meta.ShardRec;
import puzzle.SwedishGenerator.Lemma;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import java.util.function.LongConsumer;
import static java.nio.charset.StandardCharsets.US_ASCII;
import static puzzle.SwedishGenerator.*;
public class BuildClueAndSimpelIndex {
public static void main(String[] args) throws Exception {
val records = buildDict(Path.of("nl_score_hints_v4.csv"));
buildShard(records);
for (var qRaw : List.of("FIETS", "huis", "kiwi")) {
var q = Meta.normWord(qRaw);
var w = Lemma.from(q);
var i = Meta.findIndexInMapMmap(Meta.shardMap, w);
System.out.println("\nQuery: " + qRaw + " (norm=" + q + ") w=" + w + " -> i=" + i);
if (i >= 0) {
var rec = Meta.readRecord(Meta.shardData, w, i);
System.out.println(" simpel=" + rec.simpel());
System.out.println(" clues=" + Arrays.toString(rec.clues()));
} else {
System.out.println(" NOT FOUND");
}
}
System.out.println("\nFiles written to: " + Meta.dir);
System.out.println(" " + Meta.shardData);
System.out.println(" " + Meta.shardMap);
} // --- Demo main ---
// --- Build demo files: shard.data + shard.map ---
static void buildShard(List<ShardRec> records) throws IOException {
records = new ArrayList<>(records);
// map is sorted by w; record index i == positie in deze gesorteerde lijst
records.sort(Comparator.comparingLong(ShardRec::w));
var n = records.size();
List<byte[]> recBytes = new ArrayList<>(n);
var offsets = new int[n];
var off = 0;
for (var i = 0; i < n; i++) {
var r = records.get(i);
// schrijf het echte woord weg + simpel + JSON hints
var line = r.word() + "\t" + r.simpel() + "\t" + Meta.GSON.toJson(r.clues());
var bytes = line.getBytes(StandardCharsets.UTF_8);
recBytes.add(bytes);
offsets[i] = off;
off += bytes.length;
}
var headerSize = 12L;
var tableSize = (long) n * 4L;
var dataStart = headerSize + tableSize;
try (var ch = FileChannel.open(Meta.shardData, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.WRITE)) {
var hdr = ByteBuffer.allocate(12).order(Meta.ORDER);
hdr.putInt(Meta.SHARD_MAGIC).putInt(Meta.VERSION).putInt(n);
hdr.flip();
ch.write(hdr);
var table = ByteBuffer.allocate(n * 4).order(Meta.ORDER);
for (var i = 0; i < n; i++) table.putInt(offsets[i]);
table.flip();
ch.write(table);
ch.position(dataStart);
for (var b : recBytes) ch.write(ByteBuffer.wrap(b));
}
try (var ch = FileChannel.open(Meta.shardMap, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.WRITE)) {
var hdr = ByteBuffer.allocate(12).order(Meta.ORDER);
hdr.putInt(Meta.MAP_MAGIC).putInt(Meta.VERSION).putInt(n);
hdr.flip();
ch.write(hdr);
var keys = ByteBuffer.allocate(n * 8).order(Meta.ORDER);
for (var r : records) keys.putLong(r.w());
keys.flip();
ch.write(keys);
}
}
private static List<ShardRec> buildDict(Path wordsPath) throws IOException {
var recs = new ArrayList<ShardRec>();
try (var lines = Files.lines(wordsPath, StandardCharsets.UTF_8)) {
lines.forEach(line -> {
var parts = line.split(",", 4);
var word = parts[0].trim();
long w = SwedishGenerator.Lemma.from(word.getBytes(US_ASCII));
if (!word.equals(SwedishGenerator.Lemma.asWord(w, Export.BYTES.get()))) {
throw new RuntimeException();
}
var rawClue = parts[3].trim();
if (rawClue.startsWith("\"") && rawClue.endsWith("\"")) {
rawClue = rawClue.substring(1, rawClue.length() - 1).replace("\"\"", "\"");
}
var clues = Meta.GSON.fromJson(rawClue, String[].class);
var simpel = Integer.parseInt(parts[CsvIndexService.SIMPEL_IDX].trim());
recs.add(new ShardRec(word, w, simpel, clues));
});
}
return recs;
}
}

View File

@@ -1,135 +0,0 @@
package puzzle;
import lombok.val;
import puzzle.DictJavaGeneratorMulti.CsvIndexService;
import puzzle.Meta.ShardLem;
import puzzle.Meta.ShardRec;
import puzzle.SwedishGenerator.Lemma;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import java.util.function.LongConsumer;
import static java.nio.charset.StandardCharsets.US_ASCII;
public class BuildMeta2 {
public static void lineToLemma(String line, LongConsumer ok) {
if (line.isBlank()) {
throw new RuntimeException("Empty line");
}
var parts = line.split(",", 4);
var word = parts[0].trim();
ok.accept(SwedishGenerator.Lemma.from(word.getBytes(US_ASCII)));
}
// --- Build demo files: shard.data + shard.map ---
static void buildShard(Path shardData, Path shardMap, List<ShardRec> records) throws IOException {
records = new ArrayList<>(records);
// map is sorted by w; record index i == positie in deze gesorteerde lijst
records.sort(Comparator.comparingLong(ShardRec::w));
int n = records.size();
List<byte[]> recBytes = new ArrayList<>(n);
int[] offsets = new int[n];
int off = 0;
for (int i = 0; i < n; i++) {
ShardRec r = records.get(i);
// schrijf het echte woord weg + simpel + JSON hints
String line = r.word() + "\t" + r.simpel() + "\t" + Meta.GSON.toJson(r.clues());
byte[] bytes = line.getBytes(StandardCharsets.UTF_8);
recBytes.add(bytes);
offsets[i] = off;
off += bytes.length;
}
long headerSize = 12L;
long tableSize = (long) n * 4L;
long dataStart = headerSize + tableSize;
try (FileChannel ch = FileChannel.open(shardData,
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.WRITE)) {
ByteBuffer hdr = ByteBuffer.allocate(12).order(Meta.ORDER);
hdr.putInt(Meta.SHARD_MAGIC).putInt(Meta.VERSION).putInt(n);
hdr.flip();
ch.write(hdr);
ByteBuffer table = ByteBuffer.allocate(n * 4).order(Meta.ORDER);
for (int i = 0; i < n; i++) table.putInt(offsets[i]);
table.flip();
ch.write(table);
ch.position(dataStart);
for (byte[] b : recBytes) ch.write(ByteBuffer.wrap(b));
}
try (FileChannel ch = FileChannel.open(shardMap,
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.WRITE)) {
ByteBuffer hdr = ByteBuffer.allocate(12).order(Meta.ORDER);
hdr.putInt(Meta.MAP_MAGIC).putInt(Meta.VERSION).putInt(n);
hdr.flip();
ch.write(hdr);
ByteBuffer keys = ByteBuffer.allocate(n * 8).order(Meta.ORDER);
for (ShardRec r : records) keys.putLong(r.w());
keys.flip();
ch.write(keys);
}
}
public static void main(String[] args) throws Exception {
val records = buildDict(Path.of("nl_score_hints_v4.csv"));
buildShard(Meta.shardData, Meta.shardMap, records);
for (String qRaw : List.of("FIETS", "huis", "kiwi")) {
String q = Meta.normWord(qRaw);
long w = Lemma.from(q);
int i = Meta.findIndexInMapMmap(Meta.shardMap, w);
System.out.println("\nQuery: " + qRaw + " (norm=" + q + ") w=" + w + " -> i=" + i);
if (i >= 0) {
ShardLem rec = Meta.readRecord(Meta.shardData, w, i);
System.out.println(" simpel=" + rec.simpel());
System.out.println(" clues=" + Arrays.toString(rec.clues()));
} else {
System.out.println(" NOT FOUND");
}
}
System.out.println("\nFiles written to: " + Meta.dir);
System.out.println(" " + Meta.shardData);
System.out.println(" " + Meta.shardMap);
} // --- Demo main ---
public static void main1(String[] args) throws Exception {
for (String qRaw : List.of("FIETS", "HUIS", "KIWI")) {
long w = Lemma.from(qRaw) | (3897L << 43L);
ShardLem rec = Meta.lookup(w);
System.out.println(rec);
}
}
private static List<ShardRec> buildDict(Path wordsPath) throws IOException {
var recs = new ArrayList<ShardRec>();
try (var lines = Files.lines(wordsPath, StandardCharsets.UTF_8)) {
lines.forEach(line -> {
lineToLemma(line, w -> {
String word = SwedishGenerator.Lemma.asWord(w, Export.BYTES.get());
String[] clues = CsvIndexService.lineToClue(line);
int simpel = CsvIndexService.lineToSimpel(line);
recs.add(new ShardRec(word, w, simpel, clues));
});
});
}
return recs;
}
}

View File

@@ -5,7 +5,6 @@ import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.experimental.Accessors;
import lombok.val;
import puzzle.DictJavaGeneratorMulti.DictEntryDTO.IntListDTO;
import puzzle.SwedishGenerator.Dict;
import puzzle.SwedishGenerator.DictEntry;
import puzzle.SwedishGenerator.Lemma;
@@ -16,14 +15,14 @@ public final class DictJavaGeneratorMulti {
// Smaller = more files, but safer for javac/class limits.
private static final int WORDS_CHUNK = 8_192;
private static final int POS_CHUNK = 8_192;
public static final int THRESS = 800;
public static void main(String[] args) throws Exception {
Path wordsFile = Path.of(args.length > 0 ? args[0] : "nl_score_hints_v4.csv");
Path outDir = Path.of(args.length > 1 ? args[1] : "src/main/generated-sources/puzzle/dict" + THRESS);
String pkg = "puzzle.dict" + THRESS;
Path wordsFile = Path.of(args.length > 0 ? args[0] : "nl_score_hints_v4.csv");
Path outDir = Path.of(args.length > 1 ? args[1] : "src/main/generated-sources/puzzle/dict" + THRESS);
String pkg = "puzzle.dict" + THRESS;
HashMap<String, ShardBuilder> builders = new HashMap<String, ShardBuilder>(16);
SwedishGenerator.Dict dict = buildDict(wordsFile, builders);
SwedishGenerator.Dict dict = buildDict(wordsFile, builders, THRESS);
Files.createDirectories(outDir);
@@ -41,34 +40,30 @@ public final class DictJavaGeneratorMulti {
System.out.println("Generated sources into: " + outDir.toAbsolutePath());
}
public static final int THRESS = 800;
static String shardKey(long word) {
return ""+Lemma.unpackSize(word) + 1;
return "" + Lemma.unpackSize(word) + 1;
}
private static SwedishGenerator.Dict buildDict(Path wordsPath, HashMap<String, ShardBuilder> builders) throws IOException {
private static SwedishGenerator.Dict buildDict(Path wordsPath, HashMap<String, ShardBuilder> builders, int thress) throws IOException {
var map = new LongArrayList(100_000);
try (var lines = Files.lines(wordsPath, StandardCharsets.UTF_8)) {
lines.forEach(line -> {
CsvIndexService.lineToLemma(line, w -> {
String word = Lemma.asWord(w, Export.BYTES.get());
String[] clues = CsvIndexService.lineToClue(line);
int simpel = CsvIndexService.lineToSimpel(line);
// serialize to: WORD \t JSON \n
// (als je al JSON string wilt bewaren: gebruik Gson.toJson(clues))
String json = Meta.GSON.toJson(clues);
String recStr = word + "\t" + simpel + "\t" + json + "\n";
byte[] rec = recStr.getBytes(StandardCharsets.UTF_8);
var key = shardKey(w);
ShardBuilder sb = builders.computeIfAbsent(key, k -> new ShardBuilder());
try {
map.add(Lemma.pack(w, sb.addRecord(rec)));
} catch (IOException e) {
throw new UncheckedIOException(e);
}
});
var parts = line.split(",", 4);
var word = parts[0].trim();
long w = SwedishGenerator.Lemma.from(word.getBytes(US_ASCII));
if (!word.equals(SwedishGenerator.Lemma.asWord(w, Export.BYTES.get()))) {
throw new RuntimeException();
}
int score = Integer.parseInt(parts[1].trim());
var simpel = Integer.parseInt(parts[CsvIndexService.SIMPEL_IDX].trim());
if (score < 1 || simpel > thress) {
if (Main.VERBOSE) System.err.println("Word too complex: " + line);
return;
}
var key = shardKey(w);
ShardBuilder sb = builders.computeIfAbsent(key, k -> new ShardBuilder());
map.add(Lemma.pack(w, sb.addRecord()));
});
}
return Dicts.makeDict(map.toArray());
@@ -114,13 +109,10 @@ public final class DictJavaGeneratorMulti {
static final class ShardBuilder {
final IntListDTO offsets = new IntListDTO(4096);
final ByteArrayOutputStream data = new ByteArrayOutputStream(1 << 20); // grows
int addRecord(byte[] rec) throws IOException {
var size = data.size();
val currSize = offsets.size();
offsets.add(size);
data.write(rec);
int c;
int addRecord() {
val currSize = c;
c++;
return currSize;
}
}