introduce bitloops
This commit is contained in:
@@ -1,10 +1,8 @@
|
||||
package puzzle;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import puzzle.DictCodeGen.DictJavaGenerator;
|
||||
import lombok.val;
|
||||
import puzzle.Export.Dicts;
|
||||
import puzzle.Export.IntListDTO;
|
||||
import puzzle.SwedishGenerator.Dict;
|
||||
import puzzle.SwedishGenerator.Lemma;
|
||||
|
||||
import java.io.*;
|
||||
@@ -12,23 +10,20 @@ import java.nio.ByteBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.*;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
|
||||
public final class DictJavaGeneratorMulti {
|
||||
|
||||
// Smaller = more files, but safer for javac/class limits.
|
||||
private static final int WORDS_CHUNK = 8_192;
|
||||
private static final int POS_CHUNK = 8_192;
|
||||
@Test
|
||||
public void dictCodeGen15() {
|
||||
System.out.println(DictData.DICT);
|
||||
}
|
||||
public static void main(String[] args) throws Exception {
|
||||
Path wordsFile = Path.of(args.length > 0 ? args[0] : "nl_score_hints_v3.csv");
|
||||
Path outDir = Path.of(args.length > 1 ? args[1] : "/home/mike/dev/puzzle-generator/src/main/generated-sources/puzzle");
|
||||
String pkg = "puzzle";
|
||||
Path wordsFile = Path.of(args.length > 0 ? args[0] : "nl_score_hints_v3.csv");
|
||||
Path outDir = Path.of(args.length > 1 ? args[1] : "src/main/generated-sources/puzzle");
|
||||
String pkg = "puzzle";
|
||||
HashMap<Path, ShardBuilder> builders = new HashMap<Path, ShardBuilder>(16);
|
||||
|
||||
SwedishGenerator.Dict dict = buildDict(wordsFile);
|
||||
SwedishGenerator.Dict dict = buildDict(wordsFile, builders);
|
||||
|
||||
Files.createDirectories(outDir);
|
||||
|
||||
@@ -43,117 +38,60 @@ public final class DictJavaGeneratorMulti {
|
||||
|
||||
// Aggregator
|
||||
writeAggregator(outDir, pkg, "DictData", dict.length());
|
||||
var csv = Paths.get("nl_score_hints_v3.csv");
|
||||
var idx = Paths.get("nl_score_hints_v3.idx");
|
||||
//var csv = Paths.get("nl_score_hints_v3.csv");
|
||||
//var idx = Paths.get("nl_score_hints_v3.idx");
|
||||
|
||||
ScopedValue.where(CsvIndexService.SC, new CsvIndexService(csv, idx)).run(() -> generateHintShards(dict, outDir));
|
||||
//ScopedValue.where(CsvIndexService.SC, new CsvIndexService(csv, idx)).run(() -> generateHintShards(dict, builders, outDir));
|
||||
System.out.println("Generated sources into: " + outDir.toAbsolutePath());
|
||||
}
|
||||
|
||||
private static SwedishGenerator.Dict buildDict(Path wordsPath) throws IOException {
|
||||
private static SwedishGenerator.Dict buildDict(Path wordsPath, HashMap<Path, ShardBuilder> builders) throws IOException {
|
||||
var map = new LongArrayList(100_000);
|
||||
try (var lines = Files.lines(wordsPath, StandardCharsets.UTF_8)) {
|
||||
lines.forEach(line -> CsvIndexService.lineToLemma(line, map::add));
|
||||
lines.forEach(line -> {
|
||||
CsvIndexService.lineToLemma(line, w -> {
|
||||
long len = Lemma.length0(w);
|
||||
|
||||
String word = Lemma.asWord(w);
|
||||
String[] clues = CsvIndexService.lineToClue(line);
|
||||
int simpel = CsvIndexService.lineToSimpel(line);
|
||||
|
||||
// serialize to: WORD \t JSON \n
|
||||
// (als je al JSON string wilt bewaren: gebruik Gson.toJson(clues))
|
||||
String json = Meta.GSON.toJson(clues);
|
||||
String recStr = word + "\t" + simpel + "\t" + json + "\n";
|
||||
byte[] rec = recStr.getBytes(StandardCharsets.UTF_8);
|
||||
|
||||
var key = Meta.shardKey(w);
|
||||
ShardBuilder sb = builders.computeIfAbsent(key, k -> new ShardBuilder());
|
||||
try {
|
||||
long index = ((long) sb.addRecord(rec) << 3) | len;
|
||||
map.add(w | (index << 40));
|
||||
} catch (IOException e) {
|
||||
throw new UncheckedIOException(e);
|
||||
}
|
||||
});
|
||||
|
||||
});
|
||||
}
|
||||
return Dicts.makeDict(map.toArray());
|
||||
}
|
||||
|
||||
static final int VERSION = 1;
|
||||
static String wordFromLine(String line) {
|
||||
// ID,WORD,*,*,"JSON"
|
||||
var parts = line.split(",", 5);
|
||||
return parts[1].trim();
|
||||
}
|
||||
|
||||
static final class ShardBuilder {
|
||||
|
||||
final IntListDTO offsets = new IntListDTO(4096);
|
||||
final ByteArrayOutputStream data = new ByteArrayOutputStream(1 << 20); // grows
|
||||
void addRecord(byte[] rec) throws IOException {
|
||||
offsets.add(data.size());
|
||||
int addRecord(byte[] rec) throws IOException {
|
||||
var size = data.size();
|
||||
val currSize = offsets.size();
|
||||
offsets.add(size);
|
||||
data.write(rec);
|
||||
return currSize;
|
||||
}
|
||||
}
|
||||
static void generateHintShards(Path csv, Path outDir) throws IOException {
|
||||
Files.createDirectories(outDir);
|
||||
|
||||
var builders = new java.util.HashMap<String, ShardBuilder>(256);
|
||||
|
||||
try (var lines = Files.lines(csv, StandardCharsets.UTF_8)) {
|
||||
lines.forEach(line -> {
|
||||
if (line == null || line.isBlank()) return;
|
||||
|
||||
String word = wordFromLine(line);
|
||||
String[] clues = CsvIndexService.lineToClue(line);
|
||||
int simpel = CsvIndexService.lineToSimpel(line);
|
||||
|
||||
// serialize to: WORD \t JSON \n
|
||||
// (als je al JSON string wilt bewaren: gebruik Gson.toJson(clues))
|
||||
String json = Meta.GSON.toJson(clues);
|
||||
String recStr = word + "\t" + simpel + "\t" + json + "\n";
|
||||
byte[] rec = recStr.getBytes(StandardCharsets.UTF_8);
|
||||
|
||||
String key = Meta.shardKey(word);
|
||||
ShardBuilder sb = builders.computeIfAbsent(key, k -> new ShardBuilder());
|
||||
try {
|
||||
sb.addRecord(rec);
|
||||
} catch (IOException e) {
|
||||
throw new UncheckedIOException(e);
|
||||
}
|
||||
});
|
||||
} catch (UncheckedIOException uioe) {
|
||||
throw uioe.getCause();
|
||||
}
|
||||
|
||||
// flush all shards to disk as <key>.idx (e.g. 6Z.idx)
|
||||
for (var e : builders.entrySet()) {
|
||||
writeIndexedShard(outDir.resolve(e.getKey() + ".idx"), e.getValue());
|
||||
}
|
||||
}
|
||||
static void generateHintShards(Dict dict, Path outDir) {
|
||||
try {
|
||||
Files.createDirectories(outDir);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
var builders = new java.util.HashMap<Path, ShardBuilder>(256);
|
||||
|
||||
for (var index : dict.index()) {
|
||||
long[] words = index.words();
|
||||
for (int shardIdx = 0; shardIdx < words.length; shardIdx++) {
|
||||
var w = words[shardIdx];
|
||||
String word = Lemma.asWord(w);
|
||||
int wIdx = Lemma.unpackIndex(w);
|
||||
String[] clues = CsvIndexService.clues(wIdx);
|
||||
int simpel = CsvIndexService.simpel(wIdx);
|
||||
|
||||
// serialize to: WORD \t JSON \n
|
||||
// (als je al JSON string wilt bewaren: gebruik Gson.toJson(clues))
|
||||
String json = Meta.GSON.toJson(clues);
|
||||
String recStr = word + "\t" + simpel + "\t" + json + "\n";
|
||||
byte[] rec = recStr.getBytes(StandardCharsets.UTF_8);
|
||||
|
||||
var key = Meta.shardKey(w);
|
||||
ShardBuilder sb = builders.computeIfAbsent(key, k -> new ShardBuilder());
|
||||
try {
|
||||
sb.addRecord(rec);
|
||||
} catch (IOException e) {
|
||||
throw new UncheckedIOException(e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// flush all shards to disk as <key>.idx (e.g. 6Z.idx)
|
||||
for (var e : builders.entrySet()) {
|
||||
try {
|
||||
writeIndexedShard(e.getKey(), e.getValue());
|
||||
} catch (IOException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void writeIndexedShard(Path out, ShardBuilder sb) throws IOException {
|
||||
int n = sb.offsets.size();
|
||||
int[] offs = sb.offsets.toArray();
|
||||
|
||||
Reference in New Issue
Block a user