introduce bitloops
This commit is contained in:
@@ -5,7 +5,6 @@ import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.Accessors;
|
||||
import lombok.val;
|
||||
import puzzle.DictJavaGeneratorMulti.DictEntryDTO.IntListDTO;
|
||||
import puzzle.SwedishGenerator.Dict;
|
||||
import puzzle.SwedishGenerator.DictEntry;
|
||||
import puzzle.SwedishGenerator.Lemma;
|
||||
@@ -16,14 +15,14 @@ public final class DictJavaGeneratorMulti {
|
||||
// Smaller = more files, but safer for javac/class limits.
|
||||
private static final int WORDS_CHUNK = 8_192;
|
||||
private static final int POS_CHUNK = 8_192;
|
||||
|
||||
public static final int THRESS = 800;
|
||||
public static void main(String[] args) throws Exception {
|
||||
Path wordsFile = Path.of(args.length > 0 ? args[0] : "nl_score_hints_v4.csv");
|
||||
Path outDir = Path.of(args.length > 1 ? args[1] : "src/main/generated-sources/puzzle/dict" + THRESS);
|
||||
String pkg = "puzzle.dict" + THRESS;
|
||||
Path wordsFile = Path.of(args.length > 0 ? args[0] : "nl_score_hints_v4.csv");
|
||||
Path outDir = Path.of(args.length > 1 ? args[1] : "src/main/generated-sources/puzzle/dict" + THRESS);
|
||||
String pkg = "puzzle.dict" + THRESS;
|
||||
HashMap<String, ShardBuilder> builders = new HashMap<String, ShardBuilder>(16);
|
||||
|
||||
SwedishGenerator.Dict dict = buildDict(wordsFile, builders);
|
||||
SwedishGenerator.Dict dict = buildDict(wordsFile, builders, THRESS);
|
||||
|
||||
Files.createDirectories(outDir);
|
||||
|
||||
@@ -41,34 +40,30 @@ public final class DictJavaGeneratorMulti {
|
||||
System.out.println("Generated sources into: " + outDir.toAbsolutePath());
|
||||
|
||||
}
|
||||
public static final int THRESS = 800;
|
||||
|
||||
static String shardKey(long word) {
|
||||
return ""+Lemma.unpackSize(word) + 1;
|
||||
return "" + Lemma.unpackSize(word) + 1;
|
||||
}
|
||||
private static SwedishGenerator.Dict buildDict(Path wordsPath, HashMap<String, ShardBuilder> builders) throws IOException {
|
||||
private static SwedishGenerator.Dict buildDict(Path wordsPath, HashMap<String, ShardBuilder> builders, int thress) throws IOException {
|
||||
var map = new LongArrayList(100_000);
|
||||
try (var lines = Files.lines(wordsPath, StandardCharsets.UTF_8)) {
|
||||
lines.forEach(line -> {
|
||||
CsvIndexService.lineToLemma(line, w -> {
|
||||
String word = Lemma.asWord(w, Export.BYTES.get());
|
||||
String[] clues = CsvIndexService.lineToClue(line);
|
||||
int simpel = CsvIndexService.lineToSimpel(line);
|
||||
|
||||
// serialize to: WORD \t JSON \n
|
||||
// (als je al JSON string wilt bewaren: gebruik Gson.toJson(clues))
|
||||
String json = Meta.GSON.toJson(clues);
|
||||
String recStr = word + "\t" + simpel + "\t" + json + "\n";
|
||||
byte[] rec = recStr.getBytes(StandardCharsets.UTF_8);
|
||||
|
||||
var key = shardKey(w);
|
||||
ShardBuilder sb = builders.computeIfAbsent(key, k -> new ShardBuilder());
|
||||
try {
|
||||
map.add(Lemma.pack(w, sb.addRecord(rec)));
|
||||
} catch (IOException e) {
|
||||
throw new UncheckedIOException(e);
|
||||
}
|
||||
});
|
||||
var parts = line.split(",", 4);
|
||||
var word = parts[0].trim();
|
||||
long w = SwedishGenerator.Lemma.from(word.getBytes(US_ASCII));
|
||||
if (!word.equals(SwedishGenerator.Lemma.asWord(w, Export.BYTES.get()))) {
|
||||
throw new RuntimeException();
|
||||
}
|
||||
int score = Integer.parseInt(parts[1].trim());
|
||||
var simpel = Integer.parseInt(parts[CsvIndexService.SIMPEL_IDX].trim());
|
||||
if (score < 1 || simpel > thress) {
|
||||
if (Main.VERBOSE) System.err.println("Word too complex: " + line);
|
||||
return;
|
||||
}
|
||||
|
||||
var key = shardKey(w);
|
||||
ShardBuilder sb = builders.computeIfAbsent(key, k -> new ShardBuilder());
|
||||
map.add(Lemma.pack(w, sb.addRecord()));
|
||||
});
|
||||
}
|
||||
return Dicts.makeDict(map.toArray());
|
||||
@@ -114,13 +109,10 @@ public final class DictJavaGeneratorMulti {
|
||||
|
||||
static final class ShardBuilder {
|
||||
|
||||
final IntListDTO offsets = new IntListDTO(4096);
|
||||
final ByteArrayOutputStream data = new ByteArrayOutputStream(1 << 20); // grows
|
||||
int addRecord(byte[] rec) throws IOException {
|
||||
var size = data.size();
|
||||
val currSize = offsets.size();
|
||||
offsets.add(size);
|
||||
data.write(rec);
|
||||
int c;
|
||||
int addRecord() {
|
||||
val currSize = c;
|
||||
c++;
|
||||
return currSize;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user