introduce bitloops
This commit is contained in:
244
src/main/java/puzzle/CsvIndexService.java
Normal file
244
src/main/java/puzzle/CsvIndexService.java
Normal file
@@ -0,0 +1,244 @@
|
||||
package puzzle;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import puzzle.SwedishGenerator.Lemma;
|
||||
import java.io.*;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.*;
|
||||
import java.util.Arrays;
|
||||
import java.util.Locale;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
public final class CsvIndexService
|
||||
implements Closeable {
|
||||
|
||||
static final ScopedValue<CsvIndexService> SC = ScopedValue.newInstance();
|
||||
static final Gson GSON = new Gson();
|
||||
private static final int MAGIC = 0x4C494458; // "LIDX"
|
||||
private static final int VERSION = 1;
|
||||
|
||||
private final Path csvPath;
|
||||
private final Path idxPath;
|
||||
|
||||
private volatile long[] offsets; // lazy
|
||||
private volatile FileChannel csvChannel; // open once
|
||||
private final Object lock = new Object();
|
||||
|
||||
public CsvIndexService(Path csvPath, Path idxPath) {
|
||||
this.csvPath = csvPath;
|
||||
this.idxPath = idxPath;
|
||||
}
|
||||
|
||||
public static String[] lineToClue(String line) {
|
||||
if (line.isBlank()) throw new RuntimeException("Empty line");
|
||||
var parts = line.split(",", 5);
|
||||
var rawClue = parts[4].trim();
|
||||
if (rawClue.startsWith("\"") && rawClue.endsWith("\"")) {
|
||||
rawClue = rawClue.substring(1, rawClue.length() - 1).replace("\"\"", "\"");
|
||||
}
|
||||
return GSON.fromJson(rawClue, String[].class);
|
||||
}
|
||||
public static void lineToLemma(String line, Consumer<Lemma> ok) {
|
||||
if (line.isBlank()) {
|
||||
throw new RuntimeException("Empty line");
|
||||
}
|
||||
var parts = line.split(",", 5);
|
||||
var id = Integer.parseInt(parts[0].trim());
|
||||
var word = parts[1].trim();
|
||||
if (!word.matches("^[A-Z]{2,8}$")) {
|
||||
throw new RuntimeException("Invalid word:" + line);
|
||||
}
|
||||
|
||||
// CSV has level 1-10. llmScores use 10-level.
|
||||
int score = Integer.parseInt(parts[2].trim());
|
||||
if (score < 1) {
|
||||
if (Main.VERBOSE) System.err.println("Word too complex: " + line);
|
||||
return;
|
||||
}
|
||||
int simpel = Integer.parseInt(parts[3].trim());
|
||||
ok.accept(new Lemma(id, word, simpel));
|
||||
}
|
||||
|
||||
public static String[] clues(int index) {
|
||||
try {
|
||||
if (SC.isBound())
|
||||
return lineToClue(SC.get().getLine(index));
|
||||
return new String[0];
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("Failed to get clues for index " + index, e);
|
||||
}
|
||||
}
|
||||
/** Haal één regel op (0-based line index), met self-healing index (1x rebuild). */
|
||||
public String getLine(int lineIndex) throws IOException {
|
||||
ensureLoaded();
|
||||
|
||||
var line = readLineAt(lineIndex);
|
||||
|
||||
if (startsWithIndex(line, lineIndex)) return line;
|
||||
|
||||
// mismatch => rebuild index en nog 1x proberen
|
||||
synchronized (lock) {
|
||||
rebuildIndexLocked();
|
||||
line = readLineAt(lineIndex);
|
||||
if (startsWithIndex(line, lineIndex)) return line;
|
||||
}
|
||||
|
||||
throw new RuntimeException("Index mismatch after rebuild. Requested=" + lineIndex + ", got line=" + preview(line));
|
||||
}
|
||||
|
||||
private void ensureLoaded() throws IOException {
|
||||
if (offsets != null && csvChannel != null && csvChannel.isOpen()) return;
|
||||
|
||||
synchronized (lock) {
|
||||
if (offsets != null && csvChannel != null && csvChannel.isOpen()) return;
|
||||
|
||||
csvChannel = FileChannel.open(csvPath, StandardOpenOption.READ);
|
||||
|
||||
if (Files.exists(idxPath)) {
|
||||
try {
|
||||
offsets = readIndex(idxPath);
|
||||
return;
|
||||
} catch (IOException badIndex) {
|
||||
// fall-through -> rebuild
|
||||
}
|
||||
}
|
||||
|
||||
rebuildIndexLocked();
|
||||
}
|
||||
}
|
||||
|
||||
private void rebuildIndexLocked() throws IOException {
|
||||
var built = buildOffsets(csvPath);
|
||||
writeIndex(idxPath, built);
|
||||
offsets = built;
|
||||
}
|
||||
|
||||
private String readLineAt(int lineIndex) throws IOException {
|
||||
var local = offsets;
|
||||
if (lineIndex < 0 || lineIndex >= local.length) {
|
||||
throw new IndexOutOfBoundsException("lineIndex=" + lineIndex + ", max=" + (local.length - 1));
|
||||
}
|
||||
|
||||
var start = local[lineIndex];
|
||||
csvChannel.position(start);
|
||||
|
||||
// lees in blokjes (sneller dan 1 byte) tot newline
|
||||
var buf = new byte[8192];
|
||||
var total = 0;
|
||||
var out = new byte[256];
|
||||
|
||||
while (true) {
|
||||
var bb = ByteBuffer.wrap(buf);
|
||||
var n = csvChannel.read(bb);
|
||||
if (n < 0) break; // EOF
|
||||
var end = n;
|
||||
|
||||
for (var i = 0; i < end; i++) {
|
||||
var b = buf[i];
|
||||
|
||||
if (b == (byte) '\n') {
|
||||
// reposition kanaal op byte na newline
|
||||
long back = (end - i - 1);
|
||||
csvChannel.position(csvChannel.position() - back);
|
||||
return new String(out, 0, total, StandardCharsets.UTF_8);
|
||||
}
|
||||
if (b == (byte) '\r') continue;
|
||||
|
||||
if (total == out.length) out = Arrays.copyOf(out, out.length * 2);
|
||||
out[total++] = b;
|
||||
}
|
||||
}
|
||||
|
||||
return new String(out, 0, total, StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
/** Check: begint de regel met "<lineIndex>," */
|
||||
private static boolean startsWithIndex(String line, int lineIndex) {
|
||||
if (line == null || line.isEmpty()) return false;
|
||||
|
||||
var comma = line.indexOf(',');
|
||||
if (comma <= 0) return false;
|
||||
|
||||
// snelle parse zonder split
|
||||
long v = 0;
|
||||
for (var i = 0; i < comma; i++) {
|
||||
var c = line.charAt(i);
|
||||
if (c < '0' || c > '9') return false;
|
||||
v = (v * 10) + (c - '0');
|
||||
if (v > Integer.MAX_VALUE) return false;
|
||||
}
|
||||
return v == lineIndex;
|
||||
}
|
||||
|
||||
private static String preview(String s) {
|
||||
if (s == null) return "null";
|
||||
return s.length() <= 120 ? s : s.substring(0, 120) + "...";
|
||||
}
|
||||
|
||||
/** Bouw offsets door newlines te scannen. Resultaat is exact getrimd. */
|
||||
public static long[] buildOffsets(Path path) throws IOException {
|
||||
try (var ch = FileChannel.open(path, StandardOpenOption.READ)) {
|
||||
var offs = new long[131072]; // start-capacity, groeit indien nodig
|
||||
var c = 0;
|
||||
offs[c++] = 0L;
|
||||
|
||||
var buf = ByteBuffer.allocateDirect(1 << 20);
|
||||
long pos = 0;
|
||||
|
||||
while (true) {
|
||||
buf.clear();
|
||||
var n = ch.read(buf);
|
||||
if (n < 0) break;
|
||||
buf.flip();
|
||||
|
||||
for (var i = 0; i < n; i++) {
|
||||
if (buf.get(i) == (byte) '\n') {
|
||||
if (c == offs.length) offs = Arrays.copyOf(offs, offs.length * 2);
|
||||
offs[c++] = pos + i + 1;
|
||||
}
|
||||
}
|
||||
pos += n;
|
||||
}
|
||||
|
||||
return Arrays.copyOf(offs, c);
|
||||
}
|
||||
}
|
||||
|
||||
public static void writeIndex(Path out, long[] offsets) throws IOException {
|
||||
try (var dos = new DataOutputStream(new BufferedOutputStream(Files.newOutputStream(
|
||||
out, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.WRITE)))) {
|
||||
dos.writeInt(MAGIC);
|
||||
dos.writeInt(VERSION);
|
||||
dos.writeInt(offsets.length);
|
||||
for (var v : offsets) dos.writeLong(v);
|
||||
}
|
||||
}
|
||||
|
||||
public static long[] readIndex(Path in) throws IOException {
|
||||
try (var dis = new DataInputStream(new BufferedInputStream(Files.newInputStream(in)))) {
|
||||
var magic = dis.readInt();
|
||||
if (magic != MAGIC) throw new IOException("Not a LIDX file");
|
||||
|
||||
var version = dis.readInt();
|
||||
if (version != VERSION) throw new IOException("Unsupported version: " + version);
|
||||
|
||||
var n = dis.readInt();
|
||||
if (n < 0) throw new IOException("Corrupt length: " + n);
|
||||
|
||||
var offsets = new long[n];
|
||||
for (var i = 0; i < n; i++) offsets[i] = dis.readLong();
|
||||
return offsets;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
synchronized (lock) {
|
||||
if (csvChannel != null) csvChannel.close();
|
||||
csvChannel = null;
|
||||
offsets = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user