Files
puzzle-generator/src/test/java/puzzle/CsvIndexService.java
2026-01-17 13:22:04 +01:00

255 lines
8.5 KiB
Java

package puzzle;
import puzzle.SwedishGenerator.Lemma;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.Closeable;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.Arrays;
import java.util.function.LongConsumer;
import static java.nio.charset.StandardCharsets.US_ASCII;
public final class CsvIndexService
implements Closeable {
static final ScopedValue<CsvIndexService> SC = ScopedValue.newInstance();
private static final int MAGIC = 0x4C494458; // "LIDX"
private static final int VERSION = 1;
static int SIMPEL_IDX = 3;
private final Path csvPath;
private final Path idxPath;
private volatile long[] offsets; // lazy
private volatile FileChannel csvChannel; // open once
private final Object lock = new Object();
public CsvIndexService(Path csvPath, Path idxPath) {
this.csvPath = csvPath;
this.idxPath = idxPath;
}
public static int lineToSimpel(String line) {
var parts = line.split(",", 5);
return Integer.parseInt(parts[SIMPEL_IDX].trim());
}
public static String[] lineToClue(String line) {
if (line.isBlank()) throw new RuntimeException("Empty line");
var parts = line.split(",", 5);
var rawClue = parts[4].trim();
if (rawClue.startsWith("\"") && rawClue.endsWith("\"")) {
rawClue = rawClue.substring(1, rawClue.length() - 1).replace("\"\"", "\"");
}
return Meta.GSON.fromJson(rawClue, String[].class);
}
public static void lineToLemma(String line, LongConsumer ok) {
if (line.isBlank()) {
throw new RuntimeException("Empty line");
}
var parts = line.split(",", 5);
var id = Integer.parseInt(parts[0].trim());
var word = parts[1].trim();
int score = Integer.parseInt(parts[2].trim());
if (score < 1) {
if (Main.VERBOSE) System.err.println("Word too complex: " + line);
return;
}
ok.accept(Lemma.pack(id, word.getBytes(US_ASCII)));
}
public static int simpel(int index) {
try {
if (SC.isBound())
return lineToSimpel(SC.get().getLine(index));
return -1;
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException("Failed to get clues for index " + index, e);
}
}
public static String[] clues(int index) {
try {
if (SC.isBound())
return lineToClue(SC.get().getLine(index));
return new String[0];
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException("Failed to get clues for index " + index, e);
}
}
/** Haal één regel op (0-based line index), met self-healing index (1x rebuild). */
public String getLine(int lineIndex) throws IOException {
ensureLoaded();
var line = readLineAt(lineIndex);
if (startsWithIndex(line, lineIndex)) return line;
// mismatch => rebuild index en nog 1x proberen
synchronized (lock) {
rebuildIndexLocked();
line = readLineAt(lineIndex);
if (startsWithIndex(line, lineIndex)) return line;
}
throw new RuntimeException("Index mismatch after rebuild. Requested=" + lineIndex + ", got line=" + preview(line));
}
public void ensureLoaded() throws IOException {
if (offsets != null && csvChannel != null && csvChannel.isOpen()) return;
synchronized (lock) {
if (offsets != null && csvChannel != null && csvChannel.isOpen()) return;
csvChannel = FileChannel.open(csvPath, StandardOpenOption.READ);
if (Files.exists(idxPath)) {
try {
offsets = readIndex(idxPath);
return;
} catch (IOException badIndex) {
// fall-through -> rebuild
}
}
rebuildIndexLocked();
}
}
private void rebuildIndexLocked() throws IOException {
var built = buildOffsets(csvPath);
writeIndex(idxPath, built);
offsets = built;
}
private String readLineAt(int lineIndex) throws IOException {
var local = offsets;
if (lineIndex < 0 || lineIndex >= local.length) {
throw new IndexOutOfBoundsException("lineIndex=" + lineIndex + ", max=" + (local.length - 1));
}
long currentPos = local[lineIndex];
// lees in blokjes (sneller dan 1 byte) tot newline
var buf = new byte[8192];
var total = 0;
var out = new byte[256];
while (true) {
var bb = ByteBuffer.wrap(buf);
var n = csvChannel.read(bb, currentPos);
if (n < 0) break; // EOF
currentPos += n;
var end = n;
for (var i = 0; i < end; i++) {
var b = buf[i];
if (b == (byte) '\n') {
return new String(out, 0, total, StandardCharsets.UTF_8);
}
if (b == (byte) '\r') continue;
if (total == out.length) out = Arrays.copyOf(out, out.length * 2);
out[total++] = b;
}
}
return new String(out, 0, total, StandardCharsets.UTF_8);
}
/** Check: begint de regel met "<lineIndex>," */
private static boolean startsWithIndex(String line, int lineIndex) {
if (line == null || line.isEmpty()) return false;
var comma = line.indexOf(',');
if (comma <= 0) return false;
// snelle parse zonder split
long v = 0;
for (var i = 0; i < comma; i++) {
var c = line.charAt(i);
if (c < '0' || c > '9') return false;
v = (v * 10) + (c - '0');
if (v > Integer.MAX_VALUE) return false;
}
return v == lineIndex;
}
private static String preview(String s) {
if (s == null) return "null";
return s.length() <= 120 ? s : s.substring(0, 120) + "...";
}
/** Bouw offsets door newlines te scannen. Resultaat is exact getrimd. */
public static long[] buildOffsets(Path path) throws IOException {
try (var ch = FileChannel.open(path, StandardOpenOption.READ)) {
var offs = new long[131072]; // start-capacity, groeit indien nodig
var c = 0;
offs[c++] = 0;
var buf = ByteBuffer.allocateDirect(1 << 20);
int pos = 0;
while (true) {
buf.clear();
var n = ch.read(buf);
if (n < 0) break;
buf.flip();
for (var i = 0; i < n; i++) {
if (buf.get(i) == (byte) '\n') {
if (c == offs.length) offs = Arrays.copyOf(offs, offs.length * 2);
offs[c++] = pos + i + 1;
}
}
pos += n;
}
return Arrays.copyOf(offs, c);
}
}
public static void writeIndex(Path out, long[] offsets) throws IOException {
try (var dos = new DataOutputStream(new BufferedOutputStream(Files.newOutputStream(
out, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.WRITE)))) {
dos.writeInt(MAGIC);
dos.writeInt(VERSION);
dos.writeInt(offsets.length);
for (var v : offsets) dos.writeLong(v);
}
}
public static long[] readIndex(Path in) throws IOException {
try (var dis = new DataInputStream(new BufferedInputStream(Files.newInputStream(in)))) {
if (dis.readInt() != MAGIC) throw new IOException("Not a LIDX file");
var version = dis.readInt();
if (version != VERSION) throw new IOException("Unsupported version: " + version);
var n = dis.readInt();
if (n < 0) throw new IOException("Corrupt length: " + n);
var offsets = new long[n];
for (var i = 0; i < n; i++) offsets[i] = dis.readLong();
return offsets;
}
}
@Override
public void close() throws IOException {
synchronized (lock) {
if (csvChannel != null) csvChannel.close();
csvChannel = null;
offsets = null;
}
}
}