Gather data

This commit is contained in:
mike
2026-01-04 01:37:42 +01:00
parent 795067472f
commit 3e25ce3e1f
22 changed files with 233 additions and 1414 deletions

38
tools/hint/dbjsonl.sh Executable file
View File

@@ -0,0 +1,38 @@
#!/usr/bin/env bash
set -euo pipefail
# Usage:
# ./import_jsonl.sh "postgresql://user:pass@host:5432/dbname" gloss doc /path/to/file.jsonl
#
# Notes:
# - Creates table if it doesn't exist.
# - Inserts each JSON line into a jsonb column.
# - Skips blank lines.
DB_URL="${1:?db url}"
TABLE="${2:?table name}"
COL="${3:?json column name}"
FILE="${4:?jsonl file path}"
psql "$DB_URL" -v ON_ERROR_STOP=1 <<SQL
CREATE TABLE IF NOT EXISTS ${TABLE} (
id bigserial PRIMARY KEY,
${COL} jsonb NOT NULL
);
SQL
# \copy runs on the client, so we can feed it from a local file.
# We copy into a 1-column staging table, then cast to jsonb and insert.
psql "$DB_URL" -v ON_ERROR_STOP=1 <<SQL
CREATE TEMP TABLE _jsonl_stage(line text);
\\copy _jsonl_stage(line) FROM '${FILE}' WITH (FORMAT text);
INSERT INTO ${TABLE}(${COL})
SELECT line::jsonb
FROM _jsonl_stage
WHERE btrim(line) <> '';
-- optional: show count inserted this run
SELECT count(*) AS inserted_now FROM _jsonl_stage WHERE btrim(line) <> '';
SQL

View File

@@ -0,0 +1,59 @@
// jsonl-to-sqlite.mjs
import fs from 'node:fs'
import readline from 'node:readline'
import Database from 'better-sqlite3'
const jsonlPath = process.argv[2]
const dbPath = process.argv[3] ?? 'out.sqlite'
const table = process.argv[4] ?? 'events'
if (!jsonlPath) {
console.error('Usage: node jsonl-to-sqlite.mjs <file.jsonl> [out.sqlite] [table]')
process.exit(1)
}
const db = new Database(dbPath)
db.pragma('journal_mode = WAL')
db.exec(`
CREATE TABLE IF NOT EXISTS ${ table }
(
id INTEGER PRIMARY KEY AUTOINCREMENT,
json TEXT NOT NULL
);
`)
const insert = db.prepare(`INSERT INTO ${ table }(json)
VALUES (?)`)
const insertMany = db.transaction((rows) => {
for (const r of rows) insert.run(r)
})
const rl = readline.createInterface({
input : fs.createReadStream(jsonlPath, { encoding: 'utf8' }),
crlfDelay: Infinity
})
let batch = []
let lineNo = 0
for await (const line of rl) {
lineNo++
const trimmed = line.trim()
if (!trimmed) continue
try {
JSON.parse(trimmed) // validate
batch.push(trimmed)
} catch (e) {
console.warn(`Skipping invalid JSON on line ${ lineNo }: ${ e.message }`)
continue
}
if (batch.length >= 1000) {
insertMany(batch)
batch = []
}
}
if (batch.length) insertMany(batch)
console.log(`Done. Imported into ${ dbPath }, table=${ table }`)