Gather data
This commit is contained in:
38
tools/hint/dbjsonl.sh
Executable file
38
tools/hint/dbjsonl.sh
Executable file
@@ -0,0 +1,38 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Usage:
|
||||
# ./import_jsonl.sh "postgresql://user:pass@host:5432/dbname" gloss doc /path/to/file.jsonl
|
||||
#
|
||||
# Notes:
|
||||
# - Creates table if it doesn't exist.
|
||||
# - Inserts each JSON line into a jsonb column.
|
||||
# - Skips blank lines.
|
||||
|
||||
DB_URL="${1:?db url}"
|
||||
TABLE="${2:?table name}"
|
||||
COL="${3:?json column name}"
|
||||
FILE="${4:?jsonl file path}"
|
||||
|
||||
psql "$DB_URL" -v ON_ERROR_STOP=1 <<SQL
|
||||
CREATE TABLE IF NOT EXISTS ${TABLE} (
|
||||
id bigserial PRIMARY KEY,
|
||||
${COL} jsonb NOT NULL
|
||||
);
|
||||
SQL
|
||||
|
||||
# \copy runs on the client, so we can feed it from a local file.
|
||||
# We copy into a 1-column staging table, then cast to jsonb and insert.
|
||||
psql "$DB_URL" -v ON_ERROR_STOP=1 <<SQL
|
||||
CREATE TEMP TABLE _jsonl_stage(line text);
|
||||
|
||||
\\copy _jsonl_stage(line) FROM '${FILE}' WITH (FORMAT text);
|
||||
|
||||
INSERT INTO ${TABLE}(${COL})
|
||||
SELECT line::jsonb
|
||||
FROM _jsonl_stage
|
||||
WHERE btrim(line) <> '';
|
||||
|
||||
-- optional: show count inserted this run
|
||||
SELECT count(*) AS inserted_now FROM _jsonl_stage WHERE btrim(line) <> '';
|
||||
SQL
|
||||
59
tools/hint/jsonl-to-sqlite.mjs
Normal file
59
tools/hint/jsonl-to-sqlite.mjs
Normal file
@@ -0,0 +1,59 @@
|
||||
// jsonl-to-sqlite.mjs
|
||||
import fs from 'node:fs'
|
||||
import readline from 'node:readline'
|
||||
import Database from 'better-sqlite3'
|
||||
|
||||
const jsonlPath = process.argv[2]
|
||||
const dbPath = process.argv[3] ?? 'out.sqlite'
|
||||
const table = process.argv[4] ?? 'events'
|
||||
|
||||
if (!jsonlPath) {
|
||||
console.error('Usage: node jsonl-to-sqlite.mjs <file.jsonl> [out.sqlite] [table]')
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
const db = new Database(dbPath)
|
||||
db.pragma('journal_mode = WAL')
|
||||
|
||||
db.exec(`
|
||||
CREATE TABLE IF NOT EXISTS ${ table }
|
||||
(
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
json TEXT NOT NULL
|
||||
);
|
||||
`)
|
||||
|
||||
const insert = db.prepare(`INSERT INTO ${ table }(json)
|
||||
VALUES (?)`)
|
||||
const insertMany = db.transaction((rows) => {
|
||||
for (const r of rows) insert.run(r)
|
||||
})
|
||||
|
||||
const rl = readline.createInterface({
|
||||
input : fs.createReadStream(jsonlPath, { encoding: 'utf8' }),
|
||||
crlfDelay: Infinity
|
||||
})
|
||||
|
||||
let batch = []
|
||||
let lineNo = 0
|
||||
for await (const line of rl) {
|
||||
lineNo++
|
||||
const trimmed = line.trim()
|
||||
if (!trimmed) continue
|
||||
|
||||
try {
|
||||
JSON.parse(trimmed) // validate
|
||||
batch.push(trimmed)
|
||||
} catch (e) {
|
||||
console.warn(`Skipping invalid JSON on line ${ lineNo }: ${ e.message }`)
|
||||
continue
|
||||
}
|
||||
|
||||
if (batch.length >= 1000) {
|
||||
insertMany(batch)
|
||||
batch = []
|
||||
}
|
||||
}
|
||||
if (batch.length) insertMany(batch)
|
||||
|
||||
console.log(`Done. Imported into ${ dbPath }, table=${ table }`)
|
||||
Reference in New Issue
Block a user