chunking: add structured

2025-12-13 14:32:43 +01:00
parent 1583df8f57
commit b3d06660a4
7 changed files with 507 additions and 1 deletions
--- a/sql/migration/V004__content_graph.sql
+++ b/sql/migration/V004__content_graph.sql
--- a/sql/migration/V005__fts_and_chunks.sql
+++ b/sql/migration/V005__fts_and_chunks.sql
@@ -0,0 +1,94 @@
+ALTER TABLE files ADD COLUMN IF NOT EXISTS fts_vector tsvector;
+
+CREATE INDEX IF NOT EXISTS idx_files_fts ON files USING GIN(fts_vector);
+
+CREATE OR REPLACE FUNCTION files_fts_update() RETURNS trigger AS $$
+BEGIN
+    NEW.fts_vector :=
+        setweight(to_tsvector('english', COALESCE(NEW.path, '')), 'A') ||
+        setweight(to_tsvector('english', COALESCE(NEW.extracted_text, '')), 'B');
+    RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE TRIGGER files_fts_trigger
+    BEFORE INSERT OR UPDATE OF path, extracted_text
+    ON files
+    FOR EACH ROW
+    EXECUTE FUNCTION files_fts_update();
+
+UPDATE files SET fts_vector =
+    setweight(to_tsvector('english', COALESCE(path, '')), 'A') ||
+    setweight(to_tsvector('english', COALESCE(extracted_text, '')), 'B')
+WHERE extracted_text IS NOT NULL AND fts_vector IS NULL;
+
+CREATE TABLE IF NOT EXISTS content_chunks (
+    id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
+    node_id UUID REFERENCES content_nodes(id) ON DELETE CASCADE,
+    file_path TEXT NOT NULL,
+    disk_label VARCHAR(50),
+
+    chunk_id VARCHAR(32) NOT NULL,
+    chunk_index INT NOT NULL,
+    chunk_text TEXT NOT NULL,
+    chunk_size INT,
+
+    offset_start INT,
+    offset_end INT,
+    section_title TEXT,
+
+    metadata JSONB,
+    fts_vector tsvector,
+
+    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+
+    UNIQUE(file_path, chunk_id)
+);
+
+CREATE INDEX IF NOT EXISTS idx_chunks_node ON content_chunks(node_id);
+CREATE INDEX IF NOT EXISTS idx_chunks_file ON content_chunks(file_path);
+CREATE INDEX IF NOT EXISTS idx_chunks_fts ON content_chunks USING GIN(fts_vector);
+CREATE INDEX IF NOT EXISTS idx_chunks_metadata ON content_chunks USING GIN(metadata);
+
+CREATE OR REPLACE FUNCTION chunks_fts_update() RETURNS trigger AS $$
+BEGIN
+    NEW.fts_vector :=
+        setweight(to_tsvector('english', COALESCE(NEW.section_title, '')), 'A') ||
+        setweight(to_tsvector('english', COALESCE(NEW.chunk_text, '')), 'B');
+    RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE TRIGGER chunks_fts_trigger
+    BEFORE INSERT OR UPDATE OF chunk_text, section_title
+    ON content_chunks
+    FOR EACH ROW
+    EXECUTE FUNCTION chunks_fts_update();
+
+CREATE TABLE IF NOT EXISTS directory_index (
+    id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
+    dir_path TEXT NOT NULL UNIQUE,
+    disk_label VARCHAR(50),
+
+    file_count INT DEFAULT 0,
+    total_size BIGINT DEFAULT 0,
+    indexed_files INT DEFAULT 0,
+
+    aggregated_text TEXT,
+    fts_vector tsvector,
+
+    bm25_stats JSONB,
+
+    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+    updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
+);
+
+CREATE INDEX IF NOT EXISTS idx_dir_index_path ON directory_index(dir_path);
+CREATE INDEX IF NOT EXISTS idx_dir_index_disk ON directory_index(disk_label);
+CREATE INDEX IF NOT EXISTS idx_dir_index_fts ON directory_index USING GIN(fts_vector);
+
+COMMENT ON TABLE content_chunks IS 'Structured chunks for BM25 and vector search';
+COMMENT ON TABLE directory_index IS 'Aggregated directory-level BM25 index';
+COMMENT ON COLUMN content_chunks.chunk_id IS 'Stable hash-based chunk identifier';
+COMMENT ON COLUMN content_chunks.offset_start IS 'Character offset or paragraph index';
+COMMENT ON COLUMN content_chunks.section_title IS 'Heading/function name for structured chunks';