chunking: add structured
This commit is contained in:
76
sql/migration/V004__content_graph.sql
Normal file
76
sql/migration/V004__content_graph.sql
Normal file
@@ -0,0 +1,76 @@
|
||||
CREATE TABLE IF NOT EXISTS content_nodes (
|
||||
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
|
||||
node_type VARCHAR(50) NOT NULL,
|
||||
path TEXT NOT NULL,
|
||||
disk_label VARCHAR(50),
|
||||
parent_id UUID REFERENCES content_nodes(id) ON DELETE CASCADE,
|
||||
|
||||
checksum VARCHAR(64),
|
||||
size BIGINT,
|
||||
modified_time TIMESTAMP,
|
||||
|
||||
content_hash VARCHAR(64),
|
||||
extracted_at TIMESTAMP,
|
||||
extraction_method VARCHAR(100),
|
||||
|
||||
metadata JSONB,
|
||||
|
||||
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
UNIQUE(node_type, path, disk_label)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_content_nodes_type ON content_nodes(node_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_content_nodes_path ON content_nodes(path);
|
||||
CREATE INDEX IF NOT EXISTS idx_content_nodes_parent ON content_nodes(parent_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_content_nodes_checksum ON content_nodes(checksum);
|
||||
CREATE INDEX IF NOT EXISTS idx_content_nodes_content_hash ON content_nodes(content_hash);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS content_edges (
|
||||
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
|
||||
source_id UUID NOT NULL REFERENCES content_nodes(id) ON DELETE CASCADE,
|
||||
target_id UUID NOT NULL REFERENCES content_nodes(id) ON DELETE CASCADE,
|
||||
edge_type VARCHAR(50) NOT NULL,
|
||||
|
||||
metadata JSONB,
|
||||
confidence FLOAT,
|
||||
|
||||
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
UNIQUE(source_id, target_id, edge_type)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_content_edges_source ON content_edges(source_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_content_edges_target ON content_edges(target_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_content_edges_type ON content_edges(edge_type);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS extraction_log (
|
||||
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
|
||||
node_id UUID REFERENCES content_nodes(id) ON DELETE CASCADE,
|
||||
file_path TEXT NOT NULL,
|
||||
file_checksum VARCHAR(64),
|
||||
|
||||
extraction_method VARCHAR(100),
|
||||
status VARCHAR(50),
|
||||
error_message TEXT,
|
||||
|
||||
extracted_size BIGINT,
|
||||
processing_time_ms INT,
|
||||
|
||||
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_extraction_log_node ON extraction_log(node_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_extraction_log_file ON extraction_log(file_path);
|
||||
CREATE INDEX IF NOT EXISTS idx_extraction_log_checksum ON extraction_log(file_checksum);
|
||||
CREATE INDEX IF NOT EXISTS idx_extraction_log_status ON extraction_log(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_extraction_log_created ON extraction_log(created_at DESC);
|
||||
|
||||
COMMENT ON TABLE content_nodes IS 'Content graph nodes: directories, files, chunks';
|
||||
COMMENT ON TABLE content_edges IS 'Content graph edges: contains, derived_from, references, duplicates';
|
||||
COMMENT ON TABLE extraction_log IS 'Tracks extraction history for incremental updates';
|
||||
|
||||
COMMENT ON COLUMN content_nodes.node_type IS 'directory, file, chunk, embedding';
|
||||
COMMENT ON COLUMN content_nodes.content_hash IS 'Hash of extracted content (not file bytes)';
|
||||
COMMENT ON COLUMN content_edges.edge_type IS 'contains, derived_from, references, duplicates, similar_to';
|
||||
94
sql/migration/V005__fts_and_chunks.sql
Normal file
94
sql/migration/V005__fts_and_chunks.sql
Normal file
@@ -0,0 +1,94 @@
|
||||
ALTER TABLE files ADD COLUMN IF NOT EXISTS fts_vector tsvector;
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_files_fts ON files USING GIN(fts_vector);
|
||||
|
||||
CREATE OR REPLACE FUNCTION files_fts_update() RETURNS trigger AS $$
|
||||
BEGIN
|
||||
NEW.fts_vector :=
|
||||
setweight(to_tsvector('english', COALESCE(NEW.path, '')), 'A') ||
|
||||
setweight(to_tsvector('english', COALESCE(NEW.extracted_text, '')), 'B');
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
CREATE TRIGGER files_fts_trigger
|
||||
BEFORE INSERT OR UPDATE OF path, extracted_text
|
||||
ON files
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION files_fts_update();
|
||||
|
||||
UPDATE files SET fts_vector =
|
||||
setweight(to_tsvector('english', COALESCE(path, '')), 'A') ||
|
||||
setweight(to_tsvector('english', COALESCE(extracted_text, '')), 'B')
|
||||
WHERE extracted_text IS NOT NULL AND fts_vector IS NULL;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS content_chunks (
|
||||
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
|
||||
node_id UUID REFERENCES content_nodes(id) ON DELETE CASCADE,
|
||||
file_path TEXT NOT NULL,
|
||||
disk_label VARCHAR(50),
|
||||
|
||||
chunk_id VARCHAR(32) NOT NULL,
|
||||
chunk_index INT NOT NULL,
|
||||
chunk_text TEXT NOT NULL,
|
||||
chunk_size INT,
|
||||
|
||||
offset_start INT,
|
||||
offset_end INT,
|
||||
section_title TEXT,
|
||||
|
||||
metadata JSONB,
|
||||
fts_vector tsvector,
|
||||
|
||||
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
UNIQUE(file_path, chunk_id)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_chunks_node ON content_chunks(node_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_chunks_file ON content_chunks(file_path);
|
||||
CREATE INDEX IF NOT EXISTS idx_chunks_fts ON content_chunks USING GIN(fts_vector);
|
||||
CREATE INDEX IF NOT EXISTS idx_chunks_metadata ON content_chunks USING GIN(metadata);
|
||||
|
||||
CREATE OR REPLACE FUNCTION chunks_fts_update() RETURNS trigger AS $$
|
||||
BEGIN
|
||||
NEW.fts_vector :=
|
||||
setweight(to_tsvector('english', COALESCE(NEW.section_title, '')), 'A') ||
|
||||
setweight(to_tsvector('english', COALESCE(NEW.chunk_text, '')), 'B');
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
CREATE TRIGGER chunks_fts_trigger
|
||||
BEFORE INSERT OR UPDATE OF chunk_text, section_title
|
||||
ON content_chunks
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION chunks_fts_update();
|
||||
|
||||
CREATE TABLE IF NOT EXISTS directory_index (
|
||||
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
|
||||
dir_path TEXT NOT NULL UNIQUE,
|
||||
disk_label VARCHAR(50),
|
||||
|
||||
file_count INT DEFAULT 0,
|
||||
total_size BIGINT DEFAULT 0,
|
||||
indexed_files INT DEFAULT 0,
|
||||
|
||||
aggregated_text TEXT,
|
||||
fts_vector tsvector,
|
||||
|
||||
bm25_stats JSONB,
|
||||
|
||||
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_dir_index_path ON directory_index(dir_path);
|
||||
CREATE INDEX IF NOT EXISTS idx_dir_index_disk ON directory_index(disk_label);
|
||||
CREATE INDEX IF NOT EXISTS idx_dir_index_fts ON directory_index USING GIN(fts_vector);
|
||||
|
||||
COMMENT ON TABLE content_chunks IS 'Structured chunks for BM25 and vector search';
|
||||
COMMENT ON TABLE directory_index IS 'Aggregated directory-level BM25 index';
|
||||
COMMENT ON COLUMN content_chunks.chunk_id IS 'Stable hash-based chunk identifier';
|
||||
COMMENT ON COLUMN content_chunks.offset_start IS 'Character offset or paragraph index';
|
||||
COMMENT ON COLUMN content_chunks.section_title IS 'Heading/function name for structured chunks';
|
||||
Reference in New Issue
Block a user