Files
defrag/sql/migrations/003_content_graph.sql
2025-12-13 13:57:13 +01:00

77 lines
3.0 KiB
SQL

CREATE TABLE IF NOT EXISTS content_nodes (
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
node_type VARCHAR(50) NOT NULL,
path TEXT NOT NULL,
disk_label VARCHAR(50),
parent_id UUID REFERENCES content_nodes(id) ON DELETE CASCADE,
checksum VARCHAR(64),
size BIGINT,
modified_time TIMESTAMP,
content_hash VARCHAR(64),
extracted_at TIMESTAMP,
extraction_method VARCHAR(100),
metadata JSONB,
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
UNIQUE(node_type, path, disk_label)
);
CREATE INDEX IF NOT EXISTS idx_content_nodes_type ON content_nodes(node_type);
CREATE INDEX IF NOT EXISTS idx_content_nodes_path ON content_nodes(path);
CREATE INDEX IF NOT EXISTS idx_content_nodes_parent ON content_nodes(parent_id);
CREATE INDEX IF NOT EXISTS idx_content_nodes_checksum ON content_nodes(checksum);
CREATE INDEX IF NOT EXISTS idx_content_nodes_content_hash ON content_nodes(content_hash);
CREATE TABLE IF NOT EXISTS content_edges (
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
source_id UUID NOT NULL REFERENCES content_nodes(id) ON DELETE CASCADE,
target_id UUID NOT NULL REFERENCES content_nodes(id) ON DELETE CASCADE,
edge_type VARCHAR(50) NOT NULL,
metadata JSONB,
confidence FLOAT,
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
UNIQUE(source_id, target_id, edge_type)
);
CREATE INDEX IF NOT EXISTS idx_content_edges_source ON content_edges(source_id);
CREATE INDEX IF NOT EXISTS idx_content_edges_target ON content_edges(target_id);
CREATE INDEX IF NOT EXISTS idx_content_edges_type ON content_edges(edge_type);
CREATE TABLE IF NOT EXISTS extraction_log (
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
node_id UUID REFERENCES content_nodes(id) ON DELETE CASCADE,
file_path TEXT NOT NULL,
file_checksum VARCHAR(64),
extraction_method VARCHAR(100),
status VARCHAR(50),
error_message TEXT,
extracted_size BIGINT,
processing_time_ms INT,
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX IF NOT EXISTS idx_extraction_log_node ON extraction_log(node_id);
CREATE INDEX IF NOT EXISTS idx_extraction_log_file ON extraction_log(file_path);
CREATE INDEX IF NOT EXISTS idx_extraction_log_checksum ON extraction_log(file_checksum);
CREATE INDEX IF NOT EXISTS idx_extraction_log_status ON extraction_log(status);
CREATE INDEX IF NOT EXISTS idx_extraction_log_created ON extraction_log(created_at DESC);
COMMENT ON TABLE content_nodes IS 'Content graph nodes: directories, files, chunks';
COMMENT ON TABLE content_edges IS 'Content graph edges: contains, derived_from, references, duplicates';
COMMENT ON TABLE extraction_log IS 'Tracks extraction history for incremental updates';
COMMENT ON COLUMN content_nodes.node_type IS 'directory, file, chunk, embedding';
COMMENT ON COLUMN content_nodes.content_hash IS 'Hash of extracted content (not file bytes)';
COMMENT ON COLUMN content_edges.edge_type IS 'contains, derived_from, references, duplicates, similar_to';