95 lines
3.2 KiB
PL/PgSQL
95 lines
3.2 KiB
PL/PgSQL
ALTER TABLE files ADD COLUMN IF NOT EXISTS fts_vector tsvector;
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_files_fts ON files USING GIN(fts_vector);
|
|
|
|
CREATE OR REPLACE FUNCTION files_fts_update() RETURNS trigger AS $$
|
|
BEGIN
|
|
NEW.fts_vector :=
|
|
setweight(to_tsvector('english', COALESCE(NEW.path, '')), 'A') ||
|
|
setweight(to_tsvector('english', COALESCE(NEW.extracted_text, '')), 'B');
|
|
RETURN NEW;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
|
|
CREATE TRIGGER files_fts_trigger
|
|
BEFORE INSERT OR UPDATE OF path, extracted_text
|
|
ON files
|
|
FOR EACH ROW
|
|
EXECUTE FUNCTION files_fts_update();
|
|
|
|
UPDATE files SET fts_vector =
|
|
setweight(to_tsvector('english', COALESCE(path, '')), 'A') ||
|
|
setweight(to_tsvector('english', COALESCE(extracted_text, '')), 'B')
|
|
WHERE extracted_text IS NOT NULL AND fts_vector IS NULL;
|
|
|
|
CREATE TABLE IF NOT EXISTS content_chunks (
|
|
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
|
|
node_id UUID REFERENCES content_nodes(id) ON DELETE CASCADE,
|
|
file_path TEXT NOT NULL,
|
|
disk_label VARCHAR(50),
|
|
|
|
chunk_id VARCHAR(32) NOT NULL,
|
|
chunk_index INT NOT NULL,
|
|
chunk_text TEXT NOT NULL,
|
|
chunk_size INT,
|
|
|
|
offset_start INT,
|
|
offset_end INT,
|
|
section_title TEXT,
|
|
|
|
metadata JSONB,
|
|
fts_vector tsvector,
|
|
|
|
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
|
|
|
UNIQUE(file_path, chunk_id)
|
|
);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_chunks_node ON content_chunks(node_id);
|
|
CREATE INDEX IF NOT EXISTS idx_chunks_file ON content_chunks(file_path);
|
|
CREATE INDEX IF NOT EXISTS idx_chunks_fts ON content_chunks USING GIN(fts_vector);
|
|
CREATE INDEX IF NOT EXISTS idx_chunks_metadata ON content_chunks USING GIN(metadata);
|
|
|
|
CREATE OR REPLACE FUNCTION chunks_fts_update() RETURNS trigger AS $$
|
|
BEGIN
|
|
NEW.fts_vector :=
|
|
setweight(to_tsvector('english', COALESCE(NEW.section_title, '')), 'A') ||
|
|
setweight(to_tsvector('english', COALESCE(NEW.chunk_text, '')), 'B');
|
|
RETURN NEW;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
|
|
CREATE TRIGGER chunks_fts_trigger
|
|
BEFORE INSERT OR UPDATE OF chunk_text, section_title
|
|
ON content_chunks
|
|
FOR EACH ROW
|
|
EXECUTE FUNCTION chunks_fts_update();
|
|
|
|
CREATE TABLE IF NOT EXISTS directory_index (
|
|
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
|
|
dir_path TEXT NOT NULL UNIQUE,
|
|
disk_label VARCHAR(50),
|
|
|
|
file_count INT DEFAULT 0,
|
|
total_size BIGINT DEFAULT 0,
|
|
indexed_files INT DEFAULT 0,
|
|
|
|
aggregated_text TEXT,
|
|
fts_vector tsvector,
|
|
|
|
bm25_stats JSONB,
|
|
|
|
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
|
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
|
);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_dir_index_path ON directory_index(dir_path);
|
|
CREATE INDEX IF NOT EXISTS idx_dir_index_disk ON directory_index(disk_label);
|
|
CREATE INDEX IF NOT EXISTS idx_dir_index_fts ON directory_index USING GIN(fts_vector);
|
|
|
|
COMMENT ON TABLE content_chunks IS 'Structured chunks for BM25 and vector search';
|
|
COMMENT ON TABLE directory_index IS 'Aggregated directory-level BM25 index';
|
|
COMMENT ON COLUMN content_chunks.chunk_id IS 'Stable hash-based chunk identifier';
|
|
COMMENT ON COLUMN content_chunks.offset_start IS 'Character offset or paragraph index';
|
|
COMMENT ON COLUMN content_chunks.section_title IS 'Heading/function name for structured chunks';
|