CREATE EXTENSION IF NOT EXISTS vector; CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; CREATE EXTENSION IF NOT EXISTS pg_trgm; DROP TABLE IF EXISTS chunks CASCADE; DROP TABLE IF EXISTS documents CASCADE; DROP INDEX IF EXISTS idx_chunks_embedding; DROP INDEX IF EXISTS idx_chunks_document_id; DROP INDEX IF EXISTS idx_documents_metadata; DROP INDEX IF EXISTS idx_chunks_content_trgm; CREATE TABLE documents ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), title TEXT NOT NULL, source TEXT NOT NULL, content TEXT NOT NULL, metadata JSONB DEFAULT '{}', created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP ); CREATE INDEX idx_documents_metadata ON documents USING GIN (metadata); CREATE INDEX idx_documents_created_at ON documents (created_at DESC); CREATE TABLE chunks ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE, content TEXT NOT NULL, embedding vector(1536), chunk_index INTEGER NOT NULL, metadata JSONB DEFAULT '{}', token_count INTEGER, created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP ); CREATE INDEX idx_chunks_embedding ON chunks USING ivfflat (embedding vector_cosine_ops) WITH (lists = 1); CREATE INDEX idx_chunks_document_id ON chunks (document_id); CREATE INDEX idx_chunks_chunk_index ON chunks (document_id, chunk_index); CREATE INDEX idx_chunks_content_trgm ON chunks USING GIN (content gin_trgm_ops); CREATE OR REPLACE FUNCTION match_chunks( query_embedding vector(1536), match_count INT DEFAULT 10 ) RETURNS TABLE ( chunk_id UUID, document_id UUID, content TEXT, similarity FLOAT, metadata JSONB, document_title TEXT, document_source TEXT ) LANGUAGE plpgsql AS $$ BEGIN RETURN QUERY SELECT c.id AS chunk_id, c.document_id, c.content, 1 - (c.embedding <=> query_embedding) AS similarity, c.metadata, d.title AS document_title, d.source AS document_source FROM chunks c JOIN documents d ON c.document_id = d.id WHERE c.embedding IS NOT NULL ORDER BY c.embedding <=> query_embedding LIMIT match_count; END; $$; CREATE OR REPLACE FUNCTION hybrid_search( query_embedding vector(1536), query_text TEXT, match_count INT DEFAULT 10, text_weight FLOAT DEFAULT 0.3 ) RETURNS TABLE ( chunk_id UUID, document_id UUID, content TEXT, combined_score FLOAT, vector_similarity FLOAT, text_similarity FLOAT, metadata JSONB, document_title TEXT, document_source TEXT ) LANGUAGE plpgsql AS $$ BEGIN RETURN QUERY WITH vector_results AS ( SELECT c.id AS chunk_id, c.document_id, c.content, 1 - (c.embedding <=> query_embedding) AS vector_sim, c.metadata, d.title AS doc_title, d.source AS doc_source FROM chunks c JOIN documents d ON c.document_id = d.id WHERE c.embedding IS NOT NULL ), text_results AS ( SELECT c.id AS chunk_id, c.document_id, c.content, ts_rank_cd(to_tsvector('english', c.content), plainto_tsquery('english', query_text)) AS text_sim, c.metadata, d.title AS doc_title, d.source AS doc_source FROM chunks c JOIN documents d ON c.document_id = d.id WHERE to_tsvector('english', c.content) @@ plainto_tsquery('english', query_text) ) SELECT COALESCE(v.chunk_id, t.chunk_id) AS chunk_id, COALESCE(v.document_id, t.document_id) AS document_id, COALESCE(v.content, t.content) AS content, (COALESCE(v.vector_sim, 0) * (1 - text_weight) + COALESCE(t.text_sim, 0) * text_weight)::float8 AS combined_score, COALESCE(v.vector_sim, 0)::float8 AS vector_similarity, COALESCE(t.text_sim, 0)::float8 AS text_similarity, COALESCE(v.metadata, t.metadata) AS metadata, COALESCE(v.doc_title, t.doc_title) AS document_title, COALESCE(v.doc_source, t.doc_source) AS document_source FROM vector_results v FULL OUTER JOIN text_results t ON v.chunk_id = t.chunk_id ORDER BY combined_score DESC LIMIT match_count; END; $$; CREATE OR REPLACE FUNCTION get_document_chunks(doc_id UUID) RETURNS TABLE ( chunk_id UUID, content TEXT, chunk_index INTEGER, metadata JSONB ) LANGUAGE plpgsql AS $$ BEGIN RETURN QUERY SELECT id AS chunk_id, chunks.content, chunks.chunk_index, chunks.metadata FROM chunks WHERE document_id = doc_id ORDER BY chunk_index; END; $$; CREATE OR REPLACE FUNCTION update_updated_at_column() RETURNS TRIGGER AS $$ BEGIN NEW.updated_at = CURRENT_TIMESTAMP; RETURN NEW; END; $$ LANGUAGE plpgsql; CREATE TRIGGER update_documents_updated_at BEFORE UPDATE ON documents FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();