mirror of
https://github.com/coleam00/context-engineering-intro.git
synced 2025-12-18 02:05:25 +00:00
170 lines
4.9 KiB
MySQL
170 lines
4.9 KiB
MySQL
|
|
CREATE EXTENSION IF NOT EXISTS vector;
|
||
|
|
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
|
||
|
|
CREATE EXTENSION IF NOT EXISTS pg_trgm;
|
||
|
|
|
||
|
|
DROP TABLE IF EXISTS chunks CASCADE;
|
||
|
|
DROP TABLE IF EXISTS documents CASCADE;
|
||
|
|
DROP INDEX IF EXISTS idx_chunks_embedding;
|
||
|
|
DROP INDEX IF EXISTS idx_chunks_document_id;
|
||
|
|
DROP INDEX IF EXISTS idx_documents_metadata;
|
||
|
|
DROP INDEX IF EXISTS idx_chunks_content_trgm;
|
||
|
|
|
||
|
|
CREATE TABLE documents (
|
||
|
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||
|
|
title TEXT NOT NULL,
|
||
|
|
source TEXT NOT NULL,
|
||
|
|
content TEXT NOT NULL,
|
||
|
|
metadata JSONB DEFAULT '{}',
|
||
|
|
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
||
|
|
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
||
|
|
);
|
||
|
|
|
||
|
|
CREATE INDEX idx_documents_metadata ON documents USING GIN (metadata);
|
||
|
|
CREATE INDEX idx_documents_created_at ON documents (created_at DESC);
|
||
|
|
|
||
|
|
CREATE TABLE chunks (
|
||
|
|
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
|
||
|
|
document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
||
|
|
content TEXT NOT NULL,
|
||
|
|
embedding vector(1536),
|
||
|
|
chunk_index INTEGER NOT NULL,
|
||
|
|
metadata JSONB DEFAULT '{}',
|
||
|
|
token_count INTEGER,
|
||
|
|
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
||
|
|
);
|
||
|
|
|
||
|
|
CREATE INDEX idx_chunks_embedding ON chunks USING ivfflat (embedding vector_cosine_ops) WITH (lists = 1);
|
||
|
|
CREATE INDEX idx_chunks_document_id ON chunks (document_id);
|
||
|
|
CREATE INDEX idx_chunks_chunk_index ON chunks (document_id, chunk_index);
|
||
|
|
CREATE INDEX idx_chunks_content_trgm ON chunks USING GIN (content gin_trgm_ops);
|
||
|
|
|
||
|
|
CREATE OR REPLACE FUNCTION match_chunks(
|
||
|
|
query_embedding vector(1536),
|
||
|
|
match_count INT DEFAULT 10
|
||
|
|
)
|
||
|
|
RETURNS TABLE (
|
||
|
|
chunk_id UUID,
|
||
|
|
document_id UUID,
|
||
|
|
content TEXT,
|
||
|
|
similarity FLOAT,
|
||
|
|
metadata JSONB,
|
||
|
|
document_title TEXT,
|
||
|
|
document_source TEXT
|
||
|
|
)
|
||
|
|
LANGUAGE plpgsql
|
||
|
|
AS $$
|
||
|
|
BEGIN
|
||
|
|
RETURN QUERY
|
||
|
|
SELECT
|
||
|
|
c.id AS chunk_id,
|
||
|
|
c.document_id,
|
||
|
|
c.content,
|
||
|
|
1 - (c.embedding <=> query_embedding) AS similarity,
|
||
|
|
c.metadata,
|
||
|
|
d.title AS document_title,
|
||
|
|
d.source AS document_source
|
||
|
|
FROM chunks c
|
||
|
|
JOIN documents d ON c.document_id = d.id
|
||
|
|
WHERE c.embedding IS NOT NULL
|
||
|
|
ORDER BY c.embedding <=> query_embedding
|
||
|
|
LIMIT match_count;
|
||
|
|
END;
|
||
|
|
$$;
|
||
|
|
|
||
|
|
CREATE OR REPLACE FUNCTION hybrid_search(
|
||
|
|
query_embedding vector(1536),
|
||
|
|
query_text TEXT,
|
||
|
|
match_count INT DEFAULT 10,
|
||
|
|
text_weight FLOAT DEFAULT 0.3
|
||
|
|
)
|
||
|
|
RETURNS TABLE (
|
||
|
|
chunk_id UUID,
|
||
|
|
document_id UUID,
|
||
|
|
content TEXT,
|
||
|
|
combined_score FLOAT,
|
||
|
|
vector_similarity FLOAT,
|
||
|
|
text_similarity FLOAT,
|
||
|
|
metadata JSONB,
|
||
|
|
document_title TEXT,
|
||
|
|
document_source TEXT
|
||
|
|
)
|
||
|
|
LANGUAGE plpgsql
|
||
|
|
AS $$
|
||
|
|
BEGIN
|
||
|
|
RETURN QUERY
|
||
|
|
WITH vector_results AS (
|
||
|
|
SELECT
|
||
|
|
c.id AS chunk_id,
|
||
|
|
c.document_id,
|
||
|
|
c.content,
|
||
|
|
1 - (c.embedding <=> query_embedding) AS vector_sim,
|
||
|
|
c.metadata,
|
||
|
|
d.title AS doc_title,
|
||
|
|
d.source AS doc_source
|
||
|
|
FROM chunks c
|
||
|
|
JOIN documents d ON c.document_id = d.id
|
||
|
|
WHERE c.embedding IS NOT NULL
|
||
|
|
),
|
||
|
|
text_results AS (
|
||
|
|
SELECT
|
||
|
|
c.id AS chunk_id,
|
||
|
|
c.document_id,
|
||
|
|
c.content,
|
||
|
|
ts_rank_cd(to_tsvector('english', c.content), plainto_tsquery('english', query_text)) AS text_sim,
|
||
|
|
c.metadata,
|
||
|
|
d.title AS doc_title,
|
||
|
|
d.source AS doc_source
|
||
|
|
FROM chunks c
|
||
|
|
JOIN documents d ON c.document_id = d.id
|
||
|
|
WHERE to_tsvector('english', c.content) @@ plainto_tsquery('english', query_text)
|
||
|
|
)
|
||
|
|
SELECT
|
||
|
|
COALESCE(v.chunk_id, t.chunk_id) AS chunk_id,
|
||
|
|
COALESCE(v.document_id, t.document_id) AS document_id,
|
||
|
|
COALESCE(v.content, t.content) AS content,
|
||
|
|
(COALESCE(v.vector_sim, 0) * (1 - text_weight) + COALESCE(t.text_sim, 0) * text_weight)::float8 AS combined_score,
|
||
|
|
COALESCE(v.vector_sim, 0)::float8 AS vector_similarity,
|
||
|
|
COALESCE(t.text_sim, 0)::float8 AS text_similarity,
|
||
|
|
COALESCE(v.metadata, t.metadata) AS metadata,
|
||
|
|
COALESCE(v.doc_title, t.doc_title) AS document_title,
|
||
|
|
COALESCE(v.doc_source, t.doc_source) AS document_source
|
||
|
|
FROM vector_results v
|
||
|
|
FULL OUTER JOIN text_results t ON v.chunk_id = t.chunk_id
|
||
|
|
ORDER BY combined_score DESC
|
||
|
|
LIMIT match_count;
|
||
|
|
END;
|
||
|
|
$$;
|
||
|
|
|
||
|
|
CREATE OR REPLACE FUNCTION get_document_chunks(doc_id UUID)
|
||
|
|
RETURNS TABLE (
|
||
|
|
chunk_id UUID,
|
||
|
|
content TEXT,
|
||
|
|
chunk_index INTEGER,
|
||
|
|
metadata JSONB
|
||
|
|
)
|
||
|
|
LANGUAGE plpgsql
|
||
|
|
AS $$
|
||
|
|
BEGIN
|
||
|
|
RETURN QUERY
|
||
|
|
SELECT
|
||
|
|
id AS chunk_id,
|
||
|
|
chunks.content,
|
||
|
|
chunks.chunk_index,
|
||
|
|
chunks.metadata
|
||
|
|
FROM chunks
|
||
|
|
WHERE document_id = doc_id
|
||
|
|
ORDER BY chunk_index;
|
||
|
|
END;
|
||
|
|
$$;
|
||
|
|
|
||
|
|
CREATE OR REPLACE FUNCTION update_updated_at_column()
|
||
|
|
RETURNS TRIGGER AS $$
|
||
|
|
BEGIN
|
||
|
|
NEW.updated_at = CURRENT_TIMESTAMP;
|
||
|
|
RETURN NEW;
|
||
|
|
END;
|
||
|
|
$$ LANGUAGE plpgsql;
|
||
|
|
|
||
|
|
CREATE TRIGGER update_documents_updated_at BEFORE UPDATE ON documents
|
||
|
|
FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();
|
||
|
|
|