AI Agent Factory with Claude Code Subagents

This commit is contained in:
Cole Medin
2025-08-22 21:01:17 -05:00
parent 4e1240a0b3
commit 8d9f46ecfa
104 changed files with 24521 additions and 0 deletions

View File

@@ -0,0 +1,3 @@
"""Ingestion package for processing documents into vector DB and knowledge graph."""
__version__ = "0.1.0"

View File

@@ -0,0 +1,518 @@
"""
Semantic chunking implementation for intelligent document splitting.
"""
import os
import re
import logging
from typing import List, Dict, Any, Optional, Tuple
from dataclasses import dataclass
import asyncio
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
logger = logging.getLogger(__name__)
# Import flexible providers
try:
from ..utils.providers import get_embedding_client, get_ingestion_model
except ImportError:
# For direct execution or testing
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils.providers import get_embedding_client, get_ingestion_model
# Initialize clients with flexible providers
embedding_client = get_embedding_client()
ingestion_model = get_ingestion_model()
@dataclass
class ChunkingConfig:
"""Configuration for chunking."""
chunk_size: int = 1000
chunk_overlap: int = 200
max_chunk_size: int = 2000
min_chunk_size: int = 100
use_semantic_splitting: bool = True
preserve_structure: bool = True
def __post_init__(self):
"""Validate configuration."""
if self.chunk_overlap >= self.chunk_size:
raise ValueError("Chunk overlap must be less than chunk size")
if self.min_chunk_size <= 0:
raise ValueError("Minimum chunk size must be positive")
@dataclass
class DocumentChunk:
"""Represents a document chunk."""
content: str
index: int
start_char: int
end_char: int
metadata: Dict[str, Any]
token_count: Optional[int] = None
def __post_init__(self):
"""Calculate token count if not provided."""
if self.token_count is None:
# Rough estimation: ~4 characters per token
self.token_count = len(self.content) // 4
class SemanticChunker:
"""Semantic document chunker using LLM for intelligent splitting."""
def __init__(self, config: ChunkingConfig):
"""
Initialize chunker.
Args:
config: Chunking configuration
"""
self.config = config
self.client = embedding_client
self.model = ingestion_model
async def chunk_document(
self,
content: str,
title: str,
source: str,
metadata: Optional[Dict[str, Any]] = None
) -> List[DocumentChunk]:
"""
Chunk a document into semantically coherent pieces.
Args:
content: Document content
title: Document title
source: Document source
metadata: Additional metadata
Returns:
List of document chunks
"""
if not content.strip():
return []
base_metadata = {
"title": title,
"source": source,
**(metadata or {})
}
# First, try semantic chunking if enabled
if self.config.use_semantic_splitting and len(content) > self.config.chunk_size:
try:
semantic_chunks = await self._semantic_chunk(content)
if semantic_chunks:
return self._create_chunk_objects(
semantic_chunks,
content,
base_metadata
)
except Exception as e:
logger.warning(f"Semantic chunking failed, falling back to simple chunking: {e}")
# Fallback to rule-based chunking
return self._simple_chunk(content, base_metadata)
async def _semantic_chunk(self, content: str) -> List[str]:
"""
Perform semantic chunking using LLM.
Args:
content: Content to chunk
Returns:
List of chunk boundaries
"""
# First, split on natural boundaries
sections = self._split_on_structure(content)
# Group sections into semantic chunks
chunks = []
current_chunk = ""
for section in sections:
# Check if adding this section would exceed chunk size
potential_chunk = current_chunk + "\n\n" + section if current_chunk else section
if len(potential_chunk) <= self.config.chunk_size:
current_chunk = potential_chunk
else:
# Current chunk is ready, decide if we should split the section
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = ""
# Handle oversized sections
if len(section) > self.config.max_chunk_size:
# Split the section semantically
sub_chunks = await self._split_long_section(section)
chunks.extend(sub_chunks)
else:
current_chunk = section
# Add the last chunk
if current_chunk:
chunks.append(current_chunk.strip())
return [chunk for chunk in chunks if len(chunk.strip()) >= self.config.min_chunk_size]
def _split_on_structure(self, content: str) -> List[str]:
"""
Split content on structural boundaries.
Args:
content: Content to split
Returns:
List of sections
"""
# Split on markdown headers, paragraphs, and other structural elements
patterns = [
r'\n#{1,6}\s+.+?\n', # Markdown headers
r'\n\n+', # Multiple newlines (paragraph breaks)
r'\n[-*+]\s+', # List items
r'\n\d+\.\s+', # Numbered lists
r'\n```.*?```\n', # Code blocks
r'\n\|\s*.+?\|\s*\n', # Tables
]
# Split by patterns but keep the separators
sections = [content]
for pattern in patterns:
new_sections = []
for section in sections:
parts = re.split(f'({pattern})', section, flags=re.MULTILINE | re.DOTALL)
new_sections.extend([part for part in parts if part.strip()])
sections = new_sections
return sections
async def _split_long_section(self, section: str) -> List[str]:
"""
Split a long section using LLM for semantic boundaries.
Args:
section: Section to split
Returns:
List of sub-chunks
"""
try:
prompt = f"""
Split the following text into semantically coherent chunks. Each chunk should:
1. Be roughly {self.config.chunk_size} characters long
2. End at natural semantic boundaries
3. Maintain context and readability
4. Not exceed {self.config.max_chunk_size} characters
Return only the split text with "---CHUNK---" as separator between chunks.
Text to split:
{section}
"""
# Use Pydantic AI for LLM calls
from pydantic_ai import Agent
temp_agent = Agent(self.model)
response = await temp_agent.run(prompt)
result = response.data
chunks = [chunk.strip() for chunk in result.split("---CHUNK---")]
# Validate chunks
valid_chunks = []
for chunk in chunks:
if (self.config.min_chunk_size <= len(chunk) <= self.config.max_chunk_size):
valid_chunks.append(chunk)
return valid_chunks if valid_chunks else self._simple_split(section)
except Exception as e:
logger.error(f"LLM chunking failed: {e}")
return self._simple_split(section)
def _simple_split(self, text: str) -> List[str]:
"""
Simple text splitting as fallback.
Args:
text: Text to split
Returns:
List of chunks
"""
chunks = []
start = 0
while start < len(text):
end = start + self.config.chunk_size
if end >= len(text):
# Last chunk
chunks.append(text[start:])
break
# Try to end at a sentence boundary
chunk_end = end
for i in range(end, max(start + self.config.min_chunk_size, end - 200), -1):
if text[i] in '.!?\n':
chunk_end = i + 1
break
chunks.append(text[start:chunk_end])
start = chunk_end - self.config.chunk_overlap
return chunks
def _simple_chunk(
self,
content: str,
base_metadata: Dict[str, Any]
) -> List[DocumentChunk]:
"""
Simple rule-based chunking.
Args:
content: Content to chunk
base_metadata: Base metadata for chunks
Returns:
List of document chunks
"""
chunks = self._simple_split(content)
return self._create_chunk_objects(chunks, content, base_metadata)
def _create_chunk_objects(
self,
chunks: List[str],
original_content: str,
base_metadata: Dict[str, Any]
) -> List[DocumentChunk]:
"""
Create DocumentChunk objects from text chunks.
Args:
chunks: List of chunk texts
original_content: Original document content
base_metadata: Base metadata
Returns:
List of DocumentChunk objects
"""
chunk_objects = []
current_pos = 0
for i, chunk_text in enumerate(chunks):
# Find the position of this chunk in the original content
start_pos = original_content.find(chunk_text, current_pos)
if start_pos == -1:
# Fallback: estimate position
start_pos = current_pos
end_pos = start_pos + len(chunk_text)
# Create chunk metadata
chunk_metadata = {
**base_metadata,
"chunk_method": "semantic" if self.config.use_semantic_splitting else "simple",
"total_chunks": len(chunks)
}
chunk_objects.append(DocumentChunk(
content=chunk_text.strip(),
index=i,
start_char=start_pos,
end_char=end_pos,
metadata=chunk_metadata
))
current_pos = end_pos
return chunk_objects
class SimpleChunker:
"""Simple non-semantic chunker for faster processing."""
def __init__(self, config: ChunkingConfig):
"""Initialize simple chunker."""
self.config = config
def chunk_document(
self,
content: str,
title: str,
source: str,
metadata: Optional[Dict[str, Any]] = None
) -> List[DocumentChunk]:
"""
Chunk document using simple rules.
Args:
content: Document content
title: Document title
source: Document source
metadata: Additional metadata
Returns:
List of document chunks
"""
if not content.strip():
return []
base_metadata = {
"title": title,
"source": source,
"chunk_method": "simple",
**(metadata or {})
}
# Split on paragraphs first
paragraphs = re.split(r'\n\s*\n', content)
chunks = []
current_chunk = ""
current_pos = 0
chunk_index = 0
for paragraph in paragraphs:
paragraph = paragraph.strip()
if not paragraph:
continue
# Check if adding this paragraph exceeds chunk size
potential_chunk = current_chunk + "\n\n" + paragraph if current_chunk else paragraph
if len(potential_chunk) <= self.config.chunk_size:
current_chunk = potential_chunk
else:
# Save current chunk if it exists
if current_chunk:
chunks.append(self._create_chunk(
current_chunk,
chunk_index,
current_pos,
current_pos + len(current_chunk),
base_metadata.copy()
))
# Move position, but ensure overlap is respected
overlap_start = max(0, len(current_chunk) - self.config.chunk_overlap)
current_pos += overlap_start
chunk_index += 1
# Start new chunk with current paragraph
current_chunk = paragraph
# Add final chunk
if current_chunk:
chunks.append(self._create_chunk(
current_chunk,
chunk_index,
current_pos,
current_pos + len(current_chunk),
base_metadata.copy()
))
# Update total chunks in metadata
for chunk in chunks:
chunk.metadata["total_chunks"] = len(chunks)
return chunks
def _create_chunk(
self,
content: str,
index: int,
start_pos: int,
end_pos: int,
metadata: Dict[str, Any]
) -> DocumentChunk:
"""Create a DocumentChunk object."""
return DocumentChunk(
content=content.strip(),
index=index,
start_char=start_pos,
end_char=end_pos,
metadata=metadata
)
# Factory function
def create_chunker(config: ChunkingConfig):
"""
Create appropriate chunker based on configuration.
Args:
config: Chunking configuration
Returns:
Chunker instance
"""
if config.use_semantic_splitting:
return SemanticChunker(config)
else:
return SimpleChunker(config)
# Example usage
async def main():
"""Example usage of the chunker."""
config = ChunkingConfig(
chunk_size=500,
chunk_overlap=50,
use_semantic_splitting=True
)
chunker = create_chunker(config)
sample_text = """
# Big Tech AI Initiatives
## Google's AI Strategy
Google has been investing heavily in artificial intelligence research and development.
Their main focus areas include:
- Large language models (LaMDA, PaLM, Gemini)
- Computer vision and image recognition
- Natural language processing
- AI-powered search improvements
The company's DeepMind division continues to push the boundaries of AI research,
with breakthrough achievements in protein folding prediction and game playing.
## Microsoft's Partnership with OpenAI
Microsoft's strategic partnership with OpenAI has positioned them as a leader
in the generative AI space. Key developments include:
1. Integration of GPT models into Office 365
2. Azure OpenAI Service for enterprise customers
3. Investment in OpenAI's continued research
"""
chunks = await chunker.chunk_document(
content=sample_text,
title="Big Tech AI Report",
source="example.md"
)
for i, chunk in enumerate(chunks):
print(f"Chunk {i}: {len(chunk.content)} chars")
print(f"Content: {chunk.content[:100]}...")
print(f"Metadata: {chunk.metadata}")
print("---")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,418 @@
"""
Document embedding generation for vector search.
"""
import os
import asyncio
import logging
from typing import List, Dict, Any, Optional, Tuple
from datetime import datetime
import json
from openai import RateLimitError, APIError
from dotenv import load_dotenv
from .chunker import DocumentChunk
# Import flexible providers
try:
from ..utils.providers import get_embedding_client, get_embedding_model
except ImportError:
# For direct execution or testing
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils.providers import get_embedding_client, get_embedding_model
# Load environment variables
load_dotenv()
logger = logging.getLogger(__name__)
# Initialize client with flexible provider
embedding_client = get_embedding_client()
EMBEDDING_MODEL = get_embedding_model()
class EmbeddingGenerator:
"""Generates embeddings for document chunks."""
def __init__(
self,
model: str = EMBEDDING_MODEL,
batch_size: int = 100,
max_retries: int = 3,
retry_delay: float = 1.0
):
"""
Initialize embedding generator.
Args:
model: OpenAI embedding model to use
batch_size: Number of texts to process in parallel
max_retries: Maximum number of retry attempts
retry_delay: Delay between retries in seconds
"""
self.model = model
self.batch_size = batch_size
self.max_retries = max_retries
self.retry_delay = retry_delay
# Model-specific configurations
self.model_configs = {
"text-embedding-3-small": {"dimensions": 1536, "max_tokens": 8191},
"text-embedding-3-large": {"dimensions": 3072, "max_tokens": 8191},
"text-embedding-ada-002": {"dimensions": 1536, "max_tokens": 8191}
}
if model not in self.model_configs:
logger.warning(f"Unknown model {model}, using default config")
self.config = {"dimensions": 1536, "max_tokens": 8191}
else:
self.config = self.model_configs[model]
async def generate_embedding(self, text: str) -> List[float]:
"""
Generate embedding for a single text.
Args:
text: Text to embed
Returns:
Embedding vector
"""
# Truncate text if too long
if len(text) > self.config["max_tokens"] * 4: # Rough token estimation
text = text[:self.config["max_tokens"] * 4]
for attempt in range(self.max_retries):
try:
response = await embedding_client.embeddings.create(
model=self.model,
input=text
)
return response.data[0].embedding
except RateLimitError as e:
if attempt == self.max_retries - 1:
raise
# Exponential backoff for rate limits
delay = self.retry_delay * (2 ** attempt)
logger.warning(f"Rate limit hit, retrying in {delay}s")
await asyncio.sleep(delay)
except APIError as e:
logger.error(f"OpenAI API error: {e}")
if attempt == self.max_retries - 1:
raise
await asyncio.sleep(self.retry_delay)
except Exception as e:
logger.error(f"Unexpected error generating embedding: {e}")
if attempt == self.max_retries - 1:
raise
await asyncio.sleep(self.retry_delay)
async def generate_embeddings_batch(
self,
texts: List[str]
) -> List[List[float]]:
"""
Generate embeddings for a batch of texts.
Args:
texts: List of texts to embed
Returns:
List of embedding vectors
"""
# Filter and truncate texts
processed_texts = []
for text in texts:
if not text or not text.strip():
processed_texts.append("")
continue
# Truncate if too long
if len(text) > self.config["max_tokens"] * 4:
text = text[:self.config["max_tokens"] * 4]
processed_texts.append(text)
for attempt in range(self.max_retries):
try:
response = await embedding_client.embeddings.create(
model=self.model,
input=processed_texts
)
return [data.embedding for data in response.data]
except RateLimitError as e:
if attempt == self.max_retries - 1:
raise
delay = self.retry_delay * (2 ** attempt)
logger.warning(f"Rate limit hit, retrying batch in {delay}s")
await asyncio.sleep(delay)
except APIError as e:
logger.error(f"OpenAI API error in batch: {e}")
if attempt == self.max_retries - 1:
# Fallback to individual processing
return await self._process_individually(processed_texts)
await asyncio.sleep(self.retry_delay)
except Exception as e:
logger.error(f"Unexpected error in batch embedding: {e}")
if attempt == self.max_retries - 1:
return await self._process_individually(processed_texts)
await asyncio.sleep(self.retry_delay)
async def _process_individually(
self,
texts: List[str]
) -> List[List[float]]:
"""
Process texts individually as fallback.
Args:
texts: List of texts to embed
Returns:
List of embedding vectors
"""
embeddings = []
for text in texts:
try:
if not text or not text.strip():
embeddings.append([0.0] * self.config["dimensions"])
continue
embedding = await self.generate_embedding(text)
embeddings.append(embedding)
# Small delay to avoid overwhelming the API
await asyncio.sleep(0.1)
except Exception as e:
logger.error(f"Failed to embed text: {e}")
# Use zero vector as fallback
embeddings.append([0.0] * self.config["dimensions"])
return embeddings
async def embed_chunks(
self,
chunks: List[DocumentChunk],
progress_callback: Optional[callable] = None
) -> List[DocumentChunk]:
"""
Generate embeddings for document chunks.
Args:
chunks: List of document chunks
progress_callback: Optional callback for progress updates
Returns:
Chunks with embeddings added
"""
if not chunks:
return chunks
logger.info(f"Generating embeddings for {len(chunks)} chunks")
# Process chunks in batches
embedded_chunks = []
total_batches = (len(chunks) + self.batch_size - 1) // self.batch_size
for i in range(0, len(chunks), self.batch_size):
batch_chunks = chunks[i:i + self.batch_size]
batch_texts = [chunk.content for chunk in batch_chunks]
try:
# Generate embeddings for this batch
embeddings = await self.generate_embeddings_batch(batch_texts)
# Add embeddings to chunks
for chunk, embedding in zip(batch_chunks, embeddings):
# Create a new chunk with embedding
embedded_chunk = DocumentChunk(
content=chunk.content,
index=chunk.index,
start_char=chunk.start_char,
end_char=chunk.end_char,
metadata={
**chunk.metadata,
"embedding_model": self.model,
"embedding_generated_at": datetime.now().isoformat()
},
token_count=chunk.token_count
)
# Add embedding as a separate attribute
embedded_chunk.embedding = embedding
embedded_chunks.append(embedded_chunk)
# Progress update
current_batch = (i // self.batch_size) + 1
if progress_callback:
progress_callback(current_batch, total_batches)
logger.info(f"Processed batch {current_batch}/{total_batches}")
except Exception as e:
logger.error(f"Failed to process batch {i//self.batch_size + 1}: {e}")
# Add chunks without embeddings as fallback
for chunk in batch_chunks:
chunk.metadata.update({
"embedding_error": str(e),
"embedding_generated_at": datetime.now().isoformat()
})
chunk.embedding = [0.0] * self.config["dimensions"]
embedded_chunks.append(chunk)
logger.info(f"Generated embeddings for {len(embedded_chunks)} chunks")
return embedded_chunks
async def embed_query(self, query: str) -> List[float]:
"""
Generate embedding for a search query.
Args:
query: Search query
Returns:
Query embedding
"""
return await self.generate_embedding(query)
def get_embedding_dimension(self) -> int:
"""Get the dimension of embeddings for this model."""
return self.config["dimensions"]
# Cache for embeddings
class EmbeddingCache:
"""Simple in-memory cache for embeddings."""
def __init__(self, max_size: int = 1000):
"""Initialize cache."""
self.cache: Dict[str, List[float]] = {}
self.access_times: Dict[str, datetime] = {}
self.max_size = max_size
def get(self, text: str) -> Optional[List[float]]:
"""Get embedding from cache."""
text_hash = self._hash_text(text)
if text_hash in self.cache:
self.access_times[text_hash] = datetime.now()
return self.cache[text_hash]
return None
def put(self, text: str, embedding: List[float]):
"""Store embedding in cache."""
text_hash = self._hash_text(text)
# Evict oldest entries if cache is full
if len(self.cache) >= self.max_size:
oldest_key = min(self.access_times.keys(), key=lambda k: self.access_times[k])
del self.cache[oldest_key]
del self.access_times[oldest_key]
self.cache[text_hash] = embedding
self.access_times[text_hash] = datetime.now()
def _hash_text(self, text: str) -> str:
"""Generate hash for text."""
import hashlib
return hashlib.md5(text.encode()).hexdigest()
# Factory function
def create_embedder(
model: str = EMBEDDING_MODEL,
use_cache: bool = True,
**kwargs
) -> EmbeddingGenerator:
"""
Create embedding generator with optional caching.
Args:
model: Embedding model to use
use_cache: Whether to use caching
**kwargs: Additional arguments for EmbeddingGenerator
Returns:
EmbeddingGenerator instance
"""
embedder = EmbeddingGenerator(model=model, **kwargs)
if use_cache:
# Add caching capability
cache = EmbeddingCache()
original_generate = embedder.generate_embedding
async def cached_generate(text: str) -> List[float]:
cached = cache.get(text)
if cached is not None:
return cached
embedding = await original_generate(text)
cache.put(text, embedding)
return embedding
embedder.generate_embedding = cached_generate
return embedder
# Example usage
async def main():
"""Example usage of the embedder."""
from .chunker import ChunkingConfig, create_chunker
# Create chunker and embedder
config = ChunkingConfig(chunk_size=200, use_semantic_splitting=False)
chunker = create_chunker(config)
embedder = create_embedder()
sample_text = """
Google's AI initiatives include advanced language models, computer vision,
and machine learning research. The company has invested heavily in
transformer architectures and neural network optimization.
Microsoft's partnership with OpenAI has led to integration of GPT models
into various products and services, making AI accessible to enterprise
customers through Azure cloud services.
"""
# Chunk the document
chunks = chunker.chunk_document(
content=sample_text,
title="AI Initiatives",
source="example.md"
)
print(f"Created {len(chunks)} chunks")
# Generate embeddings
def progress_callback(current, total):
print(f"Processing batch {current}/{total}")
embedded_chunks = await embedder.embed_chunks(chunks, progress_callback)
for i, chunk in enumerate(embedded_chunks):
print(f"Chunk {i}: {len(chunk.content)} chars, embedding dim: {len(chunk.embedding)}")
# Test query embedding
query_embedding = await embedder.embed_query("Google AI research")
print(f"Query embedding dimension: {len(query_embedding)}")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,434 @@
"""
Main ingestion script for processing markdown documents into vector DB and knowledge graph.
"""
import os
import asyncio
import logging
import json
import glob
from pathlib import Path
from typing import List, Dict, Any, Optional
from datetime import datetime
import argparse
import asyncpg
from dotenv import load_dotenv
from .chunker import ChunkingConfig, create_chunker, DocumentChunk
from .embedder import create_embedder
# Import utilities
try:
from ..utils.db_utils import initialize_database, close_database, db_pool
from ..utils.models import IngestionConfig, IngestionResult
except ImportError:
# For direct execution or testing
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils.db_utils import initialize_database, close_database, db_pool
from utils.models import IngestionConfig, IngestionResult
# Load environment variables
load_dotenv()
logger = logging.getLogger(__name__)
class DocumentIngestionPipeline:
"""Pipeline for ingesting documents into vector DB and knowledge graph."""
def __init__(
self,
config: IngestionConfig,
documents_folder: str = "documents",
clean_before_ingest: bool = False
):
"""
Initialize ingestion pipeline.
Args:
config: Ingestion configuration
documents_folder: Folder containing markdown documents
clean_before_ingest: Whether to clean existing data before ingestion
"""
self.config = config
self.documents_folder = documents_folder
self.clean_before_ingest = clean_before_ingest
# Initialize components
self.chunker_config = ChunkingConfig(
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
max_chunk_size=config.max_chunk_size,
use_semantic_splitting=config.use_semantic_chunking
)
self.chunker = create_chunker(self.chunker_config)
self.embedder = create_embedder()
self._initialized = False
async def initialize(self):
"""Initialize database connections."""
if self._initialized:
return
logger.info("Initializing ingestion pipeline...")
# Initialize database connections
await initialize_database()
self._initialized = True
logger.info("Ingestion pipeline initialized")
async def close(self):
"""Close database connections."""
if self._initialized:
await close_database()
self._initialized = False
async def ingest_documents(
self,
progress_callback: Optional[callable] = None
) -> List[IngestionResult]:
"""
Ingest all documents from the documents folder.
Args:
progress_callback: Optional callback for progress updates
Returns:
List of ingestion results
"""
if not self._initialized:
await self.initialize()
# Clean existing data if requested
if self.clean_before_ingest:
await self._clean_databases()
# Find all markdown files
markdown_files = self._find_markdown_files()
if not markdown_files:
logger.warning(f"No markdown files found in {self.documents_folder}")
return []
logger.info(f"Found {len(markdown_files)} markdown files to process")
results = []
for i, file_path in enumerate(markdown_files):
try:
logger.info(f"Processing file {i+1}/{len(markdown_files)}: {file_path}")
result = await self._ingest_single_document(file_path)
results.append(result)
if progress_callback:
progress_callback(i + 1, len(markdown_files))
except Exception as e:
logger.error(f"Failed to process {file_path}: {e}")
results.append(IngestionResult(
document_id="",
title=os.path.basename(file_path),
chunks_created=0,
entities_extracted=0,
relationships_created=0,
processing_time_ms=0,
errors=[str(e)]
))
# Log summary
total_chunks = sum(r.chunks_created for r in results)
total_errors = sum(len(r.errors) for r in results)
logger.info(f"Ingestion complete: {len(results)} documents, {total_chunks} chunks, {total_errors} errors")
return results
async def _ingest_single_document(self, file_path: str) -> IngestionResult:
"""
Ingest a single document.
Args:
file_path: Path to the document file
Returns:
Ingestion result
"""
start_time = datetime.now()
# Read document
document_content = self._read_document(file_path)
document_title = self._extract_title(document_content, file_path)
document_source = os.path.relpath(file_path, self.documents_folder)
# Extract metadata from content
document_metadata = self._extract_document_metadata(document_content, file_path)
logger.info(f"Processing document: {document_title}")
# Chunk the document
chunks = await self.chunker.chunk_document(
content=document_content,
title=document_title,
source=document_source,
metadata=document_metadata
)
if not chunks:
logger.warning(f"No chunks created for {document_title}")
return IngestionResult(
document_id="",
title=document_title,
chunks_created=0,
entities_extracted=0,
relationships_created=0,
processing_time_ms=(datetime.now() - start_time).total_seconds() * 1000,
errors=["No chunks created"]
)
logger.info(f"Created {len(chunks)} chunks")
# Entity extraction removed (graph-related functionality)
entities_extracted = 0
# Generate embeddings
embedded_chunks = await self.embedder.embed_chunks(chunks)
logger.info(f"Generated embeddings for {len(embedded_chunks)} chunks")
# Save to PostgreSQL
document_id = await self._save_to_postgres(
document_title,
document_source,
document_content,
embedded_chunks,
document_metadata
)
logger.info(f"Saved document to PostgreSQL with ID: {document_id}")
# Knowledge graph functionality removed
relationships_created = 0
graph_errors = []
# Calculate processing time
processing_time = (datetime.now() - start_time).total_seconds() * 1000
return IngestionResult(
document_id=document_id,
title=document_title,
chunks_created=len(chunks),
entities_extracted=entities_extracted,
relationships_created=relationships_created,
processing_time_ms=processing_time,
errors=graph_errors
)
def _find_markdown_files(self) -> List[str]:
"""Find all markdown files in the documents folder."""
if not os.path.exists(self.documents_folder):
logger.error(f"Documents folder not found: {self.documents_folder}")
return []
patterns = ["*.md", "*.markdown", "*.txt"]
files = []
for pattern in patterns:
files.extend(glob.glob(os.path.join(self.documents_folder, "**", pattern), recursive=True))
return sorted(files)
def _read_document(self, file_path: str) -> str:
"""Read document content from file."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
except UnicodeDecodeError:
# Try with different encoding
with open(file_path, 'r', encoding='latin-1') as f:
return f.read()
def _extract_title(self, content: str, file_path: str) -> str:
"""Extract title from document content or filename."""
# Try to find markdown title
lines = content.split('\n')
for line in lines[:10]: # Check first 10 lines
line = line.strip()
if line.startswith('# '):
return line[2:].strip()
# Fallback to filename
return os.path.splitext(os.path.basename(file_path))[0]
def _extract_document_metadata(self, content: str, file_path: str) -> Dict[str, Any]:
"""Extract metadata from document content."""
metadata = {
"file_path": file_path,
"file_size": len(content),
"ingestion_date": datetime.now().isoformat()
}
# Try to extract YAML frontmatter
if content.startswith('---'):
try:
import yaml
end_marker = content.find('\n---\n', 4)
if end_marker != -1:
frontmatter = content[4:end_marker]
yaml_metadata = yaml.safe_load(frontmatter)
if isinstance(yaml_metadata, dict):
metadata.update(yaml_metadata)
except ImportError:
logger.warning("PyYAML not installed, skipping frontmatter extraction")
except Exception as e:
logger.warning(f"Failed to parse frontmatter: {e}")
# Extract some basic metadata from content
lines = content.split('\n')
metadata['line_count'] = len(lines)
metadata['word_count'] = len(content.split())
return metadata
async def _save_to_postgres(
self,
title: str,
source: str,
content: str,
chunks: List[DocumentChunk],
metadata: Dict[str, Any]
) -> str:
"""Save document and chunks to PostgreSQL."""
async with db_pool.acquire() as conn:
async with conn.transaction():
# Insert document
document_result = await conn.fetchrow(
"""
INSERT INTO documents (title, source, content, metadata)
VALUES ($1, $2, $3, $4)
RETURNING id::text
""",
title,
source,
content,
json.dumps(metadata)
)
document_id = document_result["id"]
# Insert chunks
for chunk in chunks:
# Convert embedding to PostgreSQL vector string format
embedding_data = None
if hasattr(chunk, 'embedding') and chunk.embedding:
# PostgreSQL vector format: '[1.0,2.0,3.0]' (no spaces after commas)
embedding_data = '[' + ','.join(map(str, chunk.embedding)) + ']'
await conn.execute(
"""
INSERT INTO chunks (document_id, content, embedding, chunk_index, metadata, token_count)
VALUES ($1::uuid, $2, $3::vector, $4, $5, $6)
""",
document_id,
chunk.content,
embedding_data,
chunk.index,
json.dumps(chunk.metadata),
chunk.token_count
)
return document_id
async def _clean_databases(self):
"""Clean existing data from databases."""
logger.warning("Cleaning existing data from databases...")
# Clean PostgreSQL
async with db_pool.acquire() as conn:
async with conn.transaction():
await conn.execute("DELETE FROM chunks")
await conn.execute("DELETE FROM documents")
logger.info("Cleaned PostgreSQL database")
async def main():
"""Main function for running ingestion."""
parser = argparse.ArgumentParser(description="Ingest documents into vector DB")
parser.add_argument("--documents", "-d", default="documents", help="Documents folder path")
parser.add_argument("--clean", "-c", action="store_true", help="Clean existing data before ingestion")
parser.add_argument("--chunk-size", type=int, default=1000, help="Chunk size for splitting documents")
parser.add_argument("--chunk-overlap", type=int, default=200, help="Chunk overlap size")
parser.add_argument("--no-semantic", action="store_true", help="Disable semantic chunking")
# Graph-related arguments removed
parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging")
args = parser.parse_args()
# Configure logging
log_level = logging.DEBUG if args.verbose else logging.INFO
logging.basicConfig(
level=log_level,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
# Create ingestion configuration
config = IngestionConfig(
chunk_size=args.chunk_size,
chunk_overlap=args.chunk_overlap,
use_semantic_chunking=not args.no_semantic
)
# Create and run pipeline
pipeline = DocumentIngestionPipeline(
config=config,
documents_folder=args.documents,
clean_before_ingest=args.clean
)
def progress_callback(current: int, total: int):
print(f"Progress: {current}/{total} documents processed")
try:
start_time = datetime.now()
results = await pipeline.ingest_documents(progress_callback)
end_time = datetime.now()
total_time = (end_time - start_time).total_seconds()
# Print summary
print("\n" + "="*50)
print("INGESTION SUMMARY")
print("="*50)
print(f"Documents processed: {len(results)}")
print(f"Total chunks created: {sum(r.chunks_created for r in results)}")
# Graph-related stats removed
print(f"Total errors: {sum(len(r.errors) for r in results)}")
print(f"Total processing time: {total_time:.2f} seconds")
print()
# Print individual results
for result in results:
status = "" if not result.errors else ""
print(f"{status} {result.title}: {result.chunks_created} chunks")
if result.errors:
for error in result.errors:
print(f" Error: {error}")
except KeyboardInterrupt:
print("\nIngestion interrupted by user")
except Exception as e:
logger.error(f"Ingestion failed: {e}")
raise
finally:
await pipeline.close()
if __name__ == "__main__":
asyncio.run(main())