AI Agent Factory with Claude Code Subagents

2025-12-29 16:14:56 +00:00 · 2025-08-22 21:01:17 -05:00
parent 4e1240a0b3
commit 8d9f46ecfa
104 changed files with 24521 additions and 0 deletions
--- a/use-cases/agent-factory-with-subagents/agents/rag_agent/ingestion/chunker.py
+++ b/use-cases/agent-factory-with-subagents/agents/rag_agent/ingestion/chunker.py
@@ -0,0 +1,518 @@
+"""
+Semantic chunking implementation for intelligent document splitting.
+"""
+
+import os
+import re
+import logging
+from typing import List, Dict, Any, Optional, Tuple
+from dataclasses import dataclass
+import asyncio
+
+from dotenv import load_dotenv
+
+# Load environment variables
+load_dotenv()
+
+logger = logging.getLogger(__name__)
+
+# Import flexible providers
+try:
+    from ..utils.providers import get_embedding_client, get_ingestion_model
+except ImportError:
+    # For direct execution or testing
+    import sys
+    import os
+    sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    from utils.providers import get_embedding_client, get_ingestion_model
+
+# Initialize clients with flexible providers
+embedding_client = get_embedding_client()
+ingestion_model = get_ingestion_model()
+
+
+@dataclass
+class ChunkingConfig:
+    """Configuration for chunking."""
+    chunk_size: int = 1000
+    chunk_overlap: int = 200
+    max_chunk_size: int = 2000
+    min_chunk_size: int = 100
+    use_semantic_splitting: bool = True
+    preserve_structure: bool = True
+    
+    def __post_init__(self):
+        """Validate configuration."""
+        if self.chunk_overlap >= self.chunk_size:
+            raise ValueError("Chunk overlap must be less than chunk size")
+        if self.min_chunk_size <= 0:
+            raise ValueError("Minimum chunk size must be positive")
+
+
+@dataclass
+class DocumentChunk:
+    """Represents a document chunk."""
+    content: str
+    index: int
+    start_char: int
+    end_char: int
+    metadata: Dict[str, Any]
+    token_count: Optional[int] = None
+    
+    def __post_init__(self):
+        """Calculate token count if not provided."""
+        if self.token_count is None:
+            # Rough estimation: ~4 characters per token
+            self.token_count = len(self.content) // 4
+
+
+class SemanticChunker:
+    """Semantic document chunker using LLM for intelligent splitting."""
+    
+    def __init__(self, config: ChunkingConfig):
+        """
+        Initialize chunker.
+        
+        Args:
+            config: Chunking configuration
+        """
+        self.config = config
+        self.client = embedding_client
+        self.model = ingestion_model
+    
+    async def chunk_document(
+        self,
+        content: str,
+        title: str,
+        source: str,
+        metadata: Optional[Dict[str, Any]] = None
+    ) -> List[DocumentChunk]:
+        """
+        Chunk a document into semantically coherent pieces.
+        
+        Args:
+            content: Document content
+            title: Document title
+            source: Document source
+            metadata: Additional metadata
+        
+        Returns:
+            List of document chunks
+        """
+        if not content.strip():
+            return []
+        
+        base_metadata = {
+            "title": title,
+            "source": source,
+            **(metadata or {})
+        }
+        
+        # First, try semantic chunking if enabled
+        if self.config.use_semantic_splitting and len(content) > self.config.chunk_size:
+            try:
+                semantic_chunks = await self._semantic_chunk(content)
+                if semantic_chunks:
+                    return self._create_chunk_objects(
+                        semantic_chunks,
+                        content,
+                        base_metadata
+                    )
+            except Exception as e:
+                logger.warning(f"Semantic chunking failed, falling back to simple chunking: {e}")
+        
+        # Fallback to rule-based chunking
+        return self._simple_chunk(content, base_metadata)
+    
+    async def _semantic_chunk(self, content: str) -> List[str]:
+        """
+        Perform semantic chunking using LLM.
+        
+        Args:
+            content: Content to chunk
+        
+        Returns:
+            List of chunk boundaries
+        """
+        # First, split on natural boundaries
+        sections = self._split_on_structure(content)
+        
+        # Group sections into semantic chunks
+        chunks = []
+        current_chunk = ""
+        
+        for section in sections:
+            # Check if adding this section would exceed chunk size
+            potential_chunk = current_chunk + "\n\n" + section if current_chunk else section
+            
+            if len(potential_chunk) <= self.config.chunk_size:
+                current_chunk = potential_chunk
+            else:
+                # Current chunk is ready, decide if we should split the section
+                if current_chunk:
+                    chunks.append(current_chunk.strip())
+                    current_chunk = ""
+                
+                # Handle oversized sections
+                if len(section) > self.config.max_chunk_size:
+                    # Split the section semantically
+                    sub_chunks = await self._split_long_section(section)
+                    chunks.extend(sub_chunks)
+                else:
+                    current_chunk = section
+        
+        # Add the last chunk
+        if current_chunk:
+            chunks.append(current_chunk.strip())
+        
+        return [chunk for chunk in chunks if len(chunk.strip()) >= self.config.min_chunk_size]
+    
+    def _split_on_structure(self, content: str) -> List[str]:
+        """
+        Split content on structural boundaries.
+        
+        Args:
+            content: Content to split
+        
+        Returns:
+            List of sections
+        """
+        # Split on markdown headers, paragraphs, and other structural elements
+        patterns = [
+            r'\n#{1,6}\s+.+?\n',  # Markdown headers
+            r'\n\n+',            # Multiple newlines (paragraph breaks)
+            r'\n[-*+]\s+',       # List items
+            r'\n\d+\.\s+',       # Numbered lists
+            r'\n```.*?```\n',    # Code blocks
+            r'\n\|\s*.+?\|\s*\n', # Tables
+        ]
+        
+        # Split by patterns but keep the separators
+        sections = [content]
+        
+        for pattern in patterns:
+            new_sections = []
+            for section in sections:
+                parts = re.split(f'({pattern})', section, flags=re.MULTILINE | re.DOTALL)
+                new_sections.extend([part for part in parts if part.strip()])
+            sections = new_sections
+        
+        return sections
+    
+    async def _split_long_section(self, section: str) -> List[str]:
+        """
+        Split a long section using LLM for semantic boundaries.
+        
+        Args:
+            section: Section to split
+        
+        Returns:
+            List of sub-chunks
+        """
+        try:
+            prompt = f"""
+            Split the following text into semantically coherent chunks. Each chunk should:
+            1. Be roughly {self.config.chunk_size} characters long
+            2. End at natural semantic boundaries
+            3. Maintain context and readability
+            4. Not exceed {self.config.max_chunk_size} characters
+            
+            Return only the split text with "---CHUNK---" as separator between chunks.
+            
+            Text to split:
+            {section}
+            """
+            
+            # Use Pydantic AI for LLM calls
+            from pydantic_ai import Agent
+            temp_agent = Agent(self.model)
+            
+            response = await temp_agent.run(prompt)
+            result = response.data
+            chunks = [chunk.strip() for chunk in result.split("---CHUNK---")]
+            
+            # Validate chunks
+            valid_chunks = []
+            for chunk in chunks:
+                if (self.config.min_chunk_size <= len(chunk) <= self.config.max_chunk_size):
+                    valid_chunks.append(chunk)
+            
+            return valid_chunks if valid_chunks else self._simple_split(section)
+            
+        except Exception as e:
+            logger.error(f"LLM chunking failed: {e}")
+            return self._simple_split(section)
+    
+    def _simple_split(self, text: str) -> List[str]:
+        """
+        Simple text splitting as fallback.
+        
+        Args:
+            text: Text to split
+        
+        Returns:
+            List of chunks
+        """
+        chunks = []
+        start = 0
+        
+        while start < len(text):
+            end = start + self.config.chunk_size
+            
+            if end >= len(text):
+                # Last chunk
+                chunks.append(text[start:])
+                break
+            
+            # Try to end at a sentence boundary
+            chunk_end = end
+            for i in range(end, max(start + self.config.min_chunk_size, end - 200), -1):
+                if text[i] in '.!?\n':
+                    chunk_end = i + 1
+                    break
+            
+            chunks.append(text[start:chunk_end])
+            start = chunk_end - self.config.chunk_overlap
+        
+        return chunks
+    
+    def _simple_chunk(
+        self,
+        content: str,
+        base_metadata: Dict[str, Any]
+    ) -> List[DocumentChunk]:
+        """
+        Simple rule-based chunking.
+        
+        Args:
+            content: Content to chunk
+            base_metadata: Base metadata for chunks
+        
+        Returns:
+            List of document chunks
+        """
+        chunks = self._simple_split(content)
+        return self._create_chunk_objects(chunks, content, base_metadata)
+    
+    def _create_chunk_objects(
+        self,
+        chunks: List[str],
+        original_content: str,
+        base_metadata: Dict[str, Any]
+    ) -> List[DocumentChunk]:
+        """
+        Create DocumentChunk objects from text chunks.
+        
+        Args:
+            chunks: List of chunk texts
+            original_content: Original document content
+            base_metadata: Base metadata
+        
+        Returns:
+            List of DocumentChunk objects
+        """
+        chunk_objects = []
+        current_pos = 0
+        
+        for i, chunk_text in enumerate(chunks):
+            # Find the position of this chunk in the original content
+            start_pos = original_content.find(chunk_text, current_pos)
+            if start_pos == -1:
+                # Fallback: estimate position
+                start_pos = current_pos
+            
+            end_pos = start_pos + len(chunk_text)
+            
+            # Create chunk metadata
+            chunk_metadata = {
+                **base_metadata,
+                "chunk_method": "semantic" if self.config.use_semantic_splitting else "simple",
+                "total_chunks": len(chunks)
+            }
+            
+            chunk_objects.append(DocumentChunk(
+                content=chunk_text.strip(),
+                index=i,
+                start_char=start_pos,
+                end_char=end_pos,
+                metadata=chunk_metadata
+            ))
+            
+            current_pos = end_pos
+        
+        return chunk_objects
+
+
+class SimpleChunker:
+    """Simple non-semantic chunker for faster processing."""
+    
+    def __init__(self, config: ChunkingConfig):
+        """Initialize simple chunker."""
+        self.config = config
+    
+    def chunk_document(
+        self,
+        content: str,
+        title: str,
+        source: str,
+        metadata: Optional[Dict[str, Any]] = None
+    ) -> List[DocumentChunk]:
+        """
+        Chunk document using simple rules.
+        
+        Args:
+            content: Document content
+            title: Document title
+            source: Document source
+            metadata: Additional metadata
+        
+        Returns:
+            List of document chunks
+        """
+        if not content.strip():
+            return []
+        
+        base_metadata = {
+            "title": title,
+            "source": source,
+            "chunk_method": "simple",
+            **(metadata or {})
+        }
+        
+        # Split on paragraphs first
+        paragraphs = re.split(r'\n\s*\n', content)
+        chunks = []
+        current_chunk = ""
+        current_pos = 0
+        chunk_index = 0
+        
+        for paragraph in paragraphs:
+            paragraph = paragraph.strip()
+            if not paragraph:
+                continue
+            
+            # Check if adding this paragraph exceeds chunk size
+            potential_chunk = current_chunk + "\n\n" + paragraph if current_chunk else paragraph
+            
+            if len(potential_chunk) <= self.config.chunk_size:
+                current_chunk = potential_chunk
+            else:
+                # Save current chunk if it exists
+                if current_chunk:
+                    chunks.append(self._create_chunk(
+                        current_chunk,
+                        chunk_index,
+                        current_pos,
+                        current_pos + len(current_chunk),
+                        base_metadata.copy()
+                    ))
+                    
+                    # Move position, but ensure overlap is respected
+                    overlap_start = max(0, len(current_chunk) - self.config.chunk_overlap)
+                    current_pos += overlap_start
+                    chunk_index += 1
+                
+                # Start new chunk with current paragraph
+                current_chunk = paragraph
+        
+        # Add final chunk
+        if current_chunk:
+            chunks.append(self._create_chunk(
+                current_chunk,
+                chunk_index,
+                current_pos,
+                current_pos + len(current_chunk),
+                base_metadata.copy()
+            ))
+        
+        # Update total chunks in metadata
+        for chunk in chunks:
+            chunk.metadata["total_chunks"] = len(chunks)
+        
+        return chunks
+    
+    def _create_chunk(
+        self,
+        content: str,
+        index: int,
+        start_pos: int,
+        end_pos: int,
+        metadata: Dict[str, Any]
+    ) -> DocumentChunk:
+        """Create a DocumentChunk object."""
+        return DocumentChunk(
+            content=content.strip(),
+            index=index,
+            start_char=start_pos,
+            end_char=end_pos,
+            metadata=metadata
+        )
+
+
+# Factory function
+def create_chunker(config: ChunkingConfig):
+    """
+    Create appropriate chunker based on configuration.
+    
+    Args:
+        config: Chunking configuration
+    
+    Returns:
+        Chunker instance
+    """
+    if config.use_semantic_splitting:
+        return SemanticChunker(config)
+    else:
+        return SimpleChunker(config)
+
+
+# Example usage
+async def main():
+    """Example usage of the chunker."""
+    config = ChunkingConfig(
+        chunk_size=500,
+        chunk_overlap=50,
+        use_semantic_splitting=True
+    )
+    
+    chunker = create_chunker(config)
+    
+    sample_text = """
+    # Big Tech AI Initiatives
+    
+    ## Google's AI Strategy
+    Google has been investing heavily in artificial intelligence research and development.
+    Their main focus areas include:
+    
+    - Large language models (LaMDA, PaLM, Gemini)
+    - Computer vision and image recognition
+    - Natural language processing
+    - AI-powered search improvements
+    
+    The company's DeepMind division continues to push the boundaries of AI research,
+    with breakthrough achievements in protein folding prediction and game playing.
+    
+    ## Microsoft's Partnership with OpenAI
+    Microsoft's strategic partnership with OpenAI has positioned them as a leader
+    in the generative AI space. Key developments include:
+    
+    1. Integration of GPT models into Office 365
+    2. Azure OpenAI Service for enterprise customers
+    3. Investment in OpenAI's continued research
+    """
+    
+    chunks = await chunker.chunk_document(
+        content=sample_text,
+        title="Big Tech AI Report",
+        source="example.md"
+    )
+    
+    for i, chunk in enumerate(chunks):
+        print(f"Chunk {i}: {len(chunk.content)} chars")
+        print(f"Content: {chunk.content[:100]}...")
+        print(f"Metadata: {chunk.metadata}")
+        print("---")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())