From 324ebe8955965faa909e9f977309e9c6e8942dc4 Mon Sep 17 00:00:00 2001
From: rarebuffalo <info.krishnasingh.codes@gmail.com>
Date: Sat, 25 Apr 2026 19:27:06 +0530
Subject: [PATCH] inegrated the ai agent

---
 app/main.py                                |   3 +-
 app/routers/__init__.py                    |  17 +++
 app/routers/code_scan.py                   |  91 +++++++++++++
 app/schemas/code_scan.py                   |  29 ++++
 app/services/code_scanner/__init__.py      |   1 +
 app/services/code_scanner/github_client.py |  88 ++++++++++++
 app/services/code_scanner/orchestrator.py  | 151 +++++++++++++++++++++
 7 files changed, 379 insertions(+), 1 deletion(-)
 create mode 100644 app/routers/code_scan.py
 create mode 100644 app/schemas/code_scan.py
 create mode 100644 app/services/code_scanner/__init__.py
 create mode 100644 app/services/code_scanner/github_client.py
 create mode 100644 app/services/code_scanner/orchestrator.py

diff --git a/app/main.py b/app/main.py
index 135c557..673658d 100644
--- a/app/main.py
+++ b/app/main.py
@@ -10,7 +10,7 @@ from slowapi.middleware import SlowAPIMiddleware
 from app.config import settings
 from app.database import close_db, init_db
 from app.middleware.rate_limiter import limiter
-from app.routers import auth, health, history, scan, apikey, report
+from app.routers import auth, health, history, scan, apikey, report, code_scan
 
 logging.basicConfig(
     level=logging.DEBUG if settings.debug else logging.INFO,
@@ -56,6 +56,7 @@ def create_app() -> FastAPI:
     application.include_router(history.router)
     application.include_router(apikey.router)
     application.include_router(report.router)
+    application.include_router(code_scan.router)
 
     logger.info(f"{settings.app_name} v{settings.app_version} initialized")
 
diff --git a/app/routers/__init__.py b/app/routers/__init__.py
index e69de29..23d41e2 100644
--- a/app/routers/__init__.py
+++ b/app/routers/__init__.py
@@ -0,0 +1,17 @@
+from .auth import router as auth
+from .health import router as health
+from .history import router as history
+from .scan import router as scan
+from .apikey import router as apikey
+from .report import router as report
+from .code_scan import router as code_scan
+
+__all__ = [
+    "auth",
+    "health",
+    "history",
+    "scan",
+    "apikey",
+    "report",
+    "code_scan"
+]
diff --git a/app/routers/code_scan.py b/app/routers/code_scan.py
new file mode 100644
index 0000000..c4a9057
--- /dev/null
+++ b/app/routers/code_scan.py
@@ -0,0 +1,91 @@
+import logging
+import uuid
+import json
+from fastapi import APIRouter, HTTPException
+from typing import Dict, Any
+
+from app.schemas.code_scan import CodeScanRequest, CodeScanResponse, CodeChatRequest, CodeChatResponse
+from app.services.code_scanner.orchestrator import CodeScanOrchestrator, client
+from app.config import settings
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter(tags=["code-scan"])
+
+# In-memory store for scan results to support chat context.
+# In a real production app, this would be stored in the database.
+scan_store: Dict[str, CodeScanResponse] = {}
+
+@router.post("/code-scan/analyze", response_model=CodeScanResponse)
+async def analyze_codebase(request: CodeScanRequest):
+    logger.info(f"Starting code scan for {request.repo_url}")
+    
+    try:
+        orchestrator = CodeScanOrchestrator(
+            repo_url=request.repo_url,
+            github_token=request.github_token,
+            branch=request.branch or "main"
+        )
+        
+        # 1. Fetch repo structure
+        all_files = await orchestrator.github.get_repo_tree(request.repo_url, request.branch or "main")
+        
+        # 2. Triage files
+        triaged_files = await orchestrator.triage_files(all_files)
+        logger.info(f"Triaged {len(triaged_files)} files out of {len(all_files)}.")
+        
+        # 3. Analyze triaged files
+        vulnerabilities = await orchestrator.analyze_files(triaged_files)
+        
+        # 4. Generate Summary
+        summary = await orchestrator.generate_summary(vulnerabilities)
+        
+        scan_id = str(uuid.uuid4())
+        
+        response = CodeScanResponse(
+            scan_id=scan_id,
+            repo_url=request.repo_url,
+            summary=summary,
+            issues=vulnerabilities
+        )
+        
+        # Save to in-memory store for the chat feature
+        scan_store[scan_id] = response
+        
+        return response
+    except Exception as e:
+        logger.error(f"Code scan failed: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+@router.post("/code-scan/chat", response_model=CodeChatResponse)
+async def chat_with_scan(request: CodeChatRequest):
+    if not settings.openai_api_key:
+        raise HTTPException(status_code=400, detail="AI Chat is disabled because OPENAI_API_KEY is not configured.")
+        
+    scan_data = scan_store.get(request.scan_id)
+    if not scan_data:
+        raise HTTPException(status_code=404, detail="Scan ID not found or expired.")
+        
+    system_prompt = (
+        "You are SecureLens AI, an expert application security assistant. "
+        "You are helping a developer understand a security scan report for their codebase. "
+        f"Here is the context of the scan for the repository {scan_data.repo_url}:\n"
+        f"Summary: {scan_data.summary}\n"
+        f"Vulnerabilities: {json.dumps([v.model_dump() for v in scan_data.issues])}\n\n"
+        "Answer the user's questions clearly, concisely, and professionally. Provide code fixes if requested."
+    )
+
+    try:
+        response = await client.chat.completions.create(
+            model="gpt-3.5-turbo",
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": request.message}
+            ],
+            temperature=0.5,
+        )
+        reply = response.choices[0].message.content or "No response from AI."
+        return CodeChatResponse(reply=reply)
+    except Exception as e:
+        logger.error(f"AI Chat Error: {str(e)}")
+        raise HTTPException(status_code=500, detail="I encountered an error trying to process your request.")
diff --git a/app/schemas/code_scan.py b/app/schemas/code_scan.py
new file mode 100644
index 0000000..85999d6
--- /dev/null
+++ b/app/schemas/code_scan.py
@@ -0,0 +1,29 @@
+from pydantic import BaseModel, HttpUrl
+from typing import List, Optional, Dict, Any
+
+class CodeScanRequest(BaseModel):
+    repo_url: str
+    github_token: str
+    # branch or commit hash optional
+    branch: Optional[str] = "main"
+
+class VulnerabilityIssue(BaseModel):
+    file_path: str
+    severity: str  # High, Medium, Low, Critical
+    issue: str
+    explanation: str
+    suggested_fix: Optional[str] = None
+    line_number: Optional[int] = None
+
+class CodeScanResponse(BaseModel):
+    scan_id: str
+    repo_url: str
+    summary: str
+    issues: List[VulnerabilityIssue]
+
+class CodeChatRequest(BaseModel):
+    scan_id: str
+    message: str
+
+class CodeChatResponse(BaseModel):
+    reply: str
diff --git a/app/services/code_scanner/__init__.py b/app/services/code_scanner/__init__.py
new file mode 100644
index 0000000..65892c0
--- /dev/null
+++ b/app/services/code_scanner/__init__.py
@@ -0,0 +1 @@
+# Code Scanner Package
diff --git a/app/services/code_scanner/github_client.py b/app/services/code_scanner/github_client.py
new file mode 100644
index 0000000..16f33ae
--- /dev/null
+++ b/app/services/code_scanner/github_client.py
@@ -0,0 +1,88 @@
+import httpx
+import logging
+import base64
+from urllib.parse import urlparse
+from typing import List, Dict, Any, Optional
+
+logger = logging.getLogger(__name__)
+
+class GitHubClient:
+    def __init__(self, token: str):
+        self.token = token
+        self.headers = {
+            "Authorization": f"Bearer {self.token}",
+            "Accept": "application/vnd.github.v3+json"
+        }
+        self.base_url = "https://api.github.com"
+
+    def _parse_repo_url(self, repo_url: str) -> Optional[str]:
+        """
+        Extracts owner/repo from https://github.com/owner/repo
+        """
+        try:
+            parsed = urlparse(repo_url)
+            path_parts = parsed.path.strip('/').split('/')
+            if len(path_parts) >= 2:
+                return f"{path_parts[0]}/{path_parts[1]}"
+            return None
+        except Exception as e:
+            logger.error(f"Failed to parse repo URL {repo_url}: {e}")
+            return None
+
+    async def get_repo_tree(self, repo_url: str, branch: str = "main") -> List[str]:
+        """
+        Fetches the recursive tree of the repository to get all file paths.
+        Returns a list of file paths.
+        """
+        repo_path = self._parse_repo_url(repo_url)
+        if not repo_path:
+            raise ValueError(f"Invalid GitHub repository URL: {repo_url}")
+
+        # First, get the commit SHA for the branch to get the tree SHA
+        commits_url = f"{self.base_url}/repos/{repo_path}/commits/{branch}"
+        async with httpx.AsyncClient() as client:
+            try:
+                commit_resp = await client.get(commits_url, headers=self.headers, timeout=10.0)
+                commit_resp.raise_for_status()
+                tree_sha = commit_resp.json()["commit"]["tree"]["sha"]
+
+                # Now get the recursive tree
+                tree_url = f"{self.base_url}/repos/{repo_path}/git/trees/{tree_sha}?recursive=1"
+                tree_resp = await client.get(tree_url, headers=self.headers, timeout=15.0)
+                tree_resp.raise_for_status()
+
+                tree_data = tree_resp.json()
+                file_paths = []
+                for item in tree_data.get("tree", []):
+                    if item["type"] == "blob": # Only files, not directories
+                        file_paths.append(item["path"])
+                
+                return file_paths
+            except httpx.HTTPError as e:
+                logger.error(f"GitHub API error fetching tree for {repo_path}: {e}")
+                raise Exception(f"Failed to fetch repository structure: {e}")
+
+    async def get_file_content(self, repo_url: str, file_path: str, branch: str = "main") -> str:
+        """
+        Fetches the content of a specific file.
+        """
+        repo_path = self._parse_repo_url(repo_url)
+        if not repo_path:
+            raise ValueError(f"Invalid GitHub repository URL: {repo_url}")
+
+        # We can use the raw URL or the contents API
+        content_url = f"{self.base_url}/repos/{repo_path}/contents/{file_path}?ref={branch}"
+        
+        async with httpx.AsyncClient() as client:
+            try:
+                resp = await client.get(content_url, headers=self.headers, timeout=10.0)
+                resp.raise_for_status()
+                data = resp.json()
+                
+                if "content" in data and data.get("encoding") == "base64":
+                    decoded_content = base64.b64decode(data["content"]).decode('utf-8', errors='replace')
+                    return decoded_content
+                return ""
+            except httpx.HTTPError as e:
+                logger.error(f"GitHub API error fetching file {file_path}: {e}")
+                return "" # Return empty if file cannot be read, agent will just skip it
diff --git a/app/services/code_scanner/orchestrator.py b/app/services/code_scanner/orchestrator.py
new file mode 100644
index 0000000..4c297ce
--- /dev/null
+++ b/app/services/code_scanner/orchestrator.py
@@ -0,0 +1,151 @@
+import json
+import logging
+from typing import List, Dict, Any
+from openai import AsyncOpenAI
+
+from app.config import settings
+from app.services.code_scanner.github_client import GitHubClient
+from app.schemas.code_scan import VulnerabilityIssue
+
+logger = logging.getLogger(__name__)
+
+api_key = settings.openai_api_key or "mock-key-for-testing"
+client = AsyncOpenAI(api_key=api_key)
+
+class CodeScanOrchestrator:
+    def __init__(self, repo_url: str, github_token: str, branch: str = "main"):
+        self.repo_url = repo_url
+        self.branch = branch
+        self.github = GitHubClient(token=github_token)
+
+    async def triage_files(self, all_files: List[str]) -> List[str]:
+        """
+        Uses the LLM to select which files are most likely to contain security vulnerabilities 
+        (e.g., config files, routers, auth logic).
+        """
+        if not settings.openai_api_key:
+            logger.warning("OPENAI_API_KEY is not set. Triaging all files up to a limit.")
+            return all_files[:10] # Hard limit for testing
+
+        # To avoid context limit issues, we might want to chunk this, but for now we pass the list
+        # We can enforce a soft limit on the string length
+        files_str = "\n".join(all_files)
+        if len(files_str) > 15000:
+            files_str = files_str[:15000] + "\n... (truncated)"
+
+        prompt = (
+            "You are a Senior Application Security Engineer. I have a repository with the following files:\n"
+            f"{files_str}\n\n"
+            "Select the most critical files to review for security vulnerabilities (e.g., SAST, hardcoded secrets, SQLi, Auth bypass). "
+            "Return a JSON object with a single key 'critical_files' containing a list of the exact file paths. "
+            "Do not select more than 15 files."
+        )
+
+        try:
+            response = await client.chat.completions.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {"role": "system", "content": "You always respond with valid JSON."},
+                    {"role": "user", "content": prompt}
+                ],
+                response_format={"type": "json_object"},
+                temperature=0.1,
+            )
+            content = response.choices[0].message.content
+            if content:
+                data = json.loads(content)
+                return data.get("critical_files", [])
+        except Exception as e:
+            logger.error(f"Error triaging files: {e}")
+            
+        return all_files[:10] # Fallback
+
+    async def analyze_files(self, triaged_files: List[str]) -> List[VulnerabilityIssue]:
+        """
+        Fetches the contents of the triaged files and uses the LLM to find vulnerabilities.
+        """
+        vulnerabilities = []
+        
+        if not settings.openai_api_key:
+            return []
+
+        # Analyze files sequentially or in batches (sequential to avoid rate limits for now)
+        for file_path in triaged_files:
+            content = await self.github.get_file_content(self.repo_url, file_path, self.branch)
+            if not content:
+                continue
+                
+            # Truncate very large files
+            if len(content) > 20000:
+                content = content[:20000]
+
+            prompt = (
+                f"Review the following code from the file '{file_path}' for security vulnerabilities.\n"
+                "Focus on OWASP Top 10: SQLi, XSS, Hardcoded Secrets, IDOR, Misconfigurations, etc.\n\n"
+                f"CODE:\n{content}\n\n"
+                "Return a JSON object with a key 'vulnerabilities' containing a list of objects. "
+                "Each object MUST have the following keys: "
+                "'severity' (Critical, High, Medium, Low), "
+                "'issue' (A short title), "
+                "'explanation' (1-2 sentences explaining the vulnerability), "
+                "'suggested_fix' (Code snippet or clear instructions to fix), "
+                "'line_number' (integer or null if general)."
+            )
+
+            try:
+                response = await client.chat.completions.create(
+                    model="gpt-3.5-turbo",
+                    messages=[
+                        {"role": "system", "content": "You are a SAST security agent. Always respond with valid JSON."},
+                        {"role": "user", "content": prompt}
+                    ],
+                    response_format={"type": "json_object"},
+                    temperature=0.2,
+                )
+                
+                resp_content = response.choices[0].message.content
+                if resp_content:
+                    data = json.loads(resp_content)
+                    vulns = data.get("vulnerabilities", [])
+                    for v in vulns:
+                        vulnerabilities.append(VulnerabilityIssue(
+                            file_path=file_path,
+                            severity=v.get("severity", "Medium"),
+                            issue=v.get("issue", "Unknown Issue"),
+                            explanation=v.get("explanation", ""),
+                            suggested_fix=v.get("suggested_fix"),
+                            line_number=v.get("line_number")
+                        ))
+            except Exception as e:
+                logger.error(f"Error analyzing file {file_path}: {e}")
+
+        return vulnerabilities
+
+    async def generate_summary(self, vulnerabilities: List[VulnerabilityIssue]) -> str:
+        if not vulnerabilities:
+            return "No obvious security vulnerabilities found in the scanned files."
+            
+        if not settings.openai_api_key:
+            return f"Found {len(vulnerabilities)} potential issues."
+
+        issues_data = [v.model_dump() for v in vulnerabilities]
+        prompt = (
+            "You are a Senior AppSec Manager. Summarize the following list of vulnerabilities found in a recent scan. "
+            "Provide a 2-3 paragraph executive summary of the repository's security posture. "
+            "Keep it professional and highlight the most critical risks.\n\n"
+            f"{json.dumps(issues_data)}"
+        )
+
+        try:
+            response = await client.chat.completions.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {"role": "system", "content": "You are a cybersecurity expert."},
+                    {"role": "user", "content": prompt}
+                ],
+                temperature=0.4,
+            )
+            return response.choices[0].message.content or "Could not generate summary."
+        except Exception as e:
+            logger.error(f"Error generating summary: {e}")
+            return f"Found {len(vulnerabilities)} potential issues."