From 324ebe8955965faa909e9f977309e9c6e8942dc4 Mon Sep 17 00:00:00 2001 From: rarebuffalo Date: Sat, 25 Apr 2026 19:27:06 +0530 Subject: [PATCH] inegrated the ai agent --- app/main.py | 3 +- app/routers/__init__.py | 17 +++ app/routers/code_scan.py | 91 +++++++++++++ app/schemas/code_scan.py | 29 ++++ app/services/code_scanner/__init__.py | 1 + app/services/code_scanner/github_client.py | 88 ++++++++++++ app/services/code_scanner/orchestrator.py | 151 +++++++++++++++++++++ 7 files changed, 379 insertions(+), 1 deletion(-) create mode 100644 app/routers/code_scan.py create mode 100644 app/schemas/code_scan.py create mode 100644 app/services/code_scanner/__init__.py create mode 100644 app/services/code_scanner/github_client.py create mode 100644 app/services/code_scanner/orchestrator.py diff --git a/app/main.py b/app/main.py index 135c557..673658d 100644 --- a/app/main.py +++ b/app/main.py @@ -10,7 +10,7 @@ from slowapi.middleware import SlowAPIMiddleware from app.config import settings from app.database import close_db, init_db from app.middleware.rate_limiter import limiter -from app.routers import auth, health, history, scan, apikey, report +from app.routers import auth, health, history, scan, apikey, report, code_scan logging.basicConfig( level=logging.DEBUG if settings.debug else logging.INFO, @@ -56,6 +56,7 @@ def create_app() -> FastAPI: application.include_router(history.router) application.include_router(apikey.router) application.include_router(report.router) + application.include_router(code_scan.router) logger.info(f"{settings.app_name} v{settings.app_version} initialized") diff --git a/app/routers/__init__.py b/app/routers/__init__.py index e69de29..23d41e2 100644 --- a/app/routers/__init__.py +++ b/app/routers/__init__.py @@ -0,0 +1,17 @@ +from .auth import router as auth +from .health import router as health +from .history import router as history +from .scan import router as scan +from .apikey import router as apikey +from .report import router as report +from .code_scan import router as code_scan + +__all__ = [ + "auth", + "health", + "history", + "scan", + "apikey", + "report", + "code_scan" +] diff --git a/app/routers/code_scan.py b/app/routers/code_scan.py new file mode 100644 index 0000000..c4a9057 --- /dev/null +++ b/app/routers/code_scan.py @@ -0,0 +1,91 @@ +import logging +import uuid +import json +from fastapi import APIRouter, HTTPException +from typing import Dict, Any + +from app.schemas.code_scan import CodeScanRequest, CodeScanResponse, CodeChatRequest, CodeChatResponse +from app.services.code_scanner.orchestrator import CodeScanOrchestrator, client +from app.config import settings + +logger = logging.getLogger(__name__) + +router = APIRouter(tags=["code-scan"]) + +# In-memory store for scan results to support chat context. +# In a real production app, this would be stored in the database. +scan_store: Dict[str, CodeScanResponse] = {} + +@router.post("/code-scan/analyze", response_model=CodeScanResponse) +async def analyze_codebase(request: CodeScanRequest): + logger.info(f"Starting code scan for {request.repo_url}") + + try: + orchestrator = CodeScanOrchestrator( + repo_url=request.repo_url, + github_token=request.github_token, + branch=request.branch or "main" + ) + + # 1. Fetch repo structure + all_files = await orchestrator.github.get_repo_tree(request.repo_url, request.branch or "main") + + # 2. Triage files + triaged_files = await orchestrator.triage_files(all_files) + logger.info(f"Triaged {len(triaged_files)} files out of {len(all_files)}.") + + # 3. Analyze triaged files + vulnerabilities = await orchestrator.analyze_files(triaged_files) + + # 4. Generate Summary + summary = await orchestrator.generate_summary(vulnerabilities) + + scan_id = str(uuid.uuid4()) + + response = CodeScanResponse( + scan_id=scan_id, + repo_url=request.repo_url, + summary=summary, + issues=vulnerabilities + ) + + # Save to in-memory store for the chat feature + scan_store[scan_id] = response + + return response + except Exception as e: + logger.error(f"Code scan failed: {str(e)}") + raise HTTPException(status_code=500, detail=str(e)) + +@router.post("/code-scan/chat", response_model=CodeChatResponse) +async def chat_with_scan(request: CodeChatRequest): + if not settings.openai_api_key: + raise HTTPException(status_code=400, detail="AI Chat is disabled because OPENAI_API_KEY is not configured.") + + scan_data = scan_store.get(request.scan_id) + if not scan_data: + raise HTTPException(status_code=404, detail="Scan ID not found or expired.") + + system_prompt = ( + "You are SecureLens AI, an expert application security assistant. " + "You are helping a developer understand a security scan report for their codebase. " + f"Here is the context of the scan for the repository {scan_data.repo_url}:\n" + f"Summary: {scan_data.summary}\n" + f"Vulnerabilities: {json.dumps([v.model_dump() for v in scan_data.issues])}\n\n" + "Answer the user's questions clearly, concisely, and professionally. Provide code fixes if requested." + ) + + try: + response = await client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": request.message} + ], + temperature=0.5, + ) + reply = response.choices[0].message.content or "No response from AI." + return CodeChatResponse(reply=reply) + except Exception as e: + logger.error(f"AI Chat Error: {str(e)}") + raise HTTPException(status_code=500, detail="I encountered an error trying to process your request.") diff --git a/app/schemas/code_scan.py b/app/schemas/code_scan.py new file mode 100644 index 0000000..85999d6 --- /dev/null +++ b/app/schemas/code_scan.py @@ -0,0 +1,29 @@ +from pydantic import BaseModel, HttpUrl +from typing import List, Optional, Dict, Any + +class CodeScanRequest(BaseModel): + repo_url: str + github_token: str + # branch or commit hash optional + branch: Optional[str] = "main" + +class VulnerabilityIssue(BaseModel): + file_path: str + severity: str # High, Medium, Low, Critical + issue: str + explanation: str + suggested_fix: Optional[str] = None + line_number: Optional[int] = None + +class CodeScanResponse(BaseModel): + scan_id: str + repo_url: str + summary: str + issues: List[VulnerabilityIssue] + +class CodeChatRequest(BaseModel): + scan_id: str + message: str + +class CodeChatResponse(BaseModel): + reply: str diff --git a/app/services/code_scanner/__init__.py b/app/services/code_scanner/__init__.py new file mode 100644 index 0000000..65892c0 --- /dev/null +++ b/app/services/code_scanner/__init__.py @@ -0,0 +1 @@ +# Code Scanner Package diff --git a/app/services/code_scanner/github_client.py b/app/services/code_scanner/github_client.py new file mode 100644 index 0000000..16f33ae --- /dev/null +++ b/app/services/code_scanner/github_client.py @@ -0,0 +1,88 @@ +import httpx +import logging +import base64 +from urllib.parse import urlparse +from typing import List, Dict, Any, Optional + +logger = logging.getLogger(__name__) + +class GitHubClient: + def __init__(self, token: str): + self.token = token + self.headers = { + "Authorization": f"Bearer {self.token}", + "Accept": "application/vnd.github.v3+json" + } + self.base_url = "https://api.github.com" + + def _parse_repo_url(self, repo_url: str) -> Optional[str]: + """ + Extracts owner/repo from https://github.com/owner/repo + """ + try: + parsed = urlparse(repo_url) + path_parts = parsed.path.strip('/').split('/') + if len(path_parts) >= 2: + return f"{path_parts[0]}/{path_parts[1]}" + return None + except Exception as e: + logger.error(f"Failed to parse repo URL {repo_url}: {e}") + return None + + async def get_repo_tree(self, repo_url: str, branch: str = "main") -> List[str]: + """ + Fetches the recursive tree of the repository to get all file paths. + Returns a list of file paths. + """ + repo_path = self._parse_repo_url(repo_url) + if not repo_path: + raise ValueError(f"Invalid GitHub repository URL: {repo_url}") + + # First, get the commit SHA for the branch to get the tree SHA + commits_url = f"{self.base_url}/repos/{repo_path}/commits/{branch}" + async with httpx.AsyncClient() as client: + try: + commit_resp = await client.get(commits_url, headers=self.headers, timeout=10.0) + commit_resp.raise_for_status() + tree_sha = commit_resp.json()["commit"]["tree"]["sha"] + + # Now get the recursive tree + tree_url = f"{self.base_url}/repos/{repo_path}/git/trees/{tree_sha}?recursive=1" + tree_resp = await client.get(tree_url, headers=self.headers, timeout=15.0) + tree_resp.raise_for_status() + + tree_data = tree_resp.json() + file_paths = [] + for item in tree_data.get("tree", []): + if item["type"] == "blob": # Only files, not directories + file_paths.append(item["path"]) + + return file_paths + except httpx.HTTPError as e: + logger.error(f"GitHub API error fetching tree for {repo_path}: {e}") + raise Exception(f"Failed to fetch repository structure: {e}") + + async def get_file_content(self, repo_url: str, file_path: str, branch: str = "main") -> str: + """ + Fetches the content of a specific file. + """ + repo_path = self._parse_repo_url(repo_url) + if not repo_path: + raise ValueError(f"Invalid GitHub repository URL: {repo_url}") + + # We can use the raw URL or the contents API + content_url = f"{self.base_url}/repos/{repo_path}/contents/{file_path}?ref={branch}" + + async with httpx.AsyncClient() as client: + try: + resp = await client.get(content_url, headers=self.headers, timeout=10.0) + resp.raise_for_status() + data = resp.json() + + if "content" in data and data.get("encoding") == "base64": + decoded_content = base64.b64decode(data["content"]).decode('utf-8', errors='replace') + return decoded_content + return "" + except httpx.HTTPError as e: + logger.error(f"GitHub API error fetching file {file_path}: {e}") + return "" # Return empty if file cannot be read, agent will just skip it diff --git a/app/services/code_scanner/orchestrator.py b/app/services/code_scanner/orchestrator.py new file mode 100644 index 0000000..4c297ce --- /dev/null +++ b/app/services/code_scanner/orchestrator.py @@ -0,0 +1,151 @@ +import json +import logging +from typing import List, Dict, Any +from openai import AsyncOpenAI + +from app.config import settings +from app.services.code_scanner.github_client import GitHubClient +from app.schemas.code_scan import VulnerabilityIssue + +logger = logging.getLogger(__name__) + +api_key = settings.openai_api_key or "mock-key-for-testing" +client = AsyncOpenAI(api_key=api_key) + +class CodeScanOrchestrator: + def __init__(self, repo_url: str, github_token: str, branch: str = "main"): + self.repo_url = repo_url + self.branch = branch + self.github = GitHubClient(token=github_token) + + async def triage_files(self, all_files: List[str]) -> List[str]: + """ + Uses the LLM to select which files are most likely to contain security vulnerabilities + (e.g., config files, routers, auth logic). + """ + if not settings.openai_api_key: + logger.warning("OPENAI_API_KEY is not set. Triaging all files up to a limit.") + return all_files[:10] # Hard limit for testing + + # To avoid context limit issues, we might want to chunk this, but for now we pass the list + # We can enforce a soft limit on the string length + files_str = "\n".join(all_files) + if len(files_str) > 15000: + files_str = files_str[:15000] + "\n... (truncated)" + + prompt = ( + "You are a Senior Application Security Engineer. I have a repository with the following files:\n" + f"{files_str}\n\n" + "Select the most critical files to review for security vulnerabilities (e.g., SAST, hardcoded secrets, SQLi, Auth bypass). " + "Return a JSON object with a single key 'critical_files' containing a list of the exact file paths. " + "Do not select more than 15 files." + ) + + try: + response = await client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "You always respond with valid JSON."}, + {"role": "user", "content": prompt} + ], + response_format={"type": "json_object"}, + temperature=0.1, + ) + content = response.choices[0].message.content + if content: + data = json.loads(content) + return data.get("critical_files", []) + except Exception as e: + logger.error(f"Error triaging files: {e}") + + return all_files[:10] # Fallback + + async def analyze_files(self, triaged_files: List[str]) -> List[VulnerabilityIssue]: + """ + Fetches the contents of the triaged files and uses the LLM to find vulnerabilities. + """ + vulnerabilities = [] + + if not settings.openai_api_key: + return [] + + # Analyze files sequentially or in batches (sequential to avoid rate limits for now) + for file_path in triaged_files: + content = await self.github.get_file_content(self.repo_url, file_path, self.branch) + if not content: + continue + + # Truncate very large files + if len(content) > 20000: + content = content[:20000] + + prompt = ( + f"Review the following code from the file '{file_path}' for security vulnerabilities.\n" + "Focus on OWASP Top 10: SQLi, XSS, Hardcoded Secrets, IDOR, Misconfigurations, etc.\n\n" + f"CODE:\n{content}\n\n" + "Return a JSON object with a key 'vulnerabilities' containing a list of objects. " + "Each object MUST have the following keys: " + "'severity' (Critical, High, Medium, Low), " + "'issue' (A short title), " + "'explanation' (1-2 sentences explaining the vulnerability), " + "'suggested_fix' (Code snippet or clear instructions to fix), " + "'line_number' (integer or null if general)." + ) + + try: + response = await client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "You are a SAST security agent. Always respond with valid JSON."}, + {"role": "user", "content": prompt} + ], + response_format={"type": "json_object"}, + temperature=0.2, + ) + + resp_content = response.choices[0].message.content + if resp_content: + data = json.loads(resp_content) + vulns = data.get("vulnerabilities", []) + for v in vulns: + vulnerabilities.append(VulnerabilityIssue( + file_path=file_path, + severity=v.get("severity", "Medium"), + issue=v.get("issue", "Unknown Issue"), + explanation=v.get("explanation", ""), + suggested_fix=v.get("suggested_fix"), + line_number=v.get("line_number") + )) + except Exception as e: + logger.error(f"Error analyzing file {file_path}: {e}") + + return vulnerabilities + + async def generate_summary(self, vulnerabilities: List[VulnerabilityIssue]) -> str: + if not vulnerabilities: + return "No obvious security vulnerabilities found in the scanned files." + + if not settings.openai_api_key: + return f"Found {len(vulnerabilities)} potential issues." + + issues_data = [v.model_dump() for v in vulnerabilities] + prompt = ( + "You are a Senior AppSec Manager. Summarize the following list of vulnerabilities found in a recent scan. " + "Provide a 2-3 paragraph executive summary of the repository's security posture. " + "Keep it professional and highlight the most critical risks.\n\n" + f"{json.dumps(issues_data)}" + ) + + try: + response = await client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "You are a cybersecurity expert."}, + {"role": "user", "content": prompt} + ], + temperature=0.4, + ) + return response.choices[0].message.content or "Could not generate summary." + except Exception as e: + logger.error(f"Error generating summary: {e}") + return f"Found {len(vulnerabilities)} potential issues."