From 8dd9f12be3dc5aeda394407b02208b84d9927596 Mon Sep 17 00:00:00 2001 From: rarebuffalo Date: Sun, 3 May 2026 16:09:39 +0530 Subject: [PATCH] fix: persist code scan results to PostgreSQL (replace in-memory store) --- app/models/__init__.py | 3 +- app/models/code_scan.py | 43 +++++++ app/models/user.py | 1 + app/routers/code_scan.py | 236 +++++++++++++++++++++++++++------------ 4 files changed, 210 insertions(+), 73 deletions(-) create mode 100644 app/models/code_scan.py diff --git a/app/models/__init__.py b/app/models/__init__.py index a8b37b7..acbf549 100644 --- a/app/models/__init__.py +++ b/app/models/__init__.py @@ -1,4 +1,5 @@ from app.models.user import User from app.models.scan import ScanResult +from app.models.code_scan import CodeScanResult -__all__ = ["User", "ScanResult"] +__all__ = ["User", "ScanResult", "CodeScanResult"] diff --git a/app/models/code_scan.py b/app/models/code_scan.py new file mode 100644 index 0000000..4f67f14 --- /dev/null +++ b/app/models/code_scan.py @@ -0,0 +1,43 @@ +import uuid +from datetime import datetime, timezone + +from sqlalchemy import DateTime, ForeignKey, JSON, String, Text +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.database import Base + + +class CodeScanResult(Base): + """ + Persists the result of an AI-powered code repository scan to the database. + + Why this exists: + - Previously, code scan results were stored in a plain Python dict (scan_store) + in memory. This caused data loss on every server restart and prevented the + chat feature from working reliably. This model fixes that permanently. + + Columns: + - id: UUID primary key, used as the scan_id returned to the client. + - user_id: Optional FK to users table. NULL for unauthenticated scans. + - repo_url: The GitHub repository URL that was scanned. + - summary: The AI-generated executive summary of the scan. + - issues: JSON list of VulnerabilityIssue dicts. + - created_at: Timestamp of when the scan was performed. + """ + + __tablename__ = "code_scan_results" + + id: Mapped[str] = mapped_column( + String(36), primary_key=True, default=lambda: str(uuid.uuid4()) + ) + user_id: Mapped[str | None] = mapped_column( + String(36), ForeignKey("users.id"), index=True, nullable=True + ) + repo_url: Mapped[str] = mapped_column(String(2048)) + summary: Mapped[str] = mapped_column(Text, default="") + issues: Mapped[list] = mapped_column(JSON, default=list) + created_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), default=lambda: datetime.now(timezone.utc) + ) + + user = relationship("User", back_populates="code_scans") diff --git a/app/models/user.py b/app/models/user.py index 713d51d..348becf 100644 --- a/app/models/user.py +++ b/app/models/user.py @@ -21,5 +21,6 @@ class User(Base): ) scans = relationship("ScanResult", back_populates="user", lazy="selectin") + code_scans = relationship("CodeScanResult", back_populates="user", lazy="selectin", cascade="all, delete") api_keys = relationship("ApiKey", back_populates="user", lazy="selectin", cascade="all, delete") webhooks = relationship("Webhook", back_populates="user", lazy="selectin", cascade="all, delete") diff --git a/app/routers/code_scan.py b/app/routers/code_scan.py index 45326bf..7524539 100644 --- a/app/routers/code_scan.py +++ b/app/routers/code_scan.py @@ -1,10 +1,41 @@ -import logging -import uuid -import json -from fastapi import APIRouter, HTTPException -from typing import Dict, Any +""" +Code Scan Router +================ -from app.schemas.code_scan import CodeScanRequest, CodeScanResponse, CodeChatRequest, CodeChatResponse +Two endpoints: + POST /code-scan/analyze — Clone repo tree, triage + analyze files with AI, + persist result to PostgreSQL, return scan_id. + POST /code-scan/chat — Load the persisted scan from DB and answer questions. + GET /code-scan/models — List available AI models (debug / informational). + +Why we moved away from in-memory scan_store: + The original implementation stored scan results in a plain Python dict. + This caused two critical problems: + 1. Any server restart or crash wiped all stored context, breaking chat. + 2. Multiple Uvicorn workers have isolated memory; a scan on worker A + cannot be chatted with on worker B. + By persisting to PostgreSQL we get durability, scalability, and historical + records of all code scans (same pattern as the web scanner). +""" + +import logging +import json +from fastapi import APIRouter, Depends, HTTPException +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession +from typing import Dict + +from app.database import get_db +from app.middleware.auth import get_optional_user +from app.models.code_scan import CodeScanResult +from app.models.user import User +from app.schemas.code_scan import ( + CodeScanRequest, + CodeScanResponse, + CodeChatRequest, + CodeChatResponse, + VulnerabilityIssue, +) from app.services.code_scanner.orchestrator import CodeScanOrchestrator from app.config import settings @@ -12,98 +43,159 @@ logger = logging.getLogger(__name__) router = APIRouter(tags=["code-scan"]) -# In-memory store for scan results to support chat context. -# In a real production app, this would be stored in the database. -scan_store: Dict[str, CodeScanResponse] = {} + +# --------------------------------------------------------------------------- +# POST /code-scan/analyze +# --------------------------------------------------------------------------- @router.post("/code-scan/analyze", response_model=CodeScanResponse) -async def analyze_codebase(request: CodeScanRequest): +async def analyze_codebase( + request: CodeScanRequest, + db: AsyncSession = Depends(get_db), + current_user: User | None = Depends(get_optional_user), +): + """ + Full agentic scan of a GitHub repository. + + Flow: + 1. Fetch the full file tree from GitHub. + 2. Ask the AI to triage (select) the most security-critical files. + 3. Analyze each triaged file for OWASP Top-10 vulnerabilities. + 4. Generate an executive summary of all findings. + 5. Persist the result to `code_scan_results` table. + 6. Return the scan_id so the frontend can open a chat session. + """ logger.info(f"Starting code scan for {request.repo_url}") - + try: orchestrator = CodeScanOrchestrator( repo_url=request.repo_url, github_token=request.github_token, - branch=request.branch or "main" + branch=request.branch or "main", ) - - # 1. Fetch repo structure - all_files = await orchestrator.github.get_repo_tree(request.repo_url, request.branch or "main") - - # 2. Triage files + + # Step 1 — Fetch file tree + all_files = await orchestrator.github.get_repo_tree( + request.repo_url, request.branch or "main" + ) + + # Step 2 — AI triage: pick the most security-sensitive files triaged_files = await orchestrator.triage_files(all_files) logger.info(f"Triaged {len(triaged_files)} files out of {len(all_files)}.") - - # 3. Analyze triaged files + + # Step 3 — Analyze each file vulnerabilities = await orchestrator.analyze_files(triaged_files) - - # 4. Generate Summary + + # Step 4 — Generate summary summary = await orchestrator.generate_summary(vulnerabilities) - - scan_id = str(uuid.uuid4()) - - response = CodeScanResponse( + + # Step 5 — Persist to database + # Serialise VulnerabilityIssue objects to plain dicts for JSON storage. + issues_as_dicts = [v.model_dump() for v in vulnerabilities] + + scan_record = CodeScanResult( + user_id=current_user.id if current_user else None, + repo_url=request.repo_url, + summary=summary, + issues=issues_as_dicts, + ) + db.add(scan_record) + await db.flush() # flush to get the auto-generated id without committing yet + scan_id = scan_record.id + + logger.info(f"Code scan {scan_id} persisted to database.") + + return CodeScanResponse( scan_id=scan_id, repo_url=request.repo_url, summary=summary, - issues=vulnerabilities + issues=vulnerabilities, ) - - # Save to in-memory store for the chat feature - scan_store[scan_id] = response - - return response + except Exception as e: logger.error(f"Code scan failed: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) + + +# --------------------------------------------------------------------------- +# POST /code-scan/chat +# --------------------------------------------------------------------------- + +@router.post("/code-scan/chat", response_model=CodeChatResponse) +async def chat_with_scan( + request: CodeChatRequest, + db: AsyncSession = Depends(get_db), +): + """ + Conversational Q&A grounded in a previously completed code scan. + + We load the scan from PostgreSQL using the scan_id, so this works + correctly across server restarts and multiple workers. + """ + if not settings.ai_api_key: + raise HTTPException( + status_code=400, + detail="AI Chat is disabled because no AI API key is configured.", + ) + + # Load the scan record from DB + result = await db.execute( + select(CodeScanResult).where(CodeScanResult.id == request.scan_id) + ) + scan_data = result.scalar_one_or_none() + + if not scan_data: + raise HTTPException( + status_code=404, + detail="Scan ID not found. The scan may have expired or never existed.", + ) + + # Build a rich context-aware prompt + prompt = ( + "You are SecureLens AI, an expert application security assistant. " + "You are helping a developer understand a security scan report for their codebase. " + f"Here is the context of the scan for the repository '{scan_data.repo_url}':\n\n" + f"Executive Summary:\n{scan_data.summary}\n\n" + f"Vulnerabilities Found:\n{json.dumps(scan_data.issues, indent=2)}\n\n" + f"Developer's Question: {request.message}\n\n" + "Answer clearly, concisely, and professionally. " + "Provide specific code fixes when requested. " + "Reference the exact file paths and line numbers from the scan data when relevant." + ) + + try: + # Use the unified AI service so it respects whatever provider is configured + from app.services.ai import call_ai + reply = await call_ai(prompt, temperature=0.5) + return CodeChatResponse(reply=reply) + except Exception as e: + logger.error(f"AI Chat Error: {str(e)}") + raise HTTPException( + status_code=500, + detail="I encountered an error trying to process your request.", + ) + + +# --------------------------------------------------------------------------- +# GET /code-scan/models (informational / debug) +# --------------------------------------------------------------------------- + @router.get("/code-scan/models") async def list_available_models(): - if not settings.gemini_api_key: - raise HTTPException(status_code=500, detail="GEMINI_API_KEY is not set.") + """ + Lists AI models available to the configured provider. + Only meaningful when using the Gemini provider. + """ + if not settings.ai_api_key: + raise HTTPException(status_code=500, detail="No AI API key is set.") try: from google import genai - client = genai.Client(api_key=settings.gemini_api_key) + + client = genai.Client(api_key=settings.ai_api_key) models = [] for model in client.models.list(): - if 'generateContent' in model.supported_actions: + if "generateContent" in model.supported_actions: models.append(model.name) return {"models": models} except Exception as e: raise HTTPException(status_code=500, detail=f"Error fetching models: {e}") - -@router.post("/code-scan/chat", response_model=CodeChatResponse) -async def chat_with_scan(request: CodeChatRequest): - if not settings.gemini_api_key: - raise HTTPException(status_code=400, detail="AI Chat is disabled because GEMINI_API_KEY is not configured.") - - scan_data = scan_store.get(request.scan_id) - if not scan_data: - raise HTTPException(status_code=404, detail="Scan ID not found or expired.") - - prompt = ( - "You are SecureLens AI, an expert application security assistant. " - "You are helping a developer understand a security scan report for their codebase. " - f"Here is the context of the scan for the repository {scan_data.repo_url}:\n" - f"Summary: {scan_data.summary}\n" - f"Vulnerabilities: {json.dumps([v.model_dump() for v in scan_data.issues])}\n\n" - f"User Message: {request.message}\n\n" - "Answer the user's questions clearly, concisely, and professionally. Provide code fixes if requested." - ) - - try: - from google import genai - from google.genai import types - - client = genai.Client(api_key=settings.gemini_api_key) - response = await client.aio.models.generate_content( - model='gemini-2.0-flash', - contents=prompt, - config=types.GenerateContentConfig( - temperature=0.5, - ) - ) - reply = response.text or "No response from AI." - return CodeChatResponse(reply=reply) - except Exception as e: - logger.error(f"AI Chat Error: {str(e)}") - raise HTTPException(status_code=500, detail="I encountered an error trying to process your request.")