fix: persist code scan results to PostgreSQL (replace in-memory store)

2026-06-19 07:00:30 +00:00 · 2026-05-03 16:09:39 +05:30
parent 7159f4b9af
commit 8dd9f12be3
4 changed files with 210 additions and 73 deletions
--- a/app/models/init.py
+++ b/app/models/init.py
@@ -1,4 +1,5 @@
 from app.models.user import User
 from app.models.scan import ScanResult
+from app.models.code_scan import CodeScanResult

-__all__ = ["User", "ScanResult"]
+__all__ = ["User", "ScanResult", "CodeScanResult"]
--- a/app/models/code_scan.py
+++ b/app/models/code_scan.py
@@ -0,0 +1,43 @@
+import uuid
+from datetime import datetime, timezone
+
+from sqlalchemy import DateTime, ForeignKey, JSON, String, Text
+from sqlalchemy.orm import Mapped, mapped_column, relationship
+
+from app.database import Base
+
+
+class CodeScanResult(Base):
+    """
+    Persists the result of an AI-powered code repository scan to the database.
+
+    Why this exists:
+    - Previously, code scan results were stored in a plain Python dict (scan_store)
+      in memory. This caused data loss on every server restart and prevented the
+      chat feature from working reliably. This model fixes that permanently.
+
+    Columns:
+    - id:           UUID primary key, used as the scan_id returned to the client.
+    - user_id:      Optional FK to users table. NULL for unauthenticated scans.
+    - repo_url:     The GitHub repository URL that was scanned.
+    - summary:      The AI-generated executive summary of the scan.
+    - issues:       JSON list of VulnerabilityIssue dicts.
+    - created_at:   Timestamp of when the scan was performed.
+    """
+
+    __tablename__ = "code_scan_results"
+
+    id: Mapped[str] = mapped_column(
+        String(36), primary_key=True, default=lambda: str(uuid.uuid4())
+    )
+    user_id: Mapped[str | None] = mapped_column(
+        String(36), ForeignKey("users.id"), index=True, nullable=True
+    )
+    repo_url: Mapped[str] = mapped_column(String(2048))
+    summary: Mapped[str] = mapped_column(Text, default="")
+    issues: Mapped[list] = mapped_column(JSON, default=list)
+    created_at: Mapped[datetime] = mapped_column(
+        DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)
+    )
+
+    user = relationship("User", back_populates="code_scans")
--- a/app/models/user.py
+++ b/app/models/user.py
@@ -21,5 +21,6 @@ class User(Base):
    )

    scans = relationship("ScanResult", back_populates="user", lazy="selectin")
+    code_scans = relationship("CodeScanResult", back_populates="user", lazy="selectin", cascade="all, delete")
    api_keys = relationship("ApiKey", back_populates="user", lazy="selectin", cascade="all, delete")
    webhooks = relationship("Webhook", back_populates="user", lazy="selectin", cascade="all, delete")
--- a/app/routers/code_scan.py
+++ b/app/routers/code_scan.py
@@ -1,10 +1,41 @@
-import logging
-import uuid
-import json
-from fastapi import APIRouter, HTTPException
-from typing import Dict, Any
+"""
+Code Scan Router
+================

-from app.schemas.code_scan import CodeScanRequest, CodeScanResponse, CodeChatRequest, CodeChatResponse
+Two endpoints:
+  POST /code-scan/analyze  — Clone repo tree, triage + analyze files with AI,
+                              persist result to PostgreSQL, return scan_id.
+  POST /code-scan/chat     — Load the persisted scan from DB and answer questions.
+  GET  /code-scan/models   — List available AI models (debug / informational).
+
+Why we moved away from in-memory scan_store:
+  The original implementation stored scan results in a plain Python dict.
+  This caused two critical problems:
+    1. Any server restart or crash wiped all stored context, breaking chat.
+    2. Multiple Uvicorn workers have isolated memory; a scan on worker A
+       cannot be chatted with on worker B.
+  By persisting to PostgreSQL we get durability, scalability, and historical
+  records of all code scans (same pattern as the web scanner).
+"""
+
+import logging
+import json
+from fastapi import APIRouter, Depends, HTTPException
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+from typing import Dict
+
+from app.database import get_db
+from app.middleware.auth import get_optional_user
+from app.models.code_scan import CodeScanResult
+from app.models.user import User
+from app.schemas.code_scan import (
+    CodeScanRequest,
+    CodeScanResponse,
+    CodeChatRequest,
+    CodeChatResponse,
+    VulnerabilityIssue,
+)
 from app.services.code_scanner.orchestrator import CodeScanOrchestrator
 from app.config import settings

@@ -12,98 +43,159 @@ logger = logging.getLogger(__name__)

 router = APIRouter(tags=["code-scan"])

-# In-memory store for scan results to support chat context.
-# In a real production app, this would be stored in the database.
-scan_store: Dict[str, CodeScanResponse] = {}
+
+# ---------------------------------------------------------------------------
+# POST /code-scan/analyze
+# ---------------------------------------------------------------------------

@router.post("/code-scan/analyze", response_model=CodeScanResponse)
-async def analyze_codebase(request: CodeScanRequest):
+async def analyze_codebase(
+    request: CodeScanRequest,
+    db: AsyncSession = Depends(get_db),
+    current_user: User | None = Depends(get_optional_user),
+):
+    """
+    Full agentic scan of a GitHub repository.
+
+    Flow:
+      1. Fetch the full file tree from GitHub.
+      2. Ask the AI to triage (select) the most security-critical files.
+      3. Analyze each triaged file for OWASP Top-10 vulnerabilities.
+      4. Generate an executive summary of all findings.
+      5. Persist the result to `code_scan_results` table.
+      6. Return the scan_id so the frontend can open a chat session.
+    """
    logger.info(f"Starting code scan for {request.repo_url}")
-    
+
    try:
        orchestrator = CodeScanOrchestrator(
            repo_url=request.repo_url,
            github_token=request.github_token,
-            branch=request.branch or "main"
+            branch=request.branch or "main",
        )
-        
-        # 1. Fetch repo structure
-        all_files = await orchestrator.github.get_repo_tree(request.repo_url, request.branch or "main")
-        
-        # 2. Triage files
+
+        # Step 1 — Fetch file tree
+        all_files = await orchestrator.github.get_repo_tree(
+            request.repo_url, request.branch or "main"
+        )
+
+        # Step 2 — AI triage: pick the most security-sensitive files
        triaged_files = await orchestrator.triage_files(all_files)
        logger.info(f"Triaged {len(triaged_files)} files out of {len(all_files)}.")
-        
-        # 3. Analyze triaged files
+
+        # Step 3 — Analyze each file
        vulnerabilities = await orchestrator.analyze_files(triaged_files)
-        
-        # 4. Generate Summary
+
+        # Step 4 — Generate summary
        summary = await orchestrator.generate_summary(vulnerabilities)
-        
-        scan_id = str(uuid.uuid4())
-        
-        response = CodeScanResponse(
+
+        # Step 5 — Persist to database
+        # Serialise VulnerabilityIssue objects to plain dicts for JSON storage.
+        issues_as_dicts = [v.model_dump() for v in vulnerabilities]
+
+        scan_record = CodeScanResult(
+            user_id=current_user.id if current_user else None,
+            repo_url=request.repo_url,
+            summary=summary,
+            issues=issues_as_dicts,
+        )
+        db.add(scan_record)
+        await db.flush()  # flush to get the auto-generated id without committing yet
+        scan_id = scan_record.id
+
+        logger.info(f"Code scan {scan_id} persisted to database.")
+
+        return CodeScanResponse(
            scan_id=scan_id,
            repo_url=request.repo_url,
            summary=summary,
-            issues=vulnerabilities
+            issues=vulnerabilities,
        )
-        
-        # Save to in-memory store for the chat feature
-        scan_store[scan_id] = response
-        
-        return response
+
    except Exception as e:
        logger.error(f"Code scan failed: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))
+
+
+# ---------------------------------------------------------------------------
+# POST /code-scan/chat
+# ---------------------------------------------------------------------------
+
+@router.post("/code-scan/chat", response_model=CodeChatResponse)
+async def chat_with_scan(
+    request: CodeChatRequest,
+    db: AsyncSession = Depends(get_db),
+):
+    """
+    Conversational Q&A grounded in a previously completed code scan.
+
+    We load the scan from PostgreSQL using the scan_id, so this works
+    correctly across server restarts and multiple workers.
+    """
+    if not settings.ai_api_key:
+        raise HTTPException(
+            status_code=400,
+            detail="AI Chat is disabled because no AI API key is configured.",
+        )
+
+    # Load the scan record from DB
+    result = await db.execute(
+        select(CodeScanResult).where(CodeScanResult.id == request.scan_id)
+    )
+    scan_data = result.scalar_one_or_none()
+
+    if not scan_data:
+        raise HTTPException(
+            status_code=404,
+            detail="Scan ID not found. The scan may have expired or never existed.",
+        )
+
+    # Build a rich context-aware prompt
+    prompt = (
+        "You are SecureLens AI, an expert application security assistant. "
+        "You are helping a developer understand a security scan report for their codebase. "
+        f"Here is the context of the scan for the repository '{scan_data.repo_url}':\n\n"
+        f"Executive Summary:\n{scan_data.summary}\n\n"
+        f"Vulnerabilities Found:\n{json.dumps(scan_data.issues, indent=2)}\n\n"
+        f"Developer's Question: {request.message}\n\n"
+        "Answer clearly, concisely, and professionally. "
+        "Provide specific code fixes when requested. "
+        "Reference the exact file paths and line numbers from the scan data when relevant."
+    )
+
+    try:
+        # Use the unified AI service so it respects whatever provider is configured
+        from app.services.ai import call_ai
+        reply = await call_ai(prompt, temperature=0.5)
+        return CodeChatResponse(reply=reply)
+    except Exception as e:
+        logger.error(f"AI Chat Error: {str(e)}")
+        raise HTTPException(
+            status_code=500,
+            detail="I encountered an error trying to process your request.",
+        )
+
+
+# ---------------------------------------------------------------------------
+# GET /code-scan/models  (informational / debug)
+# ---------------------------------------------------------------------------
+
@router.get("/code-scan/models")
 async def list_available_models():
-    if not settings.gemini_api_key:
-        raise HTTPException(status_code=500, detail="GEMINI_API_KEY is not set.")
+    """
+    Lists AI models available to the configured provider.
+    Only meaningful when using the Gemini provider.
+    """
+    if not settings.ai_api_key:
+        raise HTTPException(status_code=500, detail="No AI API key is set.")
    try:
        from google import genai
-        client = genai.Client(api_key=settings.gemini_api_key)
+
+        client = genai.Client(api_key=settings.ai_api_key)
        models = []
        for model in client.models.list():
-            if 'generateContent' in model.supported_actions:
+            if "generateContent" in model.supported_actions:
                models.append(model.name)
        return {"models": models}
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error fetching models: {e}")
-
-@router.post("/code-scan/chat", response_model=CodeChatResponse)
-async def chat_with_scan(request: CodeChatRequest):
-    if not settings.gemini_api_key:
-        raise HTTPException(status_code=400, detail="AI Chat is disabled because GEMINI_API_KEY is not configured.")
-        
-    scan_data = scan_store.get(request.scan_id)
-    if not scan_data:
-        raise HTTPException(status_code=404, detail="Scan ID not found or expired.")
-        
-    prompt = (
-        "You are SecureLens AI, an expert application security assistant. "
-        "You are helping a developer understand a security scan report for their codebase. "
-        f"Here is the context of the scan for the repository {scan_data.repo_url}:\n"
-        f"Summary: {scan_data.summary}\n"
-        f"Vulnerabilities: {json.dumps([v.model_dump() for v in scan_data.issues])}\n\n"
-        f"User Message: {request.message}\n\n"
-        "Answer the user's questions clearly, concisely, and professionally. Provide code fixes if requested."
-    )
-
-    try:
-        from google import genai
-        from google.genai import types
-        
-        client = genai.Client(api_key=settings.gemini_api_key)
-        response = await client.aio.models.generate_content(
-            model='gemini-2.0-flash',
-            contents=prompt,
-            config=types.GenerateContentConfig(
-                temperature=0.5,
-            )
-        )
-        reply = response.text or "No response from AI."
-        return CodeChatResponse(reply=reply)
-    except Exception as e:
-        logger.error(f"AI Chat Error: {str(e)}")
-        raise HTTPException(status_code=500, detail="I encountered an error trying to process your request.")