diff --git a/app/services/nuclei_scanner.py b/app/services/nuclei_scanner.py new file mode 100644 index 0000000..600f2cf --- /dev/null +++ b/app/services/nuclei_scanner.py @@ -0,0 +1,179 @@ +""" +Nuclei Scanner +============== + +Wraps the Nuclei CLI tool (https://github.com/projectdiscovery/nuclei) as an +async subprocess call so it can run as a background task after the main scan +response has been returned to the client. + +Why a background task? + Nuclei is an active scanner — it actually sends probe requests to the target. + A typical run takes 30–120 seconds, which is far too slow to block an API + response. Running it in the background lets the client get the passive scan + results immediately, and then poll GET /scans/{id}/nuclei for the Nuclei + findings when they are ready. + +Nuclei is completely optional. If the binary is not found in PATH (or the +configured path), the scan is silently skipped and the NucleiScanResult row +is saved with status "skipped". No error is raised. + +Output format: + Nuclei outputs one JSON object per line (-json flag). Each line looks like: + { + "template-id": "...", + "info": {"name": "...", "severity": "..."}, + "matched-at": "https://...", + "description": "..." + } + +Installation: + go install -v github.com/projectdiscovery/nuclei/v3/cmd/nuclei@latest + Or download the binary from https://github.com/projectdiscovery/nuclei/releases +""" + +import asyncio +import json +import logging +import shutil +from datetime import datetime, timezone + +from app.config import settings +from app.database import AsyncSessionLocal +from app.models.nuclei_result import NucleiScanResult + +logger = logging.getLogger(__name__) + +# Conservative timeout — Nuclei can be slow, but we cap it at 90 seconds +# to prevent background tasks from running indefinitely. +NUCLEI_TIMEOUT_SECONDS = 90 + + +def _find_nuclei_binary() -> str | None: + """ + Resolve the Nuclei binary path. + + Checks the configured path first (NUCLEI_BINARY_PATH env var), then + falls back to searching PATH. Returns None if not found anywhere. + """ + if settings.nuclei_binary_path: + return settings.nuclei_binary_path + + return shutil.which("nuclei") + + +def _parse_nuclei_output(stdout: bytes) -> list[dict]: + """ + Parse Nuclei's JSONL output into a list of finding dicts. + + Each line of stdout is expected to be a valid JSON object. Lines that + fail to parse are skipped with a warning. + """ + findings = [] + for line in stdout.decode("utf-8", errors="replace").splitlines(): + line = line.strip() + if not line: + continue + try: + raw = json.loads(line) + # Normalise into a flat schema we control + findings.append({ + "template_id": raw.get("template-id", "unknown"), + "name": raw.get("info", {}).get("name", "Unknown"), + "severity": raw.get("info", {}).get("severity", "info"), + "matched_at": raw.get("matched-at", ""), + "description": raw.get("info", {}).get("description", None), + }) + except json.JSONDecodeError: + logger.warning(f"Nuclei: could not parse output line: {line[:200]}") + + return findings + + +async def run_nuclei_scan(scan_result_id: str, url: str) -> None: + """ + Entry point for the background Nuclei scan task. + + This runs after the main scan response has been sent. It: + 1. Checks if the Nuclei binary is available + 2. Runs Nuclei against the URL with a timeout + 3. Parses the JSONL output + 4. Saves a NucleiScanResult row to the database + + The NucleiScanResult.status field reflects the outcome: + - "completed" : Nuclei ran and (possibly) found issues + - "skipped" : Nuclei binary not found + - "timeout" : Nuclei ran but exceeded the timeout + - "error" : Nuclei subprocess failed + """ + nuclei_path = _find_nuclei_binary() + + if not nuclei_path: + logger.info( + "Nuclei binary not found — active scan skipped. " + "Install nuclei and set NUCLEI_BINARY_PATH if you want active scanning." + ) + await _save_nuclei_result(scan_result_id, url, [], "skipped") + return + + logger.info(f"Nuclei active scan starting: {url}") + + try: + proc = await asyncio.create_subprocess_exec( + nuclei_path, + "-u", url, + "-json", # output as JSON lines + "-silent", # suppress banner/progress + "-timeout", "10", # per-request timeout in seconds (inside Nuclei) + "-rate-limit", "10", # be polite — 10 req/s max + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + + try: + stdout, stderr = await asyncio.wait_for( + proc.communicate(), + timeout=NUCLEI_TIMEOUT_SECONDS, + ) + except asyncio.TimeoutError: + proc.kill() + await proc.communicate() + logger.warning(f"Nuclei timed out after {NUCLEI_TIMEOUT_SECONDS}s for {url}") + await _save_nuclei_result(scan_result_id, url, [], "timeout") + return + + if proc.returncode not in (0, 1): + # Nuclei exits 1 when it finds nothing — that's not an error + err = stderr.decode("utf-8", errors="replace")[:500] + logger.error(f"Nuclei exited with code {proc.returncode}: {err}") + await _save_nuclei_result(scan_result_id, url, [], "error") + return + + findings = _parse_nuclei_output(stdout) + logger.info(f"Nuclei scan complete: {url} — {len(findings)} finding(s)") + await _save_nuclei_result(scan_result_id, url, findings, "completed") + + except FileNotFoundError: + logger.error(f"Nuclei binary not executable at path: {nuclei_path}") + await _save_nuclei_result(scan_result_id, url, [], "skipped") + except Exception as e: + logger.error(f"Nuclei scan failed for {url}: {e}", exc_info=True) + await _save_nuclei_result(scan_result_id, url, [], "error") + + +async def _save_nuclei_result( + scan_result_id: str, + url: str, + findings: list[dict], + status: str, +) -> None: + """Persist the Nuclei scan result. Uses its own session (background context).""" + async with AsyncSessionLocal() as db: + row = NucleiScanResult( + scan_result_id=scan_result_id, + url=url, + findings=findings, + status=status, + completed_at=datetime.now(timezone.utc), + ) + db.add(row) + await db.commit()