import os import asyncio import logging from pathlib import Path from contextlib import asynccontextmanager import httpx import aiosqlite from fastapi import FastAPI, Query from fastapi.responses import StreamingResponse, JSONResponse from fastapi.staticfiles import StaticFiles from dotenv import load_dotenv load_dotenv() from app.db import ( DATA_DIR, PARQUET_PATH, SQLITE_PATH, init_db, get_stats, get_domains, get_enriched, queue_domains, get_queue_status, build_duckdb_index, index_status, ) from app.enricher import start_worker, pause_worker, resume_worker, is_running from app.scorer import run_scoring logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") logger = logging.getLogger(__name__) PARQUET_URL = os.getenv("PARQUET_URL", "") async def download_parquet(): if PARQUET_PATH.exists(): logger.info("Using cached parquet at %s", PARQUET_PATH) return DATA_DIR.mkdir(parents=True, exist_ok=True) tmp_path = PARQUET_PATH.with_suffix(".tmp") downloaded = tmp_path.stat().st_size if tmp_path.exists() else 0 headers = {"Range": f"bytes={downloaded}-"} if downloaded > 0 else {} logger.info("Downloading parquet from %s (offset=%d)...", PARQUET_URL, downloaded) async with httpx.AsyncClient(follow_redirects=True, timeout=None) as client: async with client.stream("GET", PARQUET_URL, headers=headers) as resp: if resp.status_code == 416: tmp_path.rename(PARQUET_PATH) return resp.raise_for_status() total = int(resp.headers.get("content-length", 0)) + downloaded mode = "ab" if downloaded > 0 else "wb" with open(tmp_path, mode) as f: received = downloaded async for chunk in resp.aiter_bytes(chunk_size=1024 * 1024): f.write(chunk) received += len(chunk) if total: logger.info("Download: %.1f%% (%d/%d)", received / total * 100, received, total) tmp_path.rename(PARQUET_PATH) logger.info("Parquet download complete") @asynccontextmanager async def lifespan(app: FastAPI): await download_parquet() await init_db() # Build DuckDB index in background — queries still work (slower) while building asyncio.create_task(build_duckdb_index()) start_worker() logger.info("DomGod ready on port 6677") yield app = FastAPI(title="DomGod", lifespan=lifespan) # ── API ────────────────────────────────────────────────────────────────────── @app.get("/api/stats") async def stats(): return await get_stats() @app.get("/api/index/status") async def get_index_status(): return index_status() @app.get("/api/domains") async def domains( tld: str = Query(None), page: int = Query(1, ge=1), limit: int = Query(100, ge=1, le=500), live_only: bool = Query(False), alpha_only: bool = Query(False), no_sld: bool = Query(False), keyword: str = Query(None), ): total, rows = await get_domains( tld=tld, page=page, limit=limit, alpha_only=alpha_only, no_sld=no_sld, keyword=keyword, live_only=live_only, ) return {"page": page, "limit": limit, "total": total, "results": rows} @app.post("/api/enrich/batch") async def enrich_batch(body: dict): domains_list = body.get("domains", []) if not domains_list: return JSONResponse({"error": "no domains provided"}, status_code=400) await queue_domains(domains_list) resume_worker() return {"queued": len(domains_list)} @app.get("/api/enrich/status") async def enrich_status(): status = await get_queue_status() status["worker_running"] = is_running() return status @app.post("/api/enrich/retry") async def enrich_retry(): async with aiosqlite.connect(SQLITE_PATH) as db: await db.execute("UPDATE job_queue SET status='pending', error=NULL WHERE status='failed'") await db.commit() resume_worker() return {"status": "retrying"} @app.post("/api/enrich/pause") async def enrich_pause(): pause_worker() return {"status": "paused"} @app.post("/api/enrich/resume") async def enrich_resume(): resume_worker() return {"status": "resumed"} @app.get("/api/enriched") async def enriched( min_score: int = Query(0, ge=0, le=100), cms: str = Query(None), country: str = Query(None), page: int = Query(1, ge=1), limit: int = Query(100, ge=1, le=1000), ): total, rows = await get_enriched(min_score=min_score, cms=cms, country=country, page=page, limit=limit) return {"page": page, "limit": limit, "total": total, "results": rows} @app.get("/api/export") async def export_csv( min_score: int = Query(0), cms: str = Query(None), country: str = Query(None), tier: str = Query(None), ): if tier == "hot": min_score = 80 elif tier == "warm": min_score = 50 max_score = 79 if tier == "warm" else 100 async def generate(): yield "domain,score,cms,ssl_expiry_days,ip_country,is_live,status_code,has_mx,server,page_title,enriched_at\n" p = 1 while True: _, rows = await get_enriched(min_score=min_score, cms=cms, country=country, page=p, limit=500) if not rows: break for r in rows: if r.get("score", 0) > max_score: continue line = ",".join( f'"{str(r.get(col) or "").replace(chr(34), chr(39))}"' for col in ["domain", "score", "cms", "ssl_expiry_days", "ip_country", "is_live", "status_code", "has_mx", "server", "page_title", "enriched_at"] ) yield line + "\n" p += 1 fname = f"domgod_{tier or 'export'}_score{min_score}.csv" return StreamingResponse( generate(), media_type="text/csv", headers={"Content-Disposition": f'attachment; filename="{fname}"'}, ) @app.post("/api/score/run") async def score_run(): return await run_scoring() # ── Static UI ──────────────────────────────────────────────────────────────── static_dir = Path(__file__).parent / "static" app.mount("/", StaticFiles(directory=str(static_dir), html=True), name="static") if __name__ == "__main__": import uvicorn uvicorn.run("app.main:app", host="0.0.0.0", port=6677, log_level="info")