feat: initial Dockerized domain intelligence dashboard
- FastAPI backend with DuckDB pushdown queries on 72M parquet - Async enrichment worker: HTTP, SSL, DNS MX, CMS fingerprint, ip-api.com - Resumable parquet download with HTTP Range support - Lead scoring engine (max 100 pts, target countries ES,GB,DE,FR,RO,PT,AD,IT) - Single-file Alpine.js + Chart.js dashboard on port 6677 - SQLite enrichment DB with job queue and scores tables - Dockerized with persistent /data volume Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
202
app/main.py
Normal file
202
app/main.py
Normal file
@@ -0,0 +1,202 @@
|
||||
import os
|
||||
import sys
|
||||
import asyncio
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
import httpx
|
||||
import duckdb
|
||||
import aiosqlite
|
||||
from fastapi import FastAPI, Query
|
||||
from fastapi.responses import StreamingResponse, JSONResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
from app.db import (
|
||||
DATA_DIR, PARQUET_PATH, SQLITE_PATH,
|
||||
init_db, get_stats, get_domains, get_enriched,
|
||||
queue_domains, get_queue_status,
|
||||
)
|
||||
from app.enricher import start_worker, pause_worker, resume_worker, is_running
|
||||
from app.scorer import run_scoring
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PARQUET_URL = os.getenv("PARQUET_URL", "")
|
||||
|
||||
|
||||
async def download_parquet():
|
||||
if PARQUET_PATH.exists():
|
||||
logger.info("Using cached parquet at %s", PARQUET_PATH)
|
||||
return
|
||||
|
||||
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
||||
tmp_path = PARQUET_PATH.with_suffix(".tmp")
|
||||
|
||||
# Resumable download via Range header
|
||||
downloaded = tmp_path.stat().st_size if tmp_path.exists() else 0
|
||||
headers = {"Range": f"bytes={downloaded}-"} if downloaded > 0 else {}
|
||||
|
||||
logger.info("Downloading parquet from %s (offset=%d)...", PARQUET_URL, downloaded)
|
||||
|
||||
async with httpx.AsyncClient(follow_redirects=True, timeout=None) as client:
|
||||
async with client.stream("GET", PARQUET_URL, headers=headers) as resp:
|
||||
if resp.status_code == 416:
|
||||
# Already fully downloaded
|
||||
tmp_path.rename(PARQUET_PATH)
|
||||
return
|
||||
resp.raise_for_status()
|
||||
total = int(resp.headers.get("content-length", 0)) + downloaded
|
||||
mode = "ab" if downloaded > 0 else "wb"
|
||||
with open(tmp_path, mode) as f:
|
||||
received = downloaded
|
||||
async for chunk in resp.aiter_bytes(chunk_size=1024 * 1024):
|
||||
f.write(chunk)
|
||||
received += len(chunk)
|
||||
if total:
|
||||
pct = received / total * 100
|
||||
logger.info("Download progress: %.1f%% (%d/%d bytes)", pct, received, total)
|
||||
|
||||
tmp_path.rename(PARQUET_PATH)
|
||||
logger.info("Parquet download complete: %s", PARQUET_PATH)
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
await download_parquet()
|
||||
await init_db()
|
||||
start_worker()
|
||||
logger.info("DomGod dashboard ready on port 6677")
|
||||
yield
|
||||
|
||||
|
||||
app = FastAPI(title="DomGod", lifespan=lifespan)
|
||||
|
||||
|
||||
# ── API routes ──────────────────────────────────────────────────────────────
|
||||
|
||||
@app.get("/api/stats")
|
||||
async def stats():
|
||||
return await get_stats()
|
||||
|
||||
|
||||
@app.get("/api/domains")
|
||||
async def domains(
|
||||
tld: str = Query(None),
|
||||
page: int = Query(1, ge=1),
|
||||
limit: int = Query(100, ge=1, le=1000),
|
||||
live_only: bool = Query(False),
|
||||
):
|
||||
rows = await get_domains(tld=tld, page=page, limit=limit, live_only=live_only)
|
||||
return {"page": page, "limit": limit, "results": rows}
|
||||
|
||||
|
||||
@app.post("/api/enrich/batch")
|
||||
async def enrich_batch(body: dict):
|
||||
domains_list = body.get("domains", [])
|
||||
if not domains_list:
|
||||
return JSONResponse({"error": "no domains provided"}, status_code=400)
|
||||
await queue_domains(domains_list)
|
||||
resume_worker()
|
||||
return {"queued": len(domains_list)}
|
||||
|
||||
|
||||
@app.get("/api/enrich/status")
|
||||
async def enrich_status():
|
||||
status = await get_queue_status()
|
||||
status["worker_running"] = is_running()
|
||||
return status
|
||||
|
||||
|
||||
@app.post("/api/enrich/retry")
|
||||
async def enrich_retry():
|
||||
async with aiosqlite.connect(SQLITE_PATH) as db:
|
||||
await db.execute("UPDATE job_queue SET status='pending', error=NULL WHERE status='failed'")
|
||||
await db.commit()
|
||||
resume_worker()
|
||||
return {"status": "retrying failed jobs"}
|
||||
|
||||
|
||||
@app.post("/api/enrich/pause")
|
||||
async def enrich_pause():
|
||||
pause_worker()
|
||||
return {"status": "paused"}
|
||||
|
||||
|
||||
@app.post("/api/enrich/resume")
|
||||
async def enrich_resume():
|
||||
resume_worker()
|
||||
return {"status": "resumed"}
|
||||
|
||||
|
||||
@app.get("/api/enriched")
|
||||
async def enriched(
|
||||
min_score: int = Query(0, ge=0, le=100),
|
||||
cms: str = Query(None),
|
||||
country: str = Query(None),
|
||||
page: int = Query(1, ge=1),
|
||||
limit: int = Query(100, ge=1, le=1000),
|
||||
):
|
||||
rows = await get_enriched(min_score=min_score, cms=cms, country=country, page=page, limit=limit)
|
||||
return {"page": page, "limit": limit, "results": rows}
|
||||
|
||||
|
||||
@app.get("/api/export")
|
||||
async def export_csv(
|
||||
min_score: int = Query(0),
|
||||
cms: str = Query(None),
|
||||
country: str = Query(None),
|
||||
tier: str = Query(None),
|
||||
):
|
||||
if tier == "hot":
|
||||
min_score = 80
|
||||
elif tier == "warm":
|
||||
min_score = 50
|
||||
|
||||
async def generate():
|
||||
yield "domain,score,cms,ssl_expiry_days,ip_country,is_live,status_code,has_mx,server,page_title,enriched_at\n"
|
||||
page = 1
|
||||
while True:
|
||||
rows = await get_enriched(min_score=min_score, cms=cms, country=country, page=page, limit=500)
|
||||
if not rows:
|
||||
break
|
||||
for r in rows:
|
||||
# Apply warm tier upper bound
|
||||
if tier == "warm" and r.get("score", 0) >= 80:
|
||||
continue
|
||||
line = ",".join(
|
||||
f'"{str(r.get(col) or "").replace(chr(34), chr(39))}"'
|
||||
for col in [
|
||||
"domain", "score", "cms", "ssl_expiry_days", "ip_country",
|
||||
"is_live", "status_code", "has_mx", "server", "page_title", "enriched_at"
|
||||
]
|
||||
)
|
||||
yield line + "\n"
|
||||
page += 1
|
||||
|
||||
filename = f"domgod_leads_score{min_score}{'_' + tier if tier else ''}.csv"
|
||||
return StreamingResponse(
|
||||
generate(),
|
||||
media_type="text/csv",
|
||||
headers={"Content-Disposition": f'attachment; filename="{filename}"'},
|
||||
)
|
||||
|
||||
|
||||
@app.post("/api/score/run")
|
||||
async def score_run():
|
||||
result = await run_scoring()
|
||||
return result
|
||||
|
||||
|
||||
# ── Static UI ───────────────────────────────────────────────────────────────
|
||||
static_dir = Path(__file__).parent / "static"
|
||||
app.mount("/", StaticFiles(directory=str(static_dir), html=True), name="static")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=6677, log_level="info")
|
||||
Reference in New Issue
Block a user