feat: persistent DuckDB index, new filters, pagination fix, enrich UX

- Build /data/domains.duckdb on first run (tld+parts columns + ART index)
  → TLD filter goes from ~60s full scan to <100ms index lookup
  → System still works (slower) while index builds in background
- New /api/domains params: alpha_only, no_sld, keyword
  → alpha_only: domains with only letters (no hyphens/numbers)
  → no_sld: parts=2, excludes com.es / net.es patterns
  → keyword: LIKE '%term%' niche search
- /api/domains and /api/enriched now return total count for pagination
- Pagination: shows total matches, page X of Y, Next disabled at last page
- Enrich button: toast notifications instead of alert(), error handling
- Select all on page button, clear selection button
- Stats/TLD breakdown cached after first load (no repeat full scan)
- Header shows index build status (building → ready)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-13 17:00:08 +02:00
parent 2db95cc727
commit 7acff12242
3 changed files with 662 additions and 641 deletions

View File

@@ -1,12 +1,10 @@
import os
import sys
import asyncio
import logging
from pathlib import Path
from contextlib import asynccontextmanager
import httpx
import duckdb
import aiosqlite
from fastapi import FastAPI, Query
from fastapi.responses import StreamingResponse, JSONResponse
@@ -18,7 +16,7 @@ load_dotenv()
from app.db import (
DATA_DIR, PARQUET_PATH, SQLITE_PATH,
init_db, get_stats, get_domains, get_enriched,
queue_domains, get_queue_status,
queue_domains, get_queue_status, build_duckdb_index, index_status,
)
from app.enricher import start_worker, pause_worker, resume_worker, is_running
from app.scorer import run_scoring
@@ -37,16 +35,13 @@ async def download_parquet():
DATA_DIR.mkdir(parents=True, exist_ok=True)
tmp_path = PARQUET_PATH.with_suffix(".tmp")
# Resumable download via Range header
downloaded = tmp_path.stat().st_size if tmp_path.exists() else 0
headers = {"Range": f"bytes={downloaded}-"} if downloaded > 0 else {}
logger.info("Downloading parquet from %s (offset=%d)...", PARQUET_URL, downloaded)
async with httpx.AsyncClient(follow_redirects=True, timeout=None) as client:
async with client.stream("GET", PARQUET_URL, headers=headers) as resp:
if resp.status_code == 416:
# Already fully downloaded
tmp_path.rename(PARQUET_PATH)
return
resp.raise_for_status()
@@ -58,41 +53,54 @@ async def download_parquet():
f.write(chunk)
received += len(chunk)
if total:
pct = received / total * 100
logger.info("Download progress: %.1f%% (%d/%d bytes)", pct, received, total)
logger.info("Download: %.1f%% (%d/%d)", received / total * 100, received, total)
tmp_path.rename(PARQUET_PATH)
logger.info("Parquet download complete: %s", PARQUET_PATH)
logger.info("Parquet download complete")
@asynccontextmanager
async def lifespan(app: FastAPI):
await download_parquet()
await init_db()
# Build DuckDB index in background — queries still work (slower) while building
asyncio.create_task(build_duckdb_index())
start_worker()
logger.info("DomGod dashboard ready on port 6677")
logger.info("DomGod ready on port 6677")
yield
app = FastAPI(title="DomGod", lifespan=lifespan)
# ── API routes ──────────────────────────────────────────────────────────────
# ── API ──────────────────────────────────────────────────────────────────────
@app.get("/api/stats")
async def stats():
return await get_stats()
@app.get("/api/index/status")
async def get_index_status():
return index_status()
@app.get("/api/domains")
async def domains(
tld: str = Query(None),
page: int = Query(1, ge=1),
limit: int = Query(100, ge=1, le=1000),
limit: int = Query(100, ge=1, le=500),
live_only: bool = Query(False),
alpha_only: bool = Query(False),
no_sld: bool = Query(False),
keyword: str = Query(None),
):
rows = await get_domains(tld=tld, page=page, limit=limit, live_only=live_only)
return {"page": page, "limit": limit, "results": rows}
total, rows = await get_domains(
tld=tld, page=page, limit=limit,
alpha_only=alpha_only, no_sld=no_sld,
keyword=keyword, live_only=live_only,
)
return {"page": page, "limit": limit, "total": total, "results": rows}
@app.post("/api/enrich/batch")
@@ -118,7 +126,7 @@ async def enrich_retry():
await db.execute("UPDATE job_queue SET status='pending', error=NULL WHERE status='failed'")
await db.commit()
resume_worker()
return {"status": "retrying failed jobs"}
return {"status": "retrying"}
@app.post("/api/enrich/pause")
@@ -141,8 +149,8 @@ async def enriched(
page: int = Query(1, ge=1),
limit: int = Query(100, ge=1, le=1000),
):
rows = await get_enriched(min_score=min_score, cms=cms, country=country, page=page, limit=limit)
return {"page": page, "limit": limit, "results": rows}
total, rows = await get_enriched(min_score=min_score, cms=cms, country=country, page=page, limit=limit)
return {"page": page, "limit": limit, "total": total, "results": rows}
@app.get("/api/export")
@@ -157,46 +165,42 @@ async def export_csv(
elif tier == "warm":
min_score = 50
max_score = 79 if tier == "warm" else 100
async def generate():
yield "domain,score,cms,ssl_expiry_days,ip_country,is_live,status_code,has_mx,server,page_title,enriched_at\n"
page = 1
p = 1
while True:
rows = await get_enriched(min_score=min_score, cms=cms, country=country, page=page, limit=500)
_, rows = await get_enriched(min_score=min_score, cms=cms, country=country, page=p, limit=500)
if not rows:
break
for r in rows:
# Apply warm tier upper bound
if tier == "warm" and r.get("score", 0) >= 80:
if r.get("score", 0) > max_score:
continue
line = ",".join(
f'"{str(r.get(col) or "").replace(chr(34), chr(39))}"'
for col in [
"domain", "score", "cms", "ssl_expiry_days", "ip_country",
"is_live", "status_code", "has_mx", "server", "page_title", "enriched_at"
]
for col in ["domain", "score", "cms", "ssl_expiry_days", "ip_country",
"is_live", "status_code", "has_mx", "server", "page_title", "enriched_at"]
)
yield line + "\n"
page += 1
p += 1
filename = f"domgod_leads_score{min_score}{'_' + tier if tier else ''}.csv"
fname = f"domgod_{tier or 'export'}_score{min_score}.csv"
return StreamingResponse(
generate(),
media_type="text/csv",
headers={"Content-Disposition": f'attachment; filename="{filename}"'},
generate(), media_type="text/csv",
headers={"Content-Disposition": f'attachment; filename="{fname}"'},
)
@app.post("/api/score/run")
async def score_run():
result = await run_scoring()
return result
return await run_scoring()
# ── Static UI ───────────────────────────────────────────────────────────────
# ── Static UI ───────────────────────────────────────────────────────────────
static_dir = Path(__file__).parent / "static"
app.mount("/", StaticFiles(directory=str(static_dir), html=True), name="static")
if __name__ == "__main__":
import uvicorn
uvicorn.run("app.main:app", host="0.0.0.0", port=6677, log_level="info")