feat: persistent DuckDB index, new filters, pagination fix, enrich UX

- Build /data/domains.duckdb on first run (tld+parts columns + ART index) → TLD filter goes from ~60s full scan to <100ms index lookup → System still works (slower) while index builds in background - New /api/domains params: alpha_only, no_sld, keyword → alpha_only: domains with only letters (no hyphens/numbers) → no_sld: parts=2, excludes com.es / net.es patterns → keyword: LIKE '%term%' niche search - /api/domains and /api/enriched now return total count for pagination - Pagination: shows total matches, page X of Y, Next disabled at last page - Enrich button: toast notifications instead of alert(), error handling - Select all on page button, clear selection button - Stats/TLD breakdown cached after first load (no repeat full scan) - Header shows index build status (building → ready) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-13 17:00:08 +02:00
parent 2db95cc727
commit 7acff12242
3 changed files with 662 additions and 641 deletions
--- a/app/main.py
+++ b/app/main.py
@@ -1,12 +1,10 @@
 import os
-import sys
 import asyncio
 import logging
 from pathlib import Path
 from contextlib import asynccontextmanager

 import httpx
-import duckdb
 import aiosqlite
 from fastapi import FastAPI, Query
 from fastapi.responses import StreamingResponse, JSONResponse
@@ -18,7 +16,7 @@ load_dotenv()
 from app.db import (
    DATA_DIR, PARQUET_PATH, SQLITE_PATH,
    init_db, get_stats, get_domains, get_enriched,
-    queue_domains, get_queue_status,
+    queue_domains, get_queue_status, build_duckdb_index, index_status,
 )
 from app.enricher import start_worker, pause_worker, resume_worker, is_running
 from app.scorer import run_scoring
@@ -37,16 +35,13 @@ async def download_parquet():
    DATA_DIR.mkdir(parents=True, exist_ok=True)
    tmp_path = PARQUET_PATH.with_suffix(".tmp")

-    # Resumable download via Range header
    downloaded = tmp_path.stat().st_size if tmp_path.exists() else 0
    headers = {"Range": f"bytes={downloaded}-"} if downloaded > 0 else {}

    logger.info("Downloading parquet from %s (offset=%d)...", PARQUET_URL, downloaded)
-
    async with httpx.AsyncClient(follow_redirects=True, timeout=None) as client:
        async with client.stream("GET", PARQUET_URL, headers=headers) as resp:
            if resp.status_code == 416:
-                # Already fully downloaded
                tmp_path.rename(PARQUET_PATH)
                return
            resp.raise_for_status()
@@ -58,41 +53,54 @@ async def download_parquet():
                    f.write(chunk)
                    received += len(chunk)
                    if total:
-                        pct = received / total * 100
-                        logger.info("Download progress: %.1f%% (%d/%d bytes)", pct, received, total)
+                        logger.info("Download: %.1f%% (%d/%d)", received / total * 100, received, total)

    tmp_path.rename(PARQUET_PATH)
-    logger.info("Parquet download complete: %s", PARQUET_PATH)
+    logger.info("Parquet download complete")


@asynccontextmanager
 async def lifespan(app: FastAPI):
    await download_parquet()
    await init_db()
+    # Build DuckDB index in background — queries still work (slower) while building
+    asyncio.create_task(build_duckdb_index())
    start_worker()
-    logger.info("DomGod dashboard ready on port 6677")
+    logger.info("DomGod ready on port 6677")
    yield


 app = FastAPI(title="DomGod", lifespan=lifespan)


-# ── API routes ──────────────────────────────────────────────────────────────
+# ── API ──────────────────────────────────────────────────────────────────────

@app.get("/api/stats")
 async def stats():
    return await get_stats()


+@app.get("/api/index/status")
+async def get_index_status():
+    return index_status()
+
+
@app.get("/api/domains")
 async def domains(
    tld: str = Query(None),
    page: int = Query(1, ge=1),
-    limit: int = Query(100, ge=1, le=1000),
+    limit: int = Query(100, ge=1, le=500),
    live_only: bool = Query(False),
+    alpha_only: bool = Query(False),
+    no_sld: bool = Query(False),
+    keyword: str = Query(None),
 ):
-    rows = await get_domains(tld=tld, page=page, limit=limit, live_only=live_only)
-    return {"page": page, "limit": limit, "results": rows}
+    total, rows = await get_domains(
+        tld=tld, page=page, limit=limit,
+        alpha_only=alpha_only, no_sld=no_sld,
+        keyword=keyword, live_only=live_only,
+    )
+    return {"page": page, "limit": limit, "total": total, "results": rows}


@app.post("/api/enrich/batch")
@@ -118,7 +126,7 @@ async def enrich_retry():
        await db.execute("UPDATE job_queue SET status='pending', error=NULL WHERE status='failed'")
        await db.commit()
    resume_worker()
-    return {"status": "retrying failed jobs"}
+    return {"status": "retrying"}


@app.post("/api/enrich/pause")
@@ -141,8 +149,8 @@ async def enriched(
    page: int = Query(1, ge=1),
    limit: int = Query(100, ge=1, le=1000),
 ):
-    rows = await get_enriched(min_score=min_score, cms=cms, country=country, page=page, limit=limit)
-    return {"page": page, "limit": limit, "results": rows}
+    total, rows = await get_enriched(min_score=min_score, cms=cms, country=country, page=page, limit=limit)
+    return {"page": page, "limit": limit, "total": total, "results": rows}


@app.get("/api/export")
@@ -157,46 +165,42 @@ async def export_csv(
    elif tier == "warm":
        min_score = 50

+    max_score = 79 if tier == "warm" else 100
+
    async def generate():
        yield "domain,score,cms,ssl_expiry_days,ip_country,is_live,status_code,has_mx,server,page_title,enriched_at\n"
-        page = 1
+        p = 1
        while True:
-            rows = await get_enriched(min_score=min_score, cms=cms, country=country, page=page, limit=500)
+            _, rows = await get_enriched(min_score=min_score, cms=cms, country=country, page=p, limit=500)
            if not rows:
                break
            for r in rows:
-                # Apply warm tier upper bound
-                if tier == "warm" and r.get("score", 0) >= 80:
+                if r.get("score", 0) > max_score:
                    continue
                line = ",".join(
                    f'"{str(r.get(col) or "").replace(chr(34), chr(39))}"'
-                    for col in [
-                        "domain", "score", "cms", "ssl_expiry_days", "ip_country",
-                        "is_live", "status_code", "has_mx", "server", "page_title", "enriched_at"
-                    ]
+                    for col in ["domain", "score", "cms", "ssl_expiry_days", "ip_country",
+                                "is_live", "status_code", "has_mx", "server", "page_title", "enriched_at"]
                )
                yield line + "\n"
-            page += 1
+            p += 1

-    filename = f"domgod_leads_score{min_score}{'_' + tier if tier else ''}.csv"
+    fname = f"domgod_{tier or 'export'}_score{min_score}.csv"
    return StreamingResponse(
-        generate(),
-        media_type="text/csv",
-        headers={"Content-Disposition": f'attachment; filename="{filename}"'},
+        generate(), media_type="text/csv",
+        headers={"Content-Disposition": f'attachment; filename="{fname}"'},
    )


@app.post("/api/score/run")
 async def score_run():
-    result = await run_scoring()
-    return result
+    return await run_scoring()


-# ── Static UI ───────────────────────────────────────────────────────────────
+# ── Static UI ────────────────────────────────────────────────────────────────
 static_dir = Path(__file__).parent / "static"
 app.mount("/", StaticFiles(directory=str(static_dir), html=True), name="static")

-
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run("app.main:app", host="0.0.0.0", port=6677, log_level="info")