feat: persistent DuckDB index, new filters, pagination fix, enrich UX

- Build /data/domains.duckdb on first run (tld+parts columns + ART index)
  → TLD filter goes from ~60s full scan to <100ms index lookup
  → System still works (slower) while index builds in background
- New /api/domains params: alpha_only, no_sld, keyword
  → alpha_only: domains with only letters (no hyphens/numbers)
  → no_sld: parts=2, excludes com.es / net.es patterns
  → keyword: LIKE '%term%' niche search
- /api/domains and /api/enriched now return total count for pagination
- Pagination: shows total matches, page X of Y, Next disabled at last page
- Enrich button: toast notifications instead of alert(), error handling
- Select all on page button, clear selection button
- Stats/TLD breakdown cached after first load (no repeat full scan)
- Header shows index build status (building → ready)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-13 17:00:08 +02:00
parent 2db95cc727
commit 7acff12242
3 changed files with 662 additions and 641 deletions

324
app/db.py
View File

@@ -1,10 +1,15 @@
import os
import asyncio
import logging
import aiosqlite
import duckdb
from pathlib import Path
logger = logging.getLogger(__name__)
DATA_DIR = Path(os.getenv("DATA_DIR", "/data"))
PARQUET_PATH = DATA_DIR / "domains.parquet"
DUCKDB_PATH = DATA_DIR / "domains.duckdb"
SQLITE_PATH = DATA_DIR / "enrichment.db"
SCHEMA = """
@@ -23,7 +28,6 @@ CREATE TABLE IF NOT EXISTS enriched_domains (
error TEXT,
score INTEGER DEFAULT 0
);
CREATE TABLE IF NOT EXISTS job_queue (
id INTEGER PRIMARY KEY AUTOINCREMENT,
domain TEXT UNIQUE NOT NULL,
@@ -33,7 +37,6 @@ CREATE TABLE IF NOT EXISTS job_queue (
completed_at TEXT,
error TEXT
);
CREATE TABLE IF NOT EXISTS scores (
domain TEXT PRIMARY KEY,
score INTEGER NOT NULL,
@@ -41,6 +44,15 @@ CREATE TABLE IF NOT EXISTS scores (
);
"""
# Index build state
_index_ready = False
_index_building = False
_index_total = 0
# Cached stats (TLD breakdown is expensive — compute once)
_tld_cache: list = []
_total_cache: int = 0
async def init_db():
async with aiosqlite.connect(SQLITE_PATH) as db:
@@ -48,142 +60,219 @@ async def init_db():
await db.commit()
async def get_db():
return await aiosqlite.connect(SQLITE_PATH)
# ── DuckDB persistent index ──────────────────────────────────────────────────
def _build_index_sync():
global _index_ready, _index_building, _index_total
_index_building = True
try:
conn = duckdb.connect(str(DUCKDB_PATH))
conn.execute("SET threads=4")
conn.execute("SET memory_limit='2GB'")
# Check if already built
try:
n = conn.execute("SELECT COUNT(*) FROM domains").fetchone()[0]
if n > 0:
_index_total = n
_index_ready = True
_index_building = False
logger.info("DuckDB index already ready (%d rows)", n)
conn.close()
return
except Exception:
pass
logger.info("Building DuckDB index from parquet (one-time ~2-3 min)...")
conn.execute("""
CREATE OR REPLACE TABLE domains AS
SELECT
domain,
lower(regexp_extract(domain, '\\.([^.]+)$', 1)) AS tld,
len(string_split(domain, '.')) AS parts
FROM read_parquet(?)
""", [str(PARQUET_PATH)])
conn.execute("CREATE INDEX IF NOT EXISTS idx_tld ON domains(tld)")
_index_total = conn.execute("SELECT COUNT(*) FROM domains").fetchone()[0]
conn.close()
_index_ready = True
logger.info("DuckDB index built: %d rows", _index_total)
except Exception as e:
logger.error("DuckDB index build failed: %s", e)
finally:
_index_building = False
def duckdb_query(sql: str, params=None):
conn = duckdb.connect(database=":memory:", read_only=False)
conn.execute(f"SET threads=4")
if params:
result = conn.execute(sql, params).fetchall()
async def build_duckdb_index():
loop = asyncio.get_event_loop()
await loop.run_in_executor(None, _build_index_sync)
def index_status() -> dict:
return {
"ready": _index_ready,
"building": _index_building,
"total": _index_total,
}
# ── Domain queries ───────────────────────────────────────────────────────────
def _domains_sync(tld, page, limit, alpha_only, no_sld, keyword):
conditions = []
params_count = []
params_data = []
if _index_ready:
source = "domains"
def _add(clause, val=None):
conditions.append(clause)
if val is not None:
params_count.append(val)
params_data.append(val)
else:
result = conn.execute(sql).fetchall()
conn.close()
return result
source = f"read_parquet('{PARQUET_PATH}')"
def _add(clause, val=None):
conditions.append(clause)
if val is not None:
params_count.append(val)
params_data.append(val)
def duckdb_query_df(sql: str, params=None):
conn = duckdb.connect(database=":memory:", read_only=False)
if tld:
if _index_ready:
_add("tld = ?", tld.lower().lstrip("."))
else:
_add("lower(regexp_extract(domain, '\\.([^.]+)$', 1)) = ?", tld.lower().lstrip("."))
if no_sld:
if _index_ready:
_add("parts = 2")
else:
_add("len(string_split(domain, '.')) = 2")
if alpha_only:
_add("NOT regexp_matches(domain, '[^a-zA-Z.]')")
if keyword:
_add("domain LIKE ?", f"%{keyword.lower()}%")
where = ("WHERE " + " AND ".join(conditions)) if conditions else ""
offset = (page - 1) * limit
if _index_ready:
conn = duckdb.connect(str(DUCKDB_PATH), read_only=True)
else:
conn = duckdb.connect(":memory:")
conn.execute("SET threads=4")
if params:
result = conn.execute(sql, params).df()
else:
result = conn.execute(sql).df()
total = conn.execute(f"SELECT COUNT(*) FROM {source} {where}", params_count).fetchone()[0]
rows = conn.execute(
f"SELECT domain FROM {source} {where} LIMIT {limit} OFFSET {offset}", params_data
).fetchall()
conn.close()
return result
return total, [r[0] for r in rows]
async def get_domains(tld=None, page=1, limit=100, alpha_only=False, no_sld=False, keyword=None, live_only=False):
loop = asyncio.get_event_loop()
total, domain_list = await loop.run_in_executor(
None, _domains_sync, tld, page, limit, alpha_only, no_sld, keyword
)
if not domain_list:
return total, []
placeholders = ",".join("?" * len(domain_list))
async with aiosqlite.connect(SQLITE_PATH) as db:
db.row_factory = aiosqlite.Row
async with db.execute(
f"SELECT * FROM enriched_domains WHERE domain IN ({placeholders})",
domain_list,
) as cur:
enriched_map = {r["domain"]: dict(r) async for r in cur}
results = []
for d in domain_list:
row = enriched_map.get(d, {"domain": d})
if live_only and not row.get("is_live"):
continue
results.append(row)
return total, results
# ── Stats ────────────────────────────────────────────────────────────────────
def _tld_stats_sync() -> tuple[int, list]:
if _index_ready:
conn = duckdb.connect(str(DUCKDB_PATH), read_only=True)
total = conn.execute("SELECT COUNT(*) FROM domains").fetchone()[0]
rows = conn.execute("""
SELECT tld, COUNT(*) AS cnt FROM domains
WHERE tld != ''
GROUP BY tld ORDER BY cnt DESC LIMIT 20
""").fetchall()
conn.close()
else:
p = str(PARQUET_PATH)
conn = duckdb.connect(":memory:")
conn.execute("SET threads=4")
total = conn.execute(f"SELECT COUNT(*) FROM read_parquet('{p}')").fetchone()[0]
rows = conn.execute(f"""
SELECT lower(regexp_extract(domain, '\\.([^.]+)$', 1)) AS tld, COUNT(*) AS cnt
FROM read_parquet('{p}')
GROUP BY tld ORDER BY cnt DESC LIMIT 20
""").fetchall()
conn.close()
return total, [{"tld": r[0], "count": r[1]} for r in rows]
async def get_stats():
parquet = str(PARQUET_PATH)
global _tld_cache, _total_cache
# Total count + TLD breakdown via DuckDB pushdown
total = duckdb_query(f"SELECT COUNT(*) FROM read_parquet('{parquet}')")[0][0]
tld_rows = duckdb_query(f"""
SELECT
regexp_extract(domain, '\\.([a-zA-Z0-9]+)$', 1) AS tld,
COUNT(*) AS cnt
FROM read_parquet('{parquet}')
GROUP BY tld
ORDER BY cnt DESC
LIMIT 20
""")
# Compute TLD breakdown once and cache it
if not _tld_cache:
loop = asyncio.get_event_loop()
_total_cache, _tld_cache = await loop.run_in_executor(None, _tld_stats_sync)
async with aiosqlite.connect(SQLITE_PATH) as db:
async with db.execute("SELECT COUNT(*) FROM enriched_domains") as cur:
enriched = (await cur.fetchone())[0]
threshold = int(os.getenv("SCORE_THRESHOLD", "60"))
async with db.execute(
"SELECT COUNT(*) FROM enriched_domains WHERE score >= ?", (threshold,)
) as cur:
async with db.execute("SELECT COUNT(*) FROM enriched_domains WHERE score >= ?", (threshold,)) as cur:
hot_leads = (await cur.fetchone())[0]
async with db.execute(
"SELECT COUNT(*) FROM job_queue WHERE status='pending'"
) as cur:
queue_pending = (await cur.fetchone())[0]
async with db.execute(
"SELECT COUNT(*) FROM job_queue WHERE status='running'"
) as cur:
queue_running = (await cur.fetchone())[0]
async with db.execute(
"SELECT COUNT(*) FROM job_queue WHERE status='done'"
) as cur:
queue_done = (await cur.fetchone())[0]
async with db.execute(
"SELECT COUNT(*) FROM job_queue WHERE status='failed'"
) as cur:
queue_failed = (await cur.fetchone())[0]
async with db.execute("SELECT status, COUNT(*) FROM job_queue GROUP BY status") as cur:
q = {r[0]: r[1] async for r in cur}
return {
"total_domains": total,
"total_domains": _total_cache,
"enriched": enriched,
"hot_leads": hot_leads,
"tld_breakdown": [{"tld": r[0], "count": r[1]} for r in tld_rows],
"tld_breakdown": _tld_cache,
"index_status": index_status(),
"queue": {
"pending": queue_pending,
"running": queue_running,
"done": queue_done,
"failed": queue_failed,
"pending": q.get("pending", 0),
"running": q.get("running", 0),
"done": q.get("done", 0),
"failed": q.get("failed", 0),
},
}
async def get_domains(tld=None, page=1, limit=100, live_only=False):
parquet = str(PARQUET_PATH)
conditions = []
params = []
if tld:
conditions.append(f"regexp_extract(domain, '\\.([a-zA-Z0-9]+)$', 1) = '{tld}'")
if live_only:
# Join with enriched_domains to check is_live
pass
where = f"WHERE {' AND '.join(conditions)}" if conditions else ""
offset = (page - 1) * limit
sql = f"""
SELECT domain
FROM read_parquet('{parquet}')
{where}
LIMIT {limit} OFFSET {offset}
"""
rows = duckdb_query(sql)
domains = [r[0] for r in rows]
# Merge enrichment data from SQLite
if domains:
placeholders = ",".join("?" * len(domains))
async with aiosqlite.connect(SQLITE_PATH) as db:
db.row_factory = aiosqlite.Row
async with db.execute(
f"SELECT * FROM enriched_domains WHERE domain IN ({placeholders})",
domains,
) as cur:
enriched = {r["domain"]: dict(r) async for r in cur}
result = []
for d in domains:
if d in enriched:
result.append(enriched[d])
else:
result.append({"domain": d})
return result
return []
# ── Enrichment helpers ───────────────────────────────────────────────────────
async def get_enriched(min_score=0, cms=None, country=None, page=1, limit=100):
offset = (page - 1) * limit
conditions = ["score >= ?"]
params = [min_score]
params: list = [min_score]
if cms:
conditions.append("cms = ?")
params.append(cms)
if country:
conditions.append("ip_country = ?")
params.append(country)
where = "WHERE " + " AND ".join(conditions)
async with aiosqlite.connect(SQLITE_PATH) as db:
db.row_factory = aiosqlite.Row
@@ -192,7 +281,11 @@ async def get_enriched(min_score=0, cms=None, country=None, page=1, limit=100):
params + [limit, offset],
) as cur:
rows = [dict(r) async for r in cur]
return rows
async with db.execute(
f"SELECT COUNT(*) FROM enriched_domains {where}", params
) as cur:
total = (await cur.fetchone())[0]
return total, rows
async def queue_domains(domains: list[str]):
@@ -206,26 +299,13 @@ async def queue_domains(domains: list[str]):
async def get_queue_status():
async with aiosqlite.connect(SQLITE_PATH) as db:
async with db.execute(
"SELECT status, COUNT(*) FROM job_queue GROUP BY status"
) as cur:
async with db.execute("SELECT status, COUNT(*) FROM job_queue GROUP BY status") as cur:
rows = {r[0]: r[1] async for r in cur}
total = sum(rows.values())
done = rows.get("done", 0)
pending = rows.get("pending", 0)
running = rows.get("running", 0)
done = rows.get("done", 0)
failed = rows.get("failed", 0)
eta_seconds = None
if running > 0 or pending > 0:
rate = int(os.getenv("CONCURRENCY_LIMIT", "50"))
eta_seconds = (pending + running) / max(rate / 10, 1)
return {
"total": total,
"pending": pending,
"running": running,
"done": done,
"failed": failed,
"eta_seconds": eta_seconds,
}
total = sum(rows.values())
rate = int(os.getenv("CONCURRENCY_LIMIT", "50"))
eta_seconds = (pending + running) / max(rate / 10, 1) if (pending + running) > 0 else None
return {"total": total, "pending": pending, "running": running, "done": done, "failed": failed, "eta_seconds": eta_seconds}