feat: persistent DuckDB index, new filters, pagination fix, enrich UX
- Build /data/domains.duckdb on first run (tld+parts columns + ART index) → TLD filter goes from ~60s full scan to <100ms index lookup → System still works (slower) while index builds in background - New /api/domains params: alpha_only, no_sld, keyword → alpha_only: domains with only letters (no hyphens/numbers) → no_sld: parts=2, excludes com.es / net.es patterns → keyword: LIKE '%term%' niche search - /api/domains and /api/enriched now return total count for pagination - Pagination: shows total matches, page X of Y, Next disabled at last page - Enrich button: toast notifications instead of alert(), error handling - Select all on page button, clear selection button - Stats/TLD breakdown cached after first load (no repeat full scan) - Header shows index build status (building → ready) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
324
app/db.py
324
app/db.py
@@ -1,10 +1,15 @@
|
||||
import os
|
||||
import asyncio
|
||||
import logging
|
||||
import aiosqlite
|
||||
import duckdb
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DATA_DIR = Path(os.getenv("DATA_DIR", "/data"))
|
||||
PARQUET_PATH = DATA_DIR / "domains.parquet"
|
||||
DUCKDB_PATH = DATA_DIR / "domains.duckdb"
|
||||
SQLITE_PATH = DATA_DIR / "enrichment.db"
|
||||
|
||||
SCHEMA = """
|
||||
@@ -23,7 +28,6 @@ CREATE TABLE IF NOT EXISTS enriched_domains (
|
||||
error TEXT,
|
||||
score INTEGER DEFAULT 0
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS job_queue (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
domain TEXT UNIQUE NOT NULL,
|
||||
@@ -33,7 +37,6 @@ CREATE TABLE IF NOT EXISTS job_queue (
|
||||
completed_at TEXT,
|
||||
error TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS scores (
|
||||
domain TEXT PRIMARY KEY,
|
||||
score INTEGER NOT NULL,
|
||||
@@ -41,6 +44,15 @@ CREATE TABLE IF NOT EXISTS scores (
|
||||
);
|
||||
"""
|
||||
|
||||
# Index build state
|
||||
_index_ready = False
|
||||
_index_building = False
|
||||
_index_total = 0
|
||||
|
||||
# Cached stats (TLD breakdown is expensive — compute once)
|
||||
_tld_cache: list = []
|
||||
_total_cache: int = 0
|
||||
|
||||
|
||||
async def init_db():
|
||||
async with aiosqlite.connect(SQLITE_PATH) as db:
|
||||
@@ -48,142 +60,219 @@ async def init_db():
|
||||
await db.commit()
|
||||
|
||||
|
||||
async def get_db():
|
||||
return await aiosqlite.connect(SQLITE_PATH)
|
||||
# ── DuckDB persistent index ──────────────────────────────────────────────────
|
||||
|
||||
def _build_index_sync():
|
||||
global _index_ready, _index_building, _index_total
|
||||
_index_building = True
|
||||
try:
|
||||
conn = duckdb.connect(str(DUCKDB_PATH))
|
||||
conn.execute("SET threads=4")
|
||||
conn.execute("SET memory_limit='2GB'")
|
||||
|
||||
# Check if already built
|
||||
try:
|
||||
n = conn.execute("SELECT COUNT(*) FROM domains").fetchone()[0]
|
||||
if n > 0:
|
||||
_index_total = n
|
||||
_index_ready = True
|
||||
_index_building = False
|
||||
logger.info("DuckDB index already ready (%d rows)", n)
|
||||
conn.close()
|
||||
return
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
logger.info("Building DuckDB index from parquet (one-time ~2-3 min)...")
|
||||
conn.execute("""
|
||||
CREATE OR REPLACE TABLE domains AS
|
||||
SELECT
|
||||
domain,
|
||||
lower(regexp_extract(domain, '\\.([^.]+)$', 1)) AS tld,
|
||||
len(string_split(domain, '.')) AS parts
|
||||
FROM read_parquet(?)
|
||||
""", [str(PARQUET_PATH)])
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_tld ON domains(tld)")
|
||||
_index_total = conn.execute("SELECT COUNT(*) FROM domains").fetchone()[0]
|
||||
conn.close()
|
||||
_index_ready = True
|
||||
logger.info("DuckDB index built: %d rows", _index_total)
|
||||
except Exception as e:
|
||||
logger.error("DuckDB index build failed: %s", e)
|
||||
finally:
|
||||
_index_building = False
|
||||
|
||||
|
||||
def duckdb_query(sql: str, params=None):
|
||||
conn = duckdb.connect(database=":memory:", read_only=False)
|
||||
conn.execute(f"SET threads=4")
|
||||
if params:
|
||||
result = conn.execute(sql, params).fetchall()
|
||||
async def build_duckdb_index():
|
||||
loop = asyncio.get_event_loop()
|
||||
await loop.run_in_executor(None, _build_index_sync)
|
||||
|
||||
|
||||
def index_status() -> dict:
|
||||
return {
|
||||
"ready": _index_ready,
|
||||
"building": _index_building,
|
||||
"total": _index_total,
|
||||
}
|
||||
|
||||
|
||||
# ── Domain queries ───────────────────────────────────────────────────────────
|
||||
|
||||
def _domains_sync(tld, page, limit, alpha_only, no_sld, keyword):
|
||||
conditions = []
|
||||
params_count = []
|
||||
params_data = []
|
||||
|
||||
if _index_ready:
|
||||
source = "domains"
|
||||
|
||||
def _add(clause, val=None):
|
||||
conditions.append(clause)
|
||||
if val is not None:
|
||||
params_count.append(val)
|
||||
params_data.append(val)
|
||||
else:
|
||||
result = conn.execute(sql).fetchall()
|
||||
conn.close()
|
||||
return result
|
||||
source = f"read_parquet('{PARQUET_PATH}')"
|
||||
|
||||
def _add(clause, val=None):
|
||||
conditions.append(clause)
|
||||
if val is not None:
|
||||
params_count.append(val)
|
||||
params_data.append(val)
|
||||
|
||||
def duckdb_query_df(sql: str, params=None):
|
||||
conn = duckdb.connect(database=":memory:", read_only=False)
|
||||
if tld:
|
||||
if _index_ready:
|
||||
_add("tld = ?", tld.lower().lstrip("."))
|
||||
else:
|
||||
_add("lower(regexp_extract(domain, '\\.([^.]+)$', 1)) = ?", tld.lower().lstrip("."))
|
||||
|
||||
if no_sld:
|
||||
if _index_ready:
|
||||
_add("parts = 2")
|
||||
else:
|
||||
_add("len(string_split(domain, '.')) = 2")
|
||||
|
||||
if alpha_only:
|
||||
_add("NOT regexp_matches(domain, '[^a-zA-Z.]')")
|
||||
|
||||
if keyword:
|
||||
_add("domain LIKE ?", f"%{keyword.lower()}%")
|
||||
|
||||
where = ("WHERE " + " AND ".join(conditions)) if conditions else ""
|
||||
offset = (page - 1) * limit
|
||||
|
||||
if _index_ready:
|
||||
conn = duckdb.connect(str(DUCKDB_PATH), read_only=True)
|
||||
else:
|
||||
conn = duckdb.connect(":memory:")
|
||||
conn.execute("SET threads=4")
|
||||
if params:
|
||||
result = conn.execute(sql, params).df()
|
||||
else:
|
||||
result = conn.execute(sql).df()
|
||||
|
||||
total = conn.execute(f"SELECT COUNT(*) FROM {source} {where}", params_count).fetchone()[0]
|
||||
rows = conn.execute(
|
||||
f"SELECT domain FROM {source} {where} LIMIT {limit} OFFSET {offset}", params_data
|
||||
).fetchall()
|
||||
conn.close()
|
||||
return result
|
||||
return total, [r[0] for r in rows]
|
||||
|
||||
|
||||
async def get_domains(tld=None, page=1, limit=100, alpha_only=False, no_sld=False, keyword=None, live_only=False):
|
||||
loop = asyncio.get_event_loop()
|
||||
total, domain_list = await loop.run_in_executor(
|
||||
None, _domains_sync, tld, page, limit, alpha_only, no_sld, keyword
|
||||
)
|
||||
|
||||
if not domain_list:
|
||||
return total, []
|
||||
|
||||
placeholders = ",".join("?" * len(domain_list))
|
||||
async with aiosqlite.connect(SQLITE_PATH) as db:
|
||||
db.row_factory = aiosqlite.Row
|
||||
async with db.execute(
|
||||
f"SELECT * FROM enriched_domains WHERE domain IN ({placeholders})",
|
||||
domain_list,
|
||||
) as cur:
|
||||
enriched_map = {r["domain"]: dict(r) async for r in cur}
|
||||
|
||||
results = []
|
||||
for d in domain_list:
|
||||
row = enriched_map.get(d, {"domain": d})
|
||||
if live_only and not row.get("is_live"):
|
||||
continue
|
||||
results.append(row)
|
||||
|
||||
return total, results
|
||||
|
||||
|
||||
# ── Stats ────────────────────────────────────────────────────────────────────
|
||||
|
||||
def _tld_stats_sync() -> tuple[int, list]:
|
||||
if _index_ready:
|
||||
conn = duckdb.connect(str(DUCKDB_PATH), read_only=True)
|
||||
total = conn.execute("SELECT COUNT(*) FROM domains").fetchone()[0]
|
||||
rows = conn.execute("""
|
||||
SELECT tld, COUNT(*) AS cnt FROM domains
|
||||
WHERE tld != ''
|
||||
GROUP BY tld ORDER BY cnt DESC LIMIT 20
|
||||
""").fetchall()
|
||||
conn.close()
|
||||
else:
|
||||
p = str(PARQUET_PATH)
|
||||
conn = duckdb.connect(":memory:")
|
||||
conn.execute("SET threads=4")
|
||||
total = conn.execute(f"SELECT COUNT(*) FROM read_parquet('{p}')").fetchone()[0]
|
||||
rows = conn.execute(f"""
|
||||
SELECT lower(regexp_extract(domain, '\\.([^.]+)$', 1)) AS tld, COUNT(*) AS cnt
|
||||
FROM read_parquet('{p}')
|
||||
GROUP BY tld ORDER BY cnt DESC LIMIT 20
|
||||
""").fetchall()
|
||||
conn.close()
|
||||
return total, [{"tld": r[0], "count": r[1]} for r in rows]
|
||||
|
||||
|
||||
async def get_stats():
|
||||
parquet = str(PARQUET_PATH)
|
||||
global _tld_cache, _total_cache
|
||||
|
||||
# Total count + TLD breakdown via DuckDB pushdown
|
||||
total = duckdb_query(f"SELECT COUNT(*) FROM read_parquet('{parquet}')")[0][0]
|
||||
|
||||
tld_rows = duckdb_query(f"""
|
||||
SELECT
|
||||
regexp_extract(domain, '\\.([a-zA-Z0-9]+)$', 1) AS tld,
|
||||
COUNT(*) AS cnt
|
||||
FROM read_parquet('{parquet}')
|
||||
GROUP BY tld
|
||||
ORDER BY cnt DESC
|
||||
LIMIT 20
|
||||
""")
|
||||
# Compute TLD breakdown once and cache it
|
||||
if not _tld_cache:
|
||||
loop = asyncio.get_event_loop()
|
||||
_total_cache, _tld_cache = await loop.run_in_executor(None, _tld_stats_sync)
|
||||
|
||||
async with aiosqlite.connect(SQLITE_PATH) as db:
|
||||
async with db.execute("SELECT COUNT(*) FROM enriched_domains") as cur:
|
||||
enriched = (await cur.fetchone())[0]
|
||||
threshold = int(os.getenv("SCORE_THRESHOLD", "60"))
|
||||
async with db.execute(
|
||||
"SELECT COUNT(*) FROM enriched_domains WHERE score >= ?", (threshold,)
|
||||
) as cur:
|
||||
async with db.execute("SELECT COUNT(*) FROM enriched_domains WHERE score >= ?", (threshold,)) as cur:
|
||||
hot_leads = (await cur.fetchone())[0]
|
||||
async with db.execute(
|
||||
"SELECT COUNT(*) FROM job_queue WHERE status='pending'"
|
||||
) as cur:
|
||||
queue_pending = (await cur.fetchone())[0]
|
||||
async with db.execute(
|
||||
"SELECT COUNT(*) FROM job_queue WHERE status='running'"
|
||||
) as cur:
|
||||
queue_running = (await cur.fetchone())[0]
|
||||
async with db.execute(
|
||||
"SELECT COUNT(*) FROM job_queue WHERE status='done'"
|
||||
) as cur:
|
||||
queue_done = (await cur.fetchone())[0]
|
||||
async with db.execute(
|
||||
"SELECT COUNT(*) FROM job_queue WHERE status='failed'"
|
||||
) as cur:
|
||||
queue_failed = (await cur.fetchone())[0]
|
||||
async with db.execute("SELECT status, COUNT(*) FROM job_queue GROUP BY status") as cur:
|
||||
q = {r[0]: r[1] async for r in cur}
|
||||
|
||||
return {
|
||||
"total_domains": total,
|
||||
"total_domains": _total_cache,
|
||||
"enriched": enriched,
|
||||
"hot_leads": hot_leads,
|
||||
"tld_breakdown": [{"tld": r[0], "count": r[1]} for r in tld_rows],
|
||||
"tld_breakdown": _tld_cache,
|
||||
"index_status": index_status(),
|
||||
"queue": {
|
||||
"pending": queue_pending,
|
||||
"running": queue_running,
|
||||
"done": queue_done,
|
||||
"failed": queue_failed,
|
||||
"pending": q.get("pending", 0),
|
||||
"running": q.get("running", 0),
|
||||
"done": q.get("done", 0),
|
||||
"failed": q.get("failed", 0),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
async def get_domains(tld=None, page=1, limit=100, live_only=False):
|
||||
parquet = str(PARQUET_PATH)
|
||||
conditions = []
|
||||
params = []
|
||||
|
||||
if tld:
|
||||
conditions.append(f"regexp_extract(domain, '\\.([a-zA-Z0-9]+)$', 1) = '{tld}'")
|
||||
if live_only:
|
||||
# Join with enriched_domains to check is_live
|
||||
pass
|
||||
|
||||
where = f"WHERE {' AND '.join(conditions)}" if conditions else ""
|
||||
offset = (page - 1) * limit
|
||||
|
||||
sql = f"""
|
||||
SELECT domain
|
||||
FROM read_parquet('{parquet}')
|
||||
{where}
|
||||
LIMIT {limit} OFFSET {offset}
|
||||
"""
|
||||
rows = duckdb_query(sql)
|
||||
domains = [r[0] for r in rows]
|
||||
|
||||
# Merge enrichment data from SQLite
|
||||
if domains:
|
||||
placeholders = ",".join("?" * len(domains))
|
||||
async with aiosqlite.connect(SQLITE_PATH) as db:
|
||||
db.row_factory = aiosqlite.Row
|
||||
async with db.execute(
|
||||
f"SELECT * FROM enriched_domains WHERE domain IN ({placeholders})",
|
||||
domains,
|
||||
) as cur:
|
||||
enriched = {r["domain"]: dict(r) async for r in cur}
|
||||
|
||||
result = []
|
||||
for d in domains:
|
||||
if d in enriched:
|
||||
result.append(enriched[d])
|
||||
else:
|
||||
result.append({"domain": d})
|
||||
return result
|
||||
return []
|
||||
|
||||
# ── Enrichment helpers ───────────────────────────────────────────────────────
|
||||
|
||||
async def get_enriched(min_score=0, cms=None, country=None, page=1, limit=100):
|
||||
offset = (page - 1) * limit
|
||||
conditions = ["score >= ?"]
|
||||
params = [min_score]
|
||||
params: list = [min_score]
|
||||
if cms:
|
||||
conditions.append("cms = ?")
|
||||
params.append(cms)
|
||||
if country:
|
||||
conditions.append("ip_country = ?")
|
||||
params.append(country)
|
||||
|
||||
where = "WHERE " + " AND ".join(conditions)
|
||||
async with aiosqlite.connect(SQLITE_PATH) as db:
|
||||
db.row_factory = aiosqlite.Row
|
||||
@@ -192,7 +281,11 @@ async def get_enriched(min_score=0, cms=None, country=None, page=1, limit=100):
|
||||
params + [limit, offset],
|
||||
) as cur:
|
||||
rows = [dict(r) async for r in cur]
|
||||
return rows
|
||||
async with db.execute(
|
||||
f"SELECT COUNT(*) FROM enriched_domains {where}", params
|
||||
) as cur:
|
||||
total = (await cur.fetchone())[0]
|
||||
return total, rows
|
||||
|
||||
|
||||
async def queue_domains(domains: list[str]):
|
||||
@@ -206,26 +299,13 @@ async def queue_domains(domains: list[str]):
|
||||
|
||||
async def get_queue_status():
|
||||
async with aiosqlite.connect(SQLITE_PATH) as db:
|
||||
async with db.execute(
|
||||
"SELECT status, COUNT(*) FROM job_queue GROUP BY status"
|
||||
) as cur:
|
||||
async with db.execute("SELECT status, COUNT(*) FROM job_queue GROUP BY status") as cur:
|
||||
rows = {r[0]: r[1] async for r in cur}
|
||||
total = sum(rows.values())
|
||||
done = rows.get("done", 0)
|
||||
pending = rows.get("pending", 0)
|
||||
running = rows.get("running", 0)
|
||||
done = rows.get("done", 0)
|
||||
failed = rows.get("failed", 0)
|
||||
|
||||
eta_seconds = None
|
||||
if running > 0 or pending > 0:
|
||||
rate = int(os.getenv("CONCURRENCY_LIMIT", "50"))
|
||||
eta_seconds = (pending + running) / max(rate / 10, 1)
|
||||
|
||||
return {
|
||||
"total": total,
|
||||
"pending": pending,
|
||||
"running": running,
|
||||
"done": done,
|
||||
"failed": failed,
|
||||
"eta_seconds": eta_seconds,
|
||||
}
|
||||
total = sum(rows.values())
|
||||
rate = int(os.getenv("CONCURRENCY_LIMIT", "50"))
|
||||
eta_seconds = (pending + running) / max(rate / 10, 1) if (pending + running) > 0 else None
|
||||
return {"total": total, "pending": pending, "running": running, "done": done, "failed": failed, "eta_seconds": eta_seconds}
|
||||
|
||||
Reference in New Issue
Block a user