feat: two-phase pre-screening with HTTP check + DeepSeek batch classification

Phase 1 (no AI credits): httpx checks every selected domain concurrently
(30 parallel) with real browser UA — detects live/dead/parked/redirect.
Parked: keyword scan in body/title + known parking host redirect check.
Results saved to DB immediately; dead/parked never reach DeepSeek.

Phase 2 (single DeepSeek call): all live-site titles + snippets bundled
into ONE Replicate/DeepSeek-R1 request → returns niche + type for every
domain in batch (up to 80 per call, parallelised if more).

- app/prescreener.py (new): _check_one(), prescreen_domains(),
  classify_with_deepseek(), parking signal lists, same-domain redirect logic
- app/db.py: prescreen_status/niche/site_type/prescreen_at columns +
  migrations; save_prescreen_results() upsert helper
- app/main.py: POST /api/prescreen/batch endpoint
- app/static/index.html:
  - 🔍 Pre-screen button (disabled while running, shows spinner)
  - Niche + Type columns in Browse and Leads tables (.pni/.pty pills)
  - Prescreen status colour dot (●) when niche not yet set
  - prescreening state flag; result toast shows per-status counts

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-17 21:22:45 +02:00
parent 63f961dc80
commit 7fc510f903
4 changed files with 373 additions and 5 deletions

View File

@@ -36,7 +36,11 @@ CREATE TABLE IF NOT EXISTS enriched_domains (
ai_contact_channel TEXT,
ai_contact_value TEXT,
ai_assessed_at TEXT,
site_analysis TEXT
site_analysis TEXT,
prescreen_status TEXT,
niche TEXT,
site_type TEXT,
prescreen_at TEXT
);
CREATE TABLE IF NOT EXISTS job_queue (
id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -76,6 +80,10 @@ _MIGRATIONS = [
"ALTER TABLE enriched_domains ADD COLUMN site_analysis TEXT",
"CREATE TABLE IF NOT EXISTS ai_queue (domain TEXT PRIMARY KEY, status TEXT DEFAULT 'pending', created_at TEXT DEFAULT (datetime('now')), completed_at TEXT, error TEXT)",
"ALTER TABLE ai_queue ADD COLUMN language TEXT DEFAULT 'ES'",
"ALTER TABLE enriched_domains ADD COLUMN prescreen_status TEXT",
"ALTER TABLE enriched_domains ADD COLUMN niche TEXT",
"ALTER TABLE enriched_domains ADD COLUMN site_type TEXT",
"ALTER TABLE enriched_domains ADD COLUMN prescreen_at TEXT",
]
# Index build state
@@ -418,6 +426,35 @@ async def save_ai_assessment(domain: str, assessment: dict, site_analysis: dict
await db.commit()
async def save_prescreen_results(results: list[dict]):
"""Upsert prescreen HTTP results and/or DeepSeek niche/type classifications."""
async with aiosqlite.connect(SQLITE_PATH) as db:
for r in results:
domain = r.get("domain")
if not domain:
continue
niche = r.get("niche")
site_type = r.get("type") # DeepSeek returns "type" key
if niche or site_type:
# Classification-only update (domain row must already exist)
await db.execute(
"UPDATE enriched_domains SET niche=?, site_type=? WHERE domain=?",
(niche, site_type, domain),
)
else:
# Prescreen status upsert — create row if it doesn't exist yet
await db.execute(
"""INSERT INTO enriched_domains (domain, prescreen_status, prescreen_at, page_title)
VALUES (?, ?, datetime('now'), ?)
ON CONFLICT(domain) DO UPDATE SET
prescreen_status = excluded.prescreen_status,
prescreen_at = excluded.prescreen_at,
page_title = COALESCE(page_title, excluded.page_title)""",
(domain, r.get("prescreen_status"), r.get("title")),
)
await db.commit()
async def queue_domains(domains: list[str]):
async with aiosqlite.connect(SQLITE_PATH) as db:
await db.executemany(