feat: two-phase pre-screening with HTTP check + DeepSeek batch classification
Phase 1 (no AI credits): httpx checks every selected domain concurrently (30 parallel) with real browser UA — detects live/dead/parked/redirect. Parked: keyword scan in body/title + known parking host redirect check. Results saved to DB immediately; dead/parked never reach DeepSeek. Phase 2 (single DeepSeek call): all live-site titles + snippets bundled into ONE Replicate/DeepSeek-R1 request → returns niche + type for every domain in batch (up to 80 per call, parallelised if more). - app/prescreener.py (new): _check_one(), prescreen_domains(), classify_with_deepseek(), parking signal lists, same-domain redirect logic - app/db.py: prescreen_status/niche/site_type/prescreen_at columns + migrations; save_prescreen_results() upsert helper - app/main.py: POST /api/prescreen/batch endpoint - app/static/index.html: - 🔍 Pre-screen button (disabled while running, shows spinner) - Niche + Type columns in Browse and Leads tables (.pni/.pty pills) - Prescreen status colour dot (●) when niche not yet set - prescreening state flag; result toast shows per-status counts Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
52
app/main.py
52
app/main.py
@@ -18,7 +18,7 @@ from app.db import (
|
||||
DATA_DIR, PARQUET_PATH, SQLITE_PATH,
|
||||
init_db, get_stats, get_domains, get_enriched,
|
||||
queue_domains, get_queue_status, build_duckdb_index, index_status,
|
||||
queue_ai, get_ai_queue_status, save_ai_assessment,
|
||||
queue_ai, get_ai_queue_status, save_ai_assessment, save_prescreen_results,
|
||||
)
|
||||
from app.enricher import start_worker, pause_worker, resume_worker, is_running, ensure_workers_alive
|
||||
from app.scorer import run_scoring
|
||||
@@ -171,6 +171,56 @@ async def enriched(
|
||||
|
||||
# ── AI assessment endpoints ───────────────────────────────────────────────────
|
||||
|
||||
@app.post("/api/prescreen/batch")
|
||||
async def prescreen_batch(body: dict):
|
||||
"""
|
||||
Phase 1 — HTTP check every domain (no AI). Marks live/dead/parked/redirect.
|
||||
Phase 2 — Single DeepSeek call for all live domains → niche + type.
|
||||
Max 200 domains per call.
|
||||
"""
|
||||
domains = body.get("domains", [])
|
||||
if not domains:
|
||||
return JSONResponse({"error": "no domains provided"}, status_code=400)
|
||||
if len(domains) > 200:
|
||||
return JSONResponse({"error": "max 200 domains per batch"}, status_code=400)
|
||||
|
||||
from app.prescreener import prescreen_domains, classify_with_deepseek, DEEPSEEK_BATCH_SIZE
|
||||
|
||||
# Phase 1: HTTP checks (concurrent, no AI)
|
||||
results = await prescreen_domains(domains)
|
||||
await save_prescreen_results(results)
|
||||
|
||||
counts: dict = {}
|
||||
for r in results:
|
||||
s = r.get("prescreen_status", "dead")
|
||||
counts[s] = counts.get(s, 0) + 1
|
||||
|
||||
# Phase 2: DeepSeek classification for live sites only
|
||||
live = [r for r in results if r.get("prescreen_status") == "live"]
|
||||
classified = 0
|
||||
if live:
|
||||
batches = [live[i:i + DEEPSEEK_BATCH_SIZE] for i in range(0, len(live), DEEPSEEK_BATCH_SIZE)]
|
||||
batch_cls = await asyncio.gather(
|
||||
*[classify_with_deepseek(b) for b in batches], return_exceptions=True
|
||||
)
|
||||
all_cls: list = []
|
||||
for bc in batch_cls:
|
||||
if isinstance(bc, list):
|
||||
all_cls.extend(bc)
|
||||
if all_cls:
|
||||
await save_prescreen_results(all_cls)
|
||||
classified = len(all_cls)
|
||||
|
||||
return {
|
||||
"total": len(domains),
|
||||
"live": counts.get("live", 0),
|
||||
"parked": counts.get("parked", 0),
|
||||
"redirect": counts.get("redirect", 0),
|
||||
"dead": counts.get("dead", 0),
|
||||
"classified": classified,
|
||||
}
|
||||
|
||||
|
||||
@app.post("/api/ai/assess/batch")
|
||||
async def ai_assess_batch(body: dict):
|
||||
domains_list = body.get("domains", [])
|
||||
|
||||
Reference in New Issue
Block a user