feat: bulk validator tab + status/niche/type browse filters
- New app/validator.py: background HTTP checker for entire dataset - 50 concurrent checks, skips already-validated domains - Extracts prescreen_status, server, IP, load_time_ms - start/stop/status API at /api/validator/start|stop|status - New dedicated "Validator 🔬" tab with stats grid, TLD filter, Start/Stop controls, live progress indicator - Browse tab: "Live" column replaced with "Status" dot (color-coded ● from prescreen_status, falls back to is_live) - Browse tab: new Status / Niche / Type filter dropdowns - db.py: added ip TEXT + load_time_ms INTEGER columns + migrations; get_enriched() supports prescreen_status/niche/site_type filters - main.py: /api/enriched extended with prescreen_status/niche/site_type Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
21
app/db.py
21
app/db.py
@@ -40,7 +40,9 @@ CREATE TABLE IF NOT EXISTS enriched_domains (
|
|||||||
prescreen_status TEXT,
|
prescreen_status TEXT,
|
||||||
niche TEXT,
|
niche TEXT,
|
||||||
site_type TEXT,
|
site_type TEXT,
|
||||||
prescreen_at TEXT
|
prescreen_at TEXT,
|
||||||
|
ip TEXT,
|
||||||
|
load_time_ms INTEGER
|
||||||
);
|
);
|
||||||
CREATE TABLE IF NOT EXISTS job_queue (
|
CREATE TABLE IF NOT EXISTS job_queue (
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
@@ -84,6 +86,8 @@ _MIGRATIONS = [
|
|||||||
"ALTER TABLE enriched_domains ADD COLUMN niche TEXT",
|
"ALTER TABLE enriched_domains ADD COLUMN niche TEXT",
|
||||||
"ALTER TABLE enriched_domains ADD COLUMN site_type TEXT",
|
"ALTER TABLE enriched_domains ADD COLUMN site_type TEXT",
|
||||||
"ALTER TABLE enriched_domains ADD COLUMN prescreen_at TEXT",
|
"ALTER TABLE enriched_domains ADD COLUMN prescreen_at TEXT",
|
||||||
|
"ALTER TABLE enriched_domains ADD COLUMN ip TEXT",
|
||||||
|
"ALTER TABLE enriched_domains ADD COLUMN load_time_ms INTEGER",
|
||||||
]
|
]
|
||||||
|
|
||||||
# Index build state
|
# Index build state
|
||||||
@@ -315,7 +319,9 @@ async def get_stats():
|
|||||||
# ── Enrichment helpers ───────────────────────────────────────────────────────
|
# ── Enrichment helpers ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
async def get_enriched(min_score=0, cms=None, country=None, kit_digital=None,
|
async def get_enriched(min_score=0, cms=None, country=None, kit_digital=None,
|
||||||
ai_only=False, lead_quality=None, page=1, limit=100):
|
ai_only=False, lead_quality=None,
|
||||||
|
prescreen_status=None, niche=None, site_type=None,
|
||||||
|
page=1, limit=100):
|
||||||
offset = (page - 1) * limit
|
offset = (page - 1) * limit
|
||||||
conditions = ["score >= ?"]
|
conditions = ["score >= ?"]
|
||||||
params: list = [min_score]
|
params: list = [min_score]
|
||||||
@@ -333,6 +339,17 @@ async def get_enriched(min_score=0, cms=None, country=None, kit_digital=None,
|
|||||||
if lead_quality:
|
if lead_quality:
|
||||||
conditions.append("ai_lead_quality = ?")
|
conditions.append("ai_lead_quality = ?")
|
||||||
params.append(lead_quality.upper())
|
params.append(lead_quality.upper())
|
||||||
|
if prescreen_status == "none":
|
||||||
|
conditions.append("prescreen_status IS NULL")
|
||||||
|
elif prescreen_status:
|
||||||
|
conditions.append("prescreen_status = ?")
|
||||||
|
params.append(prescreen_status)
|
||||||
|
if niche:
|
||||||
|
conditions.append("niche = ?")
|
||||||
|
params.append(niche)
|
||||||
|
if site_type:
|
||||||
|
conditions.append("site_type = ?")
|
||||||
|
params.append(site_type)
|
||||||
where = "WHERE " + " AND ".join(conditions)
|
where = "WHERE " + " AND ".join(conditions)
|
||||||
async with aiosqlite.connect(SQLITE_PATH) as db:
|
async with aiosqlite.connect(SQLITE_PATH) as db:
|
||||||
db.row_factory = aiosqlite.Row
|
db.row_factory = aiosqlite.Row
|
||||||
|
|||||||
24
app/main.py
24
app/main.py
@@ -22,6 +22,7 @@ from app.db import (
|
|||||||
)
|
)
|
||||||
from app.enricher import start_worker, pause_worker, resume_worker, is_running, ensure_workers_alive
|
from app.enricher import start_worker, pause_worker, resume_worker, is_running, ensure_workers_alive
|
||||||
from app.scorer import run_scoring
|
from app.scorer import run_scoring
|
||||||
|
from app.validator import start_validator, stop_validator, get_validator_status
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -158,17 +159,40 @@ async def enriched(
|
|||||||
kit_digital: Optional[bool] = Query(None),
|
kit_digital: Optional[bool] = Query(None),
|
||||||
ai_only: bool = Query(False),
|
ai_only: bool = Query(False),
|
||||||
lead_quality: str = Query(None),
|
lead_quality: str = Query(None),
|
||||||
|
prescreen_status: str = Query(None),
|
||||||
|
niche: str = Query(None),
|
||||||
|
site_type: str = Query(None),
|
||||||
page: int = Query(1, ge=1),
|
page: int = Query(1, ge=1),
|
||||||
limit: int = Query(100, ge=1, le=1000),
|
limit: int = Query(100, ge=1, le=1000),
|
||||||
):
|
):
|
||||||
total, rows = await get_enriched(
|
total, rows = await get_enriched(
|
||||||
min_score=min_score, cms=cms, country=country,
|
min_score=min_score, cms=cms, country=country,
|
||||||
kit_digital=kit_digital, ai_only=ai_only, lead_quality=lead_quality,
|
kit_digital=kit_digital, ai_only=ai_only, lead_quality=lead_quality,
|
||||||
|
prescreen_status=prescreen_status, niche=niche, site_type=site_type,
|
||||||
page=page, limit=limit,
|
page=page, limit=limit,
|
||||||
)
|
)
|
||||||
return {"page": page, "limit": limit, "total": total, "results": rows}
|
return {"page": page, "limit": limit, "total": total, "results": rows}
|
||||||
|
|
||||||
|
|
||||||
|
# ── Bulk Validator endpoints ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@app.post("/api/validator/start")
|
||||||
|
async def validator_start(tld: str = Query(None)):
|
||||||
|
start_validator(tld_filter=tld or None)
|
||||||
|
return get_validator_status()
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/api/validator/stop")
|
||||||
|
async def validator_stop():
|
||||||
|
stop_validator()
|
||||||
|
return {"status": "stopped"}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/validator/status")
|
||||||
|
async def validator_status():
|
||||||
|
return get_validator_status()
|
||||||
|
|
||||||
|
|
||||||
# ── AI assessment endpoints ───────────────────────────────────────────────────
|
# ── AI assessment endpoints ───────────────────────────────────────────────────
|
||||||
|
|
||||||
@app.post("/api/prescreen/batch")
|
@app.post("/api/prescreen/batch")
|
||||||
|
|||||||
@@ -314,6 +314,7 @@ tr:hover td{background:rgba(255,255,255,.025)}
|
|||||||
<div class="tabs">
|
<div class="tabs">
|
||||||
<div class="tab" :class="{active:tab==='browse'}" @click="tab='browse'">Browse & Filter</div>
|
<div class="tab" :class="{active:tab==='browse'}" @click="tab='browse'">Browse & Filter</div>
|
||||||
<div class="tab" :class="{active:tab==='enrich'}" @click="tab='enrich';loadQueue()">Enrichment</div>
|
<div class="tab" :class="{active:tab==='enrich'}" @click="tab='enrich';loadQueue()">Enrichment</div>
|
||||||
|
<div class="tab" :class="{active:tab==='validator'}" @click="tab='validator';loadValStatus()">Validator 🔬</div>
|
||||||
<div class="tab" :class="{active:tab==='pipeline'}" @click="tab='pipeline';loadPipeline()">Lead Pipeline</div>
|
<div class="tab" :class="{active:tab==='pipeline'}" @click="tab='pipeline';loadPipeline()">Lead Pipeline</div>
|
||||||
<div class="tab" :class="{active:tab==='leads'}" @click="tab='leads';loadLeads(true)">Leads 🤖</div>
|
<div class="tab" :class="{active:tab==='leads'}" @click="tab='leads';loadLeads(true)">Leads 🤖</div>
|
||||||
<div class="tab" :class="{active:tab==='chart'}" @click="tab='chart';renderChart()">TLD Chart</div>
|
<div class="tab" :class="{active:tab==='chart'}" @click="tab='chart';renderChart()">TLD Chart</div>
|
||||||
@@ -343,6 +344,40 @@ tr:hover td{background:rgba(255,255,255,.025)}
|
|||||||
<label class="tog"><input type="checkbox" x-model="f.no_sld"><strong>No SLD</strong> <span>(skip com.es)</span></label>
|
<label class="tog"><input type="checkbox" x-model="f.no_sld"><strong>No SLD</strong> <span>(skip com.es)</span></label>
|
||||||
<label class="tog"><input type="checkbox" x-model="f.kit_digital_only"><strong style="color:var(--kd)">🏅 Kit Digital only</strong></label>
|
<label class="tog"><input type="checkbox" x-model="f.kit_digital_only"><strong style="color:var(--kd)">🏅 Kit Digital only</strong></label>
|
||||||
<label class="tog"><input type="checkbox" x-model="f.exclude_assessed"><strong>Hide assessed</strong></label>
|
<label class="tog"><input type="checkbox" x-model="f.exclude_assessed"><strong>Hide assessed</strong></label>
|
||||||
|
<div class="field"><label>Status</label>
|
||||||
|
<select x-model="f.prescreen_status" style="width:105px">
|
||||||
|
<option value="">Any</option>
|
||||||
|
<option value="live">● Live</option>
|
||||||
|
<option value="dead">● Dead</option>
|
||||||
|
<option value="parked">● Parked</option>
|
||||||
|
<option value="redirect">↗ Redirect</option>
|
||||||
|
<option value="none">Not checked</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
<div class="field"><label>Niche</label>
|
||||||
|
<select x-model="f.niche" style="width:130px">
|
||||||
|
<option value="">Any</option>
|
||||||
|
<option>automotive</option><option>beauty_cosmetics</option>
|
||||||
|
<option>travel_tourism</option><option>hospitality</option>
|
||||||
|
<option>restaurant_food</option><option>legal</option>
|
||||||
|
<option>medical_health</option><option>real_estate</option>
|
||||||
|
<option>technology</option><option>fashion_retail</option>
|
||||||
|
<option>finance</option><option>education</option>
|
||||||
|
<option>construction</option><option>sports</option>
|
||||||
|
<option>entertainment</option><option>agriculture</option>
|
||||||
|
<option>industrial</option><option>consulting</option><option>other</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
<div class="field"><label>Type</label>
|
||||||
|
<select x-model="f.site_type" style="width:120px">
|
||||||
|
<option value="">Any</option>
|
||||||
|
<option>corporate</option><option>ecommerce</option>
|
||||||
|
<option>blog</option><option>newspaper</option>
|
||||||
|
<option>landing_page</option><option>portfolio</option>
|
||||||
|
<option>directory</option><option>forum</option>
|
||||||
|
<option>informational</option><option>other</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div style="display:flex;gap:6px;margin-bottom:10px;flex-wrap:wrap;align-items:center">
|
<div style="display:flex;gap:6px;margin-bottom:10px;flex-wrap:wrap;align-items:center">
|
||||||
@@ -375,7 +410,7 @@ tr:hover td{background:rgba(255,255,255,.025)}
|
|||||||
<th></th><th>Domain</th><th>Score</th><th>KD</th><th>AI</th>
|
<th></th><th>Domain</th><th>Score</th><th>KD</th><th>AI</th>
|
||||||
<th>Niche</th><th>Type</th>
|
<th>Niche</th><th>Type</th>
|
||||||
<th>Contact</th><th>CMS</th><th>SSL days</th>
|
<th>Contact</th><th>CMS</th><th>SSL days</th>
|
||||||
<th>Country</th><th>Live</th>
|
<th>Country</th><th>Status</th>
|
||||||
</tr>
|
</tr>
|
||||||
</thead>
|
</thead>
|
||||||
<tbody>
|
<tbody>
|
||||||
@@ -410,7 +445,7 @@ tr:hover td{background:rgba(255,255,255,.025)}
|
|||||||
<!-- Niche -->
|
<!-- Niche -->
|
||||||
<td>
|
<td>
|
||||||
<span x-show="row.niche" class="pill pni" x-text="row.niche"></span>
|
<span x-show="row.niche" class="pill pni" x-text="row.niche"></span>
|
||||||
<span x-show="!row.niche" :class="prescreenStatusIcon(row.prescreen_status)" :title="row.prescreen_status||''" x-text="prescreenStatusIcon(row.prescreen_status)?'●':'—'"></span>
|
<span x-show="!row.niche" style="color:var(--border)">—</span>
|
||||||
</td>
|
</td>
|
||||||
<!-- Type -->
|
<!-- Type -->
|
||||||
<td>
|
<td>
|
||||||
@@ -440,7 +475,11 @@ tr:hover td{background:rgba(255,255,255,.025)}
|
|||||||
</td>
|
</td>
|
||||||
<td x-text="row.ssl_expiry_days??'—'"></td>
|
<td x-text="row.ssl_expiry_days??'—'"></td>
|
||||||
<td x-text="row.ip_country??'—'"></td>
|
<td x-text="row.ip_country??'—'"></td>
|
||||||
<td><span class="pill" :class="row.is_live?'pg':'pp'" x-text="row.is_live?'Yes':'—'"></span></td>
|
<td style="text-align:center">
|
||||||
|
<span x-show="row.prescreen_status" :class="prescreenStatusIcon(row.prescreen_status)" :title="row.prescreen_status">●</span>
|
||||||
|
<span x-show="!row.prescreen_status && row.is_live" class="ps-live" title="live (from enricher)">●</span>
|
||||||
|
<span x-show="!row.prescreen_status && !row.is_live" style="color:var(--border)">—</span>
|
||||||
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
</template>
|
</template>
|
||||||
</tbody>
|
</tbody>
|
||||||
@@ -509,7 +548,48 @@ tr:hover td{background:rgba(255,255,255,.025)}
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!-- ④ Lead Pipeline -->
|
<!-- ④ Validator -->
|
||||||
|
<div class="card" x-show="tab==='validator'">
|
||||||
|
<div class="ct">Bulk Domain Validator</div>
|
||||||
|
<div style="font-size:12px;color:var(--muted);margin-bottom:14px">
|
||||||
|
HTTP-checks the entire dataset to determine live/dead/parked/redirect status.
|
||||||
|
Extracts server type, IP, and load time. Skips already-validated domains.
|
||||||
|
Results appear as the Status column in Browse & Filter.
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Stats grid -->
|
||||||
|
<div class="esg" style="margin-bottom:12px">
|
||||||
|
<div class="esb"><div class="ev c1" x-text="(valSt.processed??0).toLocaleString()"></div><div class="el">Checked</div></div>
|
||||||
|
<div class="esb"><div class="ev ps-live" x-text="(valSt.live??0).toLocaleString()"></div><div class="el">Live</div></div>
|
||||||
|
<div class="esb"><div class="ev ps-dead" x-text="(valSt.dead??0).toLocaleString()"></div><div class="el">Dead</div></div>
|
||||||
|
<div class="esb"><div class="ev ps-parked" x-text="(valSt.parked??0).toLocaleString()"></div><div class="el">Parked</div></div>
|
||||||
|
<div class="esb"><div class="ev ps-redirect" x-text="(valSt.redirect??0).toLocaleString()"></div><div class="el">Redirect</div></div>
|
||||||
|
<div class="esb"><div class="ev c3" x-text="(valSt.rate??0).toFixed(1)"></div><div class="el">dom/sec</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Progress line -->
|
||||||
|
<div style="font-size:11px;color:var(--muted);margin-bottom:12px"
|
||||||
|
x-text="valSt.offset ? (valSt.offset??0).toLocaleString()+' rows scanned · '+(valSt.skipped??0).toLocaleString()+' already validated (skipped)' : 'Not started yet'">
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Controls -->
|
||||||
|
<div style="display:flex;gap:8px;align-items:flex-end;flex-wrap:wrap">
|
||||||
|
<div class="field">
|
||||||
|
<label>TLD filter <span style="font-weight:400;color:var(--muted)">(leave empty for all domains)</span></label>
|
||||||
|
<input type="text" x-model="valTld" placeholder="es or com or ro" style="width:180px" :disabled="valSt.running">
|
||||||
|
</div>
|
||||||
|
<button class="btn bs" :disabled="valSt.running" @click="startValidator()">▶ Start Validator</button>
|
||||||
|
<button class="btn bd" :disabled="!valSt.running" @click="stopValidator()">⏹ Stop</button>
|
||||||
|
<span x-show="valSt.running" style="font-size:11px;color:var(--accent2);padding-bottom:6px">⚡ Running…</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Live rate progress bar (only while running) -->
|
||||||
|
<div x-show="valSt.running" style="margin-top:14px">
|
||||||
|
<div class="pw"><div class="pb" style="width:100%;animation:pulse 2s infinite"></div></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- ⑤ Lead Pipeline -->
|
||||||
<div class="card" x-show="tab==='pipeline'">
|
<div class="card" x-show="tab==='pipeline'">
|
||||||
<div style="display:flex;justify-content:flex-end;margin-bottom:10px;gap:8px">
|
<div style="display:flex;justify-content:flex-end;margin-bottom:10px;gap:8px">
|
||||||
<button class="btn bg sm" @click="loadPipeline()">↻ Refresh</button>
|
<button class="btn bg sm" @click="loadPipeline()">↻ Refresh</button>
|
||||||
@@ -668,8 +748,10 @@ function app() {
|
|||||||
aiSt: {pending:0,running:0,done:0,failed:0,total:0},
|
aiSt: {pending:0,running:0,done:0,failed:0,total:0},
|
||||||
domains: [], selected: [], aiLang: 'ES',
|
domains: [], selected: [], aiLang: 'ES',
|
||||||
loading: false, page: 1, searchTotal: 0,
|
loading: false, page: 1, searchTotal: 0,
|
||||||
f: {tld:'',keyword:'',min_score:0,cms:'',live_only:false,alpha_only:false,no_sld:false,kit_digital_only:false,exclude_assessed:false,limit:'100'},
|
f: {tld:'',keyword:'',min_score:0,cms:'',live_only:false,alpha_only:false,no_sld:false,kit_digital_only:false,exclude_assessed:false,limit:'100',prescreen_status:'',niche:'',site_type:''},
|
||||||
qst: {}, customDomains: '',
|
qst: {}, customDomains: '',
|
||||||
|
valSt: {running:false,processed:0,live:0,dead:0,parked:0,redirect:0,skipped:0,offset:0,rate:0},
|
||||||
|
valTld: '',
|
||||||
leadsQ: {quality:'', country:'', limit:'50'},
|
leadsQ: {quality:'', country:'', limit:'50'},
|
||||||
leadsData: [], leadsTotal: 0, leadsPage: 1, leadsLoading: false,
|
leadsData: [], leadsTotal: 0, leadsPage: 1, leadsLoading: false,
|
||||||
prescreening: false,
|
prescreening: false,
|
||||||
@@ -692,6 +774,7 @@ function app() {
|
|||||||
this._lastAiDone = this.aiSt.done ?? 0;
|
this._lastAiDone = this.aiSt.done ?? 0;
|
||||||
}
|
}
|
||||||
if(this.tab==='enrich') this.loadQueue();
|
if(this.tab==='enrich') this.loadQueue();
|
||||||
|
if(this.tab==='validator') this.loadValStatus();
|
||||||
if(this.tab==='pipeline') this.loadPipeline();
|
if(this.tab==='pipeline') this.loadPipeline();
|
||||||
if(this.tab==='leads') this.loadLeads();
|
if(this.tab==='leads') this.loadLeads();
|
||||||
}, 3000);
|
}, 3000);
|
||||||
@@ -731,6 +814,10 @@ function app() {
|
|||||||
if(this.f.cms) rows = rows.filter(r=> r.cms===this.f.cms);
|
if(this.f.cms) rows = rows.filter(r=> r.cms===this.f.cms);
|
||||||
if(this.f.kit_digital_only) rows = rows.filter(r=> r.kit_digital);
|
if(this.f.kit_digital_only) rows = rows.filter(r=> r.kit_digital);
|
||||||
if(this.f.exclude_assessed) rows = rows.filter(r=> !r.ai_lead_quality);
|
if(this.f.exclude_assessed) rows = rows.filter(r=> !r.ai_lead_quality);
|
||||||
|
if(this.f.prescreen_status==='none') rows = rows.filter(r=> !r.prescreen_status);
|
||||||
|
else if(this.f.prescreen_status) rows = rows.filter(r=> r.prescreen_status===this.f.prescreen_status);
|
||||||
|
if(this.f.niche) rows = rows.filter(r=> r.niche===this.f.niche);
|
||||||
|
if(this.f.site_type) rows = rows.filter(r=> r.site_type===this.f.site_type);
|
||||||
this.domains = rows;
|
this.domains = rows;
|
||||||
} catch(e) {
|
} catch(e) {
|
||||||
this.domains = [];
|
this.domains = [];
|
||||||
@@ -740,7 +827,7 @@ function app() {
|
|||||||
},
|
},
|
||||||
|
|
||||||
selectAll() { this.selected = this.domains.map(d=>d.domain); },
|
selectAll() { this.selected = this.domains.map(d=>d.domain); },
|
||||||
resetFilters() { this.f={tld:'',keyword:'',min_score:0,cms:'',live_only:false,alpha_only:false,no_sld:false,kit_digital_only:false,exclude_assessed:false,limit:'100'}; },
|
resetFilters() { this.f={tld:'',keyword:'',min_score:0,cms:'',live_only:false,alpha_only:false,no_sld:false,kit_digital_only:false,exclude_assessed:false,limit:'100',prescreen_status:'',niche:'',site_type:''}; },
|
||||||
|
|
||||||
async enqueueSelected() {
|
async enqueueSelected() {
|
||||||
if(!this.selected.length) return;
|
if(!this.selected.length) return;
|
||||||
@@ -852,6 +939,22 @@ function app() {
|
|||||||
try { this.qst = await fetch('/api/enrich/status').then(r=>r.json()); } catch(e){}
|
try { this.qst = await fetch('/api/enrich/status').then(r=>r.json()); } catch(e){}
|
||||||
},
|
},
|
||||||
|
|
||||||
|
async loadValStatus() {
|
||||||
|
try { this.valSt = await fetch('/api/validator/status').then(r=>r.json()); } catch(e){}
|
||||||
|
},
|
||||||
|
async startValidator() {
|
||||||
|
const p = new URLSearchParams();
|
||||||
|
if(this.valTld.trim()) p.set('tld', this.valTld.trim());
|
||||||
|
await fetch('/api/validator/start'+(p.toString()? '?'+p : ''), {method:'POST'});
|
||||||
|
this.notify('Validator started', 'success');
|
||||||
|
await this.loadValStatus();
|
||||||
|
},
|
||||||
|
async stopValidator() {
|
||||||
|
await fetch('/api/validator/stop', {method:'POST'});
|
||||||
|
this.notify('Validator stopped', 'info');
|
||||||
|
await this.loadValStatus();
|
||||||
|
},
|
||||||
|
|
||||||
async restartAiWorker() { await fetch('/api/ai/worker/restart',{method:'POST'}); this.notify('AI worker restarted','info'); await this.loadAiStatus(); },
|
async restartAiWorker() { await fetch('/api/ai/worker/restart',{method:'POST'}); this.notify('AI worker restarted','info'); await this.loadAiStatus(); },
|
||||||
copyEmail() {
|
copyEmail() {
|
||||||
const subj = this.modal.ai.email_subject ? `Subject: ${this.modal.ai.email_subject}\n\n` : '';
|
const subj = this.modal.ai.email_subject ? `Subject: ${this.modal.ai.email_subject}\n\n` : '';
|
||||||
|
|||||||
292
app/validator.py
Normal file
292
app/validator.py
Normal file
@@ -0,0 +1,292 @@
|
|||||||
|
"""Bulk domain validator — fast HTTP checks for the entire dataset.
|
||||||
|
|
||||||
|
Reads domains from DuckDB in batches, skips already-validated ones,
|
||||||
|
performs concurrent HTTP checks, and saves prescreen_status + server +
|
||||||
|
ip + load_time_ms to enriched_domains.
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import socket
|
||||||
|
import time
|
||||||
|
from typing import Optional
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
import aiosqlite
|
||||||
|
import duckdb
|
||||||
|
|
||||||
|
from app.db import SQLITE_PATH, DUCKDB_PATH, index_status
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
VAL_CONCURRENCY = int(os.getenv("VAL_CONCURRENCY", "50"))
|
||||||
|
VAL_BATCH = int(os.getenv("VAL_BATCH", "200"))
|
||||||
|
|
||||||
|
PARKING_BODY_SIGNALS = [
|
||||||
|
"domain is parked", "this domain is for sale", "buy this domain",
|
||||||
|
"domain parking", "parked domain", "hugedomains.com", "sedo.com",
|
||||||
|
"parkingcrew.com", "bodis.com", "dan.com", "afternic.com",
|
||||||
|
"sedoparking.com", "undeveloped.com", "epik.com/domain",
|
||||||
|
"this web page is parked", "domain has expired",
|
||||||
|
]
|
||||||
|
PARKING_REDIRECT_HOSTS = {
|
||||||
|
"sedo.com", "hugedomains.com", "dan.com", "afternic.com",
|
||||||
|
"parkingcrew.com", "bodis.com", "undeveloped.com", "epik.com",
|
||||||
|
"uniregistry.com", "sedoparking.com",
|
||||||
|
}
|
||||||
|
|
||||||
|
_UA = (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/122.0.0.0 Safari/537.36"
|
||||||
|
)
|
||||||
|
_HEADERS = {
|
||||||
|
"User-Agent": _UA,
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
}
|
||||||
|
|
||||||
|
_val_task: Optional[asyncio.Task] = None
|
||||||
|
_val_stats: dict = {
|
||||||
|
"running": False,
|
||||||
|
"processed": 0,
|
||||||
|
"live": 0,
|
||||||
|
"dead": 0,
|
||||||
|
"parked": 0,
|
||||||
|
"redirect": 0,
|
||||||
|
"skipped": 0,
|
||||||
|
"offset": 0,
|
||||||
|
"rate": 0.0,
|
||||||
|
"tld_filter": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _same_domain(original: str, final_url: str) -> bool:
|
||||||
|
orig = original.lower().lstrip("www.").split(":")[0]
|
||||||
|
final = urlparse(final_url).netloc.lower().lstrip("www.")
|
||||||
|
return orig == final or final.endswith("." + orig) or orig.endswith("." + final)
|
||||||
|
|
||||||
|
|
||||||
|
async def _resolve_ip(domain: str) -> Optional[str]:
|
||||||
|
try:
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
return await loop.run_in_executor(None, socket.gethostbyname, domain)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def _check_domain(domain: str) -> dict:
|
||||||
|
result: dict = {
|
||||||
|
"domain": domain,
|
||||||
|
"prescreen_status": "dead",
|
||||||
|
"status_code": None,
|
||||||
|
"server": None,
|
||||||
|
"ip": None,
|
||||||
|
"load_time_ms": None,
|
||||||
|
}
|
||||||
|
t0 = time.monotonic()
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(
|
||||||
|
timeout=httpx.Timeout(connect=5, read=8, write=5, pool=10),
|
||||||
|
follow_redirects=True,
|
||||||
|
headers=_HEADERS,
|
||||||
|
verify=False,
|
||||||
|
max_redirects=5,
|
||||||
|
) as client:
|
||||||
|
resp = await client.get(f"http://{domain}")
|
||||||
|
|
||||||
|
result["load_time_ms"] = int((time.monotonic() - t0) * 1000)
|
||||||
|
result["status_code"] = resp.status_code
|
||||||
|
result["server"] = (resp.headers.get("server") or "")[:100]
|
||||||
|
|
||||||
|
# Resolve IP for live-looking domains
|
||||||
|
result["ip"] = await _resolve_ip(domain)
|
||||||
|
|
||||||
|
final_url = str(resp.url)
|
||||||
|
final_host = urlparse(final_url).netloc.lower().lstrip("www.")
|
||||||
|
|
||||||
|
# Redirected to a different root domain?
|
||||||
|
if not _same_domain(domain, final_url):
|
||||||
|
for ph in PARKING_REDIRECT_HOSTS:
|
||||||
|
if ph in final_host:
|
||||||
|
result["prescreen_status"] = "parked"
|
||||||
|
return result
|
||||||
|
result["prescreen_status"] = "redirect"
|
||||||
|
return result
|
||||||
|
|
||||||
|
if resp.status_code not in (200, 203):
|
||||||
|
return result # dead
|
||||||
|
|
||||||
|
html_lc = resp.text[:20_000].lower()
|
||||||
|
for sig in PARKING_BODY_SIGNALS:
|
||||||
|
if sig in html_lc:
|
||||||
|
result["prescreen_status"] = "parked"
|
||||||
|
return result
|
||||||
|
|
||||||
|
result["prescreen_status"] = "live"
|
||||||
|
return result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("Validator %s: %s", domain, e)
|
||||||
|
result["load_time_ms"] = int((time.monotonic() - t0) * 1000)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _get_domains_batch(offset: int, limit: int, tld: Optional[str]) -> list[str]:
|
||||||
|
try:
|
||||||
|
conn = duckdb.connect(str(DUCKDB_PATH), read_only=True)
|
||||||
|
conn.execute("SET threads=2")
|
||||||
|
if tld:
|
||||||
|
rows = conn.execute(
|
||||||
|
"SELECT domain FROM domains WHERE tld=? LIMIT ? OFFSET ?",
|
||||||
|
[tld.lower().lstrip("."), limit, offset],
|
||||||
|
).fetchall()
|
||||||
|
else:
|
||||||
|
rows = conn.execute(
|
||||||
|
"SELECT domain FROM domains LIMIT ? OFFSET ?",
|
||||||
|
[limit, offset],
|
||||||
|
).fetchall()
|
||||||
|
conn.close()
|
||||||
|
return [r[0] for r in rows]
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Validator DuckDB error: %s", e)
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
async def _filter_unvalidated(domains: list[str]) -> list[str]:
|
||||||
|
"""Return only domains that don't have a prescreen_status set yet."""
|
||||||
|
if not domains:
|
||||||
|
return []
|
||||||
|
placeholders = ",".join("?" * len(domains))
|
||||||
|
async with aiosqlite.connect(SQLITE_PATH) as db:
|
||||||
|
async with db.execute(
|
||||||
|
f"SELECT domain FROM enriched_domains "
|
||||||
|
f"WHERE domain IN ({placeholders}) AND prescreen_status IS NOT NULL",
|
||||||
|
domains,
|
||||||
|
) as cur:
|
||||||
|
already = {r[0] async for r in cur}
|
||||||
|
return [d for d in domains if d not in already]
|
||||||
|
|
||||||
|
|
||||||
|
async def _save_batch(results: list[dict]):
|
||||||
|
async with aiosqlite.connect(SQLITE_PATH) as db:
|
||||||
|
for r in results:
|
||||||
|
await db.execute(
|
||||||
|
"""INSERT INTO enriched_domains
|
||||||
|
(domain, prescreen_status, status_code, server, ip, load_time_ms, prescreen_at)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, datetime('now'))
|
||||||
|
ON CONFLICT(domain) DO UPDATE SET
|
||||||
|
prescreen_status = excluded.prescreen_status,
|
||||||
|
status_code = COALESCE(excluded.status_code, status_code),
|
||||||
|
server = COALESCE(NULLIF(excluded.server,''), server),
|
||||||
|
ip = COALESCE(excluded.ip, ip),
|
||||||
|
load_time_ms = excluded.load_time_ms,
|
||||||
|
prescreen_at = excluded.prescreen_at""",
|
||||||
|
(
|
||||||
|
r["domain"], r["prescreen_status"], r.get("status_code"),
|
||||||
|
r.get("server"), r.get("ip"), r.get("load_time_ms"),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
await db.commit()
|
||||||
|
|
||||||
|
|
||||||
|
async def _validator_loop(tld_filter: Optional[str]):
|
||||||
|
global _val_stats
|
||||||
|
_val_stats["running"] = True
|
||||||
|
offset = _val_stats["offset"]
|
||||||
|
sem = asyncio.Semaphore(VAL_CONCURRENCY)
|
||||||
|
rate_buf: list[float] = []
|
||||||
|
|
||||||
|
# Wait for DuckDB index to be ready (up to 10 minutes)
|
||||||
|
for _ in range(120):
|
||||||
|
if index_status()["ready"]:
|
||||||
|
break
|
||||||
|
logger.info("Validator: waiting for DuckDB index…")
|
||||||
|
await asyncio.sleep(5)
|
||||||
|
else:
|
||||||
|
logger.error("Validator: DuckDB index never became ready")
|
||||||
|
_val_stats["running"] = False
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
batch = await loop.run_in_executor(
|
||||||
|
None, _get_domains_batch, offset, VAL_BATCH, tld_filter
|
||||||
|
)
|
||||||
|
if not batch:
|
||||||
|
logger.info("Validator: dataset complete at offset=%d", offset)
|
||||||
|
break
|
||||||
|
|
||||||
|
to_check = await _filter_unvalidated(batch)
|
||||||
|
_val_stats["skipped"] += len(batch) - len(to_check)
|
||||||
|
offset += len(batch)
|
||||||
|
_val_stats["offset"] = offset
|
||||||
|
|
||||||
|
if not to_check:
|
||||||
|
await asyncio.sleep(0) # yield to event loop
|
||||||
|
continue
|
||||||
|
|
||||||
|
t0 = time.monotonic()
|
||||||
|
|
||||||
|
async def _run(d: str) -> dict:
|
||||||
|
async with sem:
|
||||||
|
return await _check_domain(d)
|
||||||
|
|
||||||
|
raw = await asyncio.gather(*[_run(d) for d in to_check], return_exceptions=True)
|
||||||
|
results = [r for r in raw if isinstance(r, dict)]
|
||||||
|
|
||||||
|
await _save_batch(results)
|
||||||
|
|
||||||
|
for r in results:
|
||||||
|
_val_stats["processed"] += 1
|
||||||
|
s = r.get("prescreen_status", "dead")
|
||||||
|
_val_stats[s] = _val_stats.get(s, 0) + 1
|
||||||
|
|
||||||
|
elapsed = max(time.monotonic() - t0, 0.01)
|
||||||
|
rate_buf.append(len(results) / elapsed)
|
||||||
|
if len(rate_buf) > 10:
|
||||||
|
rate_buf.pop(0)
|
||||||
|
_val_stats["rate"] = round(sum(rate_buf) / len(rate_buf), 1)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Validator off=%d proc=%d live=%d dead=%d parked=%d rate=%.1f/s",
|
||||||
|
offset, _val_stats["processed"], _val_stats["live"],
|
||||||
|
_val_stats["dead"], _val_stats["parked"], _val_stats["rate"],
|
||||||
|
)
|
||||||
|
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
logger.info("Validator cancelled at offset=%d", offset)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Validator loop error: %s", e, exc_info=True)
|
||||||
|
finally:
|
||||||
|
_val_stats["running"] = False
|
||||||
|
_val_stats["offset"] = offset
|
||||||
|
|
||||||
|
|
||||||
|
def get_validator_status() -> dict:
|
||||||
|
return dict(_val_stats)
|
||||||
|
|
||||||
|
|
||||||
|
def start_validator(tld_filter: Optional[str] = None):
|
||||||
|
global _val_task, _val_stats
|
||||||
|
if _val_task and not _val_task.done():
|
||||||
|
return # already running
|
||||||
|
_val_stats["running"] = True
|
||||||
|
_val_stats["tld_filter"] = tld_filter
|
||||||
|
# Only reset counters on a completely fresh start
|
||||||
|
if not _val_stats.get("processed"):
|
||||||
|
_val_stats.update(
|
||||||
|
processed=0, live=0, dead=0, parked=0,
|
||||||
|
redirect=0, skipped=0, offset=0, rate=0.0,
|
||||||
|
)
|
||||||
|
_val_task = asyncio.create_task(_validator_loop(tld_filter))
|
||||||
|
logger.info("Validator started (tld=%s, offset=%d)", tld_filter, _val_stats["offset"])
|
||||||
|
|
||||||
|
|
||||||
|
def stop_validator():
|
||||||
|
global _val_task, _val_stats
|
||||||
|
_val_stats["running"] = False
|
||||||
|
if _val_task and not _val_task.done():
|
||||||
|
_val_task.cancel()
|
||||||
|
logger.info("Validator stop requested")
|
||||||
Reference in New Issue
Block a user