diff --git a/app/beauty_ai.py b/app/beauty_ai.py index 2daec93..01b2eb0 100644 --- a/app/beauty_ai.py +++ b/app/beauty_ai.py @@ -128,9 +128,22 @@ BEAUTY_CATEGORIES = [ # ── Brand detection (fast pre-scan, no AI) ───────────────────────────────────── def detect_brands_in_text(text: str) -> list[str]: - """Find which brands from the universe appear in the scraped page text.""" + """Find which brands from the universe appear in the scraped page text. + + Short brands (≤5 chars) use word-boundary matching to avoid false positives + like 'ref' matching 'reference', 'prefer', 'refresh', etc. + """ tl = text.lower() - return [b for b in BEAUTY_BRANDS if b.lower() in tl][:60] + result = [] + for b in BEAUTY_BRANDS: + bl = b.lower() + if len(bl) <= 5: + if re.search(r'(? list[str]: @@ -247,6 +260,8 @@ Respond ONLY with valid JSON, no markdown, no text outside JSON: "dist_matches": ["OurBrand1","OurBrand2"], "contact_email": "email or empty string", "contact_phone": "phone or empty string", + "contact_whatsapp": "whatsapp link or empty string", + "contact_social": "primary social profile URL or empty string", "b2b_proposal": "1-2 sentence value proposition in Spanish referencing their categories and our matching brands", "outreach_subject": "short Spanish subject line referencing their business name", "outreach_email": "3-4 sentence ready-to-send email in Spanish. Mention their business, 1-2 specific brands from our portfolio that match their range, and a clear call to action (catálogo, muestra, llamada).", @@ -336,12 +351,27 @@ async def assess_beauty_domain(analysis: dict) -> dict: if not result.get("detected_brands") and detected: result["detected_brands"] = detected + # Always merge contact data directly from site_analyzer — more reliable + # than AI extraction since it uses regex against raw HTML + phones = analysis.get("phones", []) + whatsapp = analysis.get("whatsapp", []) + social_links = analysis.get("social_links", []) + if phones and not result.get("contact_phone"): + result["contact_phone"] = phones[0] + if whatsapp: + result["contact_whatsapp"] = "; ".join(whatsapp[:2]) + if social_links: + result["contact_social"] = "; ".join(social_links[:3]) + logger.info("Beauty AI %s → quality=%s, dist_matches=%s", domain, result.get("lead_quality"), result.get("dist_matches")) return result except Exception as e: logger.error("Beauty AI error %s: %s", domain, e) + phones = analysis.get("phones", []) + whatsapp = analysis.get("whatsapp", []) + social = analysis.get("social_links", []) return { "error": str(e)[:300], "is_relevant": False, @@ -349,4 +379,7 @@ async def assess_beauty_domain(analysis: dict) -> dict: "dist_matches": dist_match, "detected_brands": detected, "contact_email": "", + "contact_phone": phones[0] if phones else "", + "contact_whatsapp": "; ".join(whatsapp[:2]) if whatsapp else "", + "contact_social": "; ".join(social[:3]) if social else "", } diff --git a/app/beauty_main.py b/app/beauty_main.py index 31e12d8..7f57474 100644 --- a/app/beauty_main.py +++ b/app/beauty_main.py @@ -22,7 +22,7 @@ load_dotenv() from app.db import ( SQLITE_PATH, init_db, get_stats, get_domains, get_enriched, build_duckdb_index, index_status, - queue_beauty, get_beauty_queue_status, save_beauty_assessment, get_beauty_leads, + queue_beauty, requeue_beauty, get_beauty_queue_status, save_beauty_assessment, get_beauty_leads, save_prescreen_results, ) from app.validator import start_validator, stop_validator, get_validator_status @@ -277,6 +277,17 @@ async def beauty_assess_batch(body: dict): return {"queued": len(domains_list)} +@app.post("/api/beauty/reassess/batch") +async def beauty_reassess_batch(body: dict): + """Re-queue domains for fresh assessment, resetting any existing result.""" + domains_list = body.get("domains", []) + if not domains_list: + return JSONResponse({"error": "no domains provided"}, status_code=400) + await requeue_beauty(domains_list) + _start_beauty_worker() + return {"requeued": len(domains_list)} + + @app.post("/api/beauty/worker/restart") async def beauty_worker_restart(): _start_beauty_worker() diff --git a/app/db.py b/app/db.py index 21c63ea..7854ef3 100644 --- a/app/db.py +++ b/app/db.py @@ -540,6 +540,19 @@ async def queue_beauty(domains: list[str]): await db.commit() +async def requeue_beauty(domains: list[str]): + """Re-queue domains for fresh assessment even if already assessed.""" + async with aiosqlite.connect(SQLITE_PATH, timeout=30) as db: + await db.executemany( + """INSERT INTO beauty_queue (domain, status) + VALUES (?, 'pending') + ON CONFLICT(domain) DO UPDATE SET + status='pending', completed_at=NULL, error=NULL""", + [(d,) for d in domains], + ) + await db.commit() + + async def get_beauty_queue_status(): async with aiosqlite.connect(SQLITE_PATH, timeout=30) as db: async with db.execute("SELECT status, COUNT(*) FROM beauty_queue GROUP BY status") as cur: diff --git a/app/static/beauty/index.html b/app/static/beauty/index.html index 1cf42e5..57fb0d5 100644 --- a/app/static/beauty/index.html +++ b/app/static/beauty/index.html @@ -180,6 +180,10 @@ textarea{width:100%;resize:vertical;font-family:monospace;font-size:12px} Screening… + @@ -227,6 +231,7 @@ textarea{width:100%;resize:vertical;font-family:monospace;font-size:12px}
+
@@ -446,7 +458,8 @@ function app() { valSt: {running:false,processed:0,live:0,dead:0,error:0,parked:0,redirect:0,skipped:0,offset:0,rate:0}, valTld: '', valRescan: false, toasts: [], - prescreening: false, validating: false, + prescreening: false, validating: false, reassessing: false, + _loadGen: 0, // incremented on every loadDomains() call; stale responses are discarded exportQuality: '', exportCountry: '', f: {keyword:'', tld:'', prescreen_status:'live', niche:'beauty_cosmetics', site_type:'ecommerce', country:'', assessed:'', alpha_only:false, no_sld:false, limit:'100', page:1}, @@ -491,52 +504,72 @@ function app() { goSearch() { this.f.page=1; this.loadDomains(); }, async loadDomains() { + // Generation counter: if a newer call starts while this one is awaiting the + // network, this call's result is stale and must be discarded. This prevents + // auto-advance's background fetches from overwriting a fresh user-triggered + // search that completes after the stale fetch returns. + const gen = ++this._loadGen; this.loading = true; try { - const p = new URLSearchParams({page: this.f.page, limit: this.f.limit}); - if (this.f.keyword) p.set('keyword', this.f.keyword.trim()); - if (this.f.tld) p.set('tld', this.f.tld.trim()); - if (this.f.alpha_only) p.set('alpha_only', 'true'); - if (this.f.no_sld) p.set('no_sld', 'true'); + // Snapshot filter state NOW (before any await), so the URL we build + // matches the filters the user actually requested. + const snap = {...this.f}; + const p = new URLSearchParams({page: snap.page, limit: snap.limit}); + if (snap.keyword) p.set('keyword', snap.keyword.trim()); + if (snap.tld) p.set('tld', snap.tld.trim()); + if (snap.alpha_only) p.set('alpha_only', 'true'); + if (snap.no_sld) p.set('no_sld', 'true'); // 'none' (Not checked) = domains never in the pipeline → DuckDB search. // Any real status (live/dead/…), niche, site_type, country, or assessed // requires the SQLite enriched_domains table (all server-side). - const hasEnrichFilter = (this.f.prescreen_status && this.f.prescreen_status !== 'none') - || this.f.niche || this.f.site_type || this.f.country || this.f.assessed; + const hasEnrichFilter = (snap.prescreen_status && snap.prescreen_status !== 'none') + || snap.niche || snap.site_type || snap.country || snap.assessed; let endpoint; if (hasEnrichFilter) { - if (this.f.prescreen_status) p.set('prescreen_status', this.f.prescreen_status); - if (this.f.niche) p.set('niche', this.f.niche); - if (this.f.site_type) p.set('site_type', this.f.site_type); - if (this.f.country) p.set('country', this.f.country.trim().toUpperCase()); - if (this.f.assessed) p.set('assessed', this.f.assessed); + if (snap.prescreen_status) p.set('prescreen_status', snap.prescreen_status); + if (snap.niche) p.set('niche', snap.niche); + if (snap.site_type) p.set('site_type', snap.site_type); + if (snap.country) p.set('country', snap.country.trim().toUpperCase()); + if (snap.assessed) p.set('assessed', snap.assessed); endpoint = '/api/enriched'; } else { endpoint = '/api/domains'; } - const d = await fetch(endpoint + '?' + p).then(r=>r.json()); + const d = await fetch(endpoint + '?' + p).then(r => { + if (!r.ok) throw new Error(`Server error ${r.status}`); + return r.json(); + }); + + // Discard if a newer search was started while this one was in-flight + if (gen !== this._loadGen) return; + this.domainsTotal = d.total || 0; let rows = d.results || []; // 'Not checked': DuckDB returns all domains joined with enriched data; // keep only those with no prescreen_status yet (truly unprocessed). - if (this.f.prescreen_status === 'none') rows = rows.filter(r => !r.prescreen_status); + if (snap.prescreen_status === 'none') rows = rows.filter(r => !r.prescreen_status); // Auto-advance: current DuckDB page was fully processed → try next page // (prevents "0 results" after bulk-validating a page of Not checked domains) - if (rows.length === 0 && this.f.prescreen_status === 'none' - && (d.results||[]).length > 0 && this.f.page < 500) { - this.f.page++; - this.loading = false; - await this.loadDomains(); + if (rows.length === 0 && snap.prescreen_status === 'none' + && (d.results||[]).length > 0 && snap.page < 500) { + this.f.page = snap.page + 1; + // Do NOT await — start the next page search as a fresh call so the + // generation counter works correctly; this call ends immediately. + this.loadDomains(); return; } this.domains = rows; - } catch(e) { this.notify('Failed to load: '+e.message, 'error'); } - finally { this.loading = false; } + } catch(e) { + if (gen !== this._loadGen) return; + this.notify('Failed to load: '+e.message, 'error'); + } finally { + if (gen === this._loadGen) this.loading = false; + } }, async loadLeads() { @@ -644,6 +677,30 @@ function app() { } catch(e) { this.notify('Failed: '+e.message, 'error'); } }, + async reassessSelected() { + if (!this.selected.length || this.reassessing) return; + this.reassessing = true; + try { + const d = await fetch('/api/beauty/reassess/batch', { + method:'POST', headers:{'Content-Type':'application/json'}, + body: JSON.stringify({domains: this.selected}), + }).then(r=>r.json()); + this.notify(`Re-queued ${d.requeued} domains for fresh B2B assessment`, 'success'); + this.selected = []; + } catch(e) { this.notify('Re-assess failed: '+e.message, 'error'); } + finally { this.reassessing = false; } + }, + + async reassessOne(domain) { + try { + await fetch('/api/beauty/reassess/batch', { + method:'POST', headers:{'Content-Type':'application/json'}, + body: JSON.stringify({domains:[domain]}), + }); + this.notify(`${domain} re-queued for fresh assessment`, 'success'); + } catch(e) { this.notify('Failed: '+e.message, 'error'); } + }, + toggleLead(domain) { this.expandedLead = this.expandedLead===domain ? null : domain; },