diff --git a/app/beauty_ai.py b/app/beauty_ai.py index 01b2eb0..9c1839e 100644 --- a/app/beauty_ai.py +++ b/app/beauty_ai.py @@ -19,6 +19,18 @@ REPLICATE_TOKEN = os.getenv("REPLICATE_API_TOKEN", "r8_7I7Feai78f9PzMOs20y5GVFKi REPLICATE_MODEL = "https://api.replicate.com/v1/models/google/gemini-3-pro/predictions" AI_CONCURRENCY = int(os.getenv("AI_CONCURRENCY", "3")) +# Contact extraction regexes (same patterns as site_analyzer) +_EMAIL_RE = re.compile(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}") +_PHONE_RE = re.compile(r"(?:\+\d{1,3}[\s\-]?)?(?:6|7|8|9)\d{2}[\s\-]?\d{3}[\s\-]?\d{3}") + +# Pages that often contain company registration info (CIF/NIF, registered address, +# legal email) — not fetched by site_analyzer, but rich sources for B2B contact data +_LEGAL_PATHS = [ + "/aviso-legal", "/aviso_legal", "/legal", + "/politica-de-privacidad", "/politica_privacidad", "/privacidad", + "/quienes-somos", "/quienes_somos", "/nosotros", +] + _ai_sem: Optional[asyncio.Semaphore] = None def _sem() -> asyncio.Semaphore: @@ -182,91 +194,214 @@ async def _ddg_search(query: str) -> str: return "" +# ── Legal / about page scraper ──────────────────────────────────────────────── + +async def _scrape_legal_pages(domain: str) -> dict: + """Fetch legal and about pages not covered by site_analyzer. + + Spanish Aviso Legal pages legally must contain: company name (razón social), + CIF/NIF, registered address, and a contact email — making them the richest + source of verified B2B contact data. + + Returns: + emails: all unique emails found across all pages + phones: all unique phones found across all pages + legal_snippet: first 800 chars of the aviso legal page (company registration + info: razón social, CIF, domicilio, etc.) + """ + result: dict = {"emails": [], "phones": [], "legal_snippet": ""} + + async def _fetch(path: str) -> tuple[str, str | None]: + try: + async with httpx.AsyncClient( + timeout=8, follow_redirects=True, verify=False, + headers={"User-Agent": "Mozilla/5.0"}, + ) as c: + r = await c.get(f"https://{domain}{path}") + if r.status_code == 200: + return path, r.text + except Exception: + pass + return path, None + + pages = await asyncio.gather(*[_fetch(p) for p in _LEGAL_PATHS]) + + for path, html in pages: + if not html: + continue + try: + soup = BeautifulSoup(html, "html.parser") + # Extract from anchor tags + for a in soup.find_all("a", href=True): + href = a["href"] + if href.startswith("mailto:"): + em = href[7:].split("?")[0].strip().lower() + if em and em not in result["emails"]: + result["emails"].append(em) + elif href.startswith("tel:"): + ph = re.sub(r"[^\d+]", "", href[4:]) + if ph and ph not in result["phones"]: + result["phones"].append(ph) + # Regex scan full HTML for emails + for em in _EMAIL_RE.findall(html[:60000]): + em = em.lower() + if em not in result["emails"] and not any( + em.endswith(x) for x in (".png", ".jpg", ".css", ".js", ".svg") + ): + result["emails"].append(em) + # Regex scan visible text for phones + visible = soup.get_text(separator=" ", strip=True) + for ph in _PHONE_RE.findall(visible): + ph_c = re.sub(r"[\s\-]", "", ph) + if ph_c and ph_c not in result["phones"]: + result["phones"].append(ph_c) + # Capture legal snippet from the first legal page that resolves + if not result["legal_snippet"] and any( + k in path for k in ("aviso", "legal", "privacidad") + ): + result["legal_snippet"] = " ".join(visible.split()[:150]) + except Exception: + pass + + result["emails"] = list(dict.fromkeys(result["emails"]))[:8] + result["phones"] = list(dict.fromkeys(result["phones"]))[:6] + return result + + # ── Prompt builder ───────────────────────────────────────────────────────────── def _build_beauty_prompt(a: dict, detected_brands: list, dist_matches: list, - search_results: str = "") -> str: - contacts_block = [] - if a.get("emails"): contacts_block.append(f" Emails: {', '.join(a['emails'][:3])}") - if a.get("phones"): contacts_block.append(f" Phones: {', '.join(a['phones'][:3])}") - if a.get("social_links"): contacts_block.append(f" Social: {', '.join(a['social_links'][:4])}") - contacts_str = "\n".join(contacts_block) or " Not found" + search_results: str = "", + extra_contacts: dict | None = None) -> str: + """Build the Gemini assessment prompt. - snippet = (a.get("visible_text_snippet") or "")[:1200] - title = a.get("page_title") or "" - meta = a.get("meta_description") or "" - country = a.get("ip_country") or "unknown" - cms = a.get("cms") or "unknown" - detected_str = ", ".join(detected_brands) if detected_brands else "none detected" - dist_str = ", ".join(dist_matches) if dist_matches else "none" + extra_contacts comes from _scrape_legal_pages() and adds emails/phones/company + info found in the aviso legal, privacy policy, and about pages. + """ + ec = extra_contacts or {} - return f"""You are a senior B2B sales analyst for a cosmetics distribution company operating in Europe. -Your task: evaluate whether this website is a viable B2B customer (retailer, multi-brand store, -e-commerce, distributor or chain that buys beauty products wholesale) and generate an outreach plan. + # Merge contact sources: site_analyzer (main page + contact pages) + legal pages + all_emails = list(dict.fromkeys((a.get("emails") or []) + (ec.get("emails") or [])))[:8] + all_phones = list(dict.fromkeys((a.get("phones") or []) + (ec.get("phones") or [])))[:6] + all_whatsapp = list(dict.fromkeys(a.get("whatsapp") or []))[:4] + all_social = list(dict.fromkeys(a.get("social_links") or []))[:6] -=== SITE DATA === -Domain: {a.get("domain")} -Country (IP): {country} -Title: {title} -Meta desc: {meta} -CMS: {cms} -Contact info: -{contacts_str} + def _fmt(lst: list) -> str: + return ", ".join(lst) if lst else "—" + + # Site technical signals + ssl_info = ("✓ valid" if a.get("ssl_valid") else "✗ invalid/missing") + analytics = ", ".join(a.get("analytics_present") or []) or "none detected" + word_count = a.get("word_count", 0) + load_ms = a.get("load_time_ms", 0) + copyright = a.get("copyright_year") or a.get("last_modified") or "unknown" + + snippet = (a.get("visible_text_snippet") or "")[:1600] + legal_snippet = (ec.get("legal_snippet") or "")[:800] + detected_str = ", ".join(detected_brands) if detected_brands else "none detected" + dist_str = ", ".join(dist_matches) if dist_matches else "none" + + return f"""You are a senior B2B sales analyst for a cosmetics distribution company +operating across Europe. Your task: thoroughly evaluate this website as a potential +wholesale B2B customer and produce a complete outreach dossier. + +=== BUSINESS PROFILE === +Domain: {a.get("domain")} +Country (IP): {a.get("ip_country") or "unknown"} +Region: {a.get("ip_region") or "unknown"} +Hosting (EU?): {a.get("eu_hosted")} | ISP/Org: {a.get("org") or a.get("isp") or "unknown"} +Page title: {a.get("page_title") or "—"} +H1: {a.get("h1_text") or "—"} +Meta desc: {(a.get("meta_description") or "—")[:200]} +CMS: {a.get("cms") or "unknown"} +Last updated: {copyright} + +=== TECHNICAL SIGNALS === +SSL: {ssl_info} +Load time: {load_ms}ms +Word count: {word_count} +Analytics: {analytics} +Mobile: {"yes" if a.get("has_mobile_viewport") else "no"} +Sitemap/Robots: sitemap={"yes" if a.get("has_sitemap") else "no"}, robots={"yes" if a.get("has_robots") else "no"} +GDPR/Privacy: cookie_tool={a.get("cookie_tool") or "none"}, privacy_policy={"yes" if a.get("has_privacy_policy") else "no"} + +=== ALL CONTACT CHANNELS === +Emails: {_fmt(all_emails)} +Phones: {_fmt(all_phones)} +WhatsApp: {_fmt(all_whatsapp)} +Social media: {_fmt(all_social)} + +=== LEGAL / COMPANY REGISTRATION INFO === +(extracted from aviso legal / política de privacidad — may contain razón social, CIF, address) +{legal_snippet or "Not found or page not accessible"} === PAGE CONTENT SAMPLE === {snippet} -=== BRANDS ALREADY DETECTED ON SITE === +=== BRANDS DETECTED ON SITE === {detected_str} === OUR PORTFOLIO BRANDS FOUND ON THEIR SITE === -(These brands we distribute — finding them means we're already in their market) +(brands we distribute that appear on their site — confirms shared market) {dist_str} === WEB SEARCH RESULTS === -{(search_results or "No results.")[:500]} +{(search_results or "No results available.")[:700]} -=== OUR DISTRIBUTION PORTFOLIO === +=== OUR FULL DISTRIBUTION PORTFOLIO === {', '.join(OUR_BRANDS)} === BEAUTY CATEGORIES WE COVER === {', '.join(BEAUTY_CATEGORIES)} === ASSESSMENT RULES === -1. Determine if this is a B2B prospect: retailer, pharmacy, parafarmacia, - perfumería, multi-brand beauty ecommerce, salon chain, supermarket beauty section, - or beauty products distributor based in Europe. -2. Identify which categories from our list they cover. -3. From the page content (even if brands list is empty), identify any beauty brands mentioned. -4. Match detected brands against our portfolio — this drives lead quality: - - HOT: 3+ of our portfolio brands detected, OR a large EU retailer clearly in our niche - - WARM: 1-2 portfolio brand matches, OR clear beauty retailer with good potential - - COLD: beauty-adjacent but weak match, OR can't confirm they buy wholesale - - NOT_RELEVANT: not a beauty business or not in Europe -5. Write all human text (proposal, email) in Spanish. -6. Keep JSON values concise (≤ 25 words each). +1. TARGET PROFILE: retailer, pharmacy, parafarmacia, perfumería, multi-brand beauty + ecommerce, salon chain, beauty distributor, or supermarket beauty section in Europe. +2. Identify ALL beauty brands mentioned anywhere on the page — go beyond the pre-detected + list above. Use product names, brand references in body text, alt text, etc. +3. Match brands against our portfolio. Lead quality is driven by portfolio overlap: + - HOT: 3+ portfolio brands detected, OR major EU beauty retailer clearly in our niche + - WARM: 1-2 portfolio brand matches, OR clear beauty multi-brand retailer with good reach + - COLD: beauty-adjacent but weak portfolio overlap, OR single-brand, OR unclear wholesale + - NOT_RELEVANT: not a beauty business, not in Europe, or clearly a consumer-only brand +4. Extract the BEST contact for outreach: + - Prefer business/commercial emails (info@, ventas@, compras@, admin@) over personal + - If WhatsApp exists, flag it — it's often the fastest channel in Spain/LatAm + - Check social media for direct messaging channels +5. Use the legal/company info to identify the official business name (razón social), + and if a CIF/NIF is visible, mention it in outreach_notes as it confirms legitimacy. +6. Write summary, pitch_angle, b2b_proposal, outreach_subject, and outreach_email in Spanish. +7. The outreach_email must be a complete ready-to-send email: greeting, 2-3 body sentences + (reference their specific range, 1-2 matching portfolio brands, add value), clear CTA. -Respond ONLY with valid JSON, no markdown, no text outside JSON: +Respond ONLY with valid JSON, no markdown fences, no text outside the JSON object: {{ - "is_relevant": true/false, + "is_relevant": true, "lead_quality": "HOT|WARM|COLD|NOT_RELEVANT", - "lead_reasoning": "1-2 sentences why", - "business_type": "retailer|ecommerce|distributor|pharmacy|salon_chain|other", - "business_name": "name from title or domain", - "country_fiscal": "2-letter ISO or full name", - "countries_active": ["ES","FR"], + "summary": "2-3 sentence executive summary: what this business does, their product range, who their customers are, and their apparent scale", + "lead_reasoning": "2-3 sentences explaining the lead quality rating — reference specific brands found, categories covered, and portfolio overlap", + "business_type": "retailer|ecommerce|distributor|pharmacy|parafarmacia|salon_chain|perfumeria|other", + "business_name": "official business name from title, H1, or aviso legal", + "country_fiscal": "2-letter ISO", + "countries_active": ["ES"], "categories": ["Hair Care","Makeup"], - "detected_brands": ["brand1","brand2"], - "dist_matches": ["OurBrand1","OurBrand2"], - "contact_email": "email or empty string", - "contact_phone": "phone or empty string", - "contact_whatsapp": "whatsapp link or empty string", - "contact_social": "primary social profile URL or empty string", - "b2b_proposal": "1-2 sentence value proposition in Spanish referencing their categories and our matching brands", - "outreach_subject": "short Spanish subject line referencing their business name", - "outreach_email": "3-4 sentence ready-to-send email in Spanish. Mention their business, 1-2 specific brands from our portfolio that match their range, and a clear call to action (catálogo, muestra, llamada).", - "revenue_estimate": "unknown", - "outreach_notes": "brief context for sales rep" + "detected_brands": ["all beauty brands found on site — be thorough"], + "dist_matches": ["our portfolio brands found on their site"], + "partnership_signals": ["carries multi-brand","has wholesale section","stockist page","B2B portal"], + "pitch_angle": "1 punchy sentence in Spanish: the specific angle for this business (reference their range, a gap you fill, or the portfolio brands that match)", + "b2b_proposal": "2-3 sentence value proposition in Spanish: what we offer, why it fits their range, what differentiates our brands", + "outreach_subject": "specific Spanish subject line mentioning their business name and 1 relevant brand", + "outreach_email": "complete ready-to-send Spanish email: greeting + 3-4 body sentences referencing their specific product range and 1-2 portfolio brands that match + clear CTA (catálogo, muestra, llamada, pedido mínimo) + valediction. Do not use placeholders.", + "best_contact_channel": "email|phone|whatsapp|social|web_form|unknown", + "best_contact_value": "the actual email/phone/URL to use — prefer commercial emails, then phone, then social", + "all_contacts": {{ + "emails": {json.dumps(all_emails)}, + "phones": {json.dumps(all_phones)}, + "whatsapp": {json.dumps(all_whatsapp)}, + "social": {json.dumps(all_social)} + }}, + "revenue_estimate": "unknown|<100k€|100k-500k€|500k-2M€|>2M€", + "outreach_notes": "2-3 sentences for the sales rep: timing, approach, red flags, CIF if found, any urgency signals" }}""" @@ -309,21 +444,31 @@ async def assess_beauty_domain(analysis: dict) -> dict: detected = detect_brands_in_text(text) dist_match = get_dist_matches(detected) - # Also search for company context + # Run DDG search and legal page scraping in parallel title = analysis.get("page_title") or "" biz_name = title.split("|")[0].split("-")[0].strip() or domain - search_results = await _ddg_search(f'"{biz_name}" {domain} beauty cosmetics wholesale contact') - logger.info("Beauty assess %s: %d brands detected, %d portfolio matches", - domain, len(detected), len(dist_match)) + search_results, extra_contacts = await asyncio.gather( + _ddg_search(f'"{biz_name}" {domain} cosmetics beauty wholesale B2B contacto'), + _scrape_legal_pages(domain), + ) + + logger.info( + "Beauty assess %s: %d brands, %d portfolio matches, " + "%d extra emails from legal pages", + domain, len(detected), len(dist_match), + len(extra_contacts.get("emails", [])), + ) payload = { "input": { - "prompt": _build_beauty_prompt(analysis, detected, dist_match, search_results), + "prompt": _build_beauty_prompt( + analysis, detected, dist_match, search_results, extra_contacts + ), "images": [], "videos": [], "top_p": 0.9, - "temperature": 0.15, + "temperature": 0.2, "thinking_level": "low", - "max_output_tokens": 2000, + "max_output_tokens": 4000, } } try: @@ -351,17 +496,45 @@ async def assess_beauty_domain(analysis: dict) -> dict: if not result.get("detected_brands") and detected: result["detected_brands"] = detected - # Always merge contact data directly from site_analyzer — more reliable - # than AI extraction since it uses regex against raw HTML - phones = analysis.get("phones", []) - whatsapp = analysis.get("whatsapp", []) - social_links = analysis.get("social_links", []) - if phones and not result.get("contact_phone"): - result["contact_phone"] = phones[0] - if whatsapp: - result["contact_whatsapp"] = "; ".join(whatsapp[:2]) - if social_links: - result["contact_social"] = "; ".join(social_links[:3]) + # Merge contact data directly from site_analyzer + legal pages — + # more reliable than AI extraction since it's regex against raw HTML. + # The AI's all_contacts field may already have the right data if it + # followed the schema; fill gaps from our own extraction. + all_emails = list(dict.fromkeys( + (analysis.get("emails") or []) + (extra_contacts.get("emails") or []) + ))[:8] + all_phones = list(dict.fromkeys( + (analysis.get("phones") or []) + (extra_contacts.get("phones") or []) + ))[:6] + all_whatsapp = list(dict.fromkeys(analysis.get("whatsapp") or []))[:4] + all_social = list(dict.fromkeys(analysis.get("social_links") or []))[:6] + + # Ensure all_contacts in result is always populated from our own data + if not result.get("all_contacts") or not isinstance(result.get("all_contacts"), dict): + result["all_contacts"] = {} + result["all_contacts"].setdefault("emails", []) + result["all_contacts"].setdefault("phones", []) + result["all_contacts"].setdefault("whatsapp", []) + result["all_contacts"].setdefault("social", []) + # Merge our extracted data into the AI's all_contacts + result["all_contacts"]["emails"] = list(dict.fromkeys( + result["all_contacts"]["emails"] + all_emails))[:8] + result["all_contacts"]["phones"] = list(dict.fromkeys( + result["all_contacts"]["phones"] + all_phones))[:6] + result["all_contacts"]["whatsapp"] = list(dict.fromkeys( + result["all_contacts"]["whatsapp"] + all_whatsapp))[:4] + result["all_contacts"]["social"] = list(dict.fromkeys( + result["all_contacts"]["social"] + all_social))[:6] + + # Fill top-level contact fields from merged data if AI left them blank + if not result.get("contact_email") and all_emails: + result["contact_email"] = all_emails[0] + if not result.get("contact_phone") and all_phones: + result["contact_phone"] = all_phones[0] + if not result.get("contact_whatsapp") and all_whatsapp: + result["contact_whatsapp"] = all_whatsapp[0] + if not result.get("contact_social") and all_social: + result["contact_social"] = all_social[0] logger.info("Beauty AI %s → quality=%s, dist_matches=%s", domain, result.get("lead_quality"), result.get("dist_matches")) @@ -369,17 +542,24 @@ async def assess_beauty_domain(analysis: dict) -> dict: except Exception as e: logger.error("Beauty AI error %s: %s", domain, e) - phones = analysis.get("phones", []) - whatsapp = analysis.get("whatsapp", []) - social = analysis.get("social_links", []) + all_emails = list(dict.fromkeys( + (analysis.get("emails") or []) + (extra_contacts.get("emails") or [])))[:8] + all_phones = list(dict.fromkeys( + (analysis.get("phones") or []) + (extra_contacts.get("phones") or [])))[:6] + all_whatsapp = list(dict.fromkeys(analysis.get("whatsapp") or []))[:4] + all_social = list(dict.fromkeys(analysis.get("social_links") or []))[:6] return { "error": str(e)[:300], "is_relevant": False, "lead_quality": "COLD", "dist_matches": dist_match, "detected_brands": detected, - "contact_email": "", - "contact_phone": phones[0] if phones else "", - "contact_whatsapp": "; ".join(whatsapp[:2]) if whatsapp else "", - "contact_social": "; ".join(social[:3]) if social else "", + "contact_email": all_emails[0] if all_emails else "", + "contact_phone": all_phones[0] if all_phones else "", + "contact_whatsapp": all_whatsapp[0] if all_whatsapp else "", + "contact_social": all_social[0] if all_social else "", + "all_contacts": { + "emails": all_emails, "phones": all_phones, + "whatsapp": all_whatsapp, "social": all_social, + }, } diff --git a/app/static/beauty/index.html b/app/static/beauty/index.html index b32771f..ba0c541 100644 --- a/app/static/beauty/index.html +++ b/app/static/beauty/index.html @@ -340,7 +340,7 @@ textarea{width:100%;resize:vertical;font-family:monospace;font-size:12px} —
+ x-text="(row._beauty||{}).best_contact_value||(row._beauty||{}).contact_email||row.emails||'—'">- + - None detected in scraped text + None detected
+ ++ Signals: + + + +
+