"""Beauty B2B AI assessment — cosmetics distribution lead qualification. Pre-scans scraped text for known brands, then sends a focused prompt to Gemini to evaluate fit as a B2B customer for a cosmetics distribution business. """ import asyncio import json import logging import os import re from typing import Optional import httpx from bs4 import BeautifulSoup logger = logging.getLogger(__name__) REPLICATE_TOKEN = os.getenv("REPLICATE_API_TOKEN", "r8_7I7Feai78f9PzMOs20y5GVFKiLkgUWP463vZO") REPLICATE_MODEL = "https://api.replicate.com/v1/models/google/gemini-3-pro/predictions" AI_CONCURRENCY = int(os.getenv("AI_CONCURRENCY", "3")) # Contact extraction regexes (same patterns as site_analyzer) _EMAIL_RE = re.compile(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}") _PHONE_RE = re.compile(r"(?:\+\d{1,3}[\s\-]?)?(?:6|7|8|9)\d{2}[\s\-]?\d{3}[\s\-]?\d{3}") # Pages that often contain company registration info (CIF/NIF, registered address, # legal email) — not fetched by site_analyzer, but rich sources for B2B contact data _LEGAL_PATHS = [ "/aviso-legal", "/aviso_legal", "/legal", "/politica-de-privacidad", "/politica_privacidad", "/privacidad", "/quienes-somos", "/quienes_somos", "/nosotros", ] _ai_sem: Optional[asyncio.Semaphore] = None def _sem() -> asyncio.Semaphore: global _ai_sem if _ai_sem is None: _ai_sem = asyncio.Semaphore(AI_CONCURRENCY) return _ai_sem # ── Brand universe (market brands we can detect on client sites) ────────────── BEAUTY_BRANDS = [ "4711","7days","7th Heaven","A-derma","Abercrombie & Fitch","Abril Et Nature", "Acqua Di Parma","Actinica","Adidas","Adolfo Dominguez","Aesop","Agatha Ruiz De La Prada", "Agave","Agua Lavanda","Ahava","Air-wick","Aire Sevilla","Al Haramain","Albal","Alcantara", "Alejandro Sanz","Alfaparf Milano","Algasiv","Alma Secret","Alpecin","Alqvimia","Alterna", "Alvarez Gomez","Alyssa Ashley","Ambi Pur","American Crew","Amichi","Ana María Lajusticia", "Angel Schlesser","Anian","Annayake","Anne Möller","Anso","Antonio Banderas","Apisérum", "Apivita","Aqc Fragrances","Aquilea","Aramis","Ardell","Arganour","Ariel","Armaf", "Armand Basi","Artdeco","Artero","As I Am","Aseptine","Atashi","Atrix","Ausonia","Aussie", "Australian Gold","Autan","Aveda","Avena Kinesia","Avène","Axe","Axovital","Azalea", "Azzaro","Babaria","Babyliss","Barbie","Bare Minerals","Barulab","Batiste","Beaver", "Beconfident","Belcils","Bella Aurora","Benefit","Benton","Benzacare","Beter","Biafin", "Bio Ionic","Bio-oil","Bioderma","Biolage","Biotherm","Biovène","Biretix","Bobbi Brown", "Bouclème","Bourjois","Bperfect Cosmetics","Britney Spears","Bumble & Bumble","Burberry", "Bvlgari","Byly","Byphasse","Cacharel","Calvin Klein","Camomila Intea","Cantu","Carefree", "Carmex","Carolina Herrera","Carrera","Carthusia","Catrice","Caudalie","Cerave","Cerruti", "Cetaphil","Chanel","Chanson D'Eau","Chloé","Chopard","Christina Aguilera","Christophe Robin", "Clarins","Clean & Clear","Clinique","Coach","Cocosolis","Colab","Colgate","Collistar", "Color Wow","Comfort Zone","Comodynes","Compeed","Cosrx","Creed","Creme Of Nature", "Cristalinas","Crossmen","Crusellas","Cryopharma","Cumlaude Lab","Cutex","Cygnetic", "Daffoil","Darphin","Davidoff","Declaré","Delfy","Delisea","Denenes","Dentiblanc", "Dermalogica","Desensin","Dexeryl","Diadermine","Diesel","Diet Esthetic","Dior","Diptyque", "Dodot","Dolce & Gabbana","Donna Karan","Dove","Dr. Hauschka","Dr.jart+","Dr. Organic", "Dr. Rimpler","Dr. Tree","Drasanvi","Drunk Elephant","Dsquared2","Ducray","Durex", "Elancyl","Elegant Touch","Elemis","Elie Saab","Elizabeth Arden","Elizabeth Taylor", "Emilio Pucci","Endocare","Eric Favre","Escada","Essence","Essie","Estée Lauder", "Etat Libre D'Orange","Eucerin","Eudermin","Evax","Eve Lom","Eylure","Fa","Fairy","Fanola", "Farmatint","Farmavita","Farouk","Figuière","Fisiocrem","Flor De Mayo","Fluocaril","Foreo", "Forté Pharma","Foxy","Francis Kurkdjian","Frederic Malle","Frosch","Garnier","Ghd", "Gillette","Giorgi Line","Givenchy","Glam Of Sweden","Goldwell","Gosh","Goutal","Gritti", "Gucci","Guerlain","Guess By Marciano","Gummy","Hair Rituel By Sisley","Hairgum","Halita", "Halloween","Hansaplast","Hask","Hawaiian Tropic","Head & Shoulders","Heliocare", "Heno De Pravia","Herbal Essences","Hermès","Hidracel","Hollister","Hugo Boss", "I.c.o.n.","Ibizaloe","Iceberg","Idc Institute","Iroha","Isabelle Lancray","Isdin", "Issey Miyake","It Cosmetics","Ivybears","Jacadi","Jean Paul Gaultier","Jil Sander", "Jimmy Choo","Jo Malone","John Frieda","Johnson's Baby","Joico","Joop","Jordan","Jowaé", "Juicy Couture","Juliette Has A Gun","Just For Men","Juvena","Kaloo","Karl Lagerfeld", "Karseell","Katai","Kate Spade","Kativa","Kenzo","Kerasilk","Kerastase","Kevin Murphy", "Kevyn Aucoin","Kilian","Klorane","L'Anza","L'Occitane","L'Oréal Paris", "L'Oréal Professionnel","La Cabine","La Mer","La Prairie","La Roche Posay","La Toja", "Laboratoires Filorga","Lacer","Lacoste","Lactacyd","Lactovit","Lalique","Lancaster", "Lanvin","Lattafa","Laura Biagiotti","Le Petit Marseillais","Legrain","Lierac","Listerine", "Living Proof","Loewe","Lola Cosmetics","Lolita Lempicka","Lussoni","Lutsine E45", "M2 Beauté","Mac","Macadamia","Mad Beauty","Maria Nila","Marlies Möller","Martiderm", "Martinelia","Marvis","Matrix","Maui","Mavala","Max Factor","Maybelline","Melvita", "Mermade","Michael Kors","Milk Shake","Mix & Shout","Mixa","Moroccanoil","Moschino", "Mustela","Nabeel","Nanobrow","Nanoil","Nanolash","Narciso Rodriguez","Nars","Natur Vital", "Natura Bissé","Natural Honey","Naturalium","Naturtint","Nenuco","Neogen","Neoretin", "Neostrata","Neutrogena","Nivea","Nûby","Nuggela & Sulé","Nyx Professional Make Up", "Ogx","Olaplex","Olay","Old Spice","Olivia Garden","Opi","Oral-b","Oraldine","Orofluido", "Orlane","Oscar De La Renta","Pacha","Paese","Palette","Paloma Picasso","Paltons", "Pantene","Paranix","Parfums Saphir","Parlux","Payot","Phyto","Picu Baby","Pilexil", "Piz Buin","Plantur 39","Platanomelón","Polaar","Police","Polident","Ponds","Poseidon", "Postquam","Proraso","Puig","Purito","Rabanne","Raid","Ralph Lauren","Rated Green", "Real Techniques","Redenhair","Redist","Redken","Reebok","Ref","Refectocil","Relec", "Remescar","Rene Furterer","Revlon","Revolution Hair Care","Revolution Make Up", "Revolution Pro","Rexaline","Rexona","Rilastil","Rimmel London","Roberto Cavalli","Roc", "Rochas","Roger & Gallet","Roja Parfums","Rosacure","S3","Sabon","Salerm","Sally Hansen", "Salvatore Ferragamo","Sanex","Sarah Jessica Parker","Saryna Key","Satisfyer","Scalpers", "Scholl","Schwarzkopf","Scottex","Sebamed","Sebastian Professionals","Seche Vite", "Sensai","Sensilis","Sensodyne","Serge Lutens","Serumkind","Sesderma","Seven Cosmetics", "Sexy Hair","Shiseido","Shu Uemura","Sisley","Skeyndor","Skin Generics","Sleek", "Snp","Soap & Glory","Sol De Janeiro","Solgar","Somatoline Cosmetic","Sophie La Girafe", "Soria Natural","Steinhart","Stendhal Paris","Sterimar","Strivectin","Suavinex", "Suavipiel","Svr Laboratoire Dermatologique","Syoss","System Professional","Tabac", "Taky","Talika","Tampax","Tangle Teezer","Tanit","Teaology","Tena Lady","The Body Shop", "The Ordinary","The Wet Brush","Thermacare","Tiffany & Co","Tigi","Timotei", "Tiziana Terenzi","Tod's","Tom Ford","Tommy Hilfiger","Topicrem","Torriden","Tot Herba", "Tous","Trendy Hair","Tresemme","Trussardi","Tulipán Negro","Urban Decay","Uriage", "Usu Cosmetics","Vagisil","Valmont","Valquer","Vanderbilt","Vaseline","Veet","Vichy", "Victor","Victoria's Secret","Victorio & Lucchino","Vital Proteins","Vivra", "Voltage Cosmetics","Volumax","Waterpik","Waterwipes","Wella","Weleda", "Williams","Woodwick","Xerjoff","Xls Medical","Yankee Candle","Yari","Yotuel", "Youth Lab","Zadig & Voltaire","Ziaja", ] # Our distribution portfolio — the brands we sell to B2B clients OUR_BRANDS = [ "AIMX","Al Haramain","Apivita","Armaf","Aveda","Bouclème","Clarena", "Curly Girl Movement","Cutrin","Davines","Dr. Hauschka","FanPalm","Farmavita", "Flora Curl","GAMMA+","GHD","GOSH","ICON","Image Skincare","Instituto Español", "Janeke","Kay Pro","Kerasilk","Kyo","Label M","Lierac","Living Proof","Londa", "M2 Beauté","Malibu C","Maria Nila","Medik8","Misslyn","Mustela","Nesti Dante", "Nuxe","Obagi","Osmo","Payot","Philip B","Philip Martins","Phyto","Piz Buin", "Ramon Monegal","Redken","REF","Saryna Key","Sesderma","Skala Brasil","Skin1004", "Strivectin","Swissdent","Topicrem","Uriage","Vita Liberata","Waterclouds", "Wella","Youngblood Cosmetics", ] BEAUTY_CATEGORIES = [ "Perfumes","Facial Cosmetics","Makeup","Hair Care","Health","Body Cosmetics", "Hygiene","Kids & Babies","Sun Care","Eyewear","Home","Nutrition","Erotic","Fashion", ] # ── Brand detection (fast pre-scan, no AI) ───────────────────────────────────── def detect_brands_in_text(text: str) -> list[str]: """Find which brands from the universe appear in the scraped page text. Short brands (≤5 chars) use word-boundary matching to avoid false positives like 'ref' matching 'reference', 'prefer', 'refresh', etc. """ tl = text.lower() result = [] for b in BEAUTY_BRANDS: bl = b.lower() if len(bl) <= 5: if re.search(r'(? list[str]: """Return which detected brands are in our distribution portfolio.""" dl = {b.lower() for b in detected} return [b for b in OUR_BRANDS if b.lower() in dl] # ── DuckDuckGo search (contact/company lookup) ──────────────────────────────── async def _ddg_search(query: str) -> str: try: async with httpx.AsyncClient( timeout=10, follow_redirects=True, headers={"User-Agent": "Mozilla/5.0 (compatible; BeautyLeads/1.0)"}, ) as client: r = await client.get( "https://html.duckduckgo.com/html/", params={"q": query, "kl": "es-es"}, ) if r.status_code != 200: return "" soup = BeautifulSoup(r.text, "html.parser") parts = [] for res in soup.select(".result")[:4]: title = res.select_one(".result__a") snip = res.select_one(".result__snippet") url = res.select_one(".result__url") if snip: t = title.get_text(strip=True) if title else "" u = url.get_text(strip=True) if url else "" parts.append(f"[{u}] {t} — {snip.get_text(strip=True)}") return "\n".join(parts) except Exception as e: logger.debug("DDG search failed: %s", e) return "" # ── Legal / about page scraper ──────────────────────────────────────────────── async def _scrape_legal_pages(domain: str) -> dict: """Fetch legal and about pages not covered by site_analyzer. Spanish Aviso Legal pages legally must contain: company name (razón social), CIF/NIF, registered address, and a contact email — making them the richest source of verified B2B contact data. Returns: emails: all unique emails found across all pages phones: all unique phones found across all pages legal_snippet: first 800 chars of the aviso legal page (company registration info: razón social, CIF, domicilio, etc.) """ result: dict = {"emails": [], "phones": [], "legal_snippet": ""} async def _fetch(path: str) -> tuple[str, str | None]: try: async with httpx.AsyncClient( timeout=8, follow_redirects=True, verify=False, headers={"User-Agent": "Mozilla/5.0"}, ) as c: r = await c.get(f"https://{domain}{path}") if r.status_code == 200: return path, r.text except Exception: pass return path, None pages = await asyncio.gather(*[_fetch(p) for p in _LEGAL_PATHS]) for path, html in pages: if not html: continue try: soup = BeautifulSoup(html, "html.parser") # Extract from anchor tags for a in soup.find_all("a", href=True): href = a["href"] if href.startswith("mailto:"): em = href[7:].split("?")[0].strip().lower() if em and em not in result["emails"]: result["emails"].append(em) elif href.startswith("tel:"): ph = re.sub(r"[^\d+]", "", href[4:]) if ph and ph not in result["phones"]: result["phones"].append(ph) # Regex scan full HTML for emails for em in _EMAIL_RE.findall(html[:60000]): em = em.lower() if em not in result["emails"] and not any( em.endswith(x) for x in (".png", ".jpg", ".css", ".js", ".svg") ): result["emails"].append(em) # Regex scan visible text for phones visible = soup.get_text(separator=" ", strip=True) for ph in _PHONE_RE.findall(visible): ph_c = re.sub(r"[\s\-]", "", ph) if ph_c and ph_c not in result["phones"]: result["phones"].append(ph_c) # Capture legal snippet from the first legal page that resolves if not result["legal_snippet"] and any( k in path for k in ("aviso", "legal", "privacidad") ): result["legal_snippet"] = " ".join(visible.split()[:150]) except Exception: pass result["emails"] = list(dict.fromkeys(result["emails"]))[:8] result["phones"] = list(dict.fromkeys(result["phones"]))[:6] return result # ── Prompt builder ───────────────────────────────────────────────────────────── def _build_beauty_prompt(a: dict, detected_brands: list, dist_matches: list, search_results: str = "", extra_contacts: dict | None = None) -> str: """Build the Gemini assessment prompt. extra_contacts comes from _scrape_legal_pages() and adds emails/phones/company info found in the aviso legal, privacy policy, and about pages. """ ec = extra_contacts or {} # Merge contact sources: site_analyzer (main page + contact pages) + legal pages all_emails = list(dict.fromkeys((a.get("emails") or []) + (ec.get("emails") or [])))[:8] all_phones = list(dict.fromkeys((a.get("phones") or []) + (ec.get("phones") or [])))[:6] all_whatsapp = list(dict.fromkeys(a.get("whatsapp") or []))[:4] all_social = list(dict.fromkeys(a.get("social_links") or []))[:6] def _fmt(lst: list) -> str: return ", ".join(lst) if lst else "—" # Site technical signals ssl_info = ("✓ valid" if a.get("ssl_valid") else "✗ invalid/missing") analytics = ", ".join(a.get("analytics_present") or []) or "none detected" word_count = a.get("word_count", 0) load_ms = a.get("load_time_ms", 0) copyright = a.get("copyright_year") or a.get("last_modified") or "unknown" snippet = (a.get("visible_text_snippet") or "")[:1600] legal_snippet = (ec.get("legal_snippet") or "")[:800] detected_str = ", ".join(detected_brands) if detected_brands else "none detected" dist_str = ", ".join(dist_matches) if dist_matches else "none" return f"""You are a senior B2B sales analyst for a cosmetics distribution company operating across Europe. Your task: thoroughly evaluate this website as a potential wholesale B2B customer and produce a complete outreach dossier. === BUSINESS PROFILE === Domain: {a.get("domain")} Country (IP): {a.get("ip_country") or "unknown"} Region: {a.get("ip_region") or "unknown"} Hosting (EU?): {a.get("eu_hosted")} | ISP/Org: {a.get("org") or a.get("isp") or "unknown"} Page title: {a.get("page_title") or "—"} H1: {a.get("h1_text") or "—"} Meta desc: {(a.get("meta_description") or "—")[:200]} CMS: {a.get("cms") or "unknown"} Last updated: {copyright} === TECHNICAL SIGNALS === SSL: {ssl_info} Load time: {load_ms}ms Word count: {word_count} Analytics: {analytics} Mobile: {"yes" if a.get("has_mobile_viewport") else "no"} Sitemap/Robots: sitemap={"yes" if a.get("has_sitemap") else "no"}, robots={"yes" if a.get("has_robots") else "no"} GDPR/Privacy: cookie_tool={a.get("cookie_tool") or "none"}, privacy_policy={"yes" if a.get("has_privacy_policy") else "no"} === ALL CONTACT CHANNELS === Emails: {_fmt(all_emails)} Phones: {_fmt(all_phones)} WhatsApp: {_fmt(all_whatsapp)} Social media: {_fmt(all_social)} === LEGAL / COMPANY REGISTRATION INFO === (extracted from aviso legal / política de privacidad — may contain razón social, CIF, address) {legal_snippet or "Not found or page not accessible"} === PAGE CONTENT SAMPLE === {snippet} === BRANDS DETECTED ON SITE === {detected_str} === OUR PORTFOLIO BRANDS FOUND ON THEIR SITE === (brands we distribute that appear on their site — confirms shared market) {dist_str} === WEB SEARCH RESULTS === {(search_results or "No results available.")[:700]} === OUR FULL DISTRIBUTION PORTFOLIO === {', '.join(OUR_BRANDS)} === BEAUTY CATEGORIES WE COVER === {', '.join(BEAUTY_CATEGORIES)} === ASSESSMENT RULES === 1. TARGET PROFILE: We are looking for businesses that BUY BEAUTY PRODUCTS WHOLESALE to resell: retailers, pharmacies, parafarmacias, perfumerías, multi-brand beauty ecommerce, salon chains, supermarkets with beauty sections, beauty distributors — anywhere in Europe. 2. Identify ALL beauty brands anywhere on the page (body text, alt text, category names, product listings, brand pages). Go beyond the pre-detected list already provided above. 3. LEAD QUALITY — rate on BUSINESS TYPE first, portfolio overlap second: - HOT: Business type is clearly a multi-brand beauty reseller with professional/wholesale activity AND at least one of: ≥2 portfolio brands detected, evident professional lines, large catalogue (pharmacies, parafarmacia chains, pro salon distributors). Also HOT: any large-scale EU beauty retailer even without portfolio brand matches. - WARM: ANY genuine multi-brand beauty retailer or ecommerce that could buy wholesale — even if ZERO portfolio brands are currently detected. They are our TARGET MARKET: we want to introduce our brands to them. Pharmacies, perfumerías, beauty shops, multi-brand online stores → default WARM unless there is a clear disqualifier. When uncertain between WARM and COLD: choose WARM. - COLD: ONLY if clearly disqualified: single-brand D2C (sells only their own brand), beauty salon that doesn't sell products to end-consumers, personal influencer / blog, OR no evidence this is a purchasing business at all. - NOT_RELEVANT: No beauty/cosmetics connection, or clearly non-European. ⚠ CRITICAL: Portfolio brand absence NEVER alone justifies COLD. Our job is to introduce our brands to retailers who don't carry them yet. Rate on whether they COULD buy wholesale. 4. country_fiscal: use aviso legal if found; otherwise use the IP country shown above. NEVER leave country_fiscal empty — always provide a 2-letter ISO code. 5. Extract the BEST contact for outreach — check all data above: - Prefer commercial emails (info@, ventas@, compras@, pedidos@) over generic/personal - WhatsApp is often the fastest channel in Spain; flag it if present - Set best_contact_channel and best_contact_value explicitly 6. Write summary, pitch_angle, b2b_proposal, outreach_subject, and outreach_email in SPANISH. 7. outreach_email must be a complete ready-to-send Spanish email: greeting + 3-4 sentences referencing their specific range + 1-2 of our portfolio brands that match + clear CTA (catálogo, muestra gratuita, llamada, primer pedido mínimo). No placeholders. Respond ONLY with valid JSON, no markdown fences, no text outside the JSON object: {{ "is_relevant": true, "lead_quality": "HOT|WARM|COLD|NOT_RELEVANT", "summary": "2-3 sentence executive summary: what this business does, their product range, who their customers are, and their apparent scale", "lead_reasoning": "2-3 sentences explaining the lead quality rating — reference specific brands found, categories covered, and portfolio overlap", "business_type": "retailer|ecommerce|distributor|pharmacy|parafarmacia|salon_chain|perfumeria|other", "business_name": "official business name from title, H1, or aviso legal", "country_fiscal": "2-letter ISO", "countries_active": ["ES"], "categories": ["Hair Care","Makeup"], "detected_brands": ["all beauty brands found on site — be thorough"], "dist_matches": ["our portfolio brands found on their site"], "partnership_signals": ["carries multi-brand","has wholesale section","stockist page","B2B portal"], "pitch_angle": "1 punchy sentence in Spanish: the specific angle for this business (reference their range, a gap you fill, or the portfolio brands that match)", "b2b_proposal": "2-3 sentence value proposition in Spanish: what we offer, why it fits their range, what differentiates our brands", "outreach_subject": "specific Spanish subject line mentioning their business name and 1 relevant brand", "outreach_email": "complete ready-to-send Spanish email: greeting + 3-4 body sentences referencing their specific product range and 1-2 portfolio brands that match + clear CTA (catálogo, muestra, llamada, pedido mínimo) + valediction. Do not use placeholders.", "best_contact_channel": "email|phone|whatsapp|social|web_form|unknown", "best_contact_value": "the actual email/phone/URL to use — prefer commercial emails, then phone, then social", "all_contacts": {{ "emails": {json.dumps(all_emails)}, "phones": {json.dumps(all_phones)}, "whatsapp": {json.dumps(all_whatsapp)}, "social": {json.dumps(all_social)} }}, "revenue_estimate": "unknown|<100k€|100k-500k€|500k-2M€|>2M€", "outreach_notes": "2-3 sentences for the sales rep: timing, approach, red flags, CIF if found, any urgency signals" }}""" def _parse_beauty_output(raw: str) -> dict: text = re.sub(r"```(?:json)?", "", raw).strip().rstrip("`").strip() m = re.search(r"\{[\s\S]+\}", text) if m: candidate = m.group(0) try: return json.loads(candidate) except json.JSONDecodeError: depth_obj = candidate.count("{") - candidate.count("}") depth_arr = candidate.count("[") - candidate.count("]") fixed = re.sub(r',\s*"[^"]*"?\s*:\s*[^,\}\]]*$', '', candidate) fixed += "]" * max(0, depth_arr) + "}" * max(0, depth_obj) try: return json.loads(fixed) except json.JSONDecodeError: pass logger.warning("Beauty AI parse failed, raw: %.300s", raw) return { "is_relevant": False, "lead_quality": "COLD", "business_name": "", "contact_email": "", "dist_matches": [], "parse_error": True, } # ── Main entry point ─────────────────────────────────────────────────────────── async def assess_beauty_domain(analysis: dict) -> dict: """Full beauty B2B assessment: brand scan + AI evaluation.""" async with _sem(): domain = analysis.get("domain", "") text = analysis.get("visible_text_snippet", "") or "" html_raw = text # use snippet; brands already extracted from full page in site_analyzer detected = detect_brands_in_text(text) dist_match = get_dist_matches(detected) # Run DDG search and legal page scraping in parallel title = analysis.get("page_title") or "" biz_name = title.split("|")[0].split("-")[0].strip() or domain search_results, extra_contacts = await asyncio.gather( _ddg_search(f'"{biz_name}" {domain} cosmetics beauty wholesale B2B contacto'), _scrape_legal_pages(domain), ) logger.info( "Beauty assess %s: %d brands, %d portfolio matches, " "%d extra emails from legal pages", domain, len(detected), len(dist_match), len(extra_contacts.get("emails", [])), ) payload = { "input": { "prompt": _build_beauty_prompt( analysis, detected, dist_match, search_results, extra_contacts ), "images": [], "videos": [], "top_p": 0.9, "temperature": 0.2, "thinking_level": "low", "max_output_tokens": 4000, } } try: async with httpx.AsyncClient(timeout=120) as client: resp = await client.post( REPLICATE_MODEL, headers={ "Authorization": f"Bearer {REPLICATE_TOKEN}", "Content-Type": "application/json", "Prefer": "wait", }, json=payload, ) resp.raise_for_status() data = resp.json() output = data.get("output", "") if isinstance(output, list): output = "".join(output) result = _parse_beauty_output(output) # Merge pre-scan data that AI might miss if not result.get("dist_matches") and dist_match: result["dist_matches"] = dist_match if not result.get("detected_brands") and detected: result["detected_brands"] = detected # Merge contact data directly from site_analyzer + legal pages — # more reliable than AI extraction since it's regex against raw HTML. # The AI's all_contacts field may already have the right data if it # followed the schema; fill gaps from our own extraction. all_emails = list(dict.fromkeys( (analysis.get("emails") or []) + (extra_contacts.get("emails") or []) ))[:8] all_phones = list(dict.fromkeys( (analysis.get("phones") or []) + (extra_contacts.get("phones") or []) ))[:6] all_whatsapp = list(dict.fromkeys(analysis.get("whatsapp") or []))[:4] all_social = list(dict.fromkeys(analysis.get("social_links") or []))[:6] # Ensure all_contacts in result is always populated from our own data if not result.get("all_contacts") or not isinstance(result.get("all_contacts"), dict): result["all_contacts"] = {} result["all_contacts"].setdefault("emails", []) result["all_contacts"].setdefault("phones", []) result["all_contacts"].setdefault("whatsapp", []) result["all_contacts"].setdefault("social", []) # Merge our extracted data into the AI's all_contacts result["all_contacts"]["emails"] = list(dict.fromkeys( result["all_contacts"]["emails"] + all_emails))[:8] result["all_contacts"]["phones"] = list(dict.fromkeys( result["all_contacts"]["phones"] + all_phones))[:6] result["all_contacts"]["whatsapp"] = list(dict.fromkeys( result["all_contacts"]["whatsapp"] + all_whatsapp))[:4] result["all_contacts"]["social"] = list(dict.fromkeys( result["all_contacts"]["social"] + all_social))[:6] # Fill top-level contact fields from merged data if AI left them blank if not result.get("contact_email") and all_emails: result["contact_email"] = all_emails[0] if not result.get("contact_phone") and all_phones: result["contact_phone"] = all_phones[0] if not result.get("contact_whatsapp") and all_whatsapp: result["contact_whatsapp"] = all_whatsapp[0] if not result.get("contact_social") and all_social: result["contact_social"] = all_social[0] # country_fiscal fallback — always provide a value fc = (result.get("country_fiscal") or "").strip() if not fc or fc.lower() in ("unknown", "n/a", "-"): result["country_fiscal"] = analysis.get("ip_country") or "" logger.info("Beauty AI %s → quality=%s, dist_matches=%s", domain, result.get("lead_quality"), result.get("dist_matches")) return result except Exception as e: logger.error("Beauty AI error %s: %s", domain, e) all_emails = list(dict.fromkeys( (analysis.get("emails") or []) + (extra_contacts.get("emails") or [])))[:8] all_phones = list(dict.fromkeys( (analysis.get("phones") or []) + (extra_contacts.get("phones") or [])))[:6] all_whatsapp = list(dict.fromkeys(analysis.get("whatsapp") or []))[:4] all_social = list(dict.fromkeys(analysis.get("social_links") or []))[:6] return { "error": str(e)[:300], "is_relevant": False, "lead_quality": "COLD", "dist_matches": dist_match, "detected_brands": detected, "contact_email": all_emails[0] if all_emails else "", "contact_phone": all_phones[0] if all_phones else "", "contact_whatsapp": all_whatsapp[0] if all_whatsapp else "", "contact_social": all_social[0] if all_social else "", "all_contacts": { "emails": all_emails, "phones": all_phones, "whatsapp": all_whatsapp, "social": all_social, }, }