Files
DomGod/app/beauty_ai.py
Malin dfd47743e3 fix: broader WhatsApp/social detection, generous assessment rules, overlay popup
- site_analyzer: scan onclick/data-href/data-url/data-link/data-action attrs
  on ALL tags for WhatsApp (wa.me, api.whatsapp, web.whatsapp, wa.link),
  tel: links, and social media URLs; raise dedup cap 5→8
- beauty_ai: rewrite lead quality rules — WARM for any genuine multi-brand
  retailer even with zero portfolio matches; portfolio absence NEVER justifies
  COLD alone; added country_fiscal fallback to ip_country
- index.html: assessPopup overlay modal on quality badge click in Browse tab;
  showAssessPopup() parses beauty_assessment JSON with all_contacts fallback;
  [x-cloak] CSS to prevent flash

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-13 10:37:36 +02:00

591 lines
30 KiB
Python

"""Beauty B2B AI assessment — cosmetics distribution lead qualification.
Pre-scans scraped text for known brands, then sends a focused prompt to Gemini
to evaluate fit as a B2B customer for a cosmetics distribution business.
"""
import asyncio
import json
import logging
import os
import re
from typing import Optional
import httpx
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
REPLICATE_TOKEN = os.getenv("REPLICATE_API_TOKEN", "r8_7I7Feai78f9PzMOs20y5GVFKiLkgUWP463vZO")
REPLICATE_MODEL = "https://api.replicate.com/v1/models/google/gemini-3-pro/predictions"
AI_CONCURRENCY = int(os.getenv("AI_CONCURRENCY", "3"))
# Contact extraction regexes (same patterns as site_analyzer)
_EMAIL_RE = re.compile(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}")
_PHONE_RE = re.compile(r"(?:\+\d{1,3}[\s\-]?)?(?:6|7|8|9)\d{2}[\s\-]?\d{3}[\s\-]?\d{3}")
# Pages that often contain company registration info (CIF/NIF, registered address,
# legal email) — not fetched by site_analyzer, but rich sources for B2B contact data
_LEGAL_PATHS = [
"/aviso-legal", "/aviso_legal", "/legal",
"/politica-de-privacidad", "/politica_privacidad", "/privacidad",
"/quienes-somos", "/quienes_somos", "/nosotros",
]
_ai_sem: Optional[asyncio.Semaphore] = None
def _sem() -> asyncio.Semaphore:
global _ai_sem
if _ai_sem is None:
_ai_sem = asyncio.Semaphore(AI_CONCURRENCY)
return _ai_sem
# ── Brand universe (market brands we can detect on client sites) ──────────────
BEAUTY_BRANDS = [
"4711","7days","7th Heaven","A-derma","Abercrombie & Fitch","Abril Et Nature",
"Acqua Di Parma","Actinica","Adidas","Adolfo Dominguez","Aesop","Agatha Ruiz De La Prada",
"Agave","Agua Lavanda","Ahava","Air-wick","Aire Sevilla","Al Haramain","Albal","Alcantara",
"Alejandro Sanz","Alfaparf Milano","Algasiv","Alma Secret","Alpecin","Alqvimia","Alterna",
"Alvarez Gomez","Alyssa Ashley","Ambi Pur","American Crew","Amichi","Ana María Lajusticia",
"Angel Schlesser","Anian","Annayake","Anne Möller","Anso","Antonio Banderas","Apisérum",
"Apivita","Aqc Fragrances","Aquilea","Aramis","Ardell","Arganour","Ariel","Armaf",
"Armand Basi","Artdeco","Artero","As I Am","Aseptine","Atashi","Atrix","Ausonia","Aussie",
"Australian Gold","Autan","Aveda","Avena Kinesia","Avène","Axe","Axovital","Azalea",
"Azzaro","Babaria","Babyliss","Barbie","Bare Minerals","Barulab","Batiste","Beaver",
"Beconfident","Belcils","Bella Aurora","Benefit","Benton","Benzacare","Beter","Biafin",
"Bio Ionic","Bio-oil","Bioderma","Biolage","Biotherm","Biovène","Biretix","Bobbi Brown",
"Bouclème","Bourjois","Bperfect Cosmetics","Britney Spears","Bumble & Bumble","Burberry",
"Bvlgari","Byly","Byphasse","Cacharel","Calvin Klein","Camomila Intea","Cantu","Carefree",
"Carmex","Carolina Herrera","Carrera","Carthusia","Catrice","Caudalie","Cerave","Cerruti",
"Cetaphil","Chanel","Chanson D'Eau","Chloé","Chopard","Christina Aguilera","Christophe Robin",
"Clarins","Clean & Clear","Clinique","Coach","Cocosolis","Colab","Colgate","Collistar",
"Color Wow","Comfort Zone","Comodynes","Compeed","Cosrx","Creed","Creme Of Nature",
"Cristalinas","Crossmen","Crusellas","Cryopharma","Cumlaude Lab","Cutex","Cygnetic",
"Daffoil","Darphin","Davidoff","Declaré","Delfy","Delisea","Denenes","Dentiblanc",
"Dermalogica","Desensin","Dexeryl","Diadermine","Diesel","Diet Esthetic","Dior","Diptyque",
"Dodot","Dolce & Gabbana","Donna Karan","Dove","Dr. Hauschka","Dr.jart+","Dr. Organic",
"Dr. Rimpler","Dr. Tree","Drasanvi","Drunk Elephant","Dsquared2","Ducray","Durex",
"Elancyl","Elegant Touch","Elemis","Elie Saab","Elizabeth Arden","Elizabeth Taylor",
"Emilio Pucci","Endocare","Eric Favre","Escada","Essence","Essie","Estée Lauder",
"Etat Libre D'Orange","Eucerin","Eudermin","Evax","Eve Lom","Eylure","Fa","Fairy","Fanola",
"Farmatint","Farmavita","Farouk","Figuière","Fisiocrem","Flor De Mayo","Fluocaril","Foreo",
"Forté Pharma","Foxy","Francis Kurkdjian","Frederic Malle","Frosch","Garnier","Ghd",
"Gillette","Giorgi Line","Givenchy","Glam Of Sweden","Goldwell","Gosh","Goutal","Gritti",
"Gucci","Guerlain","Guess By Marciano","Gummy","Hair Rituel By Sisley","Hairgum","Halita",
"Halloween","Hansaplast","Hask","Hawaiian Tropic","Head & Shoulders","Heliocare",
"Heno De Pravia","Herbal Essences","Hermès","Hidracel","Hollister","Hugo Boss",
"I.c.o.n.","Ibizaloe","Iceberg","Idc Institute","Iroha","Isabelle Lancray","Isdin",
"Issey Miyake","It Cosmetics","Ivybears","Jacadi","Jean Paul Gaultier","Jil Sander",
"Jimmy Choo","Jo Malone","John Frieda","Johnson's Baby","Joico","Joop","Jordan","Jowaé",
"Juicy Couture","Juliette Has A Gun","Just For Men","Juvena","Kaloo","Karl Lagerfeld",
"Karseell","Katai","Kate Spade","Kativa","Kenzo","Kerasilk","Kerastase","Kevin Murphy",
"Kevyn Aucoin","Kilian","Klorane","L'Anza","L'Occitane","L'Oréal Paris",
"L'Oréal Professionnel","La Cabine","La Mer","La Prairie","La Roche Posay","La Toja",
"Laboratoires Filorga","Lacer","Lacoste","Lactacyd","Lactovit","Lalique","Lancaster",
"Lanvin","Lattafa","Laura Biagiotti","Le Petit Marseillais","Legrain","Lierac","Listerine",
"Living Proof","Loewe","Lola Cosmetics","Lolita Lempicka","Lussoni","Lutsine E45",
"M2 Beauté","Mac","Macadamia","Mad Beauty","Maria Nila","Marlies Möller","Martiderm",
"Martinelia","Marvis","Matrix","Maui","Mavala","Max Factor","Maybelline","Melvita",
"Mermade","Michael Kors","Milk Shake","Mix & Shout","Mixa","Moroccanoil","Moschino",
"Mustela","Nabeel","Nanobrow","Nanoil","Nanolash","Narciso Rodriguez","Nars","Natur Vital",
"Natura Bissé","Natural Honey","Naturalium","Naturtint","Nenuco","Neogen","Neoretin",
"Neostrata","Neutrogena","Nivea","Nûby","Nuggela & Sulé","Nyx Professional Make Up",
"Ogx","Olaplex","Olay","Old Spice","Olivia Garden","Opi","Oral-b","Oraldine","Orofluido",
"Orlane","Oscar De La Renta","Pacha","Paese","Palette","Paloma Picasso","Paltons",
"Pantene","Paranix","Parfums Saphir","Parlux","Payot","Phyto","Picu Baby","Pilexil",
"Piz Buin","Plantur 39","Platanomelón","Polaar","Police","Polident","Ponds","Poseidon",
"Postquam","Proraso","Puig","Purito","Rabanne","Raid","Ralph Lauren","Rated Green",
"Real Techniques","Redenhair","Redist","Redken","Reebok","Ref","Refectocil","Relec",
"Remescar","Rene Furterer","Revlon","Revolution Hair Care","Revolution Make Up",
"Revolution Pro","Rexaline","Rexona","Rilastil","Rimmel London","Roberto Cavalli","Roc",
"Rochas","Roger & Gallet","Roja Parfums","Rosacure","S3","Sabon","Salerm","Sally Hansen",
"Salvatore Ferragamo","Sanex","Sarah Jessica Parker","Saryna Key","Satisfyer","Scalpers",
"Scholl","Schwarzkopf","Scottex","Sebamed","Sebastian Professionals","Seche Vite",
"Sensai","Sensilis","Sensodyne","Serge Lutens","Serumkind","Sesderma","Seven Cosmetics",
"Sexy Hair","Shiseido","Shu Uemura","Sisley","Skeyndor","Skin Generics","Sleek",
"Snp","Soap & Glory","Sol De Janeiro","Solgar","Somatoline Cosmetic","Sophie La Girafe",
"Soria Natural","Steinhart","Stendhal Paris","Sterimar","Strivectin","Suavinex",
"Suavipiel","Svr Laboratoire Dermatologique","Syoss","System Professional","Tabac",
"Taky","Talika","Tampax","Tangle Teezer","Tanit","Teaology","Tena Lady","The Body Shop",
"The Ordinary","The Wet Brush","Thermacare","Tiffany & Co","Tigi","Timotei",
"Tiziana Terenzi","Tod's","Tom Ford","Tommy Hilfiger","Topicrem","Torriden","Tot Herba",
"Tous","Trendy Hair","Tresemme","Trussardi","Tulipán Negro","Urban Decay","Uriage",
"Usu Cosmetics","Vagisil","Valmont","Valquer","Vanderbilt","Vaseline","Veet","Vichy",
"Victor","Victoria's Secret","Victorio & Lucchino","Vital Proteins","Vivra",
"Voltage Cosmetics","Volumax","Waterpik","Waterwipes","Wella","Weleda",
"Williams","Woodwick","Xerjoff","Xls Medical","Yankee Candle","Yari","Yotuel",
"Youth Lab","Zadig & Voltaire","Ziaja",
]
# Our distribution portfolio — the brands we sell to B2B clients
OUR_BRANDS = [
"AIMX","Al Haramain","Apivita","Armaf","Aveda","Bouclème","Clarena",
"Curly Girl Movement","Cutrin","Davines","Dr. Hauschka","FanPalm","Farmavita",
"Flora Curl","GAMMA+","GHD","GOSH","ICON","Image Skincare","Instituto Español",
"Janeke","Kay Pro","Kerasilk","Kyo","Label M","Lierac","Living Proof","Londa",
"M2 Beauté","Malibu C","Maria Nila","Medik8","Misslyn","Mustela","Nesti Dante",
"Nuxe","Obagi","Osmo","Payot","Philip B","Philip Martins","Phyto","Piz Buin",
"Ramon Monegal","Redken","REF","Saryna Key","Sesderma","Skala Brasil","Skin1004",
"Strivectin","Swissdent","Topicrem","Uriage","Vita Liberata","Waterclouds",
"Wella","Youngblood Cosmetics",
]
BEAUTY_CATEGORIES = [
"Perfumes","Facial Cosmetics","Makeup","Hair Care","Health","Body Cosmetics",
"Hygiene","Kids & Babies","Sun Care","Eyewear","Home","Nutrition","Erotic","Fashion",
]
# ── Brand detection (fast pre-scan, no AI) ─────────────────────────────────────
def detect_brands_in_text(text: str) -> list[str]:
"""Find which brands from the universe appear in the scraped page text.
Short brands (≤5 chars) use word-boundary matching to avoid false positives
like 'ref' matching 'reference', 'prefer', 'refresh', etc.
"""
tl = text.lower()
result = []
for b in BEAUTY_BRANDS:
bl = b.lower()
if len(bl) <= 5:
if re.search(r'(?<![a-zA-Z0-9])' + re.escape(bl) + r'(?![a-zA-Z0-9])', tl):
result.append(b)
else:
if bl in tl:
result.append(b)
return result[:60]
def get_dist_matches(detected: list[str]) -> list[str]:
"""Return which detected brands are in our distribution portfolio."""
dl = {b.lower() for b in detected}
return [b for b in OUR_BRANDS if b.lower() in dl]
# ── DuckDuckGo search (contact/company lookup) ────────────────────────────────
async def _ddg_search(query: str) -> str:
try:
async with httpx.AsyncClient(
timeout=10, follow_redirects=True,
headers={"User-Agent": "Mozilla/5.0 (compatible; BeautyLeads/1.0)"},
) as client:
r = await client.get(
"https://html.duckduckgo.com/html/",
params={"q": query, "kl": "es-es"},
)
if r.status_code != 200:
return ""
soup = BeautifulSoup(r.text, "html.parser")
parts = []
for res in soup.select(".result")[:4]:
title = res.select_one(".result__a")
snip = res.select_one(".result__snippet")
url = res.select_one(".result__url")
if snip:
t = title.get_text(strip=True) if title else ""
u = url.get_text(strip=True) if url else ""
parts.append(f"[{u}] {t}{snip.get_text(strip=True)}")
return "\n".join(parts)
except Exception as e:
logger.debug("DDG search failed: %s", e)
return ""
# ── Legal / about page scraper ────────────────────────────────────────────────
async def _scrape_legal_pages(domain: str) -> dict:
"""Fetch legal and about pages not covered by site_analyzer.
Spanish Aviso Legal pages legally must contain: company name (razón social),
CIF/NIF, registered address, and a contact email — making them the richest
source of verified B2B contact data.
Returns:
emails: all unique emails found across all pages
phones: all unique phones found across all pages
legal_snippet: first 800 chars of the aviso legal page (company registration
info: razón social, CIF, domicilio, etc.)
"""
result: dict = {"emails": [], "phones": [], "legal_snippet": ""}
async def _fetch(path: str) -> tuple[str, str | None]:
try:
async with httpx.AsyncClient(
timeout=8, follow_redirects=True, verify=False,
headers={"User-Agent": "Mozilla/5.0"},
) as c:
r = await c.get(f"https://{domain}{path}")
if r.status_code == 200:
return path, r.text
except Exception:
pass
return path, None
pages = await asyncio.gather(*[_fetch(p) for p in _LEGAL_PATHS])
for path, html in pages:
if not html:
continue
try:
soup = BeautifulSoup(html, "html.parser")
# Extract from anchor tags
for a in soup.find_all("a", href=True):
href = a["href"]
if href.startswith("mailto:"):
em = href[7:].split("?")[0].strip().lower()
if em and em not in result["emails"]:
result["emails"].append(em)
elif href.startswith("tel:"):
ph = re.sub(r"[^\d+]", "", href[4:])
if ph and ph not in result["phones"]:
result["phones"].append(ph)
# Regex scan full HTML for emails
for em in _EMAIL_RE.findall(html[:60000]):
em = em.lower()
if em not in result["emails"] and not any(
em.endswith(x) for x in (".png", ".jpg", ".css", ".js", ".svg")
):
result["emails"].append(em)
# Regex scan visible text for phones
visible = soup.get_text(separator=" ", strip=True)
for ph in _PHONE_RE.findall(visible):
ph_c = re.sub(r"[\s\-]", "", ph)
if ph_c and ph_c not in result["phones"]:
result["phones"].append(ph_c)
# Capture legal snippet from the first legal page that resolves
if not result["legal_snippet"] and any(
k in path for k in ("aviso", "legal", "privacidad")
):
result["legal_snippet"] = " ".join(visible.split()[:150])
except Exception:
pass
result["emails"] = list(dict.fromkeys(result["emails"]))[:8]
result["phones"] = list(dict.fromkeys(result["phones"]))[:6]
return result
# ── Prompt builder ─────────────────────────────────────────────────────────────
def _build_beauty_prompt(a: dict, detected_brands: list, dist_matches: list,
search_results: str = "",
extra_contacts: dict | None = None) -> str:
"""Build the Gemini assessment prompt.
extra_contacts comes from _scrape_legal_pages() and adds emails/phones/company
info found in the aviso legal, privacy policy, and about pages.
"""
ec = extra_contacts or {}
# Merge contact sources: site_analyzer (main page + contact pages) + legal pages
all_emails = list(dict.fromkeys((a.get("emails") or []) + (ec.get("emails") or [])))[:8]
all_phones = list(dict.fromkeys((a.get("phones") or []) + (ec.get("phones") or [])))[:6]
all_whatsapp = list(dict.fromkeys(a.get("whatsapp") or []))[:4]
all_social = list(dict.fromkeys(a.get("social_links") or []))[:6]
def _fmt(lst: list) -> str:
return ", ".join(lst) if lst else ""
# Site technical signals
ssl_info = ("✓ valid" if a.get("ssl_valid") else "✗ invalid/missing")
analytics = ", ".join(a.get("analytics_present") or []) or "none detected"
word_count = a.get("word_count", 0)
load_ms = a.get("load_time_ms", 0)
copyright = a.get("copyright_year") or a.get("last_modified") or "unknown"
snippet = (a.get("visible_text_snippet") or "")[:1600]
legal_snippet = (ec.get("legal_snippet") or "")[:800]
detected_str = ", ".join(detected_brands) if detected_brands else "none detected"
dist_str = ", ".join(dist_matches) if dist_matches else "none"
return f"""You are a senior B2B sales analyst for a cosmetics distribution company
operating across Europe. Your task: thoroughly evaluate this website as a potential
wholesale B2B customer and produce a complete outreach dossier.
=== BUSINESS PROFILE ===
Domain: {a.get("domain")}
Country (IP): {a.get("ip_country") or "unknown"}
Region: {a.get("ip_region") or "unknown"}
Hosting (EU?): {a.get("eu_hosted")} | ISP/Org: {a.get("org") or a.get("isp") or "unknown"}
Page title: {a.get("page_title") or ""}
H1: {a.get("h1_text") or ""}
Meta desc: {(a.get("meta_description") or "")[:200]}
CMS: {a.get("cms") or "unknown"}
Last updated: {copyright}
=== TECHNICAL SIGNALS ===
SSL: {ssl_info}
Load time: {load_ms}ms
Word count: {word_count}
Analytics: {analytics}
Mobile: {"yes" if a.get("has_mobile_viewport") else "no"}
Sitemap/Robots: sitemap={"yes" if a.get("has_sitemap") else "no"}, robots={"yes" if a.get("has_robots") else "no"}
GDPR/Privacy: cookie_tool={a.get("cookie_tool") or "none"}, privacy_policy={"yes" if a.get("has_privacy_policy") else "no"}
=== ALL CONTACT CHANNELS ===
Emails: {_fmt(all_emails)}
Phones: {_fmt(all_phones)}
WhatsApp: {_fmt(all_whatsapp)}
Social media: {_fmt(all_social)}
=== LEGAL / COMPANY REGISTRATION INFO ===
(extracted from aviso legal / política de privacidad — may contain razón social, CIF, address)
{legal_snippet or "Not found or page not accessible"}
=== PAGE CONTENT SAMPLE ===
{snippet}
=== BRANDS DETECTED ON SITE ===
{detected_str}
=== OUR PORTFOLIO BRANDS FOUND ON THEIR SITE ===
(brands we distribute that appear on their site — confirms shared market)
{dist_str}
=== WEB SEARCH RESULTS ===
{(search_results or "No results available.")[:700]}
=== OUR FULL DISTRIBUTION PORTFOLIO ===
{', '.join(OUR_BRANDS)}
=== BEAUTY CATEGORIES WE COVER ===
{', '.join(BEAUTY_CATEGORIES)}
=== ASSESSMENT RULES ===
1. TARGET PROFILE: We are looking for businesses that BUY BEAUTY PRODUCTS WHOLESALE to
resell: retailers, pharmacies, parafarmacias, perfumerías, multi-brand beauty ecommerce,
salon chains, supermarkets with beauty sections, beauty distributors — anywhere in Europe.
2. Identify ALL beauty brands anywhere on the page (body text, alt text, category names,
product listings, brand pages). Go beyond the pre-detected list already provided above.
3. LEAD QUALITY — rate on BUSINESS TYPE first, portfolio overlap second:
- HOT: Business type is clearly a multi-brand beauty reseller with professional/wholesale
activity AND at least one of: ≥2 portfolio brands detected, evident professional
lines, large catalogue (pharmacies, parafarmacia chains, pro salon distributors).
Also HOT: any large-scale EU beauty retailer even without portfolio brand matches.
- WARM: ANY genuine multi-brand beauty retailer or ecommerce that could buy wholesale —
even if ZERO portfolio brands are currently detected. They are our TARGET MARKET:
we want to introduce our brands to them. Pharmacies, perfumerías, beauty shops,
multi-brand online stores → default WARM unless there is a clear disqualifier.
When uncertain between WARM and COLD: choose WARM.
- COLD: ONLY if clearly disqualified: single-brand D2C (sells only their own brand),
beauty salon that doesn't sell products to end-consumers, personal influencer /
blog, OR no evidence this is a purchasing business at all.
- NOT_RELEVANT: No beauty/cosmetics connection, or clearly non-European.
⚠ CRITICAL: Portfolio brand absence NEVER alone justifies COLD. Our job is to introduce
our brands to retailers who don't carry them yet. Rate on whether they COULD buy wholesale.
4. country_fiscal: use aviso legal if found; otherwise use the IP country shown above.
NEVER leave country_fiscal empty — always provide a 2-letter ISO code.
5. Extract the BEST contact for outreach — check all data above:
- Prefer commercial emails (info@, ventas@, compras@, pedidos@) over generic/personal
- WhatsApp is often the fastest channel in Spain; flag it if present
- Set best_contact_channel and best_contact_value explicitly
6. Write summary, pitch_angle, b2b_proposal, outreach_subject, and outreach_email in SPANISH.
7. outreach_email must be a complete ready-to-send Spanish email: greeting + 3-4 sentences
referencing their specific range + 1-2 of our portfolio brands that match + clear CTA
(catálogo, muestra gratuita, llamada, primer pedido mínimo). No placeholders.
Respond ONLY with valid JSON, no markdown fences, no text outside the JSON object:
{{
"is_relevant": true,
"lead_quality": "HOT|WARM|COLD|NOT_RELEVANT",
"summary": "2-3 sentence executive summary: what this business does, their product range, who their customers are, and their apparent scale",
"lead_reasoning": "2-3 sentences explaining the lead quality rating — reference specific brands found, categories covered, and portfolio overlap",
"business_type": "retailer|ecommerce|distributor|pharmacy|parafarmacia|salon_chain|perfumeria|other",
"business_name": "official business name from title, H1, or aviso legal",
"country_fiscal": "2-letter ISO",
"countries_active": ["ES"],
"categories": ["Hair Care","Makeup"],
"detected_brands": ["all beauty brands found on site — be thorough"],
"dist_matches": ["our portfolio brands found on their site"],
"partnership_signals": ["carries multi-brand","has wholesale section","stockist page","B2B portal"],
"pitch_angle": "1 punchy sentence in Spanish: the specific angle for this business (reference their range, a gap you fill, or the portfolio brands that match)",
"b2b_proposal": "2-3 sentence value proposition in Spanish: what we offer, why it fits their range, what differentiates our brands",
"outreach_subject": "specific Spanish subject line mentioning their business name and 1 relevant brand",
"outreach_email": "complete ready-to-send Spanish email: greeting + 3-4 body sentences referencing their specific product range and 1-2 portfolio brands that match + clear CTA (catálogo, muestra, llamada, pedido mínimo) + valediction. Do not use placeholders.",
"best_contact_channel": "email|phone|whatsapp|social|web_form|unknown",
"best_contact_value": "the actual email/phone/URL to use — prefer commercial emails, then phone, then social",
"all_contacts": {{
"emails": {json.dumps(all_emails)},
"phones": {json.dumps(all_phones)},
"whatsapp": {json.dumps(all_whatsapp)},
"social": {json.dumps(all_social)}
}},
"revenue_estimate": "unknown|<100k€|100k-500k€|500k-2M€|>2M€",
"outreach_notes": "2-3 sentences for the sales rep: timing, approach, red flags, CIF if found, any urgency signals"
}}"""
def _parse_beauty_output(raw: str) -> dict:
text = re.sub(r"```(?:json)?", "", raw).strip().rstrip("`").strip()
m = re.search(r"\{[\s\S]+\}", text)
if m:
candidate = m.group(0)
try:
return json.loads(candidate)
except json.JSONDecodeError:
depth_obj = candidate.count("{") - candidate.count("}")
depth_arr = candidate.count("[") - candidate.count("]")
fixed = re.sub(r',\s*"[^"]*"?\s*:\s*[^,\}\]]*$', '', candidate)
fixed += "]" * max(0, depth_arr) + "}" * max(0, depth_obj)
try:
return json.loads(fixed)
except json.JSONDecodeError:
pass
logger.warning("Beauty AI parse failed, raw: %.300s", raw)
return {
"is_relevant": False,
"lead_quality": "COLD",
"business_name": "",
"contact_email": "",
"dist_matches": [],
"parse_error": True,
}
# ── Main entry point ───────────────────────────────────────────────────────────
async def assess_beauty_domain(analysis: dict) -> dict:
"""Full beauty B2B assessment: brand scan + AI evaluation."""
async with _sem():
domain = analysis.get("domain", "")
text = analysis.get("visible_text_snippet", "") or ""
html_raw = text # use snippet; brands already extracted from full page in site_analyzer
detected = detect_brands_in_text(text)
dist_match = get_dist_matches(detected)
# Run DDG search and legal page scraping in parallel
title = analysis.get("page_title") or ""
biz_name = title.split("|")[0].split("-")[0].strip() or domain
search_results, extra_contacts = await asyncio.gather(
_ddg_search(f'"{biz_name}" {domain} cosmetics beauty wholesale B2B contacto'),
_scrape_legal_pages(domain),
)
logger.info(
"Beauty assess %s: %d brands, %d portfolio matches, "
"%d extra emails from legal pages",
domain, len(detected), len(dist_match),
len(extra_contacts.get("emails", [])),
)
payload = {
"input": {
"prompt": _build_beauty_prompt(
analysis, detected, dist_match, search_results, extra_contacts
),
"images": [], "videos": [],
"top_p": 0.9,
"temperature": 0.2,
"thinking_level": "low",
"max_output_tokens": 4000,
}
}
try:
async with httpx.AsyncClient(timeout=120) as client:
resp = await client.post(
REPLICATE_MODEL,
headers={
"Authorization": f"Bearer {REPLICATE_TOKEN}",
"Content-Type": "application/json",
"Prefer": "wait",
},
json=payload,
)
resp.raise_for_status()
data = resp.json()
output = data.get("output", "")
if isinstance(output, list):
output = "".join(output)
result = _parse_beauty_output(output)
# Merge pre-scan data that AI might miss
if not result.get("dist_matches") and dist_match:
result["dist_matches"] = dist_match
if not result.get("detected_brands") and detected:
result["detected_brands"] = detected
# Merge contact data directly from site_analyzer + legal pages —
# more reliable than AI extraction since it's regex against raw HTML.
# The AI's all_contacts field may already have the right data if it
# followed the schema; fill gaps from our own extraction.
all_emails = list(dict.fromkeys(
(analysis.get("emails") or []) + (extra_contacts.get("emails") or [])
))[:8]
all_phones = list(dict.fromkeys(
(analysis.get("phones") or []) + (extra_contacts.get("phones") or [])
))[:6]
all_whatsapp = list(dict.fromkeys(analysis.get("whatsapp") or []))[:4]
all_social = list(dict.fromkeys(analysis.get("social_links") or []))[:6]
# Ensure all_contacts in result is always populated from our own data
if not result.get("all_contacts") or not isinstance(result.get("all_contacts"), dict):
result["all_contacts"] = {}
result["all_contacts"].setdefault("emails", [])
result["all_contacts"].setdefault("phones", [])
result["all_contacts"].setdefault("whatsapp", [])
result["all_contacts"].setdefault("social", [])
# Merge our extracted data into the AI's all_contacts
result["all_contacts"]["emails"] = list(dict.fromkeys(
result["all_contacts"]["emails"] + all_emails))[:8]
result["all_contacts"]["phones"] = list(dict.fromkeys(
result["all_contacts"]["phones"] + all_phones))[:6]
result["all_contacts"]["whatsapp"] = list(dict.fromkeys(
result["all_contacts"]["whatsapp"] + all_whatsapp))[:4]
result["all_contacts"]["social"] = list(dict.fromkeys(
result["all_contacts"]["social"] + all_social))[:6]
# Fill top-level contact fields from merged data if AI left them blank
if not result.get("contact_email") and all_emails:
result["contact_email"] = all_emails[0]
if not result.get("contact_phone") and all_phones:
result["contact_phone"] = all_phones[0]
if not result.get("contact_whatsapp") and all_whatsapp:
result["contact_whatsapp"] = all_whatsapp[0]
if not result.get("contact_social") and all_social:
result["contact_social"] = all_social[0]
# country_fiscal fallback — always provide a value
fc = (result.get("country_fiscal") or "").strip()
if not fc or fc.lower() in ("unknown", "n/a", "-"):
result["country_fiscal"] = analysis.get("ip_country") or ""
logger.info("Beauty AI %s → quality=%s, dist_matches=%s",
domain, result.get("lead_quality"), result.get("dist_matches"))
return result
except Exception as e:
logger.error("Beauty AI error %s: %s", domain, e)
all_emails = list(dict.fromkeys(
(analysis.get("emails") or []) + (extra_contacts.get("emails") or [])))[:8]
all_phones = list(dict.fromkeys(
(analysis.get("phones") or []) + (extra_contacts.get("phones") or [])))[:6]
all_whatsapp = list(dict.fromkeys(analysis.get("whatsapp") or []))[:4]
all_social = list(dict.fromkeys(analysis.get("social_links") or []))[:6]
return {
"error": str(e)[:300],
"is_relevant": False,
"lead_quality": "COLD",
"dist_matches": dist_match,
"detected_brands": detected,
"contact_email": all_emails[0] if all_emails else "",
"contact_phone": all_phones[0] if all_phones else "",
"contact_whatsapp": all_whatsapp[0] if all_whatsapp else "",
"contact_social": all_social[0] if all_social else "",
"all_contacts": {
"emails": all_emails, "phones": all_phones,
"whatsapp": all_whatsapp, "social": all_social,
},
}