- site_analyzer: scan onclick/data-href/data-url/data-link/data-action attrs on ALL tags for WhatsApp (wa.me, api.whatsapp, web.whatsapp, wa.link), tel: links, and social media URLs; raise dedup cap 5→8 - beauty_ai: rewrite lead quality rules — WARM for any genuine multi-brand retailer even with zero portfolio matches; portfolio absence NEVER justifies COLD alone; added country_fiscal fallback to ip_country - index.html: assessPopup overlay modal on quality badge click in Browse tab; showAssessPopup() parses beauty_assessment JSON with all_contacts fallback; [x-cloak] CSS to prevent flash Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
591 lines
30 KiB
Python
591 lines
30 KiB
Python
"""Beauty B2B AI assessment — cosmetics distribution lead qualification.
|
|
|
|
Pre-scans scraped text for known brands, then sends a focused prompt to Gemini
|
|
to evaluate fit as a B2B customer for a cosmetics distribution business.
|
|
"""
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
from typing import Optional
|
|
|
|
import httpx
|
|
from bs4 import BeautifulSoup
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
REPLICATE_TOKEN = os.getenv("REPLICATE_API_TOKEN", "r8_7I7Feai78f9PzMOs20y5GVFKiLkgUWP463vZO")
|
|
REPLICATE_MODEL = "https://api.replicate.com/v1/models/google/gemini-3-pro/predictions"
|
|
AI_CONCURRENCY = int(os.getenv("AI_CONCURRENCY", "3"))
|
|
|
|
# Contact extraction regexes (same patterns as site_analyzer)
|
|
_EMAIL_RE = re.compile(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}")
|
|
_PHONE_RE = re.compile(r"(?:\+\d{1,3}[\s\-]?)?(?:6|7|8|9)\d{2}[\s\-]?\d{3}[\s\-]?\d{3}")
|
|
|
|
# Pages that often contain company registration info (CIF/NIF, registered address,
|
|
# legal email) — not fetched by site_analyzer, but rich sources for B2B contact data
|
|
_LEGAL_PATHS = [
|
|
"/aviso-legal", "/aviso_legal", "/legal",
|
|
"/politica-de-privacidad", "/politica_privacidad", "/privacidad",
|
|
"/quienes-somos", "/quienes_somos", "/nosotros",
|
|
]
|
|
|
|
_ai_sem: Optional[asyncio.Semaphore] = None
|
|
|
|
def _sem() -> asyncio.Semaphore:
|
|
global _ai_sem
|
|
if _ai_sem is None:
|
|
_ai_sem = asyncio.Semaphore(AI_CONCURRENCY)
|
|
return _ai_sem
|
|
|
|
|
|
# ── Brand universe (market brands we can detect on client sites) ──────────────
|
|
|
|
BEAUTY_BRANDS = [
|
|
"4711","7days","7th Heaven","A-derma","Abercrombie & Fitch","Abril Et Nature",
|
|
"Acqua Di Parma","Actinica","Adidas","Adolfo Dominguez","Aesop","Agatha Ruiz De La Prada",
|
|
"Agave","Agua Lavanda","Ahava","Air-wick","Aire Sevilla","Al Haramain","Albal","Alcantara",
|
|
"Alejandro Sanz","Alfaparf Milano","Algasiv","Alma Secret","Alpecin","Alqvimia","Alterna",
|
|
"Alvarez Gomez","Alyssa Ashley","Ambi Pur","American Crew","Amichi","Ana María Lajusticia",
|
|
"Angel Schlesser","Anian","Annayake","Anne Möller","Anso","Antonio Banderas","Apisérum",
|
|
"Apivita","Aqc Fragrances","Aquilea","Aramis","Ardell","Arganour","Ariel","Armaf",
|
|
"Armand Basi","Artdeco","Artero","As I Am","Aseptine","Atashi","Atrix","Ausonia","Aussie",
|
|
"Australian Gold","Autan","Aveda","Avena Kinesia","Avène","Axe","Axovital","Azalea",
|
|
"Azzaro","Babaria","Babyliss","Barbie","Bare Minerals","Barulab","Batiste","Beaver",
|
|
"Beconfident","Belcils","Bella Aurora","Benefit","Benton","Benzacare","Beter","Biafin",
|
|
"Bio Ionic","Bio-oil","Bioderma","Biolage","Biotherm","Biovène","Biretix","Bobbi Brown",
|
|
"Bouclème","Bourjois","Bperfect Cosmetics","Britney Spears","Bumble & Bumble","Burberry",
|
|
"Bvlgari","Byly","Byphasse","Cacharel","Calvin Klein","Camomila Intea","Cantu","Carefree",
|
|
"Carmex","Carolina Herrera","Carrera","Carthusia","Catrice","Caudalie","Cerave","Cerruti",
|
|
"Cetaphil","Chanel","Chanson D'Eau","Chloé","Chopard","Christina Aguilera","Christophe Robin",
|
|
"Clarins","Clean & Clear","Clinique","Coach","Cocosolis","Colab","Colgate","Collistar",
|
|
"Color Wow","Comfort Zone","Comodynes","Compeed","Cosrx","Creed","Creme Of Nature",
|
|
"Cristalinas","Crossmen","Crusellas","Cryopharma","Cumlaude Lab","Cutex","Cygnetic",
|
|
"Daffoil","Darphin","Davidoff","Declaré","Delfy","Delisea","Denenes","Dentiblanc",
|
|
"Dermalogica","Desensin","Dexeryl","Diadermine","Diesel","Diet Esthetic","Dior","Diptyque",
|
|
"Dodot","Dolce & Gabbana","Donna Karan","Dove","Dr. Hauschka","Dr.jart+","Dr. Organic",
|
|
"Dr. Rimpler","Dr. Tree","Drasanvi","Drunk Elephant","Dsquared2","Ducray","Durex",
|
|
"Elancyl","Elegant Touch","Elemis","Elie Saab","Elizabeth Arden","Elizabeth Taylor",
|
|
"Emilio Pucci","Endocare","Eric Favre","Escada","Essence","Essie","Estée Lauder",
|
|
"Etat Libre D'Orange","Eucerin","Eudermin","Evax","Eve Lom","Eylure","Fa","Fairy","Fanola",
|
|
"Farmatint","Farmavita","Farouk","Figuière","Fisiocrem","Flor De Mayo","Fluocaril","Foreo",
|
|
"Forté Pharma","Foxy","Francis Kurkdjian","Frederic Malle","Frosch","Garnier","Ghd",
|
|
"Gillette","Giorgi Line","Givenchy","Glam Of Sweden","Goldwell","Gosh","Goutal","Gritti",
|
|
"Gucci","Guerlain","Guess By Marciano","Gummy","Hair Rituel By Sisley","Hairgum","Halita",
|
|
"Halloween","Hansaplast","Hask","Hawaiian Tropic","Head & Shoulders","Heliocare",
|
|
"Heno De Pravia","Herbal Essences","Hermès","Hidracel","Hollister","Hugo Boss",
|
|
"I.c.o.n.","Ibizaloe","Iceberg","Idc Institute","Iroha","Isabelle Lancray","Isdin",
|
|
"Issey Miyake","It Cosmetics","Ivybears","Jacadi","Jean Paul Gaultier","Jil Sander",
|
|
"Jimmy Choo","Jo Malone","John Frieda","Johnson's Baby","Joico","Joop","Jordan","Jowaé",
|
|
"Juicy Couture","Juliette Has A Gun","Just For Men","Juvena","Kaloo","Karl Lagerfeld",
|
|
"Karseell","Katai","Kate Spade","Kativa","Kenzo","Kerasilk","Kerastase","Kevin Murphy",
|
|
"Kevyn Aucoin","Kilian","Klorane","L'Anza","L'Occitane","L'Oréal Paris",
|
|
"L'Oréal Professionnel","La Cabine","La Mer","La Prairie","La Roche Posay","La Toja",
|
|
"Laboratoires Filorga","Lacer","Lacoste","Lactacyd","Lactovit","Lalique","Lancaster",
|
|
"Lanvin","Lattafa","Laura Biagiotti","Le Petit Marseillais","Legrain","Lierac","Listerine",
|
|
"Living Proof","Loewe","Lola Cosmetics","Lolita Lempicka","Lussoni","Lutsine E45",
|
|
"M2 Beauté","Mac","Macadamia","Mad Beauty","Maria Nila","Marlies Möller","Martiderm",
|
|
"Martinelia","Marvis","Matrix","Maui","Mavala","Max Factor","Maybelline","Melvita",
|
|
"Mermade","Michael Kors","Milk Shake","Mix & Shout","Mixa","Moroccanoil","Moschino",
|
|
"Mustela","Nabeel","Nanobrow","Nanoil","Nanolash","Narciso Rodriguez","Nars","Natur Vital",
|
|
"Natura Bissé","Natural Honey","Naturalium","Naturtint","Nenuco","Neogen","Neoretin",
|
|
"Neostrata","Neutrogena","Nivea","Nûby","Nuggela & Sulé","Nyx Professional Make Up",
|
|
"Ogx","Olaplex","Olay","Old Spice","Olivia Garden","Opi","Oral-b","Oraldine","Orofluido",
|
|
"Orlane","Oscar De La Renta","Pacha","Paese","Palette","Paloma Picasso","Paltons",
|
|
"Pantene","Paranix","Parfums Saphir","Parlux","Payot","Phyto","Picu Baby","Pilexil",
|
|
"Piz Buin","Plantur 39","Platanomelón","Polaar","Police","Polident","Ponds","Poseidon",
|
|
"Postquam","Proraso","Puig","Purito","Rabanne","Raid","Ralph Lauren","Rated Green",
|
|
"Real Techniques","Redenhair","Redist","Redken","Reebok","Ref","Refectocil","Relec",
|
|
"Remescar","Rene Furterer","Revlon","Revolution Hair Care","Revolution Make Up",
|
|
"Revolution Pro","Rexaline","Rexona","Rilastil","Rimmel London","Roberto Cavalli","Roc",
|
|
"Rochas","Roger & Gallet","Roja Parfums","Rosacure","S3","Sabon","Salerm","Sally Hansen",
|
|
"Salvatore Ferragamo","Sanex","Sarah Jessica Parker","Saryna Key","Satisfyer","Scalpers",
|
|
"Scholl","Schwarzkopf","Scottex","Sebamed","Sebastian Professionals","Seche Vite",
|
|
"Sensai","Sensilis","Sensodyne","Serge Lutens","Serumkind","Sesderma","Seven Cosmetics",
|
|
"Sexy Hair","Shiseido","Shu Uemura","Sisley","Skeyndor","Skin Generics","Sleek",
|
|
"Snp","Soap & Glory","Sol De Janeiro","Solgar","Somatoline Cosmetic","Sophie La Girafe",
|
|
"Soria Natural","Steinhart","Stendhal Paris","Sterimar","Strivectin","Suavinex",
|
|
"Suavipiel","Svr Laboratoire Dermatologique","Syoss","System Professional","Tabac",
|
|
"Taky","Talika","Tampax","Tangle Teezer","Tanit","Teaology","Tena Lady","The Body Shop",
|
|
"The Ordinary","The Wet Brush","Thermacare","Tiffany & Co","Tigi","Timotei",
|
|
"Tiziana Terenzi","Tod's","Tom Ford","Tommy Hilfiger","Topicrem","Torriden","Tot Herba",
|
|
"Tous","Trendy Hair","Tresemme","Trussardi","Tulipán Negro","Urban Decay","Uriage",
|
|
"Usu Cosmetics","Vagisil","Valmont","Valquer","Vanderbilt","Vaseline","Veet","Vichy",
|
|
"Victor","Victoria's Secret","Victorio & Lucchino","Vital Proteins","Vivra",
|
|
"Voltage Cosmetics","Volumax","Waterpik","Waterwipes","Wella","Weleda",
|
|
"Williams","Woodwick","Xerjoff","Xls Medical","Yankee Candle","Yari","Yotuel",
|
|
"Youth Lab","Zadig & Voltaire","Ziaja",
|
|
]
|
|
|
|
# Our distribution portfolio — the brands we sell to B2B clients
|
|
OUR_BRANDS = [
|
|
"AIMX","Al Haramain","Apivita","Armaf","Aveda","Bouclème","Clarena",
|
|
"Curly Girl Movement","Cutrin","Davines","Dr. Hauschka","FanPalm","Farmavita",
|
|
"Flora Curl","GAMMA+","GHD","GOSH","ICON","Image Skincare","Instituto Español",
|
|
"Janeke","Kay Pro","Kerasilk","Kyo","Label M","Lierac","Living Proof","Londa",
|
|
"M2 Beauté","Malibu C","Maria Nila","Medik8","Misslyn","Mustela","Nesti Dante",
|
|
"Nuxe","Obagi","Osmo","Payot","Philip B","Philip Martins","Phyto","Piz Buin",
|
|
"Ramon Monegal","Redken","REF","Saryna Key","Sesderma","Skala Brasil","Skin1004",
|
|
"Strivectin","Swissdent","Topicrem","Uriage","Vita Liberata","Waterclouds",
|
|
"Wella","Youngblood Cosmetics",
|
|
]
|
|
|
|
BEAUTY_CATEGORIES = [
|
|
"Perfumes","Facial Cosmetics","Makeup","Hair Care","Health","Body Cosmetics",
|
|
"Hygiene","Kids & Babies","Sun Care","Eyewear","Home","Nutrition","Erotic","Fashion",
|
|
]
|
|
|
|
|
|
# ── Brand detection (fast pre-scan, no AI) ─────────────────────────────────────
|
|
|
|
def detect_brands_in_text(text: str) -> list[str]:
|
|
"""Find which brands from the universe appear in the scraped page text.
|
|
|
|
Short brands (≤5 chars) use word-boundary matching to avoid false positives
|
|
like 'ref' matching 'reference', 'prefer', 'refresh', etc.
|
|
"""
|
|
tl = text.lower()
|
|
result = []
|
|
for b in BEAUTY_BRANDS:
|
|
bl = b.lower()
|
|
if len(bl) <= 5:
|
|
if re.search(r'(?<![a-zA-Z0-9])' + re.escape(bl) + r'(?![a-zA-Z0-9])', tl):
|
|
result.append(b)
|
|
else:
|
|
if bl in tl:
|
|
result.append(b)
|
|
return result[:60]
|
|
|
|
|
|
def get_dist_matches(detected: list[str]) -> list[str]:
|
|
"""Return which detected brands are in our distribution portfolio."""
|
|
dl = {b.lower() for b in detected}
|
|
return [b for b in OUR_BRANDS if b.lower() in dl]
|
|
|
|
|
|
# ── DuckDuckGo search (contact/company lookup) ────────────────────────────────
|
|
|
|
async def _ddg_search(query: str) -> str:
|
|
try:
|
|
async with httpx.AsyncClient(
|
|
timeout=10, follow_redirects=True,
|
|
headers={"User-Agent": "Mozilla/5.0 (compatible; BeautyLeads/1.0)"},
|
|
) as client:
|
|
r = await client.get(
|
|
"https://html.duckduckgo.com/html/",
|
|
params={"q": query, "kl": "es-es"},
|
|
)
|
|
if r.status_code != 200:
|
|
return ""
|
|
soup = BeautifulSoup(r.text, "html.parser")
|
|
parts = []
|
|
for res in soup.select(".result")[:4]:
|
|
title = res.select_one(".result__a")
|
|
snip = res.select_one(".result__snippet")
|
|
url = res.select_one(".result__url")
|
|
if snip:
|
|
t = title.get_text(strip=True) if title else ""
|
|
u = url.get_text(strip=True) if url else ""
|
|
parts.append(f"[{u}] {t} — {snip.get_text(strip=True)}")
|
|
return "\n".join(parts)
|
|
except Exception as e:
|
|
logger.debug("DDG search failed: %s", e)
|
|
return ""
|
|
|
|
|
|
# ── Legal / about page scraper ────────────────────────────────────────────────
|
|
|
|
async def _scrape_legal_pages(domain: str) -> dict:
|
|
"""Fetch legal and about pages not covered by site_analyzer.
|
|
|
|
Spanish Aviso Legal pages legally must contain: company name (razón social),
|
|
CIF/NIF, registered address, and a contact email — making them the richest
|
|
source of verified B2B contact data.
|
|
|
|
Returns:
|
|
emails: all unique emails found across all pages
|
|
phones: all unique phones found across all pages
|
|
legal_snippet: first 800 chars of the aviso legal page (company registration
|
|
info: razón social, CIF, domicilio, etc.)
|
|
"""
|
|
result: dict = {"emails": [], "phones": [], "legal_snippet": ""}
|
|
|
|
async def _fetch(path: str) -> tuple[str, str | None]:
|
|
try:
|
|
async with httpx.AsyncClient(
|
|
timeout=8, follow_redirects=True, verify=False,
|
|
headers={"User-Agent": "Mozilla/5.0"},
|
|
) as c:
|
|
r = await c.get(f"https://{domain}{path}")
|
|
if r.status_code == 200:
|
|
return path, r.text
|
|
except Exception:
|
|
pass
|
|
return path, None
|
|
|
|
pages = await asyncio.gather(*[_fetch(p) for p in _LEGAL_PATHS])
|
|
|
|
for path, html in pages:
|
|
if not html:
|
|
continue
|
|
try:
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
# Extract from anchor tags
|
|
for a in soup.find_all("a", href=True):
|
|
href = a["href"]
|
|
if href.startswith("mailto:"):
|
|
em = href[7:].split("?")[0].strip().lower()
|
|
if em and em not in result["emails"]:
|
|
result["emails"].append(em)
|
|
elif href.startswith("tel:"):
|
|
ph = re.sub(r"[^\d+]", "", href[4:])
|
|
if ph and ph not in result["phones"]:
|
|
result["phones"].append(ph)
|
|
# Regex scan full HTML for emails
|
|
for em in _EMAIL_RE.findall(html[:60000]):
|
|
em = em.lower()
|
|
if em not in result["emails"] and not any(
|
|
em.endswith(x) for x in (".png", ".jpg", ".css", ".js", ".svg")
|
|
):
|
|
result["emails"].append(em)
|
|
# Regex scan visible text for phones
|
|
visible = soup.get_text(separator=" ", strip=True)
|
|
for ph in _PHONE_RE.findall(visible):
|
|
ph_c = re.sub(r"[\s\-]", "", ph)
|
|
if ph_c and ph_c not in result["phones"]:
|
|
result["phones"].append(ph_c)
|
|
# Capture legal snippet from the first legal page that resolves
|
|
if not result["legal_snippet"] and any(
|
|
k in path for k in ("aviso", "legal", "privacidad")
|
|
):
|
|
result["legal_snippet"] = " ".join(visible.split()[:150])
|
|
except Exception:
|
|
pass
|
|
|
|
result["emails"] = list(dict.fromkeys(result["emails"]))[:8]
|
|
result["phones"] = list(dict.fromkeys(result["phones"]))[:6]
|
|
return result
|
|
|
|
|
|
# ── Prompt builder ─────────────────────────────────────────────────────────────
|
|
|
|
def _build_beauty_prompt(a: dict, detected_brands: list, dist_matches: list,
|
|
search_results: str = "",
|
|
extra_contacts: dict | None = None) -> str:
|
|
"""Build the Gemini assessment prompt.
|
|
|
|
extra_contacts comes from _scrape_legal_pages() and adds emails/phones/company
|
|
info found in the aviso legal, privacy policy, and about pages.
|
|
"""
|
|
ec = extra_contacts or {}
|
|
|
|
# Merge contact sources: site_analyzer (main page + contact pages) + legal pages
|
|
all_emails = list(dict.fromkeys((a.get("emails") or []) + (ec.get("emails") or [])))[:8]
|
|
all_phones = list(dict.fromkeys((a.get("phones") or []) + (ec.get("phones") or [])))[:6]
|
|
all_whatsapp = list(dict.fromkeys(a.get("whatsapp") or []))[:4]
|
|
all_social = list(dict.fromkeys(a.get("social_links") or []))[:6]
|
|
|
|
def _fmt(lst: list) -> str:
|
|
return ", ".join(lst) if lst else "—"
|
|
|
|
# Site technical signals
|
|
ssl_info = ("✓ valid" if a.get("ssl_valid") else "✗ invalid/missing")
|
|
analytics = ", ".join(a.get("analytics_present") or []) or "none detected"
|
|
word_count = a.get("word_count", 0)
|
|
load_ms = a.get("load_time_ms", 0)
|
|
copyright = a.get("copyright_year") or a.get("last_modified") or "unknown"
|
|
|
|
snippet = (a.get("visible_text_snippet") or "")[:1600]
|
|
legal_snippet = (ec.get("legal_snippet") or "")[:800]
|
|
detected_str = ", ".join(detected_brands) if detected_brands else "none detected"
|
|
dist_str = ", ".join(dist_matches) if dist_matches else "none"
|
|
|
|
return f"""You are a senior B2B sales analyst for a cosmetics distribution company
|
|
operating across Europe. Your task: thoroughly evaluate this website as a potential
|
|
wholesale B2B customer and produce a complete outreach dossier.
|
|
|
|
=== BUSINESS PROFILE ===
|
|
Domain: {a.get("domain")}
|
|
Country (IP): {a.get("ip_country") or "unknown"}
|
|
Region: {a.get("ip_region") or "unknown"}
|
|
Hosting (EU?): {a.get("eu_hosted")} | ISP/Org: {a.get("org") or a.get("isp") or "unknown"}
|
|
Page title: {a.get("page_title") or "—"}
|
|
H1: {a.get("h1_text") or "—"}
|
|
Meta desc: {(a.get("meta_description") or "—")[:200]}
|
|
CMS: {a.get("cms") or "unknown"}
|
|
Last updated: {copyright}
|
|
|
|
=== TECHNICAL SIGNALS ===
|
|
SSL: {ssl_info}
|
|
Load time: {load_ms}ms
|
|
Word count: {word_count}
|
|
Analytics: {analytics}
|
|
Mobile: {"yes" if a.get("has_mobile_viewport") else "no"}
|
|
Sitemap/Robots: sitemap={"yes" if a.get("has_sitemap") else "no"}, robots={"yes" if a.get("has_robots") else "no"}
|
|
GDPR/Privacy: cookie_tool={a.get("cookie_tool") or "none"}, privacy_policy={"yes" if a.get("has_privacy_policy") else "no"}
|
|
|
|
=== ALL CONTACT CHANNELS ===
|
|
Emails: {_fmt(all_emails)}
|
|
Phones: {_fmt(all_phones)}
|
|
WhatsApp: {_fmt(all_whatsapp)}
|
|
Social media: {_fmt(all_social)}
|
|
|
|
=== LEGAL / COMPANY REGISTRATION INFO ===
|
|
(extracted from aviso legal / política de privacidad — may contain razón social, CIF, address)
|
|
{legal_snippet or "Not found or page not accessible"}
|
|
|
|
=== PAGE CONTENT SAMPLE ===
|
|
{snippet}
|
|
|
|
=== BRANDS DETECTED ON SITE ===
|
|
{detected_str}
|
|
|
|
=== OUR PORTFOLIO BRANDS FOUND ON THEIR SITE ===
|
|
(brands we distribute that appear on their site — confirms shared market)
|
|
{dist_str}
|
|
|
|
=== WEB SEARCH RESULTS ===
|
|
{(search_results or "No results available.")[:700]}
|
|
|
|
=== OUR FULL DISTRIBUTION PORTFOLIO ===
|
|
{', '.join(OUR_BRANDS)}
|
|
|
|
=== BEAUTY CATEGORIES WE COVER ===
|
|
{', '.join(BEAUTY_CATEGORIES)}
|
|
|
|
=== ASSESSMENT RULES ===
|
|
1. TARGET PROFILE: We are looking for businesses that BUY BEAUTY PRODUCTS WHOLESALE to
|
|
resell: retailers, pharmacies, parafarmacias, perfumerías, multi-brand beauty ecommerce,
|
|
salon chains, supermarkets with beauty sections, beauty distributors — anywhere in Europe.
|
|
|
|
2. Identify ALL beauty brands anywhere on the page (body text, alt text, category names,
|
|
product listings, brand pages). Go beyond the pre-detected list already provided above.
|
|
|
|
3. LEAD QUALITY — rate on BUSINESS TYPE first, portfolio overlap second:
|
|
- HOT: Business type is clearly a multi-brand beauty reseller with professional/wholesale
|
|
activity AND at least one of: ≥2 portfolio brands detected, evident professional
|
|
lines, large catalogue (pharmacies, parafarmacia chains, pro salon distributors).
|
|
Also HOT: any large-scale EU beauty retailer even without portfolio brand matches.
|
|
- WARM: ANY genuine multi-brand beauty retailer or ecommerce that could buy wholesale —
|
|
even if ZERO portfolio brands are currently detected. They are our TARGET MARKET:
|
|
we want to introduce our brands to them. Pharmacies, perfumerías, beauty shops,
|
|
multi-brand online stores → default WARM unless there is a clear disqualifier.
|
|
When uncertain between WARM and COLD: choose WARM.
|
|
- COLD: ONLY if clearly disqualified: single-brand D2C (sells only their own brand),
|
|
beauty salon that doesn't sell products to end-consumers, personal influencer /
|
|
blog, OR no evidence this is a purchasing business at all.
|
|
- NOT_RELEVANT: No beauty/cosmetics connection, or clearly non-European.
|
|
|
|
⚠ CRITICAL: Portfolio brand absence NEVER alone justifies COLD. Our job is to introduce
|
|
our brands to retailers who don't carry them yet. Rate on whether they COULD buy wholesale.
|
|
|
|
4. country_fiscal: use aviso legal if found; otherwise use the IP country shown above.
|
|
NEVER leave country_fiscal empty — always provide a 2-letter ISO code.
|
|
|
|
5. Extract the BEST contact for outreach — check all data above:
|
|
- Prefer commercial emails (info@, ventas@, compras@, pedidos@) over generic/personal
|
|
- WhatsApp is often the fastest channel in Spain; flag it if present
|
|
- Set best_contact_channel and best_contact_value explicitly
|
|
|
|
6. Write summary, pitch_angle, b2b_proposal, outreach_subject, and outreach_email in SPANISH.
|
|
|
|
7. outreach_email must be a complete ready-to-send Spanish email: greeting + 3-4 sentences
|
|
referencing their specific range + 1-2 of our portfolio brands that match + clear CTA
|
|
(catálogo, muestra gratuita, llamada, primer pedido mínimo). No placeholders.
|
|
|
|
Respond ONLY with valid JSON, no markdown fences, no text outside the JSON object:
|
|
{{
|
|
"is_relevant": true,
|
|
"lead_quality": "HOT|WARM|COLD|NOT_RELEVANT",
|
|
"summary": "2-3 sentence executive summary: what this business does, their product range, who their customers are, and their apparent scale",
|
|
"lead_reasoning": "2-3 sentences explaining the lead quality rating — reference specific brands found, categories covered, and portfolio overlap",
|
|
"business_type": "retailer|ecommerce|distributor|pharmacy|parafarmacia|salon_chain|perfumeria|other",
|
|
"business_name": "official business name from title, H1, or aviso legal",
|
|
"country_fiscal": "2-letter ISO",
|
|
"countries_active": ["ES"],
|
|
"categories": ["Hair Care","Makeup"],
|
|
"detected_brands": ["all beauty brands found on site — be thorough"],
|
|
"dist_matches": ["our portfolio brands found on their site"],
|
|
"partnership_signals": ["carries multi-brand","has wholesale section","stockist page","B2B portal"],
|
|
"pitch_angle": "1 punchy sentence in Spanish: the specific angle for this business (reference their range, a gap you fill, or the portfolio brands that match)",
|
|
"b2b_proposal": "2-3 sentence value proposition in Spanish: what we offer, why it fits their range, what differentiates our brands",
|
|
"outreach_subject": "specific Spanish subject line mentioning their business name and 1 relevant brand",
|
|
"outreach_email": "complete ready-to-send Spanish email: greeting + 3-4 body sentences referencing their specific product range and 1-2 portfolio brands that match + clear CTA (catálogo, muestra, llamada, pedido mínimo) + valediction. Do not use placeholders.",
|
|
"best_contact_channel": "email|phone|whatsapp|social|web_form|unknown",
|
|
"best_contact_value": "the actual email/phone/URL to use — prefer commercial emails, then phone, then social",
|
|
"all_contacts": {{
|
|
"emails": {json.dumps(all_emails)},
|
|
"phones": {json.dumps(all_phones)},
|
|
"whatsapp": {json.dumps(all_whatsapp)},
|
|
"social": {json.dumps(all_social)}
|
|
}},
|
|
"revenue_estimate": "unknown|<100k€|100k-500k€|500k-2M€|>2M€",
|
|
"outreach_notes": "2-3 sentences for the sales rep: timing, approach, red flags, CIF if found, any urgency signals"
|
|
}}"""
|
|
|
|
|
|
def _parse_beauty_output(raw: str) -> dict:
|
|
text = re.sub(r"```(?:json)?", "", raw).strip().rstrip("`").strip()
|
|
m = re.search(r"\{[\s\S]+\}", text)
|
|
if m:
|
|
candidate = m.group(0)
|
|
try:
|
|
return json.loads(candidate)
|
|
except json.JSONDecodeError:
|
|
depth_obj = candidate.count("{") - candidate.count("}")
|
|
depth_arr = candidate.count("[") - candidate.count("]")
|
|
fixed = re.sub(r',\s*"[^"]*"?\s*:\s*[^,\}\]]*$', '', candidate)
|
|
fixed += "]" * max(0, depth_arr) + "}" * max(0, depth_obj)
|
|
try:
|
|
return json.loads(fixed)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
logger.warning("Beauty AI parse failed, raw: %.300s", raw)
|
|
return {
|
|
"is_relevant": False,
|
|
"lead_quality": "COLD",
|
|
"business_name": "",
|
|
"contact_email": "",
|
|
"dist_matches": [],
|
|
"parse_error": True,
|
|
}
|
|
|
|
|
|
# ── Main entry point ───────────────────────────────────────────────────────────
|
|
|
|
async def assess_beauty_domain(analysis: dict) -> dict:
|
|
"""Full beauty B2B assessment: brand scan + AI evaluation."""
|
|
async with _sem():
|
|
domain = analysis.get("domain", "")
|
|
text = analysis.get("visible_text_snippet", "") or ""
|
|
html_raw = text # use snippet; brands already extracted from full page in site_analyzer
|
|
|
|
detected = detect_brands_in_text(text)
|
|
dist_match = get_dist_matches(detected)
|
|
|
|
# Run DDG search and legal page scraping in parallel
|
|
title = analysis.get("page_title") or ""
|
|
biz_name = title.split("|")[0].split("-")[0].strip() or domain
|
|
search_results, extra_contacts = await asyncio.gather(
|
|
_ddg_search(f'"{biz_name}" {domain} cosmetics beauty wholesale B2B contacto'),
|
|
_scrape_legal_pages(domain),
|
|
)
|
|
|
|
logger.info(
|
|
"Beauty assess %s: %d brands, %d portfolio matches, "
|
|
"%d extra emails from legal pages",
|
|
domain, len(detected), len(dist_match),
|
|
len(extra_contacts.get("emails", [])),
|
|
)
|
|
|
|
payload = {
|
|
"input": {
|
|
"prompt": _build_beauty_prompt(
|
|
analysis, detected, dist_match, search_results, extra_contacts
|
|
),
|
|
"images": [], "videos": [],
|
|
"top_p": 0.9,
|
|
"temperature": 0.2,
|
|
"thinking_level": "low",
|
|
"max_output_tokens": 4000,
|
|
}
|
|
}
|
|
try:
|
|
async with httpx.AsyncClient(timeout=120) as client:
|
|
resp = await client.post(
|
|
REPLICATE_MODEL,
|
|
headers={
|
|
"Authorization": f"Bearer {REPLICATE_TOKEN}",
|
|
"Content-Type": "application/json",
|
|
"Prefer": "wait",
|
|
},
|
|
json=payload,
|
|
)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
|
|
output = data.get("output", "")
|
|
if isinstance(output, list):
|
|
output = "".join(output)
|
|
|
|
result = _parse_beauty_output(output)
|
|
# Merge pre-scan data that AI might miss
|
|
if not result.get("dist_matches") and dist_match:
|
|
result["dist_matches"] = dist_match
|
|
if not result.get("detected_brands") and detected:
|
|
result["detected_brands"] = detected
|
|
|
|
# Merge contact data directly from site_analyzer + legal pages —
|
|
# more reliable than AI extraction since it's regex against raw HTML.
|
|
# The AI's all_contacts field may already have the right data if it
|
|
# followed the schema; fill gaps from our own extraction.
|
|
all_emails = list(dict.fromkeys(
|
|
(analysis.get("emails") or []) + (extra_contacts.get("emails") or [])
|
|
))[:8]
|
|
all_phones = list(dict.fromkeys(
|
|
(analysis.get("phones") or []) + (extra_contacts.get("phones") or [])
|
|
))[:6]
|
|
all_whatsapp = list(dict.fromkeys(analysis.get("whatsapp") or []))[:4]
|
|
all_social = list(dict.fromkeys(analysis.get("social_links") or []))[:6]
|
|
|
|
# Ensure all_contacts in result is always populated from our own data
|
|
if not result.get("all_contacts") or not isinstance(result.get("all_contacts"), dict):
|
|
result["all_contacts"] = {}
|
|
result["all_contacts"].setdefault("emails", [])
|
|
result["all_contacts"].setdefault("phones", [])
|
|
result["all_contacts"].setdefault("whatsapp", [])
|
|
result["all_contacts"].setdefault("social", [])
|
|
# Merge our extracted data into the AI's all_contacts
|
|
result["all_contacts"]["emails"] = list(dict.fromkeys(
|
|
result["all_contacts"]["emails"] + all_emails))[:8]
|
|
result["all_contacts"]["phones"] = list(dict.fromkeys(
|
|
result["all_contacts"]["phones"] + all_phones))[:6]
|
|
result["all_contacts"]["whatsapp"] = list(dict.fromkeys(
|
|
result["all_contacts"]["whatsapp"] + all_whatsapp))[:4]
|
|
result["all_contacts"]["social"] = list(dict.fromkeys(
|
|
result["all_contacts"]["social"] + all_social))[:6]
|
|
|
|
# Fill top-level contact fields from merged data if AI left them blank
|
|
if not result.get("contact_email") and all_emails:
|
|
result["contact_email"] = all_emails[0]
|
|
if not result.get("contact_phone") and all_phones:
|
|
result["contact_phone"] = all_phones[0]
|
|
if not result.get("contact_whatsapp") and all_whatsapp:
|
|
result["contact_whatsapp"] = all_whatsapp[0]
|
|
if not result.get("contact_social") and all_social:
|
|
result["contact_social"] = all_social[0]
|
|
|
|
# country_fiscal fallback — always provide a value
|
|
fc = (result.get("country_fiscal") or "").strip()
|
|
if not fc or fc.lower() in ("unknown", "n/a", "-"):
|
|
result["country_fiscal"] = analysis.get("ip_country") or ""
|
|
|
|
logger.info("Beauty AI %s → quality=%s, dist_matches=%s",
|
|
domain, result.get("lead_quality"), result.get("dist_matches"))
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.error("Beauty AI error %s: %s", domain, e)
|
|
all_emails = list(dict.fromkeys(
|
|
(analysis.get("emails") or []) + (extra_contacts.get("emails") or [])))[:8]
|
|
all_phones = list(dict.fromkeys(
|
|
(analysis.get("phones") or []) + (extra_contacts.get("phones") or [])))[:6]
|
|
all_whatsapp = list(dict.fromkeys(analysis.get("whatsapp") or []))[:4]
|
|
all_social = list(dict.fromkeys(analysis.get("social_links") or []))[:6]
|
|
return {
|
|
"error": str(e)[:300],
|
|
"is_relevant": False,
|
|
"lead_quality": "COLD",
|
|
"dist_matches": dist_match,
|
|
"detected_brands": detected,
|
|
"contact_email": all_emails[0] if all_emails else "",
|
|
"contact_phone": all_phones[0] if all_phones else "",
|
|
"contact_whatsapp": all_whatsapp[0] if all_whatsapp else "",
|
|
"contact_social": all_social[0] if all_social else "",
|
|
"all_contacts": {
|
|
"emails": all_emails, "phones": all_phones,
|
|
"whatsapp": all_whatsapp, "social": all_social,
|
|
},
|
|
}
|