DomGod/app/beauty_ai.py

"""Beauty B2B AI assessment — cosmetics distribution lead qualification.

Pre-scans scraped text for known brands, then sends a focused prompt to Gemini
to evaluate fit as a B2B customer for a cosmetics distribution business.
"""
import asyncio
import json
import logging
import os
import re
from typing import Optional

import httpx
from bs4 import BeautifulSoup

logger = logging.getLogger(__name__)

REPLICATE_TOKEN = os.getenv("REPLICATE_API_TOKEN", "r8_7I7Feai78f9PzMOs20y5GVFKiLkgUWP463vZO")
REPLICATE_MODEL = "https://api.replicate.com/v1/models/google/gemini-3-pro/predictions"
AI_CONCURRENCY  = int(os.getenv("AI_CONCURRENCY", "3"))

# Contact extraction regexes (same patterns as site_analyzer)
_EMAIL_RE = re.compile(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}")
_PHONE_RE = re.compile(r"(?:\+\d{1,3}[\s\-]?)?(?:6|7|8|9)\d{2}[\s\-]?\d{3}[\s\-]?\d{3}")

# Pages that often contain company registration info (CIF/NIF, registered address,
# legal email) — not fetched by site_analyzer, but rich sources for B2B contact data
_LEGAL_PATHS = [
    "/aviso-legal", "/aviso_legal", "/legal",
    "/politica-de-privacidad", "/politica_privacidad", "/privacidad",
    "/quienes-somos", "/quienes_somos", "/nosotros",
]

_ai_sem: Optional[asyncio.Semaphore] = None

def _sem() -> asyncio.Semaphore:
    global _ai_sem
    if _ai_sem is None:
        _ai_sem = asyncio.Semaphore(AI_CONCURRENCY)
    return _ai_sem


# ── Brand universe (market brands we can detect on client sites) ──────────────

BEAUTY_BRANDS = [
    "4711","7days","7th Heaven","A-derma","Abercrombie & Fitch","Abril Et Nature",
    "Acqua Di Parma","Actinica","Adidas","Adolfo Dominguez","Aesop","Agatha Ruiz De La Prada",
    "Agave","Agua Lavanda","Ahava","Air-wick","Aire Sevilla","Al Haramain","Albal","Alcantara",
    "Alejandro Sanz","Alfaparf Milano","Algasiv","Alma Secret","Alpecin","Alqvimia","Alterna",
    "Alvarez Gomez","Alyssa Ashley","Ambi Pur","American Crew","Amichi","Ana María Lajusticia",
    "Angel Schlesser","Anian","Annayake","Anne Möller","Anso","Antonio Banderas","Apisérum",
    "Apivita","Aqc Fragrances","Aquilea","Aramis","Ardell","Arganour","Ariel","Armaf",
    "Armand Basi","Artdeco","Artero","As I Am","Aseptine","Atashi","Atrix","Ausonia","Aussie",
    "Australian Gold","Autan","Aveda","Avena Kinesia","Avène","Axe","Axovital","Azalea",
    "Azzaro","Babaria","Babyliss","Barbie","Bare Minerals","Barulab","Batiste","Beaver",
    "Beconfident","Belcils","Bella Aurora","Benefit","Benton","Benzacare","Beter","Biafin",
    "Bio Ionic","Bio-oil","Bioderma","Biolage","Biotherm","Biovène","Biretix","Bobbi Brown",
    "Bouclème","Bourjois","Bperfect Cosmetics","Britney Spears","Bumble & Bumble","Burberry",
    "Bvlgari","Byly","Byphasse","Cacharel","Calvin Klein","Camomila Intea","Cantu","Carefree",
    "Carmex","Carolina Herrera","Carrera","Carthusia","Catrice","Caudalie","Cerave","Cerruti",
    "Cetaphil","Chanel","Chanson D'Eau","Chloé","Chopard","Christina Aguilera","Christophe Robin",
    "Clarins","Clean & Clear","Clinique","Coach","Cocosolis","Colab","Colgate","Collistar",
    "Color Wow","Comfort Zone","Comodynes","Compeed","Cosrx","Creed","Creme Of Nature",
    "Cristalinas","Crossmen","Crusellas","Cryopharma","Cumlaude Lab","Cutex","Cygnetic",
    "Daffoil","Darphin","Davidoff","Declaré","Delfy","Delisea","Denenes","Dentiblanc",
    "Dermalogica","Desensin","Dexeryl","Diadermine","Diesel","Diet Esthetic","Dior","Diptyque",
    "Dodot","Dolce & Gabbana","Donna Karan","Dove","Dr. Hauschka","Dr.jart+","Dr. Organic",
    "Dr. Rimpler","Dr. Tree","Drasanvi","Drunk Elephant","Dsquared2","Ducray","Durex",
    "Elancyl","Elegant Touch","Elemis","Elie Saab","Elizabeth Arden","Elizabeth Taylor",
    "Emilio Pucci","Endocare","Eric Favre","Escada","Essence","Essie","Estée Lauder",
    "Etat Libre D'Orange","Eucerin","Eudermin","Evax","Eve Lom","Eylure","Fa","Fairy","Fanola",
    "Farmatint","Farmavita","Farouk","Figuière","Fisiocrem","Flor De Mayo","Fluocaril","Foreo",
    "Forté Pharma","Foxy","Francis Kurkdjian","Frederic Malle","Frosch","Garnier","Ghd",
    "Gillette","Giorgi Line","Givenchy","Glam Of Sweden","Goldwell","Gosh","Goutal","Gritti",
    "Gucci","Guerlain","Guess By Marciano","Gummy","Hair Rituel By Sisley","Hairgum","Halita",
    "Halloween","Hansaplast","Hask","Hawaiian Tropic","Head & Shoulders","Heliocare",
    "Heno De Pravia","Herbal Essences","Hermès","Hidracel","Hollister","Hugo Boss",
    "I.c.o.n.","Ibizaloe","Iceberg","Idc Institute","Iroha","Isabelle Lancray","Isdin",
    "Issey Miyake","It Cosmetics","Ivybears","Jacadi","Jean Paul Gaultier","Jil Sander",
    "Jimmy Choo","Jo Malone","John Frieda","Johnson's Baby","Joico","Joop","Jordan","Jowaé",
    "Juicy Couture","Juliette Has A Gun","Just For Men","Juvena","Kaloo","Karl Lagerfeld",
    "Karseell","Katai","Kate Spade","Kativa","Kenzo","Kerasilk","Kerastase","Kevin Murphy",
    "Kevyn Aucoin","Kilian","Klorane","L'Anza","L'Occitane","L'Oréal Paris",
    "L'Oréal Professionnel","La Cabine","La Mer","La Prairie","La Roche Posay","La Toja",
    "Laboratoires Filorga","Lacer","Lacoste","Lactacyd","Lactovit","Lalique","Lancaster",
    "Lanvin","Lattafa","Laura Biagiotti","Le Petit Marseillais","Legrain","Lierac","Listerine",
    "Living Proof","Loewe","Lola Cosmetics","Lolita Lempicka","Lussoni","Lutsine E45",
    "M2 Beauté","Mac","Macadamia","Mad Beauty","Maria Nila","Marlies Möller","Martiderm",
    "Martinelia","Marvis","Matrix","Maui","Mavala","Max Factor","Maybelline","Melvita",
    "Mermade","Michael Kors","Milk Shake","Mix & Shout","Mixa","Moroccanoil","Moschino",
    "Mustela","Nabeel","Nanobrow","Nanoil","Nanolash","Narciso Rodriguez","Nars","Natur Vital",
    "Natura Bissé","Natural Honey","Naturalium","Naturtint","Nenuco","Neogen","Neoretin",
    "Neostrata","Neutrogena","Nivea","Nûby","Nuggela & Sulé","Nyx Professional Make Up",
    "Ogx","Olaplex","Olay","Old Spice","Olivia Garden","Opi","Oral-b","Oraldine","Orofluido",
    "Orlane","Oscar De La Renta","Pacha","Paese","Palette","Paloma Picasso","Paltons",
    "Pantene","Paranix","Parfums Saphir","Parlux","Payot","Phyto","Picu Baby","Pilexil",
    "Piz Buin","Plantur 39","Platanomelón","Polaar","Police","Polident","Ponds","Poseidon",
    "Postquam","Proraso","Puig","Purito","Rabanne","Raid","Ralph Lauren","Rated Green",
    "Real Techniques","Redenhair","Redist","Redken","Reebok","Ref","Refectocil","Relec",
    "Remescar","Rene Furterer","Revlon","Revolution Hair Care","Revolution Make Up",
    "Revolution Pro","Rexaline","Rexona","Rilastil","Rimmel London","Roberto Cavalli","Roc",
    "Rochas","Roger & Gallet","Roja Parfums","Rosacure","S3","Sabon","Salerm","Sally Hansen",
    "Salvatore Ferragamo","Sanex","Sarah Jessica Parker","Saryna Key","Satisfyer","Scalpers",
    "Scholl","Schwarzkopf","Scottex","Sebamed","Sebastian Professionals","Seche Vite",
    "Sensai","Sensilis","Sensodyne","Serge Lutens","Serumkind","Sesderma","Seven Cosmetics",
    "Sexy Hair","Shiseido","Shu Uemura","Sisley","Skeyndor","Skin Generics","Sleek",
    "Snp","Soap & Glory","Sol De Janeiro","Solgar","Somatoline Cosmetic","Sophie La Girafe",
    "Soria Natural","Steinhart","Stendhal Paris","Sterimar","Strivectin","Suavinex",
    "Suavipiel","Svr Laboratoire Dermatologique","Syoss","System Professional","Tabac",
    "Taky","Talika","Tampax","Tangle Teezer","Tanit","Teaology","Tena Lady","The Body Shop",
    "The Ordinary","The Wet Brush","Thermacare","Tiffany & Co","Tigi","Timotei",
    "Tiziana Terenzi","Tod's","Tom Ford","Tommy Hilfiger","Topicrem","Torriden","Tot Herba",
    "Tous","Trendy Hair","Tresemme","Trussardi","Tulipán Negro","Urban Decay","Uriage",
    "Usu Cosmetics","Vagisil","Valmont","Valquer","Vanderbilt","Vaseline","Veet","Vichy",
    "Victor","Victoria's Secret","Victorio & Lucchino","Vital Proteins","Vivra",
    "Voltage Cosmetics","Volumax","Waterpik","Waterwipes","Wella","Weleda",
    "Williams","Woodwick","Xerjoff","Xls Medical","Yankee Candle","Yari","Yotuel",
    "Youth Lab","Zadig & Voltaire","Ziaja",
]

# Our distribution portfolio — the brands we sell to B2B clients
OUR_BRANDS = [
    "AIMX","Al Haramain","Apivita","Armaf","Aveda","Bouclème","Clarena",
    "Curly Girl Movement","Cutrin","Davines","Dr. Hauschka","FanPalm","Farmavita",
    "Flora Curl","GAMMA+","GHD","GOSH","ICON","Image Skincare","Instituto Español",
    "Janeke","Kay Pro","Kerasilk","Kyo","Label M","Lierac","Living Proof","Londa",
    "M2 Beauté","Malibu C","Maria Nila","Medik8","Misslyn","Mustela","Nesti Dante",
    "Nuxe","Obagi","Osmo","Payot","Philip B","Philip Martins","Phyto","Piz Buin",
    "Ramon Monegal","Redken","REF","Saryna Key","Sesderma","Skala Brasil","Skin1004",
    "Strivectin","Swissdent","Topicrem","Uriage","Vita Liberata","Waterclouds",
    "Wella","Youngblood Cosmetics",
]

BEAUTY_CATEGORIES = [
    "Perfumes","Facial Cosmetics","Makeup","Hair Care","Health","Body Cosmetics",
    "Hygiene","Kids & Babies","Sun Care","Eyewear","Home","Nutrition","Erotic","Fashion",
]


# ── Brand detection (fast pre-scan, no AI) ─────────────────────────────────────

def detect_brands_in_text(text: str) -> list[str]:
    """Find which brands from the universe appear in the scraped page text.

    Short brands (≤5 chars) use word-boundary matching to avoid false positives
    like 'ref' matching 'reference', 'prefer', 'refresh', etc.
    """
    tl = text.lower()
    result = []
    for b in BEAUTY_BRANDS:
        bl = b.lower()
        if len(bl) <= 5:
            if re.search(r'(?<![a-zA-Z0-9])' + re.escape(bl) + r'(?![a-zA-Z0-9])', tl):
                result.append(b)
        else:
            if bl in tl:
                result.append(b)
    return result[:60]


def get_dist_matches(detected: list[str]) -> list[str]:
    """Return which detected brands are in our distribution portfolio."""
    dl = {b.lower() for b in detected}
    return [b for b in OUR_BRANDS if b.lower() in dl]


# ── DuckDuckGo search (contact/company lookup) ────────────────────────────────

async def _ddg_search(query: str) -> str:
    try:
        async with httpx.AsyncClient(
            timeout=10, follow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0 (compatible; BeautyLeads/1.0)"},
        ) as client:
            r = await client.get(
                "https://html.duckduckgo.com/html/",
                params={"q": query, "kl": "es-es"},
            )
            if r.status_code != 200:
                return ""
            soup = BeautifulSoup(r.text, "html.parser")
            parts = []
            for res in soup.select(".result")[:4]:
                title = res.select_one(".result__a")
                snip  = res.select_one(".result__snippet")
                url   = res.select_one(".result__url")
                if snip:
                    t = title.get_text(strip=True) if title else ""
                    u = url.get_text(strip=True) if url else ""
                    parts.append(f"[{u}] {t} — {snip.get_text(strip=True)}")
            return "\n".join(parts)
    except Exception as e:
        logger.debug("DDG search failed: %s", e)
        return ""


# ── Legal / about page scraper ────────────────────────────────────────────────

async def _scrape_legal_pages(domain: str) -> dict:
    """Fetch legal and about pages not covered by site_analyzer.

    Spanish Aviso Legal pages legally must contain: company name (razón social),
    CIF/NIF, registered address, and a contact email — making them the richest
    source of verified B2B contact data.

    Returns:
        emails:        all unique emails found across all pages
        phones:        all unique phones found across all pages
        legal_snippet: first 800 chars of the aviso legal page (company registration
                       info: razón social, CIF, domicilio, etc.)
    """
    result: dict = {"emails": [], "phones": [], "legal_snippet": ""}

    async def _fetch(path: str) -> tuple[str, str | None]:
        try:
            async with httpx.AsyncClient(
                timeout=8, follow_redirects=True, verify=False,
                headers={"User-Agent": "Mozilla/5.0"},
            ) as c:
                r = await c.get(f"https://{domain}{path}")
                if r.status_code == 200:
                    return path, r.text
        except Exception:
            pass
        return path, None

    pages = await asyncio.gather(*[_fetch(p) for p in _LEGAL_PATHS])

    for path, html in pages:
        if not html:
            continue
        try:
            soup = BeautifulSoup(html, "html.parser")
            # Extract from anchor tags
            for a in soup.find_all("a", href=True):
                href = a["href"]
                if href.startswith("mailto:"):
                    em = href[7:].split("?")[0].strip().lower()
                    if em and em not in result["emails"]:
                        result["emails"].append(em)
                elif href.startswith("tel:"):
                    ph = re.sub(r"[^\d+]", "", href[4:])
                    if ph and ph not in result["phones"]:
                        result["phones"].append(ph)
            # Regex scan full HTML for emails
            for em in _EMAIL_RE.findall(html[:60000]):
                em = em.lower()
                if em not in result["emails"] and not any(
                    em.endswith(x) for x in (".png", ".jpg", ".css", ".js", ".svg")
                ):
                    result["emails"].append(em)
            # Regex scan visible text for phones
            visible = soup.get_text(separator=" ", strip=True)
            for ph in _PHONE_RE.findall(visible):
                ph_c = re.sub(r"[\s\-]", "", ph)
                if ph_c and ph_c not in result["phones"]:
                    result["phones"].append(ph_c)
            # Capture legal snippet from the first legal page that resolves
            if not result["legal_snippet"] and any(
                k in path for k in ("aviso", "legal", "privacidad")
            ):
                result["legal_snippet"] = " ".join(visible.split()[:150])
        except Exception:
            pass

    result["emails"] = list(dict.fromkeys(result["emails"]))[:8]
    result["phones"] = list(dict.fromkeys(result["phones"]))[:6]
    return result


# ── Prompt builder ─────────────────────────────────────────────────────────────

def _build_beauty_prompt(a: dict, detected_brands: list, dist_matches: list,
                         search_results: str = "",
                         extra_contacts: dict | None = None) -> str:
    """Build the Gemini assessment prompt.

    extra_contacts comes from _scrape_legal_pages() and adds emails/phones/company
    info found in the aviso legal, privacy policy, and about pages.
    """
    ec = extra_contacts or {}

    # Merge contact sources: site_analyzer (main page + contact pages) + legal pages
    all_emails   = list(dict.fromkeys((a.get("emails") or []) + (ec.get("emails") or [])))[:8]
    all_phones   = list(dict.fromkeys((a.get("phones") or []) + (ec.get("phones") or [])))[:6]
    all_whatsapp = list(dict.fromkeys(a.get("whatsapp") or []))[:4]
    all_social   = list(dict.fromkeys(a.get("social_links") or []))[:6]

    def _fmt(lst: list) -> str:
        return ", ".join(lst) if lst else "—"

    # Site technical signals
    ssl_info    = ("✓ valid" if a.get("ssl_valid") else "✗ invalid/missing")
    analytics   = ", ".join(a.get("analytics_present") or []) or "none detected"
    word_count  = a.get("word_count", 0)
    load_ms     = a.get("load_time_ms", 0)
    copyright   = a.get("copyright_year") or a.get("last_modified") or "unknown"

    snippet       = (a.get("visible_text_snippet") or "")[:1600]
    legal_snippet = (ec.get("legal_snippet") or "")[:800]
    detected_str  = ", ".join(detected_brands) if detected_brands else "none detected"
    dist_str      = ", ".join(dist_matches)     if dist_matches    else "none"

    return f"""You are a senior B2B sales analyst for a cosmetics distribution company
operating across Europe. Your task: thoroughly evaluate this website as a potential
wholesale B2B customer and produce a complete outreach dossier.

=== BUSINESS PROFILE ===
Domain:          {a.get("domain")}
Country (IP):    {a.get("ip_country") or "unknown"}
Region:          {a.get("ip_region") or "unknown"}
Hosting (EU?):   {a.get("eu_hosted")}  |  ISP/Org: {a.get("org") or a.get("isp") or "unknown"}
Page title:      {a.get("page_title") or "—"}
H1:              {a.get("h1_text") or "—"}
Meta desc:       {(a.get("meta_description") or "—")[:200]}
CMS:             {a.get("cms") or "unknown"}
Last updated:    {copyright}

=== TECHNICAL SIGNALS ===
SSL:             {ssl_info}
Load time:       {load_ms}ms
Word count:      {word_count}
Analytics:       {analytics}
Mobile:          {"yes" if a.get("has_mobile_viewport") else "no"}
Sitemap/Robots:  sitemap={"yes" if a.get("has_sitemap") else "no"}, robots={"yes" if a.get("has_robots") else "no"}
GDPR/Privacy:    cookie_tool={a.get("cookie_tool") or "none"}, privacy_policy={"yes" if a.get("has_privacy_policy") else "no"}

=== ALL CONTACT CHANNELS ===
Emails:          {_fmt(all_emails)}
Phones:          {_fmt(all_phones)}
WhatsApp:        {_fmt(all_whatsapp)}
Social media:    {_fmt(all_social)}

=== LEGAL / COMPANY REGISTRATION INFO ===
(extracted from aviso legal / política de privacidad — may contain razón social, CIF, address)
{legal_snippet or "Not found or page not accessible"}

=== PAGE CONTENT SAMPLE ===
{snippet}

=== BRANDS DETECTED ON SITE ===
{detected_str}

=== OUR PORTFOLIO BRANDS FOUND ON THEIR SITE ===
(brands we distribute that appear on their site — confirms shared market)
{dist_str}

=== WEB SEARCH RESULTS ===
{(search_results or "No results available.")[:700]}

=== OUR FULL DISTRIBUTION PORTFOLIO ===
{', '.join(OUR_BRANDS)}

=== BEAUTY CATEGORIES WE COVER ===
{', '.join(BEAUTY_CATEGORIES)}

=== ASSESSMENT RULES ===
1. TARGET PROFILE: We are looking for businesses that BUY BEAUTY PRODUCTS WHOLESALE to
   resell: retailers, pharmacies, parafarmacias, perfumerías, multi-brand beauty ecommerce,
   salon chains, supermarkets with beauty sections, beauty distributors — anywhere in Europe.

2. Identify ALL beauty brands anywhere on the page (body text, alt text, category names,
   product listings, brand pages). Go beyond the pre-detected list already provided above.

3. LEAD QUALITY — rate on BUSINESS TYPE first, portfolio overlap second:
   - HOT:  Business type is clearly a multi-brand beauty reseller with professional/wholesale
           activity AND at least one of: ≥2 portfolio brands detected, evident professional
           lines, large catalogue (pharmacies, parafarmacia chains, pro salon distributors).
           Also HOT: any large-scale EU beauty retailer even without portfolio brand matches.
   - WARM: ANY genuine multi-brand beauty retailer or ecommerce that could buy wholesale —
           even if ZERO portfolio brands are currently detected. They are our TARGET MARKET:
           we want to introduce our brands to them. Pharmacies, perfumerías, beauty shops,
           multi-brand online stores → default WARM unless there is a clear disqualifier.
           When uncertain between WARM and COLD: choose WARM.
   - COLD: ONLY if clearly disqualified: single-brand D2C (sells only their own brand),
           beauty salon that doesn't sell products to end-consumers, personal influencer /
           blog, OR no evidence this is a purchasing business at all.
   - NOT_RELEVANT: No beauty/cosmetics connection, or clearly non-European.

   ⚠ CRITICAL: Portfolio brand absence NEVER alone justifies COLD. Our job is to introduce
   our brands to retailers who don't carry them yet. Rate on whether they COULD buy wholesale.

4. country_fiscal: use aviso legal if found; otherwise use the IP country shown above.
   NEVER leave country_fiscal empty — always provide a 2-letter ISO code.

5. Extract the BEST contact for outreach — check all data above:
   - Prefer commercial emails (info@, ventas@, compras@, pedidos@) over generic/personal
   - WhatsApp is often the fastest channel in Spain; flag it if present
   - Set best_contact_channel and best_contact_value explicitly

6. Write summary, pitch_angle, b2b_proposal, outreach_subject, and outreach_email in SPANISH.

7. outreach_email must be a complete ready-to-send Spanish email: greeting + 3-4 sentences
   referencing their specific range + 1-2 of our portfolio brands that match + clear CTA
   (catálogo, muestra gratuita, llamada, primer pedido mínimo). No placeholders.

Respond ONLY with valid JSON, no markdown fences, no text outside the JSON object:
{{
  "is_relevant": true,
  "lead_quality": "HOT|WARM|COLD|NOT_RELEVANT",
  "summary": "2-3 sentence executive summary: what this business does, their product range, who their customers are, and their apparent scale",
  "lead_reasoning": "2-3 sentences explaining the lead quality rating — reference specific brands found, categories covered, and portfolio overlap",
  "business_type": "retailer|ecommerce|distributor|pharmacy|parafarmacia|salon_chain|perfumeria|other",
  "business_name": "official business name from title, H1, or aviso legal",
  "country_fiscal": "2-letter ISO",
  "countries_active": ["ES"],
  "categories": ["Hair Care","Makeup"],
  "detected_brands": ["all beauty brands found on site — be thorough"],
  "dist_matches": ["our portfolio brands found on their site"],
  "partnership_signals": ["carries multi-brand","has wholesale section","stockist page","B2B portal"],
  "pitch_angle": "1 punchy sentence in Spanish: the specific angle for this business (reference their range, a gap you fill, or the portfolio brands that match)",
  "b2b_proposal": "2-3 sentence value proposition in Spanish: what we offer, why it fits their range, what differentiates our brands",
  "outreach_subject": "specific Spanish subject line mentioning their business name and 1 relevant brand",
  "outreach_email": "complete ready-to-send Spanish email: greeting + 3-4 body sentences referencing their specific product range and 1-2 portfolio brands that match + clear CTA (catálogo, muestra, llamada, pedido mínimo) + valediction. Do not use placeholders.",
  "best_contact_channel": "email|phone|whatsapp|social|web_form|unknown",
  "best_contact_value": "the actual email/phone/URL to use — prefer commercial emails, then phone, then social",
  "all_contacts": {{
    "emails": {json.dumps(all_emails)},
    "phones": {json.dumps(all_phones)},
    "whatsapp": {json.dumps(all_whatsapp)},
    "social": {json.dumps(all_social)}
  }},
  "revenue_estimate": "unknown|<100k€|100k-500k€|500k-2M€|>2M€",
  "outreach_notes": "2-3 sentences for the sales rep: timing, approach, red flags, CIF if found, any urgency signals"
}}"""


def _parse_beauty_output(raw: str) -> dict:
    text = re.sub(r"```(?:json)?", "", raw).strip().rstrip("`").strip()
    m = re.search(r"\{[\s\S]+\}", text)
    if m:
        candidate = m.group(0)
        try:
            return json.loads(candidate)
        except json.JSONDecodeError:
            depth_obj = candidate.count("{") - candidate.count("}")
            depth_arr = candidate.count("[") - candidate.count("]")
            fixed = re.sub(r',\s*"[^"]*"?\s*:\s*[^,\}\]]*$', '', candidate)
            fixed += "]" * max(0, depth_arr) + "}" * max(0, depth_obj)
            try:
                return json.loads(fixed)
            except json.JSONDecodeError:
                pass
    logger.warning("Beauty AI parse failed, raw: %.300s", raw)
    return {
        "is_relevant": False,
        "lead_quality": "COLD",
        "business_name": "",
        "contact_email": "",
        "dist_matches": [],
        "parse_error": True,
    }


# ── Main entry point ───────────────────────────────────────────────────────────

async def assess_beauty_domain(analysis: dict) -> dict:
    """Full beauty B2B assessment: brand scan + AI evaluation."""
    async with _sem():
        domain  = analysis.get("domain", "")
        text    = analysis.get("visible_text_snippet", "") or ""
        html_raw = text  # use snippet; brands already extracted from full page in site_analyzer

        detected   = detect_brands_in_text(text)
        dist_match = get_dist_matches(detected)

        # Run DDG search and legal page scraping in parallel
        title     = analysis.get("page_title") or ""
        biz_name  = title.split("|")[0].split("-")[0].strip() or domain
        search_results, extra_contacts = await asyncio.gather(
            _ddg_search(f'"{biz_name}" {domain} cosmetics beauty wholesale B2B contacto'),
            _scrape_legal_pages(domain),
        )

        logger.info(
            "Beauty assess %s: %d brands, %d portfolio matches, "
            "%d extra emails from legal pages",
            domain, len(detected), len(dist_match),
            len(extra_contacts.get("emails", [])),
        )

        payload = {
            "input": {
                "prompt": _build_beauty_prompt(
                    analysis, detected, dist_match, search_results, extra_contacts
                ),
                "images": [], "videos": [],
                "top_p": 0.9,
                "temperature": 0.2,
                "thinking_level": "low",
                "max_output_tokens": 4000,
            }
        }
        try:
            async with httpx.AsyncClient(timeout=120) as client:
                resp = await client.post(
                    REPLICATE_MODEL,
                    headers={
                        "Authorization": f"Bearer {REPLICATE_TOKEN}",
                        "Content-Type":  "application/json",
                        "Prefer":        "wait",
                    },
                    json=payload,
                )
                resp.raise_for_status()
                data = resp.json()

            output = data.get("output", "")
            if isinstance(output, list):
                output = "".join(output)

            result = _parse_beauty_output(output)
            # Merge pre-scan data that AI might miss
            if not result.get("dist_matches") and dist_match:
                result["dist_matches"] = dist_match
            if not result.get("detected_brands") and detected:
                result["detected_brands"] = detected

            # Merge contact data directly from site_analyzer + legal pages —
            # more reliable than AI extraction since it's regex against raw HTML.
            # The AI's all_contacts field may already have the right data if it
            # followed the schema; fill gaps from our own extraction.
            all_emails   = list(dict.fromkeys(
                (analysis.get("emails") or []) + (extra_contacts.get("emails") or [])
            ))[:8]
            all_phones   = list(dict.fromkeys(
                (analysis.get("phones") or []) + (extra_contacts.get("phones") or [])
            ))[:6]
            all_whatsapp = list(dict.fromkeys(analysis.get("whatsapp") or []))[:4]
            all_social   = list(dict.fromkeys(analysis.get("social_links") or []))[:6]

            # Ensure all_contacts in result is always populated from our own data
            if not result.get("all_contacts") or not isinstance(result.get("all_contacts"), dict):
                result["all_contacts"] = {}
            result["all_contacts"].setdefault("emails",   [])
            result["all_contacts"].setdefault("phones",   [])
            result["all_contacts"].setdefault("whatsapp", [])
            result["all_contacts"].setdefault("social",   [])
            # Merge our extracted data into the AI's all_contacts
            result["all_contacts"]["emails"]   = list(dict.fromkeys(
                result["all_contacts"]["emails"]   + all_emails))[:8]
            result["all_contacts"]["phones"]   = list(dict.fromkeys(
                result["all_contacts"]["phones"]   + all_phones))[:6]
            result["all_contacts"]["whatsapp"] = list(dict.fromkeys(
                result["all_contacts"]["whatsapp"] + all_whatsapp))[:4]
            result["all_contacts"]["social"]   = list(dict.fromkeys(
                result["all_contacts"]["social"]   + all_social))[:6]

            # Fill top-level contact fields from merged data if AI left them blank
            if not result.get("contact_email") and all_emails:
                result["contact_email"] = all_emails[0]
            if not result.get("contact_phone") and all_phones:
                result["contact_phone"] = all_phones[0]
            if not result.get("contact_whatsapp") and all_whatsapp:
                result["contact_whatsapp"] = all_whatsapp[0]
            if not result.get("contact_social") and all_social:
                result["contact_social"] = all_social[0]

            # country_fiscal fallback — always provide a value
            fc = (result.get("country_fiscal") or "").strip()
            if not fc or fc.lower() in ("unknown", "n/a", "-"):
                result["country_fiscal"] = analysis.get("ip_country") or ""

            logger.info("Beauty AI %s → quality=%s, dist_matches=%s",
                        domain, result.get("lead_quality"), result.get("dist_matches"))
            return result

        except Exception as e:
            logger.error("Beauty AI error %s: %s", domain, e)
            all_emails   = list(dict.fromkeys(
                (analysis.get("emails") or []) + (extra_contacts.get("emails") or [])))[:8]
            all_phones   = list(dict.fromkeys(
                (analysis.get("phones") or []) + (extra_contacts.get("phones") or [])))[:6]
            all_whatsapp = list(dict.fromkeys(analysis.get("whatsapp") or []))[:4]
            all_social   = list(dict.fromkeys(analysis.get("social_links") or []))[:6]
            return {
                "error": str(e)[:300],
                "is_relevant": False,
                "lead_quality": "COLD",
                "dist_matches": dist_match,
                "detected_brands": detected,
                "contact_email":    all_emails[0]   if all_emails   else "",
                "contact_phone":    all_phones[0]   if all_phones   else "",
                "contact_whatsapp": all_whatsapp[0] if all_whatsapp else "",
                "contact_social":   all_social[0]   if all_social   else "",
                "all_contacts": {
                    "emails": all_emails, "phones": all_phones,
                    "whatsapp": all_whatsapp, "social": all_social,
                },
            }