From dfd47743e35d357dcfe51683b774b06da914d810 Mon Sep 17 00:00:00 2001 From: Malin Date: Wed, 13 May 2026 10:37:36 +0200 Subject: [PATCH] fix: broader WhatsApp/social detection, generous assessment rules, overlay popup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - site_analyzer: scan onclick/data-href/data-url/data-link/data-action attrs on ALL tags for WhatsApp (wa.me, api.whatsapp, web.whatsapp, wa.link), tel: links, and social media URLs; raise dedup cap 5→8 - beauty_ai: rewrite lead quality rules — WARM for any genuine multi-brand retailer even with zero portfolio matches; portfolio absence NEVER justifies COLD alone; added country_fiscal fallback to ip_country - index.html: assessPopup overlay modal on quality badge click in Browse tab; showAssessPopup() parses beauty_assessment JSON with all_contacts fallback; [x-cloak] CSS to prevent flash Co-Authored-By: Claude Sonnet 4.6 --- app/beauty_ai.py | 61 ++++++++++----- app/site_analyzer.py | 51 +++++++++++-- app/static/beauty/index.html | 140 ++++++++++++++++++++++++++++++++++- 3 files changed, 227 insertions(+), 25 deletions(-) diff --git a/app/beauty_ai.py b/app/beauty_ai.py index 9c1839e..74f0f96 100644 --- a/app/beauty_ai.py +++ b/app/beauty_ai.py @@ -355,24 +355,44 @@ Social media: {_fmt(all_social)} {', '.join(BEAUTY_CATEGORIES)} === ASSESSMENT RULES === -1. TARGET PROFILE: retailer, pharmacy, parafarmacia, perfumería, multi-brand beauty - ecommerce, salon chain, beauty distributor, or supermarket beauty section in Europe. -2. Identify ALL beauty brands mentioned anywhere on the page — go beyond the pre-detected - list above. Use product names, brand references in body text, alt text, etc. -3. Match brands against our portfolio. Lead quality is driven by portfolio overlap: - - HOT: 3+ portfolio brands detected, OR major EU beauty retailer clearly in our niche - - WARM: 1-2 portfolio brand matches, OR clear beauty multi-brand retailer with good reach - - COLD: beauty-adjacent but weak portfolio overlap, OR single-brand, OR unclear wholesale - - NOT_RELEVANT: not a beauty business, not in Europe, or clearly a consumer-only brand -4. Extract the BEST contact for outreach: - - Prefer business/commercial emails (info@, ventas@, compras@, admin@) over personal - - If WhatsApp exists, flag it — it's often the fastest channel in Spain/LatAm - - Check social media for direct messaging channels -5. Use the legal/company info to identify the official business name (razón social), - and if a CIF/NIF is visible, mention it in outreach_notes as it confirms legitimacy. -6. Write summary, pitch_angle, b2b_proposal, outreach_subject, and outreach_email in Spanish. -7. The outreach_email must be a complete ready-to-send email: greeting, 2-3 body sentences - (reference their specific range, 1-2 matching portfolio brands, add value), clear CTA. +1. TARGET PROFILE: We are looking for businesses that BUY BEAUTY PRODUCTS WHOLESALE to + resell: retailers, pharmacies, parafarmacias, perfumerías, multi-brand beauty ecommerce, + salon chains, supermarkets with beauty sections, beauty distributors — anywhere in Europe. + +2. Identify ALL beauty brands anywhere on the page (body text, alt text, category names, + product listings, brand pages). Go beyond the pre-detected list already provided above. + +3. LEAD QUALITY — rate on BUSINESS TYPE first, portfolio overlap second: + - HOT: Business type is clearly a multi-brand beauty reseller with professional/wholesale + activity AND at least one of: ≥2 portfolio brands detected, evident professional + lines, large catalogue (pharmacies, parafarmacia chains, pro salon distributors). + Also HOT: any large-scale EU beauty retailer even without portfolio brand matches. + - WARM: ANY genuine multi-brand beauty retailer or ecommerce that could buy wholesale — + even if ZERO portfolio brands are currently detected. They are our TARGET MARKET: + we want to introduce our brands to them. Pharmacies, perfumerías, beauty shops, + multi-brand online stores → default WARM unless there is a clear disqualifier. + When uncertain between WARM and COLD: choose WARM. + - COLD: ONLY if clearly disqualified: single-brand D2C (sells only their own brand), + beauty salon that doesn't sell products to end-consumers, personal influencer / + blog, OR no evidence this is a purchasing business at all. + - NOT_RELEVANT: No beauty/cosmetics connection, or clearly non-European. + + ⚠ CRITICAL: Portfolio brand absence NEVER alone justifies COLD. Our job is to introduce + our brands to retailers who don't carry them yet. Rate on whether they COULD buy wholesale. + +4. country_fiscal: use aviso legal if found; otherwise use the IP country shown above. + NEVER leave country_fiscal empty — always provide a 2-letter ISO code. + +5. Extract the BEST contact for outreach — check all data above: + - Prefer commercial emails (info@, ventas@, compras@, pedidos@) over generic/personal + - WhatsApp is often the fastest channel in Spain; flag it if present + - Set best_contact_channel and best_contact_value explicitly + +6. Write summary, pitch_angle, b2b_proposal, outreach_subject, and outreach_email in SPANISH. + +7. outreach_email must be a complete ready-to-send Spanish email: greeting + 3-4 sentences + referencing their specific range + 1-2 of our portfolio brands that match + clear CTA + (catálogo, muestra gratuita, llamada, primer pedido mínimo). No placeholders. Respond ONLY with valid JSON, no markdown fences, no text outside the JSON object: {{ @@ -536,6 +556,11 @@ async def assess_beauty_domain(analysis: dict) -> dict: if not result.get("contact_social") and all_social: result["contact_social"] = all_social[0] + # country_fiscal fallback — always provide a value + fc = (result.get("country_fiscal") or "").strip() + if not fc or fc.lower() in ("unknown", "n/a", "-"): + result["country_fiscal"] = analysis.get("ip_country") or "" + logger.info("Beauty AI %s → quality=%s, dist_matches=%s", domain, result.get("lead_quality"), result.get("dist_matches")) return result diff --git a/app/site_analyzer.py b/app/site_analyzer.py index 74f9c52..60e53d2 100644 --- a/app/site_analyzer.py +++ b/app/site_analyzer.py @@ -316,8 +316,22 @@ async def _analyze_site_inner(domain: str) -> dict: result["has_gmb"] = any(sig.lower() in hl for sig in GMB_SCHEMA_SIGNALS) # ── Contacts ────────────────────────────────────────────────────────── - for a in soup.find_all("a", href=True): - href = a["href"] + # Pattern for WhatsApp links that appear inside onclick/data-* attrs + _WA_ATTR_RE = re.compile( + r'(https?://(?:wa\.me|api\.whatsapp\.com/send|web\.whatsapp\.com/send' + r'|wa\.link)[^\s\'"\\>]{0,80})', + re.I, + ) + + def _add_whatsapp(raw: str): + m = _WA_ATTR_RE.search(raw) + url = m.group(1) if m else raw[:80] + url = url.rstrip("'\"\\)") + if url and url not in result["whatsapp"]: + result["whatsapp"].append(url) + + for tag in soup.find_all("a", href=True): + href = tag["href"] if href.startswith("mailto:"): em = href[7:].split("?")[0].strip().lower() if em and em not in result["emails"]: @@ -326,9 +340,8 @@ async def _analyze_site_inner(domain: str) -> dict: ph = re.sub(r"[^\d+]", "", href[4:]) if ph and ph not in result["phones"]: result["phones"].append(ph) - elif "wa.me" in href or "api.whatsapp.com" in href: - if href not in result["whatsapp"]: - result["whatsapp"].append(href[:80]) + elif any(x in href for x in ("wa.me", "api.whatsapp", "wa.link", "web.whatsapp")): + _add_whatsapp(href) else: for sd in SOCIAL_DOM: if sd in href.lower(): @@ -336,6 +349,32 @@ async def _analyze_site_inner(domain: str) -> dict: if clean not in result["social_links"]: result["social_links"].append(clean) break + + # Broader scan: WhatsApp / tel links hidden in onclick, data-href, data-url, etc. + for tag in soup.find_all(True): + for attr in ("onclick", "data-href", "data-url", "data-link", "data-action"): + val = tag.get(attr) or "" + if not val: + continue + # WhatsApp in attribute value + if any(x in val for x in ("wa.me", "api.whatsapp", "wa.link", "web.whatsapp")): + _add_whatsapp(val) + # tel: in attribute value + m_tel = re.search(r"tel:([\d\s\+\-\(\)]{6,20})", val) + if m_tel: + ph = re.sub(r"[^\d+]", "", m_tel.group(1)) + if ph and ph not in result["phones"]: + result["phones"].append(ph) + # Social media links in attribute value + for sd in SOCIAL_DOM: + if sd in val.lower(): + url_m = re.search(r"https?://[^\s'\"\\)]{10,120}", val) + if url_m: + clean = url_m.group(0).split("?")[0].rstrip("/") + if clean not in result["social_links"]: + result["social_links"].append(clean) + break + for em in EMAIL_RE.findall(html[:80000]): em = em.lower() if em not in result["emails"] and not any(em.endswith(x) for x in [".png",".jpg",".css",".js",".svg"]): @@ -345,7 +384,7 @@ async def _analyze_site_inner(domain: str) -> dict: if ph_c not in result["phones"]: result["phones"].append(ph_c) for k in ["emails", "phones", "whatsapp", "social_links"]: - result[k] = list(dict.fromkeys(result[k]))[:5] + result[k] = list(dict.fromkeys(result[k]))[:8] # ── CMS ─────────────────────────────────────────────────────────────── CMS_SIGS = { diff --git a/app/static/beauty/index.html b/app/static/beauty/index.html index ba0c541..a29935b 100644 --- a/app/static/beauty/index.html +++ b/app/static/beauty/index.html @@ -88,6 +88,7 @@ tr:hover td{background:rgba(232,121,160,.04)} input[type=checkbox]{width:14px;height:14px;accent-color:var(--accent);cursor:pointer} textarea{width:100%;resize:vertical;font-family:monospace;font-size:12px} .section-pad{padding:0 24px} +[x-cloak]{display:none!important} @@ -225,7 +226,10 @@ textarea{width:100%;resize:vertical;font-family:monospace;font-size:12px} - + @@ -489,6 +493,120 @@ textarea{width:100%;resize:vertical;font-family:monospace;font-size:12px} + +
+
+ + + +
+
+