diff --git a/app/db.py b/app/db.py index 0b012b3..5d53a9d 100644 --- a/app/db.py +++ b/app/db.py @@ -378,7 +378,9 @@ async def save_ai_assessment(domain: str, assessment: dict, site_analysis: dict domain, ), ) - # Also update contact_info + kit_digital from site_analysis if available + # Update contact_info + kit_digital from site_analysis if available. + # Gemini's kit_digital_confirmed is the authoritative verdict — it can + # override a false-positive from the heuristic scanner. if site_analysis: contacts = { "emails": site_analysis.get("emails", []), @@ -386,12 +388,15 @@ async def save_ai_assessment(domain: str, assessment: dict, site_analysis: dict "whatsapp": site_analysis.get("whatsapp", []), "social": site_analysis.get("social_links", []), } + # Prefer Gemini's explicit verdict; fall back to heuristic if null + ai_kit = assessment.get("kit_digital_confirmed") + kit_val = int(ai_kit) if ai_kit is not None else int(site_analysis.get("kit_digital", False)) await db.execute( """UPDATE enriched_domains SET kit_digital=?, kit_digital_signals=?, contact_info=? WHERE domain=?""", ( - int(site_analysis.get("kit_digital", False)), + kit_val, _json.dumps(site_analysis.get("kit_digital_signals", [])), _json.dumps(contacts), domain, diff --git a/app/enricher.py b/app/enricher.py index 5ce4c85..4d9c293 100644 --- a/app/enricher.py +++ b/app/enricher.py @@ -62,44 +62,34 @@ def detect_cms(html: str, headers: dict) -> Optional[str]: # ── Kit Digital detection ──────────────────────────────────────────────────── -KIT_IMG_PATS = [ - "digitalizadores", "kit-digital", "kitdigital", "kit_digital", - "fondos-europeos", "fondos_europeos", "nextgeneration", "next-generation", - "prtr", "plan-recuperacion", "planderecuperacion", - "acelerapyme", "logo-ue", "recovery-eu", "cofinanciado", -] -KIT_TEXT_PATS = [ - "kit digital", "agente digitalizador", "agentes digitalizadores", - "fondos europeos", "next generation eu", "nextgenerationeu", - "plan de recuperación", "plan de recuperacion", - "plan de digitalización", "digitalización pymes", - "prtr", "financiado por la unión europea", - "red.es/kit-digital", "acelerapyme.es", -] -KIT_LINK_PATS = ["acelerapyme", "red.es", "kit-digital", "kitdigital"] +KIT_STRONG_IMG = ["kit-digital", "kitdigital", "kit_digital", "agente-digitalizador", "agente_digitalizador"] +KIT_STRONG_TEXT = ["kit digital", "agente digitalizador", "agentes digitalizadores"] +KIT_STRONG_LINK = ["acelerapyme.es", "red.es/kit-digital", "kit-digital.red.es"] def detect_kit_digital(soup, html: str) -> tuple[bool, list]: signals = [] - hl = html.lower() + vl = soup.get_text().lower() for img in soup.find_all("img"): combined = ((img.get("src") or "") + (img.get("alt") or "") + (img.get("srcset") or "")).lower() - for p in KIT_IMG_PATS: + for p in KIT_STRONG_IMG: if p in combined: signals.append(f"img:{p}") break - for p in KIT_TEXT_PATS: - if p in hl: + for p in KIT_STRONG_TEXT: + if p in vl: signals.append(f"text:{p}") for a in soup.find_all("a", href=True): href = a["href"].lower() - if any(p in href for p in KIT_LINK_PATS): - signals.append(f"link:{href[:60]}") + for p in KIT_STRONG_LINK: + if p in href: + signals.append(f"link:{href[:60]}") + break - signals = list(dict.fromkeys(signals))[:15] + signals = list(dict.fromkeys(signals))[:10] return len(signals) > 0, signals diff --git a/app/replicate_ai.py b/app/replicate_ai.py index aaa6e8e..5428f98 100644 --- a/app/replicate_ai.py +++ b/app/replicate_ai.py @@ -38,6 +38,8 @@ def _build_prompt(a: dict) -> str: lorem_str = ", ".join(a.get("lorem_matches") or []) or "none" ph_str = ", ".join(a.get("placeholder_matches") or []) or "none" snippet = (a.get("visible_text_snippet") or "")[:2000] + social_str = ", ".join(a.get("social_links") or []) or "none detected" + gmb_str = f"✅ Found — {a.get('gmb_url','')}" if a.get("has_gmb") else "❌ Not detected" eu_hosted = a.get("eu_hosted") hosting_flag = "✅ EU" if eu_hosted else ("❌ Non-EU" if eu_hosted is False else "unknown") @@ -86,10 +88,16 @@ Inputs without labels: {a.get("inputs_without_labels")} Lorem ipsum: {a.get("has_lorem_ipsum")} → {lorem_str} Placeholder: {a.get("has_placeholder")} → {ph_str} -=== KIT DIGITAL (Spanish gov €12k SME grants — sites must show EU logos) === -Detected: {a.get("kit_digital")} +=== KIT DIGITAL (Spanish gov €12k SME grants — ONLY confirm if you see explicit Kit Digital / agente digitalizador branding) === +Heuristic detected: {a.get("kit_digital")} {kd_str} +=== GOOGLE MY BUSINESS === +GMB/Business Profile: {gmb_str} + +=== SOCIAL MEDIA === +Profiles found on site: {social_str} + === CONTACT CHANNELS === {contacts_str} @@ -99,7 +107,8 @@ Detected: {a.get("kit_digital")} === INSTRUCTIONS === The client sells: web redesign, SEO, hosting migration, SSL renewal, security audits, GDPR compliance, accessibility fixes, Google Ads, -maintenance contracts, AI tools for SMEs. +maintenance contracts, AI tools for SMEs, Google My Business setup/optimisation, +social media management (Instagram, Facebook, LinkedIn, TikTok). Respond ONLY with valid JSON, no markdown fences, no text outside the JSON: {{ @@ -112,6 +121,8 @@ Respond ONLY with valid JSON, no markdown fences, no text outside the JSON: "gdpr_compliance": "cookie banner status, privacy policy, GDPR gaps", "accessibility_issues": ["specific a11y problems found"], "kit_digital_confirmed": true/false, + "has_gmb": true/false, + "has_social_media": true/false, "kit_digital_reasoning": "1 sentence", "is_local_sme": true/false, "lead_quality": "HOT|WARM|COLD", diff --git a/app/scorer.py b/app/scorer.py index 92f591d..4b429c2 100644 --- a/app/scorer.py +++ b/app/scorer.py @@ -1,15 +1,16 @@ import os +import json import aiosqlite from app.db import SQLITE_PATH KNOWN_CMS = {"wordpress", "joomla", "drupal", "wix", "squarespace", "shopify", "prestashop", "magento", "typo3", "opencart"} -TARGET_COUNTRIES = set(os.getenv("TARGET_COUNTRIES", "ES,GB,DE,FR").split(",")) +TARGET_COUNTRIES = set(os.getenv("TARGET_COUNTRIES", "ES,GB,DE,FR,RO,PT,AD,IT").split(",")) LOCAL_BIZ_KEYWORDS = { "restaurant", "cafe", "shop", "store", "salon", "plumber", "electrician", "dentist", "clinic", "garage", "hotel", "bakery", "bar", "gym", "spa", - "fontanero", "electricista", "dentista", "clínica", "taller", "hotel", - "panadería", "peluquería", "tienda", + "fontanero", "electricista", "dentista", "clínica", "taller", + "panadería", "peluquería", "tienda", "abogado", "gestor", "inmobili", } @@ -21,29 +22,53 @@ def local_biz_keywords(title: str | None) -> bool: def score(domain_row: dict) -> int: - s = 0 - if domain_row.get("is_live"): - s += 20 + # Dead sites are unreachable — cap at 5 regardless of other signals + if not domain_row.get("is_live") and not domain_row.get("reachable"): + return 5 + + s = 20 # Live site base + ssl_days = domain_row.get("ssl_expiry_days") if ssl_days is not None and ssl_days < 30: - s += 15 + s += 15 # SSL expiring / expired — urgent upsell if not domain_row.get("ssl_valid"): - s += 15 + s += 15 # No valid SSL + cms = (domain_row.get("cms") or "").lower() if cms in KNOWN_CMS: - s += 15 + s += 15 # Known CMS — maintenance / migration opportunity + if not domain_row.get("has_mx"): - s += 10 + s += 10 # No email = needs professional email setup + if domain_row.get("ip_country") in TARGET_COUNTRIES: s += 10 + server = (domain_row.get("server") or "").lower() if "shared" in server: - s += 10 + s += 5 + if local_biz_keywords(domain_row.get("page_title")): s += 5 - # Kit Digital: proven buyer of IT services + + # Kit Digital: proven buyer of IT services (Gemini-confirmed takes precedence) if domain_row.get("kit_digital"): s += 20 + + # Social media / GMB presence signals + try: + ci = json.loads(domain_row.get("contact_info") or "{}") + has_social = bool(ci.get("social")) + has_contact = bool(ci.get("emails") or ci.get("phones") or ci.get("whatsapp")) + except Exception: + has_social = False + has_contact = False + + if not has_social: + s += 5 # No social = opportunity to build presence + if has_contact: + s += 3 # Reachable lead — we can actually pitch them + return min(s, 100) diff --git a/app/site_analyzer.py b/app/site_analyzer.py index ef3d57f..f8a4f45 100644 --- a/app/site_analyzer.py +++ b/app/site_analyzer.py @@ -76,16 +76,26 @@ WEBMASTER = { "yandex": ["yandex-verification"], } -# ── Kit Digital ─────────────────────────────────────────────────────────────── -KIT_IMG_PATS = [ - "digitalizadores", "kit-digital", "kitdigital", "kit_digital", - "fondos-europeos", "fondos_europeos", "nextgeneration", "next-generation", - "prtr", "plan-recuperacion", "acelerapyme", "cofinanciado", +# ── Kit Digital — require SPECIFIC signals, not generic EU logos ─────────────── +# These patterns are unambiguously Kit Digital programme markers +KIT_STRONG_IMG = ["kit-digital", "kitdigital", "kit_digital", "agente-digitalizador", "agente_digitalizador"] +KIT_STRONG_TEXT = ["kit digital", "agente digitalizador", "agentes digitalizadores"] +KIT_STRONG_LINK = ["acelerapyme.es", "red.es/kit-digital", "kit-digital.red.es"] + +# ── Google My Business / Business Profile ──────────────────────────────────── +GMB_URL_SIGNALS = [ + "maps.googleapis.com/maps/api", # embedded Google Map widget + "google.com/maps/place", # link to GMB Place page + "maps.google.com", + "g.page/", + "maps.app.goo.gl", + "goo.gl/maps", + "business.google.com", ] -KIT_TEXT_PATS = [ - "kit digital", "agente digitalizador", "fondos europeos", - "next generation eu", "nextgenerationeu", "plan de recuperación", - "prtr", "financiado por la unión europea", "red.es/kit-digital", "acelerapyme", +GMB_SCHEMA_SIGNALS = [ + '"@type":"LocalBusiness"', + '"@type": "LocalBusiness"', + "schema.org/LocalBusiness", ] EMAIL_RE = re.compile(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}") @@ -155,6 +165,8 @@ async def _analyze_site_inner(domain: str) -> dict: "inputs_without_labels": 0, # Kit Digital "kit_digital": False, "kit_digital_signals": [], + # Google My Business + "has_gmb": False, "gmb_url": None, # Contacts "emails": [], "phones": [], "whatsapp": [], "social_links": [], "error": None, @@ -267,25 +279,40 @@ async def _analyze_site_inner(domain: str) -> dict: if inp.get("id") not in labeled_ids and not inp.get("aria-label") and not inp.get("aria-labelledby") ) - # ── Kit Digital ─────────────────────────────────────────────────────── + # ── Kit Digital (specific signals only — generic EU logos excluded) ────── kd_signals = [] for img in soup.find_all("img"): comb = ((img.get("src") or "") + (img.get("alt") or "") + (img.get("srcset") or "")).lower() - for p in KIT_IMG_PATS: + for p in KIT_STRONG_IMG: if p in comb: kd_signals.append(f"img:{p}") break - for p in KIT_TEXT_PATS: - if p in hl: + for p in KIT_STRONG_TEXT: + if p in vl: kd_signals.append(f"text:{p}") for a in soup.find_all("a", href=True): href = a["href"].lower() - if "acelerapyme" in href or "red.es" in href or "kit-digital" in href: - kd_signals.append(f"link:{href[:50]}") + for p in KIT_STRONG_LINK: + if p in href: + kd_signals.append(f"link:{href[:60]}") + break kd_signals = list(dict.fromkeys(kd_signals))[:10] result["kit_digital"] = len(kd_signals) > 0 result["kit_digital_signals"] = kd_signals + # ── Google My Business ──────────────────────────────────────────────── + for a in soup.find_all("a", href=True): + href_g = a["href"] + for sig in GMB_URL_SIGNALS: + if sig in href_g: + result["has_gmb"] = True + result["gmb_url"] = href_g[:120] + break + if result["has_gmb"]: + break + if not result["has_gmb"]: + result["has_gmb"] = any(sig.lower() in hl for sig in GMB_SCHEMA_SIGNALS) + # ── Contacts ────────────────────────────────────────────────────────── for a in soup.find_all("a", href=True): href = a["href"] diff --git a/app/static/index.html b/app/static/index.html index 158adb7..459c6da 100644 --- a/app/static/index.html +++ b/app/static/index.html @@ -91,12 +91,21 @@ tr:hover td{background:rgba(255,255,255,.025)} .score{display:inline-block;padding:1px 6px;border-radius:5px;font-weight:800;font-size:11px;min-width:28px;text-align:center} /* Contact chips */ -.contact-chips{display:flex;flex-wrap:wrap;gap:3px} +.contact-chips{display:flex;flex-wrap:wrap;gap:3px;align-items:center} .chip{display:inline-flex;align-items:center;gap:3px;padding:1px 6px;border-radius:4px;font-size:10px;background:var(--surface2);border:1px solid var(--border);color:var(--muted);max-width:160px;overflow:hidden;text-overflow:ellipsis;white-space:nowrap} .chip.email{border-color:#00d4aa33;color:var(--accent2)} .chip.phone{border-color:#6c63ff33;color:var(--accent)} .chip.wa{border-color:#22c55e33;color:#4ade80} -.chip.social{border-color:#f59e0b33;color:var(--kd)} +/* Social platform icon badges */ +.sicon{display:inline-flex;align-items:center;justify-content:center;width:18px;height:18px;border-radius:4px;font-size:9px;font-weight:900;text-decoration:none;flex-shrink:0;line-height:1} +.sicon.fb{background:#1877f2;color:#fff} +.sicon.ig{background:linear-gradient(135deg,#f09433 0%,#e6683c 25%,#dc2743 50%,#cc2366 75%,#bc1888 100%);color:#fff} +.sicon.li{background:#0a66c2;color:#fff} +.sicon.tw{background:#000;color:#fff} +.sicon.tt{background:#010101;color:#fff} +.sicon.yt{background:#ff0000;color:#fff} +.sicon.gmb{background:#4285f4;color:#fff} +.sicon.other{background:var(--surface2);border:1px solid var(--border);color:var(--muted)} /* Tooltip */ [title]{cursor:help} @@ -182,6 +191,8 @@ tr:hover td{background:rgba(255,255,255,.025)}