fix: AI worker crash-proof + GDPR/hosting/accessibility analysis

AI worker fixes (root cause of "nothing reaches Replicate"): - Worker task died silently — no exception handler around while loop - Added try/except around entire loop body with exc_info logging - Added watchdog task that restarts dead workers every 10 seconds - ensure_workers_alive() called on every /api/ai/assess/batch POST - _assess_one() is now a top-level function (not closure) — avoids subtle scoping bugs with async inner functions in while loops - /api/ai/debug endpoint: shows worker alive status, task exception, last 10 queue entries — browse to /api/ai/debug to diagnose - /api/ai/worker/restart endpoint + UI button - "Restart AI worker" button + "Debug AI queue" link in enrichment tab site_analyzer.py — new signals: - IP resolution + ip-api.com for ASN, org, ISP, host country - EU hosting detection (27 EU + EEA + adequacy countries) - GDPR: detects Cookiebot, OneTrust, CookiePro, Osano, Iubenda, Borlabs, CookieYes, Complianz, Usercentrics + text signals - Privacy policy and GDPR text presence - Accessibility: html lang missing, images without alt count, skip nav link, empty links, inputs without labels Gemini prompt additions: - Hosting section: IP, ASN, org/ISP, EU vs non-EU flag - GDPR section: cookie tool, notice, privacy policy - Accessibility section: all quick-scan results - New output fields: hosting_notes, gdpr_compliance, accessibility_issues[] Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-13 18:01:34 +02:00
parent 5ad8259c75
commit 60c9b495ae
10 changed files with 409 additions and 205 deletions
--- a/app/site_analyzer.py
+++ b/app/site_analyzer.py
@@ -1,8 +1,9 @@
-"""Deep site analysis: content quality, SEO signals, performance, indexing hints."""
+"""Deep site analysis: content quality, SEO, hosting, GDPR, accessibility."""
 import asyncio
 import re
 import time
 import logging
+import socket
 from typing import Optional

 import httpx
@@ -10,39 +11,72 @@ from bs4 import BeautifulSoup

 logger = logging.getLogger(__name__)

-# ── Content quality ───────────────────────────────────────────────────────────
+# ── EU countries (hosting check) ─────────────────────────────────────────────
+EU_COUNTRIES = {
+    'AT','BE','BG','HR','CY','CZ','DK','EE','FI','FR','DE','GR',
+    'HU','IE','IT','LV','LT','LU','MT','NL','PL','PT','RO','SK',
+    'SI','ES','SE',
+    'NO','IS','LI',  # EEA
+    'CH','GB','AD',  # adequacy / adjacent
+}

+# ── Content quality ───────────────────────────────────────────────────────────
 LOREM_PHRASES = [
    "lorem ipsum", "sed ut perspiciatis", "nunc sem sapien",
    "nulla id nibh", "aenean dignissim", "aliquam tincidunt",
    "vestibulum commodo", "fusce nunc lacus", "consectetuer",
    "cras ornare tristique", "ntulla nec ante", "risus id metus",
    "praesent placerat", "fusce pellentesque", "suscipit nibh",
-    "integer vitae libero", "felis quis tortor",
+    "integer vitae libero", "felis quis tortor", "dolor sit amet",
 ]
-
 PLACEHOLDER_PHRASES = [
    "under construction", "coming soon", "sample page",
-    "this is a demo", "default post", "hello world",
-    "test post", "uncategorized",
+    "this is a demo", "hello world", "test content",
+    "default post", "uncategorized", "demo content",
 ]

-# ── Analytics & webmaster tags ────────────────────────────────────────────────
+# ── Cookie / GDPR consent tools ───────────────────────────────────────────────
+COOKIE_TOOLS = {
+    "cookiebot":      ["cookiebot.com", "CookieConsent", "CybotCookiebot"],
+    "onetrust":       ["onetrust", "otBannerSdk"],
+    "cookiepro":      ["cookiepro.com"],
+    "osano":          ["osano.com"],
+    "iubenda":        ["iubenda.com"],
+    "borlabs":        ["borlabs-cookie"],
+    "complianz":      ["complianz"],
+    "cookieyes":      ["cookieyes.com", "cookie-law-info"],
+    "usercentrics":   ["usercentrics.com"],
+    "quantcast":      ["quantcast.com/cmp"],
+}
+COOKIE_TEXT_SIGNALS = [
+    "accept cookies", "acepta las cookies", "we use cookies", "usamos cookies",
+    "cookie policy", "política de cookies", "cookie settings", "manage cookies",
+    "aceptar todas", "rechazar cookies",
+]
+PRIVACY_SIGNALS = [
+    "privacy policy", "política de privacidad", "aviso legal",
+    "privacy notice", "data protection",
+]
+GDPR_TEXT_SIGNALS = [
+    "rgpd", "gdpr", "reglamento general de protección",
+    "lopd", "protección de datos", "responsable del tratamiento",
+]

+# ── Analytics / webmaster ─────────────────────────────────────────────────────
 ANALYTICS = {
-    "google_analytics":    ["gtag('config'", "google-analytics.com/analytics.js", "G-"],
-    "google_tag_manager":  ["googletagmanager.com/gtm.js", "GTM-"],
-    "facebook_pixel":      ["fbq('init'", "connect.facebook.net/en_US/fbevents"],
-    "hotjar":              ["static.hotjar.com"],
-    "clarity":             ["clarity.ms/tag"],
+    "google_analytics":   ["gtag('config'", "google-analytics.com/analytics.js", "G-"],
+    "google_tag_manager": ["googletagmanager.com/gtm.js", "GTM-"],
+    "facebook_pixel":     ["fbq('init'", "connect.facebook.net/en_US/fbevents"],
+    "hotjar":             ["static.hotjar.com"],
+    "clarity":            ["clarity.ms/tag"],
 }
-
 WEBMASTER = {
-    "google_search_console": ['google-site-verification'],
-    "bing_webmaster":        ['msvalidate.01'],
-    "yandex":                ['yandex-verification'],
+    "google_search_console": ["google-site-verification"],
+    "bing_webmaster":        ["msvalidate.01"],
+    "yandex":                ["yandex-verification"],
 }

+# ── Kit Digital ───────────────────────────────────────────────────────────────
 KIT_IMG_PATS = [
    "digitalizadores", "kit-digital", "kitdigital", "kit_digital",
    "fondos-europeos", "fondos_europeos", "nextgeneration", "next-generation",
@@ -56,72 +90,99 @@ KIT_TEXT_PATS = [

 EMAIL_RE   = re.compile(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}")
 PHONE_RE   = re.compile(r"(?:\+34[\s\-]?)?(?:6|7|8|9)\d{2}[\s\-]?\d{3}[\s\-]?\d{3}")
-SOCIAL_DOM = ["facebook.com", "instagram.com", "linkedin.com", "twitter.com", "x.com", "tiktok.com"]
+SOCIAL_DOM = ["facebook.com", "instagram.com", "linkedin.com",
+              "twitter.com", "x.com", "tiktok.com", "youtube.com"]
+
+
+async def _get_hosting_info(domain: str) -> dict:
+    """Resolve IP, then look up ASN / org / country via ip-api.com."""
+    info = {"ip": None, "asn": None, "org": None, "isp": None,
+            "ip_country": None, "ip_region": None, "eu_hosted": None}
+    try:
+        loop = asyncio.get_event_loop()
+        ip = await loop.run_in_executor(None, socket.gethostbyname, domain)
+        info["ip"] = ip
+        async with httpx.AsyncClient(timeout=6) as client:
+            r = await client.get(
+                f"http://ip-api.com/json/{ip}",
+                params={"fields": "status,country,countryCode,regionName,org,as,isp"},
+            )
+            if r.status_code == 200:
+                d = r.json()
+                if d.get("status") == "success":
+                    info.update({
+                        "asn":        d.get("as"),
+                        "org":        d.get("org"),
+                        "isp":        d.get("isp"),
+                        "ip_country": d.get("countryCode"),
+                        "ip_region":  d.get("regionName"),
+                        "eu_hosted":  d.get("countryCode") in EU_COUNTRIES,
+                    })
+    except Exception as e:
+        logger.debug("Hosting lookup failed for %s: %s", domain, e)
+    return info


 async def analyze_site(domain: str) -> dict:
-    """Fetch and deeply analyse a site. Returns a rich dict for the AI prompt."""
    result = {
        "domain": domain,
-        "reachable": False,
-        "load_time_ms": None,
-        "status_code": None,
-        "final_url": None,
-        "page_size_kb": None,
-        "server": None,
-        "cms": None,
-        "ssl_valid": False,
-        "ssl_expiry_days": None,
+        "reachable": False, "load_time_ms": None, "status_code": None,
+        "final_url": None, "page_size_kb": None, "server": None, "cms": None,
+        # Hosting
+        "ip": None, "asn": None, "org": None, "isp": None,
+        "ip_country": None, "ip_region": None, "eu_hosted": None,
+        # SSL
+        "ssl_valid": False, "ssl_expiry_days": None,
        # Content quality
-        "has_lorem_ipsum": False,
-        "lorem_matches": [],
-        "has_placeholder": False,
-        "placeholder_matches": [],
-        "word_count": 0,
-        "image_count": 0,
-        "broken_images": 0,
-        "script_count": 0,
+        "has_lorem_ipsum": False, "lorem_matches": [],
+        "has_placeholder": False, "placeholder_matches": [],
+        "word_count": 0, "image_count": 0, "script_count": 0,
        "has_mobile_viewport": False,
-        "page_title": None,
-        "meta_description": None,
-        "h1_text": None,
+        "page_title": None, "meta_description": None, "h1_text": None,
        "visible_text_snippet": "",
-        # SEO / webmaster
-        "has_sitemap": False,
-        "has_robots": False,
-        "robots_disallows_google": False,
-        "analytics_present": [],
-        "webmaster_verified": [],
-        "canonical_url": None,
-        "og_title": None,
+        # SEO
+        "has_sitemap": False, "has_robots": False, "robots_disallows_google": False,
+        "analytics_present": [], "webmaster_verified": [],
+        "canonical_url": None, "og_title": None,
+        # GDPR / cookies
+        "cookie_tool": None, "has_cookie_notice": False,
+        "has_privacy_policy": False, "has_gdpr_text": False,
+        # Accessibility
+        "html_lang": None, "images_missing_alt": 0,
+        "has_skip_nav": False, "empty_links": 0,
+        "inputs_without_labels": 0,
        # Kit Digital
-        "kit_digital": False,
-        "kit_digital_signals": [],
+        "kit_digital": False, "kit_digital_signals": [],
        # Contacts
-        "emails": [],
-        "phones": [],
-        "whatsapp": [],
-        "social_links": [],
-        # Errors
+        "emails": [], "phones": [], "whatsapp": [], "social_links": [],
        "error": None,
    }

-    # ── Fetch main page ───────────────────────────────────────────────────────
-    try:
+    # ── Fetch + hosting (parallel) ────────────────────────────────────────────
+    async def _fetch():
        t0 = time.monotonic()
-        async with httpx.AsyncClient(
-            timeout=15, follow_redirects=True, verify=False,
-            headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"},
-        ) as client:
-            resp = await client.get(f"https://{domain}")
-            if resp.status_code >= 400:
-                resp = await client.get(f"http://{domain}")
+        try:
+            async with httpx.AsyncClient(
+                timeout=15, follow_redirects=True, verify=False,
+                headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"},
+            ) as client:
+                resp = await client.get(f"https://{domain}")
+                if resp.status_code >= 400:
+                    resp = await client.get(f"http://{domain}")
+            return resp, int((time.monotonic() - t0) * 1000)
+        except Exception as e:
+            return None, int((time.monotonic() - t0) * 1000)

-        load_ms = int((time.monotonic() - t0) * 1000)
+    (resp, load_ms), hosting = await asyncio.gather(_fetch(), _get_hosting_info(domain))
+    result.update(hosting)
+    result["load_time_ms"] = load_ms
+
+    if resp is None:
+        result["error"] = "Failed to fetch site"
+    else:
        html = resp.text
        result.update({
            "reachable": resp.status_code < 400,
-            "load_time_ms": load_ms,
            "status_code": resp.status_code,
            "final_url": str(resp.url),
            "page_size_kb": round(len(resp.content) / 1024, 1),
@@ -131,46 +192,42 @@ async def analyze_site(domain: str) -> dict:
        soup = BeautifulSoup(html, "html.parser")
        hl = html.lower()

-        # Title, meta
-        title_tag = soup.find("title")
-        result["page_title"] = title_tag.get_text(strip=True)[:200] if title_tag else None
-        meta_desc = soup.find("meta", attrs={"name": "description"})
-        result["meta_description"] = (meta_desc.get("content") or "")[:300] if meta_desc else None
+        # ── Basic metadata ────────────────────────────────────────────────────
+        result["html_lang"] = (soup.find("html") or {}).get("lang")
+        t = soup.find("title")
+        result["page_title"] = t.get_text(strip=True)[:200] if t else None
+        md = soup.find("meta", attrs={"name": "description"})
+        result["meta_description"] = (md.get("content") or "")[:300] if md else None
        h1 = soup.find("h1")
        result["h1_text"] = h1.get_text(strip=True)[:200] if h1 else None
-
-        # Mobile viewport
        result["has_mobile_viewport"] = bool(soup.find("meta", attrs={"name": "viewport"}))
-
-        # Canonical + OG
-        canon = soup.find("link", rel="canonical")
-        result["canonical_url"] = canon.get("href") if canon else None
+        c = soup.find("link", rel="canonical")
+        result["canonical_url"] = c.get("href") if c else None
        og = soup.find("meta", property="og:title")
        result["og_title"] = og.get("content") if og else None

-        # Visible text
+        # ── Visible text ──────────────────────────────────────────────────────
        for tag in soup(["script", "style", "noscript"]):
            tag.decompose()
-        visible_text = soup.get_text(separator=" ", strip=True)
-        words = visible_text.split()
+        visible = soup.get_text(separator=" ", strip=True)
+        vl = visible.lower()
+        words = visible.split()
        result["word_count"] = len(words)
-        result["visible_text_snippet"] = " ".join(words[:500])
+        result["visible_text_snippet"] = " ".join(words[:600])

-        # Lorem ipsum / placeholder detection
-        vl = visible_text.lower()
+        # ── Content quality ───────────────────────────────────────────────────
        lorem_hits = [p for p in LOREM_PHRASES if p in vl]
-        result["has_lorem_ipsum"] = len(lorem_hits) > 0
-        result["lorem_matches"] = lorem_hits[:5]
+        result["has_lorem_ipsum"]   = len(lorem_hits) > 0
+        result["lorem_matches"]     = lorem_hits[:6]
        ph_hits = [p for p in PLACEHOLDER_PHRASES if p in vl]
-        result["has_placeholder"] = len(ph_hits) > 0
+        result["has_placeholder"]   = len(ph_hits) > 0
        result["placeholder_matches"] = ph_hits[:3]

-        # Images & scripts
        imgs = soup.find_all("img")
-        result["image_count"] = len(imgs)
+        result["image_count"]  = len(imgs)
        result["script_count"] = len(soup.find_all("script", src=True))

-        # Analytics / webmaster tags
+        # ── Analytics / webmaster ─────────────────────────────────────────────
        for name, sigs in ANALYTICS.items():
            if any(s.lower() in hl for s in sigs):
                result["analytics_present"].append(name)
@@ -178,12 +235,42 @@ async def analyze_site(domain: str) -> dict:
            if any(s.lower() in hl for s in sigs):
                result["webmaster_verified"].append(name)

-        # Kit Digital
+        # ── GDPR / cookies ────────────────────────────────────────────────────
+        for tool, sigs in COOKIE_TOOLS.items():
+            if any(s.lower() in hl for s in sigs):
+                result["cookie_tool"] = tool
+                result["has_cookie_notice"] = True
+                break
+        if not result["has_cookie_notice"]:
+            result["has_cookie_notice"] = any(s in vl for s in COOKIE_TEXT_SIGNALS)
+        result["has_privacy_policy"] = any(s in vl for s in PRIVACY_SIGNALS) or bool(
+            soup.find("a", href=lambda h: h and ("privacidad" in h.lower() or "privacy" in h.lower()))
+        )
+        result["has_gdpr_text"] = any(s in vl for s in GDPR_TEXT_SIGNALS)
+
+        # ── Accessibility ─────────────────────────────────────────────────────
+        result["images_missing_alt"] = sum(
+            1 for img in imgs if not img.get("alt") and img.get("alt") != ""
+        )
+        result["has_skip_nav"] = bool(
+            soup.find("a", href=lambda h: h and h.lower() in ("#main", "#content", "#maincontent", "#skip"))
+        )
+        result["empty_links"] = sum(
+            1 for a in soup.find_all("a") if not a.get_text(strip=True) and not a.find("img")
+        )
+        all_inputs = soup.find_all("input", type=lambda t: t not in ("hidden", "submit", "button", None) or t is None)
+        labeled_ids = {lbl.get("for") for lbl in soup.find_all("label") if lbl.get("for")}
+        result["inputs_without_labels"] = sum(
+            1 for inp in all_inputs
+            if inp.get("id") not in labeled_ids and not inp.get("aria-label") and not inp.get("aria-labelledby")
+        )
+
+        # ── Kit Digital ───────────────────────────────────────────────────────
        kd_signals = []
-        for img in imgs:
-            combined = ((img.get("src") or "") + (img.get("alt") or "") + (img.get("srcset") or "")).lower()
+        for img in soup.find_all("img"):
+            comb = ((img.get("src") or "") + (img.get("alt") or "") + (img.get("srcset") or "")).lower()
            for p in KIT_IMG_PATS:
-                if p in combined:
+                if p in comb:
                    kd_signals.append(f"img:{p}")
                    break
        for p in KIT_TEXT_PATS:
@@ -194,10 +281,10 @@ async def analyze_site(domain: str) -> dict:
            if "acelerapyme" in href or "red.es" in href or "kit-digital" in href:
                kd_signals.append(f"link:{href[:50]}")
        kd_signals = list(dict.fromkeys(kd_signals))[:10]
-        result["kit_digital"] = len(kd_signals) > 0
+        result["kit_digital"]         = len(kd_signals) > 0
        result["kit_digital_signals"] = kd_signals

-        # Contacts
+        # ── Contacts ──────────────────────────────────────────────────────────
        for a in soup.find_all("a", href=True):
            href = a["href"]
            if href.startswith("mailto:"):
@@ -220,25 +307,36 @@ async def analyze_site(domain: str) -> dict:
                        break
        for em in EMAIL_RE.findall(html[:80000]):
            em = em.lower()
-            if em not in result["emails"] and not any(em.endswith(x) for x in [".png", ".jpg", ".css", ".js", ".svg"]):
+            if em not in result["emails"] and not any(em.endswith(x) for x in [".png",".jpg",".css",".js",".svg"]):
                result["emails"].append(em)
-        for ph in PHONE_RE.findall(visible_text):
+        for ph in PHONE_RE.findall(visible):
            ph_c = re.sub(r"[\s\-]", "", ph)
            if ph_c not in result["phones"]:
                result["phones"].append(ph_c)
-        # Cap
        for k in ["emails", "phones", "whatsapp", "social_links"]:
            result[k] = list(dict.fromkeys(result[k]))[:5]

-        # CMS
-        from app.enricher import detect_cms
-        result["cms"] = detect_cms(html, dict(resp.headers))
-
-    except Exception as e:
-        result["error"] = str(e)[:300]
+        # ── CMS ───────────────────────────────────────────────────────────────
+        CMS_SIGS = {
+            "wordpress":   ["/wp-content/", "/wp-includes/", 'content="WordPress'],
+            "joomla":      ["/components/com_", "Joomla!", 'content="Joomla'],
+            "drupal":      ["/sites/default/files/", "Drupal.settings"],
+            "wix":         ["static.wixstatic.com", "X-Wix-"],
+            "squarespace": ["squarespace.com", "X-Squarespace-"],
+            "shopify":     ["cdn.shopify.com", "Shopify.theme"],
+            "prestashop":  ["PrestaShop", "/modules/prestashop"],
+            "magento":     ["Mage.Cookies", "X-Magento-"],
+            "typo3":       ["typo3temp", "TYPO3 CMS"],
+            "opencart":    ["route=common/home", "OpenCart"],
+        }
+        combined_check = html[:60000] + " ".join(f"{k}:{v}" for k, v in resp.headers.items())
+        for cms, sigs in CMS_SIGS.items():
+            if any(s.lower() in combined_check.lower() for s in sigs):
+                result["cms"] = cms
+                break

    # ── Sitemap & robots (parallel) ───────────────────────────────────────────
-    async def _check_url(url: str) -> Optional[str]:
+    async def _get(url):
        try:
            async with httpx.AsyncClient(timeout=6, follow_redirects=True, verify=False) as c:
                r = await c.get(url)
@@ -247,24 +345,22 @@ async def analyze_site(domain: str) -> dict:
            return None

    sitemap_txt, robots_txt = await asyncio.gather(
-        _check_url(f"https://{domain}/sitemap.xml"),
-        _check_url(f"https://{domain}/robots.txt"),
+        _get(f"https://{domain}/sitemap.xml"),
+        _get(f"https://{domain}/robots.txt"),
    )
    result["has_sitemap"] = sitemap_txt is not None
-    result["has_robots"] = robots_txt is not None
+    result["has_robots"]  = robots_txt is not None
    if robots_txt:
-        robots_lower = robots_txt.lower()
-        result["robots_disallows_google"] = (
-            "disallow: /" in robots_lower and "googlebot" in robots_lower
-        )
+        rl = robots_txt.lower()
+        result["robots_disallows_google"] = "disallow: /" in rl and "googlebot" in rl

    # ── SSL ───────────────────────────────────────────────────────────────────
-    import ssl as _ssl, socket as _socket
+    import ssl as _ssl
    try:
        def _ssl_check():
            import datetime as _dt
            ctx = _ssl.create_default_context()
-            with _socket.create_connection((domain, 443), timeout=5) as s:
+            with socket.create_connection((domain, 443), timeout=5) as s:
                with ctx.wrap_socket(s, server_hostname=domain) as ss:
                    cert = ss.getpeercert()
                    exp = _dt.datetime.strptime(cert["notAfter"], "%b %d %H:%M:%S %Y %Z")