fix: AI worker crash-proof + GDPR/hosting/accessibility analysis

AI worker fixes (root cause of "nothing reaches Replicate"): - Worker task died silently — no exception handler around while loop - Added try/except around entire loop body with exc_info logging - Added watchdog task that restarts dead workers every 10 seconds - ensure_workers_alive() called on every /api/ai/assess/batch POST - _assess_one() is now a top-level function (not closure) — avoids subtle scoping bugs with async inner functions in while loops - /api/ai/debug endpoint: shows worker alive status, task exception, last 10 queue entries — browse to /api/ai/debug to diagnose - /api/ai/worker/restart endpoint + UI button - "Restart AI worker" button + "Debug AI queue" link in enrichment tab site_analyzer.py — new signals: - IP resolution + ip-api.com for ASN, org, ISP, host country - EU hosting detection (27 EU + EEA + adequacy countries) - GDPR: detects Cookiebot, OneTrust, CookiePro, Osano, Iubenda, Borlabs, CookieYes, Complianz, Usercentrics + text signals - Privacy policy and GDPR text presence - Accessibility: html lang missing, images without alt count, skip nav link, empty links, inputs without labels Gemini prompt additions: - Hosting section: IP, ASN, org/ISP, EU vs non-EU flag - GDPR section: cookie tool, notice, privacy policy - Accessibility section: all quick-scan results - New output fields: hosting_notes, gdpr_compliance, accessibility_issues[] Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-13 18:01:34 +02:00
parent 5ad8259c75
commit 60c9b495ae
10 changed files with 409 additions and 205 deletions
--- a/app/pycache/init.cpython-311.pyc
+++ b/app/pycache/init.cpython-311.pyc
--- a/app/pycache/db.cpython-311.pyc
+++ b/app/pycache/db.cpython-311.pyc
--- a/app/pycache/enricher.cpython-311.pyc
+++ b/app/pycache/enricher.cpython-311.pyc
--- a/app/pycache/replicate_ai.cpython-311.pyc
+++ b/app/pycache/replicate_ai.cpython-311.pyc
--- a/app/pycache/site_analyzer.cpython-311.pyc
+++ b/app/pycache/site_analyzer.cpython-311.pyc
--- a/app/enricher.py
+++ b/app/enricher.py
@@ -338,14 +338,42 @@ async def worker_loop():
 # ── AI assessment worker ──────────────────────────────────────────────────────
-async def ai_worker_loop():
+async def _assess_one(domain: str) -> None:
    """Process a single AI assessment — safe to call concurrently."""
    from app.replicate_ai import assess_domain as gemini_assess
    from app.site_analyzer import analyze_site
    logger.info("AI: starting analysis for %s", domain)
    try:
        analysis = await analyze_site(domain)
        logger.info("AI: site analyzed %s (reachable=%s, words=%s)",
                    domain, analysis.get("reachable"), analysis.get("word_count"))
        assessment = await gemini_assess(analysis)
        logger.info("AI: Gemini done %s → quality=%s",
                    domain, assessment.get("lead_quality"))
        await save_ai_assessment(domain, assessment, site_analysis=analysis)
        logger.info("AI: saved %s", domain)
    except Exception as e:
        logger.error("AI: failed %s — %s", domain, e, exc_info=True)
        try:
            async with aiosqlite.connect(SQLITE_PATH) as db:
                await db.execute(
                    "UPDATE ai_queue SET status='failed', completed_at=datetime('now'), error=? WHERE domain=?",
                    (str(e)[:400], domain),
                )
                await db.commit()
        except Exception:
            pass
 async def ai_worker_loop():
    logger.info("AI worker loop starting")
    while True:
        rows = []
        try:
            async with aiosqlite.connect(SQLITE_PATH) as db:
                async with db.execute(
-                "SELECT domain FROM ai_queue WHERE status='pending' LIMIT 10"
+                    "SELECT domain FROM ai_queue WHERE status='pending' LIMIT 5"
                ) as cur:
                    rows = await cur.fetchall()
                if rows:
@@ -354,37 +382,44 @@ async def ai_worker_loop():
                        [(r[0],) for r in rows],
                    )
                    await db.commit()
                    logger.info("AI worker: picked up %d jobs: %s",
                                len(rows), [r[0] for r in rows])
        except Exception as e:
            logger.error("AI worker DB error: %s", e, exc_info=True)
            await asyncio.sleep(5)
            continue
        if not rows:
            await asyncio.sleep(3)
            continue
-        async def assess_one(domain: str):
+        # Run assessments concurrently (semaphore in replicate_ai enforces AI_CONCURRENCY)
-            try:
+        results = await asyncio.gather(
-                # Always do a fresh deep scrape — no pre-enrichment required
+            *[_assess_one(r[0]) for r in rows],
-                analysis = await analyze_site(domain)
+            return_exceptions=True,
                assessment = await gemini_assess(analysis)
                await save_ai_assessment(domain, assessment, site_analysis=analysis)
                logger.info("AI done: %s → %s", domain, assessment.get("lead_quality"))
            except Exception as e:
                async with aiosqlite.connect(SQLITE_PATH) as db:
                    await db.execute(
                        "UPDATE ai_queue SET status='failed', completed_at=datetime('now') WHERE domain=?",
                        (domain,),
        )
-                    await db.commit()
+        for r, exc in zip(rows, results):
-                logger.error("AI worker error %s: %s", domain, e)
+            if isinstance(exc, Exception):
-
+                logger.error("AI task exception for %s: %s", r[0], exc, exc_info=exc)
        # AI_CONCURRENCY concurrent assessments (already enforced by replicate_ai semaphore)
        await asyncio.gather(*[asyncio.create_task(assess_one(r[0])) for r in rows], return_exceptions=True)
 def start_worker():
    global _worker_task, _ai_worker_task
    if _worker_task is None or _worker_task.done():
        _worker_task = asyncio.create_task(worker_loop())
        logger.info("Enrichment worker started")
    if _ai_worker_task is None or _ai_worker_task.done():
        if _ai_worker_task is not None and _ai_worker_task.done():
            exc = _ai_worker_task.exception() if not _ai_worker_task.cancelled() else None
            if exc:
                logger.error("AI worker died with: %s", exc, exc_info=exc)
        _ai_worker_task = asyncio.create_task(ai_worker_loop())
        logger.info("AI worker started/restarted")
 def ensure_workers_alive():
    """Restart workers if they've died — call periodically."""
    start_worker()
 def pause_worker():
--- a/app/main.py
+++ b/app/main.py
@@ -20,7 +20,7 @@ from app.db import (
    queue_domains, get_queue_status, build_duckdb_index, index_status,
    queue_ai, get_ai_queue_status, save_ai_assessment,
 )
-from app.enricher import start_worker, pause_worker, resume_worker, is_running
+from app.enricher import start_worker, pause_worker, resume_worker, is_running, ensure_workers_alive
 from app.scorer import run_scoring
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
@@ -61,13 +61,20 @@ async def download_parquet():
    logger.info("Parquet download complete")
 async def _watchdog():
    """Restart workers if they die every 10 seconds."""
    while True:
        await asyncio.sleep(10)
        ensure_workers_alive()
@asynccontextmanager
 async def lifespan(app: FastAPI):
    await download_parquet()
    await init_db()
    # Build DuckDB index in background — queries still work (slower) while building
    asyncio.create_task(build_duckdb_index())
    start_worker()
    asyncio.create_task(_watchdog())
    logger.info("DomGod ready on port 6677")
    yield
@@ -167,9 +174,43 @@ async def ai_assess_batch(body: dict):
    if not domains_list:
        return JSONResponse({"error": "no domains provided"}, status_code=400)
    await queue_ai(domains_list)
    ensure_workers_alive()   # ensure AI worker is alive when jobs are queued
    return {"queued": len(domains_list)}
@app.post("/api/ai/worker/restart")
 async def ai_worker_restart():
    ensure_workers_alive()
    return {"status": "restarted"}
@app.get("/api/ai/debug")
 async def ai_debug():
    """Returns worker state + last 10 queue entries for troubleshooting."""
    from app.enricher import _ai_worker_task
    task_alive = _ai_worker_task is not None and not _ai_worker_task.done()
    task_exc   = None
    if _ai_worker_task and _ai_worker_task.done() and not _ai_worker_task.cancelled():
        try:
            task_exc = str(_ai_worker_task.exception())
        except Exception:
            pass
    async with aiosqlite.connect(SQLITE_PATH) as db:
        db.row_factory = aiosqlite.Row
        async with db.execute(
            "SELECT domain, status, created_at, completed_at, error FROM ai_queue ORDER BY created_at DESC LIMIT 10"
        ) as cur:
            recent = [dict(r) async for r in cur]
    return {
        "ai_worker_alive": task_alive,
        "ai_worker_exception": task_exc,
        "recent_queue": recent,
        "queue_status": await get_ai_queue_status(),
    }
@app.get("/api/ai/status")
 async def ai_status():
    return await get_ai_queue_status()
--- a/app/replicate_ai.py
+++ b/app/replicate_ai.py
@@ -25,7 +25,6 @@ def _sem() -> asyncio.Semaphore:
 def _build_prompt(a: dict) -> str:
    """Build the Gemini prompt from a full site analysis dict."""
    contacts_block = []
    if a.get("emails"):      contacts_block.append(f"  Emails:    {', '.join(a['emails'][:3])}")
    if a.get("phones"):      contacts_block.append(f"  Phones:    {', '.join(a['phones'][:3])}")
@@ -33,80 +32,98 @@ def _build_prompt(a: dict) -> str:
    if a.get("social_links"):contacts_block.append(f"  Social:    {', '.join(a['social_links'][:4])}")
    contacts_str = "\n".join(contacts_block) or "  None found"
-    kd_str = "\n".join(f"  - {s}" for s in (a.get("kit_digital_signals") or [])) or "  None detected"
+    kd_str      = "\n".join(f"  - {s}" for s in (a.get("kit_digital_signals") or [])) or "  None"
-    analytics_str = ", ".join(a.get("analytics_present") or []) or "none"
+    analytics   = ", ".join(a.get("analytics_present") or []) or "none"
-    webmaster_str = ", ".join(a.get("webmaster_verified") or []) or "none"
+    webmaster   = ", ".join(a.get("webmaster_verified") or []) or "none"
    lorem_str   = ", ".join(a.get("lorem_matches") or []) or "none"
-    placeholder_str = ", ".join(a.get("placeholder_matches") or []) or "none"
+    ph_str      = ", ".join(a.get("placeholder_matches") or []) or "none"
    snippet     = (a.get("visible_text_snippet") or "")[:2000]
-    text_snippet = (a.get("visible_text_snippet") or "")[:2000]
+    eu_hosted   = a.get("eu_hosted")
    hosting_flag = "✅ EU" if eu_hosted else ("❌ Non-EU" if eu_hosted is False else "unknown")
-    return f"""You are a senior web consultant and IT sales analyst reviewing a Spanish SME website.
+    return f"""You are a senior web consultant and IT sales analyst evaluating a Spanish/European SME website for IT services upsell.
-=== TECHNICAL SNAPSHOT ===
+=== TECHNICAL ===
 Domain:          {a.get("domain")}
 Reachable:       {a.get("reachable")}  |  Status: {a.get("status_code")}  |  Load time: {a.get("load_time_ms")} ms
 Final URL:         {a.get("final_url")}
 Page size:       {a.get("page_size_kb")} KB  |  Server: {a.get("server")}  |  CMS: {a.get("cms") or "unknown"}
-SSL valid:         {a.get("ssl_valid")}  |  SSL expires in: {a.get("ssl_expiry_days")} days
+SSL:             valid={a.get("ssl_valid")}  expires_in={a.get("ssl_expiry_days")} days
-Mobile viewport:   {a.get("has_mobile_viewport")}
+Mobile:          viewport={a.get("has_mobile_viewport")}
-Word count:        {a.get("word_count")}  |  Images: {a.get("image_count")}  |  Scripts: {a.get("script_count")}
+Words: {a.get("word_count")}  |  Images: {a.get("image_count")}  |  Scripts: {a.get("script_count")}
-=== SEO & INDEXING SIGNALS ===
+=== HOSTING & INFRASTRUCTURE ===
-Page title:        {a.get("page_title") or "missing"}
+IP:              {a.get("ip") or "unknown"}
-H1:                {a.get("h1_text") or "missing"}
+ASN:             {a.get("asn") or "unknown"}
-Meta description:  {a.get("meta_description") or "missing"}
+Organisation:    {a.get("org") or "unknown"}
-Canonical URL:     {a.get("canonical_url") or "not set"}
+ISP:             {a.get("isp") or "unknown"}
-Sitemap.xml:       {a.get("has_sitemap")}
+Host country:    {a.get("ip_country") or "unknown"} / {a.get("ip_region") or ""}
-Robots.txt:        {a.get("has_robots")}  |  Blocks Googlebot: {a.get("robots_disallows_google")}
+EU hosted:       {hosting_flag}
-Analytics:         {analytics_str}
+
-Webmaster verified:{webmaster_str}
+=== SEO & INDEXING ===
 Title:           {a.get("page_title") or "MISSING"}
 H1:              {a.get("h1_text") or "MISSING"}
 Meta desc:       {a.get("meta_description") or "MISSING"}
 Canonical:       {a.get("canonical_url") or "not set"}
 Sitemap:         {a.get("has_sitemap")}  |  Robots: {a.get("has_robots")}  |  Blocks Google: {a.get("robots_disallows_google")}
 Analytics:       {analytics}
 Webmaster:       {webmaster}
 === GDPR & LEGAL COMPLIANCE ===
 Cookie tool:     {a.get("cookie_tool") or "none detected"}
 Cookie notice:   {a.get("has_cookie_notice")}
 Privacy policy:  {a.get("has_privacy_policy")}
 GDPR text:       {a.get("has_gdpr_text")}
 === ACCESSIBILITY (quick scan) ===
 HTML lang attr:        {a.get("html_lang") or "MISSING"}
 Images missing alt:    {a.get("images_missing_alt")}
 Skip navigation link:  {a.get("has_skip_nav")}
 Empty links:           {a.get("empty_links")}
 Inputs without labels: {a.get("inputs_without_labels")}
 === CONTENT QUALITY ===
-Lorem ipsum found: {a.get("has_lorem_ipsum")}  →  matches: {lorem_str}
+Lorem ipsum:     {a.get("has_lorem_ipsum")}  →  {lorem_str}
-Placeholder text:  {a.get("has_placeholder")}  →  matches: {placeholder_str}
+Placeholder:     {a.get("has_placeholder")}  →  {ph_str}
-=== KIT DIGITAL (Spanish gov digitalization grant — sites must display EU logos) ===
+=== KIT DIGITAL (Spanish gov €12k SME grants — sites must show EU logos) ===
 Detected:        {a.get("kit_digital")}
 Signals:
 {kd_str}
 === CONTACT CHANNELS ===
 {contacts_str}
-=== PAGE TEXT SAMPLE (first 2000 chars) ===
+=== PAGE TEXT SAMPLE ===
-{text_snippet}
+{snippet}
-=== TASK ===
+=== INSTRUCTIONS ===
-Analyse this site for IT services upsell potential. The client sells:
+The client sells: web redesign, SEO, hosting migration, SSL renewal,
-web design/redesign, SEO, hosting migration, SSL renewal, security audits,
+security audits, GDPR compliance, accessibility fixes, Google Ads,
-maintenance contracts, Google Ads, and AI-assisted tools for SMEs.
+maintenance contracts, AI tools for SMEs.
-Respond ONLY with valid JSON — no markdown, no text outside the JSON object:
+Respond ONLY with valid JSON, no markdown fences, no text outside the JSON:
 {{
-  "summary": "2-3 sentence executive summary of the site's current state",
+  "summary": "2-3 sentence executive summary of the site's state",
-  "site_quality_score": <0-10 integer>,
+  "site_quality_score": <0-10>,
-  "content_issues": ["list of specific content problems found — lorem ipsum, broken sections, placeholder text, etc."],
+  "content_issues": ["specific issues found in page content"],
-  "performance_notes": "comment on load time, page size, mobile readiness",
+  "performance_notes": "load time, size, mobile assessment",
-  "seo_status": "brief SEO assessment — indexing signals, missing elements",
+  "seo_status": "SEO health — what's missing or broken",
  "hosting_notes": "ASN/ISP name, EU vs non-EU, any concerns",
  "gdpr_compliance": "cookie banner status, privacy policy, GDPR gaps",
  "accessibility_issues": ["specific a11y problems found"],
  "kit_digital_confirmed": true/false,
-  "kit_digital_reasoning": "1 sentence — why confirmed or not",
+  "kit_digital_reasoning": "1 sentence",
  "is_local_sme": true/false,
  "lead_quality": "HOT|WARM|COLD",
-  "lead_reasoning": "1-2 sentences on why",
+  "lead_reasoning": "1-2 sentences",
  "best_contact_channel": "email|phone|whatsapp|social|web_form|unknown",
-  "best_contact_value": "the actual value to use (email address, phone number, URL) or empty string",
+  "best_contact_value": "actual email/phone/URL or empty string",
-  "all_contacts": {{
+  "all_contacts": {{"emails":[],"phones":[],"whatsapp":[],"social":[]}},
-    "emails": [],
+  "pitch_angle": "1 cold-outreach sentence in Spanish",
    "phones": [],
    "whatsapp": [],
    "social": []
  }},
  "pitch_angle": "One concrete opening sentence in Spanish for cold outreach",
  "services_needed": ["service1","service2"],
-  "urgency_signals": ["list of specific urgent issues — expiring SSL, lorem ipsum live, no GA, blocked robots etc"],
+  "urgency_signals": ["specific urgent issues like 'SSL expires in 12 days', 'lorem ipsum on homepage', 'no cookie banner'"],
-  "outreach_notes": "Key context for the sales rep"
+  "outreach_notes": "sales rep context"
-}}"""
+}}
 def _parse_output(raw: str) -> dict:
--- a/app/site_analyzer.py
+++ b/app/site_analyzer.py
@@ -1,8 +1,9 @@
-"""Deep site analysis: content quality, SEO signals, performance, indexing hints."""
+"""Deep site analysis: content quality, SEO, hosting, GDPR, accessibility."""
 import asyncio
 import re
 import time
 import logging
 import socket
 from typing import Optional
 import httpx
@@ -10,25 +11,58 @@ from bs4 import BeautifulSoup
 logger = logging.getLogger(__name__)
-# ── Content quality ───────────────────────────────────────────────────────────
+# ── EU countries (hosting check) ─────────────────────────────────────────────
 EU_COUNTRIES = {
    'AT','BE','BG','HR','CY','CZ','DK','EE','FI','FR','DE','GR',
    'HU','IE','IT','LV','LT','LU','MT','NL','PL','PT','RO','SK',
    'SI','ES','SE',
    'NO','IS','LI',  # EEA
    'CH','GB','AD',  # adequacy / adjacent
 }
 # ── Content quality ───────────────────────────────────────────────────────────
 LOREM_PHRASES = [
    "lorem ipsum", "sed ut perspiciatis", "nunc sem sapien",
    "nulla id nibh", "aenean dignissim", "aliquam tincidunt",
    "vestibulum commodo", "fusce nunc lacus", "consectetuer",
    "cras ornare tristique", "ntulla nec ante", "risus id metus",
    "praesent placerat", "fusce pellentesque", "suscipit nibh",
-    "integer vitae libero", "felis quis tortor",
+    "integer vitae libero", "felis quis tortor", "dolor sit amet",
 ]
 PLACEHOLDER_PHRASES = [
    "under construction", "coming soon", "sample page",
-    "this is a demo", "default post", "hello world",
+    "this is a demo", "hello world", "test content",
-    "test post", "uncategorized",
+    "default post", "uncategorized", "demo content",
 ]
-# ── Analytics & webmaster tags ────────────────────────────────────────────────
+# ── Cookie / GDPR consent tools ───────────────────────────────────────────────
 COOKIE_TOOLS = {
    "cookiebot":      ["cookiebot.com", "CookieConsent", "CybotCookiebot"],
    "onetrust":       ["onetrust", "otBannerSdk"],
    "cookiepro":      ["cookiepro.com"],
    "osano":          ["osano.com"],
    "iubenda":        ["iubenda.com"],
    "borlabs":        ["borlabs-cookie"],
    "complianz":      ["complianz"],
    "cookieyes":      ["cookieyes.com", "cookie-law-info"],
    "usercentrics":   ["usercentrics.com"],
    "quantcast":      ["quantcast.com/cmp"],
 }
 COOKIE_TEXT_SIGNALS = [
    "accept cookies", "acepta las cookies", "we use cookies", "usamos cookies",
    "cookie policy", "política de cookies", "cookie settings", "manage cookies",
    "aceptar todas", "rechazar cookies",
 ]
 PRIVACY_SIGNALS = [
    "privacy policy", "política de privacidad", "aviso legal",
    "privacy notice", "data protection",
 ]
 GDPR_TEXT_SIGNALS = [
    "rgpd", "gdpr", "reglamento general de protección",
    "lopd", "protección de datos", "responsable del tratamiento",
 ]
 # ── Analytics / webmaster ─────────────────────────────────────────────────────
 ANALYTICS = {
    "google_analytics":   ["gtag('config'", "google-analytics.com/analytics.js", "G-"],
    "google_tag_manager": ["googletagmanager.com/gtm.js", "GTM-"],
@@ -36,13 +70,13 @@ ANALYTICS = {
    "hotjar":             ["static.hotjar.com"],
    "clarity":            ["clarity.ms/tag"],
 }
 WEBMASTER = {
-    "google_search_console": ['google-site-verification'],
+    "google_search_console": ["google-site-verification"],
-    "bing_webmaster":        ['msvalidate.01'],
+    "bing_webmaster":        ["msvalidate.01"],
-    "yandex":                ['yandex-verification'],
+    "yandex":                ["yandex-verification"],
 }
 # ── Kit Digital ───────────────────────────────────────────────────────────────
 KIT_IMG_PATS = [
    "digitalizadores", "kit-digital", "kitdigital", "kit_digital",
    "fondos-europeos", "fondos_europeos", "nextgeneration", "next-generation",
@@ -56,59 +90,78 @@ KIT_TEXT_PATS = [
 EMAIL_RE   = re.compile(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}")
 PHONE_RE   = re.compile(r"(?:\+34[\s\-]?)?(?:6|7|8|9)\d{2}[\s\-]?\d{3}[\s\-]?\d{3}")
-SOCIAL_DOM = ["facebook.com", "instagram.com", "linkedin.com", "twitter.com", "x.com", "tiktok.com"]
+SOCIAL_DOM = ["facebook.com", "instagram.com", "linkedin.com",
              "twitter.com", "x.com", "tiktok.com", "youtube.com"]
 async def _get_hosting_info(domain: str) -> dict:
    """Resolve IP, then look up ASN / org / country via ip-api.com."""
    info = {"ip": None, "asn": None, "org": None, "isp": None,
            "ip_country": None, "ip_region": None, "eu_hosted": None}
    try:
        loop = asyncio.get_event_loop()
        ip = await loop.run_in_executor(None, socket.gethostbyname, domain)
        info["ip"] = ip
        async with httpx.AsyncClient(timeout=6) as client:
            r = await client.get(
                f"http://ip-api.com/json/{ip}",
                params={"fields": "status,country,countryCode,regionName,org,as,isp"},
            )
            if r.status_code == 200:
                d = r.json()
                if d.get("status") == "success":
                    info.update({
                        "asn":        d.get("as"),
                        "org":        d.get("org"),
                        "isp":        d.get("isp"),
                        "ip_country": d.get("countryCode"),
                        "ip_region":  d.get("regionName"),
                        "eu_hosted":  d.get("countryCode") in EU_COUNTRIES,
                    })
    except Exception as e:
        logger.debug("Hosting lookup failed for %s: %s", domain, e)
    return info
 async def analyze_site(domain: str) -> dict:
    """Fetch and deeply analyse a site. Returns a rich dict for the AI prompt."""
    result = {
        "domain": domain,
-        "reachable": False,
+        "reachable": False, "load_time_ms": None, "status_code": None,
-        "load_time_ms": None,
+        "final_url": None, "page_size_kb": None, "server": None, "cms": None,
-        "status_code": None,
+        # Hosting
-        "final_url": None,
+        "ip": None, "asn": None, "org": None, "isp": None,
-        "page_size_kb": None,
+        "ip_country": None, "ip_region": None, "eu_hosted": None,
-        "server": None,
+        # SSL
-        "cms": None,
+        "ssl_valid": False, "ssl_expiry_days": None,
        "ssl_valid": False,
        "ssl_expiry_days": None,
        # Content quality
-        "has_lorem_ipsum": False,
+        "has_lorem_ipsum": False, "lorem_matches": [],
-        "lorem_matches": [],
+        "has_placeholder": False, "placeholder_matches": [],
-        "has_placeholder": False,
+        "word_count": 0, "image_count": 0, "script_count": 0,
        "placeholder_matches": [],
        "word_count": 0,
        "image_count": 0,
        "broken_images": 0,
        "script_count": 0,
        "has_mobile_viewport": False,
-        "page_title": None,
+        "page_title": None, "meta_description": None, "h1_text": None,
        "meta_description": None,
        "h1_text": None,
        "visible_text_snippet": "",
-        # SEO / webmaster
+        # SEO
-        "has_sitemap": False,
+        "has_sitemap": False, "has_robots": False, "robots_disallows_google": False,
-        "has_robots": False,
+        "analytics_present": [], "webmaster_verified": [],
-        "robots_disallows_google": False,
+        "canonical_url": None, "og_title": None,
-        "analytics_present": [],
+        # GDPR / cookies
-        "webmaster_verified": [],
+        "cookie_tool": None, "has_cookie_notice": False,
-        "canonical_url": None,
+        "has_privacy_policy": False, "has_gdpr_text": False,
-        "og_title": None,
+        # Accessibility
        "html_lang": None, "images_missing_alt": 0,
        "has_skip_nav": False, "empty_links": 0,
        "inputs_without_labels": 0,
        # Kit Digital
-        "kit_digital": False,
+        "kit_digital": False, "kit_digital_signals": [],
        "kit_digital_signals": [],
        # Contacts
-        "emails": [],
+        "emails": [], "phones": [], "whatsapp": [], "social_links": [],
        "phones": [],
        "whatsapp": [],
        "social_links": [],
        # Errors
        "error": None,
    }
-    # ── Fetch main page ───────────────────────────────────────────────────────
+    # ── Fetch + hosting (parallel) ────────────────────────────────────────────
-    try:
+    async def _fetch():
        t0 = time.monotonic()
        try:
            async with httpx.AsyncClient(
                timeout=15, follow_redirects=True, verify=False,
                headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"},
@@ -116,12 +169,20 @@ async def analyze_site(domain: str) -> dict:
                resp = await client.get(f"https://{domain}")
                if resp.status_code >= 400:
                    resp = await client.get(f"http://{domain}")
            return resp, int((time.monotonic() - t0) * 1000)
        except Exception as e:
            return None, int((time.monotonic() - t0) * 1000)
-        load_ms = int((time.monotonic() - t0) * 1000)
+    (resp, load_ms), hosting = await asyncio.gather(_fetch(), _get_hosting_info(domain))
    result.update(hosting)
    result["load_time_ms"] = load_ms
    if resp is None:
        result["error"] = "Failed to fetch site"
    else:
        html = resp.text
        result.update({
            "reachable": resp.status_code < 400,
            "load_time_ms": load_ms,
            "status_code": resp.status_code,
            "final_url": str(resp.url),
            "page_size_kb": round(len(resp.content) / 1024, 1),
@@ -131,46 +192,42 @@ async def analyze_site(domain: str) -> dict:
        soup = BeautifulSoup(html, "html.parser")
        hl = html.lower()
-        # Title, meta
+        # ── Basic metadata ────────────────────────────────────────────────────
-        title_tag = soup.find("title")
+        result["html_lang"] = (soup.find("html") or {}).get("lang")
-        result["page_title"] = title_tag.get_text(strip=True)[:200] if title_tag else None
+        t = soup.find("title")
-        meta_desc = soup.find("meta", attrs={"name": "description"})
+        result["page_title"] = t.get_text(strip=True)[:200] if t else None
-        result["meta_description"] = (meta_desc.get("content") or "")[:300] if meta_desc else None
+        md = soup.find("meta", attrs={"name": "description"})
        result["meta_description"] = (md.get("content") or "")[:300] if md else None
        h1 = soup.find("h1")
        result["h1_text"] = h1.get_text(strip=True)[:200] if h1 else None
        # Mobile viewport
        result["has_mobile_viewport"] = bool(soup.find("meta", attrs={"name": "viewport"}))
-
+        c = soup.find("link", rel="canonical")
-        # Canonical + OG
+        result["canonical_url"] = c.get("href") if c else None
        canon = soup.find("link", rel="canonical")
        result["canonical_url"] = canon.get("href") if canon else None
        og = soup.find("meta", property="og:title")
        result["og_title"] = og.get("content") if og else None
-        # Visible text
+        # ── Visible text ──────────────────────────────────────────────────────
        for tag in soup(["script", "style", "noscript"]):
            tag.decompose()
-        visible_text = soup.get_text(separator=" ", strip=True)
+        visible = soup.get_text(separator=" ", strip=True)
-        words = visible_text.split()
+        vl = visible.lower()
        words = visible.split()
        result["word_count"] = len(words)
-        result["visible_text_snippet"] = " ".join(words[:500])
+        result["visible_text_snippet"] = " ".join(words[:600])
-        # Lorem ipsum / placeholder detection
+        # ── Content quality ───────────────────────────────────────────────────
        vl = visible_text.lower()
        lorem_hits = [p for p in LOREM_PHRASES if p in vl]
        result["has_lorem_ipsum"]   = len(lorem_hits) > 0
-        result["lorem_matches"] = lorem_hits[:5]
+        result["lorem_matches"]     = lorem_hits[:6]
        ph_hits = [p for p in PLACEHOLDER_PHRASES if p in vl]
        result["has_placeholder"]   = len(ph_hits) > 0
        result["placeholder_matches"] = ph_hits[:3]
        # Images & scripts
        imgs = soup.find_all("img")
        result["image_count"]  = len(imgs)
        result["script_count"] = len(soup.find_all("script", src=True))
-        # Analytics / webmaster tags
+        # ── Analytics / webmaster ─────────────────────────────────────────────
        for name, sigs in ANALYTICS.items():
            if any(s.lower() in hl for s in sigs):
                result["analytics_present"].append(name)
@@ -178,12 +235,42 @@ async def analyze_site(domain: str) -> dict:
            if any(s.lower() in hl for s in sigs):
                result["webmaster_verified"].append(name)
-        # Kit Digital
+        # ── GDPR / cookies ────────────────────────────────────────────────────
        for tool, sigs in COOKIE_TOOLS.items():
            if any(s.lower() in hl for s in sigs):
                result["cookie_tool"] = tool
                result["has_cookie_notice"] = True
                break
        if not result["has_cookie_notice"]:
            result["has_cookie_notice"] = any(s in vl for s in COOKIE_TEXT_SIGNALS)
        result["has_privacy_policy"] = any(s in vl for s in PRIVACY_SIGNALS) or bool(
            soup.find("a", href=lambda h: h and ("privacidad" in h.lower() or "privacy" in h.lower()))
        )
        result["has_gdpr_text"] = any(s in vl for s in GDPR_TEXT_SIGNALS)
        # ── Accessibility ─────────────────────────────────────────────────────
        result["images_missing_alt"] = sum(
            1 for img in imgs if not img.get("alt") and img.get("alt") != ""
        )
        result["has_skip_nav"] = bool(
            soup.find("a", href=lambda h: h and h.lower() in ("#main", "#content", "#maincontent", "#skip"))
        )
        result["empty_links"] = sum(
            1 for a in soup.find_all("a") if not a.get_text(strip=True) and not a.find("img")
        )
        all_inputs = soup.find_all("input", type=lambda t: t not in ("hidden", "submit", "button", None) or t is None)
        labeled_ids = {lbl.get("for") for lbl in soup.find_all("label") if lbl.get("for")}
        result["inputs_without_labels"] = sum(
            1 for inp in all_inputs
            if inp.get("id") not in labeled_ids and not inp.get("aria-label") and not inp.get("aria-labelledby")
        )
        # ── Kit Digital ───────────────────────────────────────────────────────
        kd_signals = []
-        for img in imgs:
+        for img in soup.find_all("img"):
-            combined = ((img.get("src") or "") + (img.get("alt") or "") + (img.get("srcset") or "")).lower()
+            comb = ((img.get("src") or "") + (img.get("alt") or "") + (img.get("srcset") or "")).lower()
            for p in KIT_IMG_PATS:
-                if p in combined:
+                if p in comb:
                    kd_signals.append(f"img:{p}")
                    break
        for p in KIT_TEXT_PATS:
@@ -197,7 +284,7 @@ async def analyze_site(domain: str) -> dict:
        result["kit_digital"]         = len(kd_signals) > 0
        result["kit_digital_signals"] = kd_signals
-        # Contacts
+        # ── Contacts ──────────────────────────────────────────────────────────
        for a in soup.find_all("a", href=True):
            href = a["href"]
            if href.startswith("mailto:"):
@@ -222,23 +309,34 @@ async def analyze_site(domain: str) -> dict:
            em = em.lower()
            if em not in result["emails"] and not any(em.endswith(x) for x in [".png",".jpg",".css",".js",".svg"]):
                result["emails"].append(em)
-        for ph in PHONE_RE.findall(visible_text):
+        for ph in PHONE_RE.findall(visible):
            ph_c = re.sub(r"[\s\-]", "", ph)
            if ph_c not in result["phones"]:
                result["phones"].append(ph_c)
        # Cap
        for k in ["emails", "phones", "whatsapp", "social_links"]:
            result[k] = list(dict.fromkeys(result[k]))[:5]
-        # CMS
+        # ── CMS ───────────────────────────────────────────────────────────────
-        from app.enricher import detect_cms
+        CMS_SIGS = {
-        result["cms"] = detect_cms(html, dict(resp.headers))
+            "wordpress":   ["/wp-content/", "/wp-includes/", 'content="WordPress'],
-
+            "joomla":      ["/components/com_", "Joomla!", 'content="Joomla'],
-    except Exception as e:
+            "drupal":      ["/sites/default/files/", "Drupal.settings"],
-        result["error"] = str(e)[:300]
+            "wix":         ["static.wixstatic.com", "X-Wix-"],
            "squarespace": ["squarespace.com", "X-Squarespace-"],
            "shopify":     ["cdn.shopify.com", "Shopify.theme"],
            "prestashop":  ["PrestaShop", "/modules/prestashop"],
            "magento":     ["Mage.Cookies", "X-Magento-"],
            "typo3":       ["typo3temp", "TYPO3 CMS"],
            "opencart":    ["route=common/home", "OpenCart"],
        }
        combined_check = html[:60000] + " ".join(f"{k}:{v}" for k, v in resp.headers.items())
        for cms, sigs in CMS_SIGS.items():
            if any(s.lower() in combined_check.lower() for s in sigs):
                result["cms"] = cms
                break
    # ── Sitemap & robots (parallel) ───────────────────────────────────────────
-    async def _check_url(url: str) -> Optional[str]:
+    async def _get(url):
        try:
            async with httpx.AsyncClient(timeout=6, follow_redirects=True, verify=False) as c:
                r = await c.get(url)
@@ -247,24 +345,22 @@ async def analyze_site(domain: str) -> dict:
            return None
    sitemap_txt, robots_txt = await asyncio.gather(
-        _check_url(f"https://{domain}/sitemap.xml"),
+        _get(f"https://{domain}/sitemap.xml"),
-        _check_url(f"https://{domain}/robots.txt"),
+        _get(f"https://{domain}/robots.txt"),
    )
    result["has_sitemap"] = sitemap_txt is not None
    result["has_robots"]  = robots_txt is not None
    if robots_txt:
-        robots_lower = robots_txt.lower()
+        rl = robots_txt.lower()
-        result["robots_disallows_google"] = (
+        result["robots_disallows_google"] = "disallow: /" in rl and "googlebot" in rl
            "disallow: /" in robots_lower and "googlebot" in robots_lower
        )
    # ── SSL ───────────────────────────────────────────────────────────────────
-    import ssl as _ssl, socket as _socket
+    import ssl as _ssl
    try:
        def _ssl_check():
            import datetime as _dt
            ctx = _ssl.create_default_context()
-            with _socket.create_connection((domain, 443), timeout=5) as s:
+            with socket.create_connection((domain, 443), timeout=5) as s:
                with ctx.wrap_socket(s, server_hostname=domain) as ss:
                    cert = ss.getpeercert()
                    exp = _dt.datetime.strptime(cert["notAfter"], "%b %d %H:%M:%S %Y %Z")
--- a/app/static/index.html
+++ b/app/static/index.html
@@ -179,7 +179,9 @@ tr:hover td{background:rgba(255,255,255,.025)}
    <div class="mrow"><span class="mlabel">Reasoning</span><span x-text="modal.ai.lead_reasoning||'—'"></span></div>
    <div class="mrow"><span class="mlabel">KD notes</span><span x-text="modal.ai.kit_digital_reasoning||'—'"></span></div>
    <div class="mrow"><span class="mlabel">Performance</span><span x-text="modal.ai.performance_notes||'—'"></span></div>
-    <div class="mrow"><span class="mlabel">SEO status</span><span x-text="modal.ai.seo_status||'—'"></span></div>
+    <div class="mrow"><span class="mlabel">SEO</span><span x-text="modal.ai.seo_status||'—'"></span></div>
    <div class="mrow"><span class="mlabel">Hosting</span><span x-text="(modal.sa?.org||'?') + ' / ' + (modal.sa?.ip_country||'?') + (modal.sa?.eu_hosted===false?' ❌ Non-EU':modal.sa?.eu_hosted?' ✅ EU':'')"></span></div>
    <div class="mrow"><span class="mlabel">GDPR</span><span :style="(!modal.sa?.has_cookie_notice)?'color:var(--danger)':''" x-text="modal.ai.gdpr_compliance||'—'"></span></div>
    <!-- Content issues -->
    <div x-show="(modal.ai.content_issues||[]).length>0" style="margin:8px 0">
@@ -225,6 +227,14 @@ tr:hover td{background:rgba(255,255,255,.025)}
      <div style="font-size:13px;font-style:italic;color:var(--accent2)" x-text="modal.ai.pitch_angle||'—'"></div>
    </div>
    <!-- Accessibility issues -->
    <div x-show="(modal.ai.accessibility_issues||[]).length>0" style="margin:8px 0">
      <div style="font-size:10px;color:var(--muted);text-transform:uppercase;margin-bottom:4px">Accessibility Issues</div>
      <template x-for="issue in (modal.ai.accessibility_issues||[])">
        <div style="font-size:12px;color:var(--warn);padding:2px 0">♿ <span x-text="issue"></span></div>
      </template>
    </div>
    <div class="mrow"><span class="mlabel">Services</span><span x-text="(modal.ai.services_needed||[]).join(', ')||'—'"></span></div>
    <div class="mrow"><span class="mlabel">Notes</span><span x-text="modal.ai.outreach_notes||'—'"></span></div>
@@ -431,7 +441,11 @@ tr:hover td{background:rgba(255,255,255,.025)}
      <div style="font-size:12px;color:var(--muted);margin-bottom:8px">
        Auto-assesses enriched domains via Gemini. Detects Kit Digital confirmation, extracts best contact channel, writes pitch.
      </div>
      <div style="display:flex;gap:6px;flex-wrap:wrap">
        <button class="btn bai" @click="aiAssessAllKD()">🤖 AI Assess all Kit Digital domains</button>
        <button class="btn bg sm" @click="restartAiWorker()">↺ Restart AI worker</button>
        <a class="btn bg sm" href="/api/ai/debug" target="_blank" style="text-decoration:none">🔍 Debug AI queue</a>
      </div>
    </div>
    <div style="margin-top:18px;padding-top:14px;border-top:1px solid var(--border)">
@@ -616,6 +630,7 @@ function app() {
      try { this.qst = await fetch('/api/enrich/status').then(r=>r.json()); } catch(e){}
    },
    async restartAiWorker() { const r=await fetch('/api/ai/worker/restart',{method:'POST'}); this.notify('AI worker restarted','info'); await this.loadAiStatus(); },
    async startEnrich()  { await fetch('/api/enrich/resume',{method:'POST'}); this.notify('Worker started','success'); await this.loadQueue(); },
    async pauseEnrich()  { await fetch('/api/enrich/pause',{method:'POST'}); this.notify('Worker paused','success'); await this.loadQueue(); },
    async retryFailed()  { await fetch('/api/enrich/retry',{method:'POST'}); this.notify('Retrying failed','success'); await this.loadQueue(); },