feat: deep site analysis engine + fix AI assess for any domain

site_analyzer.py (new): - Fresh scrape with timing, page size, server, CMS detection - Lorem ipsum detection (16 phrases incl. user's example) - Placeholder content detection (hello world, sample page, etc.) - Analytics: GA4, GTM, Facebook Pixel, Hotjar, Clarity - Webmaster: Google Search Console, Bing, Yandex verification tags - sitemap.xml and robots.txt check + Googlebot block detection - Mobile viewport check, word count, image/script count - Full contact extraction: emails, phones, WhatsApp, social links - Kit Digital signal detection AI worker fix: - No longer requires pre-enrichment — works on ANY selected domain - Does fresh site_analyzer scrape then calls Gemini with full context - Stores site_analysis JSON alongside AI assessment - Upserts into enriched_domains even if domain was never enriched Gemini prompt now includes: - Complete technical snapshot (load time, size, server, SSL) - Full SEO signals (sitemap, robots, analytics, webmaster verified) - Content quality (lorem ipsum matches, placeholder matches) - Kit Digital signals - All extracted contacts - 500-word page text sample - Outputs: summary, site_quality_score/10, content_issues[], urgency_signals[], performance_notes, seo_status, best_contact_channel+value, all_contacts, ES pitch, services_needed, outreach_notes UI: rich AI modal with summary banner, quality grid, content issues, urgency signals, full contact list, technical snapshot Fixes: correct Replicate token, ai_queue status='running' bug Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-13 17:46:01 +02:00
parent faca4b6e1a
commit 5ad8259c75
7 changed files with 530 additions and 111 deletions
--- a/app/db.py
+++ b/app/db.py
@@ -35,7 +35,8 @@ CREATE TABLE IF NOT EXISTS enriched_domains (
    ai_pitch TEXT,
    ai_contact_channel TEXT,
    ai_contact_value TEXT,
-    ai_assessed_at TEXT
+    ai_assessed_at TEXT,
+    site_analysis TEXT
 );
 CREATE TABLE IF NOT EXISTS job_queue (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -71,6 +72,7 @@ _MIGRATIONS = [
    "ALTER TABLE enriched_domains ADD COLUMN ai_contact_channel TEXT",
    "ALTER TABLE enriched_domains ADD COLUMN ai_contact_value TEXT",
    "ALTER TABLE enriched_domains ADD COLUMN ai_assessed_at TEXT",
+    "ALTER TABLE enriched_domains ADD COLUMN site_analysis TEXT",
    "CREATE TABLE IF NOT EXISTS ai_queue (domain TEXT PRIMARY KEY, status TEXT DEFAULT 'pending', created_at TEXT DEFAULT (datetime('now')), completed_at TEXT, error TEXT)",
 ]

@@ -352,13 +354,19 @@ async def get_ai_queue_status():
    }


-async def save_ai_assessment(domain: str, assessment: dict):
+async def save_ai_assessment(domain: str, assessment: dict, site_analysis: dict = None):
    import json as _json
    async with aiosqlite.connect(SQLITE_PATH) as db:
+        # Upsert into enriched_domains (domain may not exist yet if assessed before full enrichment)
+        await db.execute(
+            """INSERT INTO enriched_domains (domain) VALUES (?) ON CONFLICT(domain) DO NOTHING""",
+            (domain,),
+        )
        await db.execute(
            """UPDATE enriched_domains SET
               ai_assessment=?, ai_lead_quality=?, ai_pitch=?,
-               ai_contact_channel=?, ai_contact_value=?, ai_assessed_at=datetime('now')
+               ai_contact_channel=?, ai_contact_value=?, ai_assessed_at=datetime('now'),
+               site_analysis=?
               WHERE domain=?""",
            (
                _json.dumps(assessment),
@@ -366,6 +374,26 @@ async def save_ai_assessment(domain: str, assessment: dict):
                assessment.get("pitch_angle"),
                assessment.get("best_contact_channel"),
                assessment.get("best_contact_value"),
+                _json.dumps(site_analysis) if site_analysis else None,
+                domain,
+            ),
+        )
+        # Also update contact_info + kit_digital from site_analysis if available
+        if site_analysis:
+            contacts = {
+                "emails":   site_analysis.get("emails", []),
+                "phones":   site_analysis.get("phones", []),
+                "whatsapp": site_analysis.get("whatsapp", []),
+                "social":   site_analysis.get("social_links", []),
+            }
+            await db.execute(
+                """UPDATE enriched_domains SET
+                   kit_digital=?, kit_digital_signals=?, contact_info=?
+                   WHERE domain=?""",
+                (
+                    int(site_analysis.get("kit_digital", False)),
+                    _json.dumps(site_analysis.get("kit_digital_signals", [])),
+                    _json.dumps(contacts),
                    domain,
                ),
            )
--- a/app/enricher.py
+++ b/app/enricher.py
@@ -13,7 +13,7 @@ import dns.resolver
 import aiosqlite
 from bs4 import BeautifulSoup

-from app.db import SQLITE_PATH, queue_ai, save_ai_assessment, get_ai_queue_status
+from app.db import SQLITE_PATH, queue_ai, save_ai_assessment
 from app.scorer import score

 logger = logging.getLogger(__name__)
@@ -340,17 +340,17 @@ async def worker_loop():

 async def ai_worker_loop():
    from app.replicate_ai import assess_domain as gemini_assess
+    from app.site_analyzer import analyze_site

    while True:
        async with aiosqlite.connect(SQLITE_PATH) as db:
            async with db.execute(
-                "SELECT domain FROM ai_queue WHERE status='pending' LIMIT 20"
+                "SELECT domain FROM ai_queue WHERE status='pending' LIMIT 10"
            ) as cur:
                rows = await cur.fetchall()
-            # Mark as running
            if rows:
                await db.executemany(
-                    "UPDATE ai_queue SET status='running', created_at=created_at WHERE domain=?",
+                    "UPDATE ai_queue SET status='running' WHERE domain=?",
                    [(r[0],) for r in rows],
                )
                await db.commit()
@@ -361,16 +361,11 @@ async def ai_worker_loop():

        async def assess_one(domain: str):
            try:
-                async with aiosqlite.connect(SQLITE_PATH) as db:
-                    db.row_factory = aiosqlite.Row
-                    async with db.execute(
-                        "SELECT * FROM enriched_domains WHERE domain=?", (domain,)
-                    ) as cur:
-                        row = await cur.fetchone()
-                if not row:
-                    return
-                assessment = await gemini_assess(dict(row))
-                await save_ai_assessment(domain, assessment)
+                # Always do a fresh deep scrape — no pre-enrichment required
+                analysis = await analyze_site(domain)
+                assessment = await gemini_assess(analysis)
+                await save_ai_assessment(domain, assessment, site_analysis=analysis)
+                logger.info("AI done: %s → %s", domain, assessment.get("lead_quality"))
            except Exception as e:
                async with aiosqlite.connect(SQLITE_PATH) as db:
                    await db.execute(
@@ -380,6 +375,7 @@ async def ai_worker_loop():
                    await db.commit()
                logger.error("AI worker error %s: %s", domain, e)

+        # AI_CONCURRENCY concurrent assessments (already enforced by replicate_ai semaphore)
        await asyncio.gather(*[asyncio.create_task(assess_one(r[0])) for r in rows], return_exceptions=True)


--- a/app/main.py
+++ b/app/main.py
@@ -177,22 +177,16 @@ async def ai_status():

@app.post("/api/ai/assess/single")
 async def ai_assess_single(body: dict):
-    """Immediate (blocking) AI assessment of a single domain."""
+    """Immediate (blocking) AI assessment — does fresh scrape, no pre-enrichment needed."""
    domain = body.get("domain")
    if not domain:
        return JSONResponse({"error": "no domain"}, status_code=400)
+    from app.site_analyzer import analyze_site
    from app.replicate_ai import assess_domain as gemini_assess
-    async with aiosqlite.connect(SQLITE_PATH) as db:
-        db.row_factory = aiosqlite.Row
-        async with db.execute(
-            "SELECT * FROM enriched_domains WHERE domain=?", (domain,)
-        ) as cur:
-            row = await cur.fetchone()
-    if not row:
-        return JSONResponse({"error": "domain not yet enriched"}, status_code=404)
-    assessment = await gemini_assess(dict(row))
-    await save_ai_assessment(domain, assessment)
-    return assessment
+    analysis = await analyze_site(domain)
+    assessment = await gemini_assess(analysis)
+    await save_ai_assessment(domain, assessment, site_analysis=analysis)
+    return {**assessment, "site_analysis": analysis}


@app.get("/api/export")
--- a/app/replicate_ai.py
+++ b/app/replicate_ai.py
@@ -1,4 +1,4 @@
-"""Replicate / Gemini integration for domain lead assessment."""
+"""Replicate / Gemini integration — deep site assessment."""
 import asyncio
 import json
 import logging
@@ -10,7 +10,7 @@ import httpx

 logger = logging.getLogger(__name__)

-REPLICATE_TOKEN = os.getenv("REPLICATE_API_TOKEN", "r8_6kV2NWMQyPVB9JILHJprrXJJh4vWazA22Osyj")
+REPLICATE_TOKEN = os.getenv("REPLICATE_API_TOKEN", "r8_7I7Feai78f9PzMOs20y5GVFKiLkgUWP463vZO")  # override via env
 REPLICATE_MODEL = "https://api.replicate.com/v1/models/google/gemini-3-pro/predictions"
 AI_CONCURRENCY  = int(os.getenv("AI_CONCURRENCY", "3"))

@@ -24,66 +24,92 @@ def _sem() -> asyncio.Semaphore:
    return _ai_sem


-def _build_prompt(row: dict) -> str:
-    kit_signals = row.get("kit_digital_signals") or "[]"
-    try:
-        sigs = json.loads(kit_signals)
-        kit_block = "\n".join(f"  - {s}" for s in sigs) if sigs else "  None detected"
-    except Exception:
-        kit_block = f"  {kit_signals}"
+def _build_prompt(a: dict) -> str:
+    """Build the Gemini prompt from a full site analysis dict."""
+    contacts_block = []
+    if a.get("emails"):    contacts_block.append(f"  Emails:    {', '.join(a['emails'][:3])}")
+    if a.get("phones"):    contacts_block.append(f"  Phones:    {', '.join(a['phones'][:3])}")
+    if a.get("whatsapp"):  contacts_block.append(f"  WhatsApp:  {', '.join(a['whatsapp'][:2])}")
+    if a.get("social_links"): contacts_block.append(f"  Social:    {', '.join(a['social_links'][:4])}")
+    contacts_str = "\n".join(contacts_block) or "  None found"

-    contact_raw = row.get("contact_info") or "{}"
-    try:
-        contacts = json.loads(contact_raw)
-    except Exception:
-        contacts = {}
+    kd_str = "\n".join(f"  - {s}" for s in (a.get("kit_digital_signals") or [])) or "  None detected"
+    analytics_str = ", ".join(a.get("analytics_present") or []) or "none"
+    webmaster_str = ", ".join(a.get("webmaster_verified") or []) or "none"
+    lorem_str     = ", ".join(a.get("lorem_matches") or []) or "none"
+    placeholder_str = ", ".join(a.get("placeholder_matches") or []) or "none"

-    contact_block = []
-    if contacts.get("emails"):
-        contact_block.append(f"  Emails: {', '.join(contacts['emails'][:3])}")
-    if contacts.get("phones"):
-        contact_block.append(f"  Phones: {', '.join(contacts['phones'][:3])}")
-    if contacts.get("whatsapp"):
-        contact_block.append(f"  WhatsApp: {', '.join(contacts['whatsapp'][:2])}")
-    if contacts.get("social"):
-        contact_block.append(f"  Social: {', '.join(contacts['social'][:4])}")
-    contact_str = "\n".join(contact_block) if contact_block else "  None found"
+    text_snippet = (a.get("visible_text_snippet") or "")[:2000]

-    return f"""You are a sales intelligence analyst evaluating Spanish SME websites for IT services upsell.
+    return f"""You are a senior web consultant and IT sales analyst reviewing a Spanish SME website.

-DOMAIN DATA:
- Domain: {row.get("domain")}
- Page title: {row.get("page_title") or "N/A"}
- CMS: {row.get("cms") or "unknown"}
- Server: {row.get("server") or "unknown"}
- Country: {row.get("ip_country") or "unknown"}
- SSL valid: {row.get("ssl_valid")}, expires in {row.get("ssl_expiry_days") or "?"} days
- Has email (MX): {bool(row.get("has_mx"))}
- Is live: {bool(row.get("is_live"))}
- Kit Digital signals found on page:
-{kit_block}
- Contact channels found on page:
-{contact_str}
+=== TECHNICAL SNAPSHOT ===
+Domain:            {a.get("domain")}
+Reachable:         {a.get("reachable")}  |  Status: {a.get("status_code")}  |  Load time: {a.get("load_time_ms")} ms
+Final URL:         {a.get("final_url")}
+Page size:         {a.get("page_size_kb")} KB  |  Server: {a.get("server")}  |  CMS: {a.get("cms") or "unknown"}
+SSL valid:         {a.get("ssl_valid")}  |  SSL expires in: {a.get("ssl_expiry_days")} days
+Mobile viewport:   {a.get("has_mobile_viewport")}
+Word count:        {a.get("word_count")}  |  Images: {a.get("image_count")}  |  Scripts: {a.get("script_count")}

-Kit Digital is a Spanish government program (up to €12k grants for SME digitalization). Sites that received it MUST display EU/digitalizadores logos. These businesses have proven they invest in IT services and may need follow-up: new website, SEO, hosting migration, security, maintenance contracts.
+=== SEO & INDEXING SIGNALS ===
+Page title:        {a.get("page_title") or "missing"}
+H1:                {a.get("h1_text") or "missing"}
+Meta description:  {a.get("meta_description") or "missing"}
+Canonical URL:     {a.get("canonical_url") or "not set"}
+Sitemap.xml:       {a.get("has_sitemap")}
+Robots.txt:        {a.get("has_robots")}  |  Blocks Googlebot: {a.get("robots_disallows_google")}
+Analytics:         {analytics_str}
+Webmaster verified:{webmaster_str}

-Assess this lead and respond ONLY with valid JSON (no markdown, no explanation outside the JSON):
+=== CONTENT QUALITY ===
+Lorem ipsum found: {a.get("has_lorem_ipsum")}  →  matches: {lorem_str}
+Placeholder text:  {a.get("has_placeholder")}  →  matches: {placeholder_str}
+
+=== KIT DIGITAL (Spanish gov digitalization grant — sites must display EU logos) ===
+Detected:          {a.get("kit_digital")}
+Signals:
+{kd_str}
+
+=== CONTACT CHANNELS ===
+{contacts_str}
+
+=== PAGE TEXT SAMPLE (first 2000 chars) ===
+{text_snippet}
+
+=== TASK ===
+Analyse this site for IT services upsell potential. The client sells:
+web design/redesign, SEO, hosting migration, SSL renewal, security audits,
+maintenance contracts, Google Ads, and AI-assisted tools for SMEs.
+
+Respond ONLY with valid JSON — no markdown, no text outside the JSON object:
 {{
-  "is_local_sme": true/false,
+  "summary": "2-3 sentence executive summary of the site's current state",
+  "site_quality_score": <0-10 integer>,
+  "content_issues": ["list of specific content problems found — lorem ipsum, broken sections, placeholder text, etc."],
+  "performance_notes": "comment on load time, page size, mobile readiness",
+  "seo_status": "brief SEO assessment — indexing signals, missing elements",
  "kit_digital_confirmed": true/false,
-  "kit_digital_reasoning": "1 sentence explaining why or why not",
+  "kit_digital_reasoning": "1 sentence — why confirmed or not",
+  "is_local_sme": true/false,
  "lead_quality": "HOT|WARM|COLD",
-  "lead_reasoning": "1-2 sentences on why this is a good/bad lead for IT services sales",
+  "lead_reasoning": "1-2 sentences on why",
  "best_contact_channel": "email|phone|whatsapp|social|web_form|unknown",
-  "best_contact_value": "the actual email/phone/URL to use, or empty string",
-  "pitch_angle": "One concrete opening sentence for a cold email or call in Spanish",
-  "services_likely_needed": ["service1", "service2"],
-  "outreach_notes": "Any useful context for the sales rep (language, business type, urgency)"
+  "best_contact_value": "the actual value to use (email address, phone number, URL) or empty string",
+  "all_contacts": {{
+    "emails": [],
+    "phones": [],
+    "whatsapp": [],
+    "social": []
+  }},
+  "pitch_angle": "One concrete opening sentence in Spanish for cold outreach",
+  "services_needed": ["service1", "service2"],
+  "urgency_signals": ["list of specific urgent issues — expiring SSL, lorem ipsum live, no GA, blocked robots etc"],
+  "outreach_notes": "Key context for the sales rep"
 }}"""


 def _parse_output(raw: str) -> dict:
-    """Extract JSON from Gemini text output."""
    text = re.sub(r"```(?:json)?", "", raw).strip().rstrip("`").strip()
    m = re.search(r"\{[\s\S]+\}", text)
    if m:
@@ -91,8 +117,9 @@ def _parse_output(raw: str) -> dict:
            return json.loads(m.group(0))
        except json.JSONDecodeError:
            pass
+    logger.warning("Could not parse Gemini JSON output, raw: %s", raw[:300])
    return {
-        "raw": raw[:500],
+        "summary": raw[:400],
        "lead_quality": "COLD",
        "best_contact_channel": "unknown",
        "best_contact_value": "",
@@ -100,22 +127,22 @@ def _parse_output(raw: str) -> dict:
    }


-async def assess_domain(row: dict) -> dict:
-    """Call Gemini via Replicate to assess a domain. Returns parsed assessment dict."""
+async def assess_domain(analysis: dict) -> dict:
+    """Call Gemini with the full site analysis. Returns parsed assessment."""
    async with _sem():
        payload = {
            "input": {
-                "prompt": _build_prompt(row),
+                "prompt": _build_prompt(analysis),
                "images":  [],
                "videos":  [],
                "top_p":   0.9,
                "temperature": 0.2,
                "thinking_level": "low",
-                "max_output_tokens": 1024,
+                "max_output_tokens": 2048,
            }
        }
        try:
-            async with httpx.AsyncClient(timeout=90) as client:
+            async with httpx.AsyncClient(timeout=120) as client:
                resp = await client.post(
                    REPLICATE_MODEL,
                    headers={
@@ -133,10 +160,15 @@ async def assess_domain(row: dict) -> dict:
                output = "".join(output)

            result = _parse_output(output)
-            logger.info("AI %s → %s / contact: %s",
-                        row.get("domain"), result.get("lead_quality"), result.get("best_contact_channel"))
+            logger.info("AI %s → %s (quality %s)",
+                        analysis.get("domain"), result.get("lead_quality"), result.get("site_quality_score"))
            return result

        except Exception as e:
-            logger.error("Replicate error %s: %s", row.get("domain"), e)
-            return {"error": str(e)[:300], "lead_quality": "COLD", "best_contact_channel": "unknown", "best_contact_value": ""}
+            logger.error("Replicate error %s: %s", analysis.get("domain"), e)
+            return {
+                "error":               str(e)[:300],
+                "lead_quality":        "COLD",
+                "best_contact_channel": "unknown",
+                "best_contact_value":  "",
+            }
--- a/app/site_analyzer.py
+++ b/app/site_analyzer.py
@@ -0,0 +1,277 @@
+"""Deep site analysis: content quality, SEO signals, performance, indexing hints."""
+import asyncio
+import re
+import time
+import logging
+from typing import Optional
+
+import httpx
+from bs4 import BeautifulSoup
+
+logger = logging.getLogger(__name__)
+
+# ── Content quality ───────────────────────────────────────────────────────────
+
+LOREM_PHRASES = [
+    "lorem ipsum", "sed ut perspiciatis", "nunc sem sapien",
+    "nulla id nibh", "aenean dignissim", "aliquam tincidunt",
+    "vestibulum commodo", "fusce nunc lacus", "consectetuer",
+    "cras ornare tristique", "ntulla nec ante", "risus id metus",
+    "praesent placerat", "fusce pellentesque", "suscipit nibh",
+    "integer vitae libero", "felis quis tortor",
+]
+
+PLACEHOLDER_PHRASES = [
+    "under construction", "coming soon", "sample page",
+    "this is a demo", "default post", "hello world",
+    "test post", "uncategorized",
+]
+
+# ── Analytics & webmaster tags ────────────────────────────────────────────────
+
+ANALYTICS = {
+    "google_analytics":    ["gtag('config'", "google-analytics.com/analytics.js", "G-"],
+    "google_tag_manager":  ["googletagmanager.com/gtm.js", "GTM-"],
+    "facebook_pixel":      ["fbq('init'", "connect.facebook.net/en_US/fbevents"],
+    "hotjar":              ["static.hotjar.com"],
+    "clarity":             ["clarity.ms/tag"],
+}
+
+WEBMASTER = {
+    "google_search_console": ['google-site-verification'],
+    "bing_webmaster":        ['msvalidate.01'],
+    "yandex":                ['yandex-verification'],
+}
+
+KIT_IMG_PATS = [
+    "digitalizadores", "kit-digital", "kitdigital", "kit_digital",
+    "fondos-europeos", "fondos_europeos", "nextgeneration", "next-generation",
+    "prtr", "plan-recuperacion", "acelerapyme", "cofinanciado",
+]
+KIT_TEXT_PATS = [
+    "kit digital", "agente digitalizador", "fondos europeos",
+    "next generation eu", "nextgenerationeu", "plan de recuperación",
+    "prtr", "financiado por la unión europea", "red.es/kit-digital", "acelerapyme",
+]
+
+EMAIL_RE   = re.compile(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}")
+PHONE_RE   = re.compile(r"(?:\+34[\s\-]?)?(?:6|7|8|9)\d{2}[\s\-]?\d{3}[\s\-]?\d{3}")
+SOCIAL_DOM = ["facebook.com", "instagram.com", "linkedin.com", "twitter.com", "x.com", "tiktok.com"]
+
+
+async def analyze_site(domain: str) -> dict:
+    """Fetch and deeply analyse a site. Returns a rich dict for the AI prompt."""
+    result = {
+        "domain": domain,
+        "reachable": False,
+        "load_time_ms": None,
+        "status_code": None,
+        "final_url": None,
+        "page_size_kb": None,
+        "server": None,
+        "cms": None,
+        "ssl_valid": False,
+        "ssl_expiry_days": None,
+        # Content quality
+        "has_lorem_ipsum": False,
+        "lorem_matches": [],
+        "has_placeholder": False,
+        "placeholder_matches": [],
+        "word_count": 0,
+        "image_count": 0,
+        "broken_images": 0,
+        "script_count": 0,
+        "has_mobile_viewport": False,
+        "page_title": None,
+        "meta_description": None,
+        "h1_text": None,
+        "visible_text_snippet": "",
+        # SEO / webmaster
+        "has_sitemap": False,
+        "has_robots": False,
+        "robots_disallows_google": False,
+        "analytics_present": [],
+        "webmaster_verified": [],
+        "canonical_url": None,
+        "og_title": None,
+        # Kit Digital
+        "kit_digital": False,
+        "kit_digital_signals": [],
+        # Contacts
+        "emails": [],
+        "phones": [],
+        "whatsapp": [],
+        "social_links": [],
+        # Errors
+        "error": None,
+    }
+
+    # ── Fetch main page ───────────────────────────────────────────────────────
+    try:
+        t0 = time.monotonic()
+        async with httpx.AsyncClient(
+            timeout=15, follow_redirects=True, verify=False,
+            headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"},
+        ) as client:
+            resp = await client.get(f"https://{domain}")
+            if resp.status_code >= 400:
+                resp = await client.get(f"http://{domain}")
+
+        load_ms = int((time.monotonic() - t0) * 1000)
+        html = resp.text
+        result.update({
+            "reachable": resp.status_code < 400,
+            "load_time_ms": load_ms,
+            "status_code": resp.status_code,
+            "final_url": str(resp.url),
+            "page_size_kb": round(len(resp.content) / 1024, 1),
+            "server": resp.headers.get("server"),
+        })
+
+        soup = BeautifulSoup(html, "html.parser")
+        hl = html.lower()
+
+        # Title, meta
+        title_tag = soup.find("title")
+        result["page_title"] = title_tag.get_text(strip=True)[:200] if title_tag else None
+        meta_desc = soup.find("meta", attrs={"name": "description"})
+        result["meta_description"] = (meta_desc.get("content") or "")[:300] if meta_desc else None
+        h1 = soup.find("h1")
+        result["h1_text"] = h1.get_text(strip=True)[:200] if h1 else None
+
+        # Mobile viewport
+        result["has_mobile_viewport"] = bool(soup.find("meta", attrs={"name": "viewport"}))
+
+        # Canonical + OG
+        canon = soup.find("link", rel="canonical")
+        result["canonical_url"] = canon.get("href") if canon else None
+        og = soup.find("meta", property="og:title")
+        result["og_title"] = og.get("content") if og else None
+
+        # Visible text
+        for tag in soup(["script", "style", "noscript"]):
+            tag.decompose()
+        visible_text = soup.get_text(separator=" ", strip=True)
+        words = visible_text.split()
+        result["word_count"] = len(words)
+        result["visible_text_snippet"] = " ".join(words[:500])
+
+        # Lorem ipsum / placeholder detection
+        vl = visible_text.lower()
+        lorem_hits = [p for p in LOREM_PHRASES if p in vl]
+        result["has_lorem_ipsum"] = len(lorem_hits) > 0
+        result["lorem_matches"] = lorem_hits[:5]
+        ph_hits = [p for p in PLACEHOLDER_PHRASES if p in vl]
+        result["has_placeholder"] = len(ph_hits) > 0
+        result["placeholder_matches"] = ph_hits[:3]
+
+        # Images & scripts
+        imgs = soup.find_all("img")
+        result["image_count"] = len(imgs)
+        result["script_count"] = len(soup.find_all("script", src=True))
+
+        # Analytics / webmaster tags
+        for name, sigs in ANALYTICS.items():
+            if any(s.lower() in hl for s in sigs):
+                result["analytics_present"].append(name)
+        for name, sigs in WEBMASTER.items():
+            if any(s.lower() in hl for s in sigs):
+                result["webmaster_verified"].append(name)
+
+        # Kit Digital
+        kd_signals = []
+        for img in imgs:
+            combined = ((img.get("src") or "") + (img.get("alt") or "") + (img.get("srcset") or "")).lower()
+            for p in KIT_IMG_PATS:
+                if p in combined:
+                    kd_signals.append(f"img:{p}")
+                    break
+        for p in KIT_TEXT_PATS:
+            if p in hl:
+                kd_signals.append(f"text:{p}")
+        for a in soup.find_all("a", href=True):
+            href = a["href"].lower()
+            if "acelerapyme" in href or "red.es" in href or "kit-digital" in href:
+                kd_signals.append(f"link:{href[:50]}")
+        kd_signals = list(dict.fromkeys(kd_signals))[:10]
+        result["kit_digital"] = len(kd_signals) > 0
+        result["kit_digital_signals"] = kd_signals
+
+        # Contacts
+        for a in soup.find_all("a", href=True):
+            href = a["href"]
+            if href.startswith("mailto:"):
+                em = href[7:].split("?")[0].strip().lower()
+                if em and em not in result["emails"]:
+                    result["emails"].append(em)
+            elif href.startswith("tel:"):
+                ph = re.sub(r"[^\d+]", "", href[4:])
+                if ph and ph not in result["phones"]:
+                    result["phones"].append(ph)
+            elif "wa.me" in href or "api.whatsapp.com" in href:
+                if href not in result["whatsapp"]:
+                    result["whatsapp"].append(href[:80])
+            else:
+                for sd in SOCIAL_DOM:
+                    if sd in href.lower():
+                        clean = href.split("?")[0].rstrip("/")
+                        if clean not in result["social_links"]:
+                            result["social_links"].append(clean)
+                        break
+        for em in EMAIL_RE.findall(html[:80000]):
+            em = em.lower()
+            if em not in result["emails"] and not any(em.endswith(x) for x in [".png", ".jpg", ".css", ".js", ".svg"]):
+                result["emails"].append(em)
+        for ph in PHONE_RE.findall(visible_text):
+            ph_c = re.sub(r"[\s\-]", "", ph)
+            if ph_c not in result["phones"]:
+                result["phones"].append(ph_c)
+        # Cap
+        for k in ["emails", "phones", "whatsapp", "social_links"]:
+            result[k] = list(dict.fromkeys(result[k]))[:5]
+
+        # CMS
+        from app.enricher import detect_cms
+        result["cms"] = detect_cms(html, dict(resp.headers))
+
+    except Exception as e:
+        result["error"] = str(e)[:300]
+
+    # ── Sitemap & robots (parallel) ───────────────────────────────────────────
+    async def _check_url(url: str) -> Optional[str]:
+        try:
+            async with httpx.AsyncClient(timeout=6, follow_redirects=True, verify=False) as c:
+                r = await c.get(url)
+                return r.text if r.status_code == 200 else None
+        except Exception:
+            return None
+
+    sitemap_txt, robots_txt = await asyncio.gather(
+        _check_url(f"https://{domain}/sitemap.xml"),
+        _check_url(f"https://{domain}/robots.txt"),
+    )
+    result["has_sitemap"] = sitemap_txt is not None
+    result["has_robots"] = robots_txt is not None
+    if robots_txt:
+        robots_lower = robots_txt.lower()
+        result["robots_disallows_google"] = (
+            "disallow: /" in robots_lower and "googlebot" in robots_lower
+        )
+
+    # ── SSL ───────────────────────────────────────────────────────────────────
+    import ssl as _ssl, socket as _socket
+    try:
+        def _ssl_check():
+            import datetime as _dt
+            ctx = _ssl.create_default_context()
+            with _socket.create_connection((domain, 443), timeout=5) as s:
+                with ctx.wrap_socket(s, server_hostname=domain) as ss:
+                    cert = ss.getpeercert()
+                    exp = _dt.datetime.strptime(cert["notAfter"], "%b %d %H:%M:%S %Y %Z")
+                    return True, (_dt.datetime.utcnow() - exp).days * -1
+        loop = asyncio.get_event_loop()
+        result["ssl_valid"], result["ssl_expiry_days"] = await loop.run_in_executor(None, _ssl_check)
+    except Exception:
+        pass
+
+    return result
--- a/app/static/index.html
+++ b/app/static/index.html
@@ -136,11 +136,10 @@ tr:hover td{background:rgba(255,255,255,.025)}

 /* AI detail modal */
 .modal-bg{position:fixed;inset:0;background:#000a;z-index:300;display:flex;align-items:center;justify-content:center}
-.modal{background:var(--surface);border:1px solid var(--border);border-radius:var(--r);padding:20px;max-width:500px;width:90%;max-height:80vh;overflow-y:auto}
-.modal h2{font-size:16px;font-weight:800;margin-bottom:12px}
-.modal .row{display:flex;gap:8px;margin-bottom:8px;font-size:13px}
-.modal .label{color:var(--muted);min-width:110px;font-size:12px}
-.modal .val{color:var(--text)}
+.modal{background:var(--surface);border:1px solid var(--border);border-radius:var(--r);padding:18px;max-width:560px;width:95%;max-height:88vh;overflow-y:auto}
+.modal h2{font-size:15px;font-weight:800}
+.mrow{display:flex;gap:8px;margin-bottom:6px;font-size:12px;line-height:1.4}
+.mlabel{color:var(--muted);min-width:90px;font-size:11px;padding-top:1px;flex-shrink:0}

@media(max-width:700px){.pipeline{grid-template-columns:1fr}.sg{grid-template-columns:1fr 1fr}}
 </style>
@@ -153,15 +152,99 @@ tr:hover td{background:rgba(255,255,255,.025)}
 <!-- AI Detail Modal -->
 <div class="modal-bg" x-show="modal.open" @click.self="modal.open=false" x-cloak>
  <div class="modal" @click.stop>
-    <h2>AI Assessment — <span style="color:var(--accent2)" x-text="modal.domain"></span></h2>
-    <div class="row"><span class="label">Lead quality</span><span class="val"><span class="pill" :class="aiPillClass(modal.data.lead_quality)" x-text="modal.data.lead_quality || '—'"></span></span></div>
-    <div class="row"><span class="label">Kit Digital</span><span class="val" x-text="modal.data.kit_digital_confirmed ? '✅ Confirmed' : '❌ Not confirmed'"></span></div>
-    <div class="row"><span class="label">KD reasoning</span><span class="val" x-text="modal.data.kit_digital_reasoning || '—'"></span></div>
-    <div class="row"><span class="label">Lead reasoning</span><span class="val" x-text="modal.data.lead_reasoning || '—'"></span></div>
-    <div class="row"><span class="label">Best channel</span><span class="val" x-text="(modal.data.best_contact_channel || '—') + (modal.data.best_contact_value ? ': ' + modal.data.best_contact_value : '')"></span></div>
-    <div class="row"><span class="label">Pitch</span><span class="val" style="font-style:italic;color:var(--accent2)" x-text="modal.data.pitch_angle || '—'"></span></div>
-    <div class="row"><span class="label">Services needed</span><span class="val" x-text="(modal.data.services_likely_needed || []).join(', ') || '—'"></span></div>
-    <div class="row"><span class="label">Outreach notes</span><span class="val" x-text="modal.data.outreach_notes || '—'"></span></div>
+    <div style="display:flex;justify-content:space-between;align-items:flex-start;margin-bottom:12px">
+      <h2>AI Report — <span style="color:var(--accent2)" x-text="modal.domain"></span></h2>
+      <button class="btn bg sm" @click="modal.open=false">✕</button>
+    </div>
+
+    <!-- Summary banner -->
+    <div x-show="modal.ai.summary" style="background:var(--surface2);border-radius:6px;padding:10px 12px;margin-bottom:12px;font-size:12px;line-height:1.5;color:var(--text)" x-text="modal.ai.summary"></div>
+
+    <!-- Lead + quality -->
+    <div style="display:grid;grid-template-columns:1fr 1fr 1fr;gap:8px;margin-bottom:12px">
+      <div style="background:var(--surface2);border-radius:6px;padding:8px;text-align:center">
+        <div style="font-size:10px;color:var(--muted);margin-bottom:3px">LEAD</div>
+        <span class="pill" :class="aiPillClass(modal.ai.lead_quality)" x-text="modal.ai.lead_quality||'—'"></span>
+      </div>
+      <div style="background:var(--surface2);border-radius:6px;padding:8px;text-align:center">
+        <div style="font-size:10px;color:var(--muted);margin-bottom:3px">SITE QUALITY</div>
+        <span class="score" :style="qualityBg(modal.ai.site_quality_score)" x-text="(modal.ai.site_quality_score??'—')+'/10'"></span>
+      </div>
+      <div style="background:var(--surface2);border-radius:6px;padding:8px;text-align:center">
+        <div style="font-size:10px;color:var(--muted);margin-bottom:3px">KIT DIGITAL</div>
+        <span x-text="modal.ai.kit_digital_confirmed ? '✅ Yes' : '❌ No'" style="font-size:13px;font-weight:700"></span>
+      </div>
+    </div>
+
+    <div class="mrow"><span class="mlabel">Reasoning</span><span x-text="modal.ai.lead_reasoning||'—'"></span></div>
+    <div class="mrow"><span class="mlabel">KD notes</span><span x-text="modal.ai.kit_digital_reasoning||'—'"></span></div>
+    <div class="mrow"><span class="mlabel">Performance</span><span x-text="modal.ai.performance_notes||'—'"></span></div>
+    <div class="mrow"><span class="mlabel">SEO status</span><span x-text="modal.ai.seo_status||'—'"></span></div>
+
+    <!-- Content issues -->
+    <div x-show="(modal.ai.content_issues||[]).length>0" style="margin:8px 0">
+      <div style="font-size:10px;color:var(--muted);text-transform:uppercase;margin-bottom:4px">Content Issues</div>
+      <template x-for="issue in (modal.ai.content_issues||[])">
+        <div style="font-size:12px;color:var(--danger);padding:2px 0">⚠ <span x-text="issue"></span></div>
+      </template>
+    </div>
+
+    <!-- Urgency signals -->
+    <div x-show="(modal.ai.urgency_signals||[]).length>0" style="margin:8px 0">
+      <div style="font-size:10px;color:var(--muted);text-transform:uppercase;margin-bottom:4px">Urgency Signals</div>
+      <template x-for="sig in (modal.ai.urgency_signals||[])">
+        <div style="font-size:12px;color:var(--warn);padding:2px 0">🔴 <span x-text="sig"></span></div>
+      </template>
+    </div>
+
+    <!-- Contact -->
+    <div style="background:var(--surface2);border-radius:6px;padding:10px;margin:8px 0">
+      <div style="font-size:10px;color:var(--muted);text-transform:uppercase;margin-bottom:6px">Best Contact</div>
+      <div style="font-size:13px;font-weight:700;color:var(--accent2)" x-text="(modal.ai.best_contact_channel||'unknown').toUpperCase()"></div>
+      <div style="font-size:12px;color:var(--text);margin-top:2px;word-break:break-all" x-text="modal.ai.best_contact_value||'—'"></div>
+      <!-- All contacts from site_analysis -->
+      <div x-show="modal.sa" style="margin-top:8px;display:flex;flex-wrap:wrap;gap:4px">
+        <template x-for="em in (modal.sa?.emails||[])">
+          <a :href="'mailto:'+em" class="chip email" x-text="em"></a>
+        </template>
+        <template x-for="ph in (modal.sa?.phones||[])">
+          <a :href="'tel:'+ph" class="chip phone" x-text="ph"></a>
+        </template>
+        <template x-for="wa in (modal.sa?.whatsapp||[])">
+          <a :href="wa" target="_blank" class="chip wa">💬 WhatsApp</a>
+        </template>
+        <template x-for="s in (modal.sa?.social_links||[]).slice(0,3)">
+          <a :href="s" target="_blank" class="chip social" x-text="s.replace('https://','').split('/')[0]"></a>
+        </template>
+      </div>
+    </div>
+
+    <!-- Pitch -->
+    <div style="background:#6c63ff15;border:1px solid #6c63ff33;border-radius:6px;padding:10px;margin:8px 0">
+      <div style="font-size:10px;color:var(--muted);text-transform:uppercase;margin-bottom:4px">Cold Pitch (ES)</div>
+      <div style="font-size:13px;font-style:italic;color:var(--accent2)" x-text="modal.ai.pitch_angle||'—'"></div>
+    </div>
+
+    <div class="mrow"><span class="mlabel">Services</span><span x-text="(modal.ai.services_needed||[]).join(', ')||'—'"></span></div>
+    <div class="mrow"><span class="mlabel">Notes</span><span x-text="modal.ai.outreach_notes||'—'"></span></div>
+
+    <!-- Site analysis tech snapshot -->
+    <div x-show="modal.sa" style="margin-top:10px;padding-top:10px;border-top:1px solid var(--border)">
+      <div style="font-size:10px;color:var(--muted);text-transform:uppercase;margin-bottom:6px">Technical Snapshot</div>
+      <div style="display:grid;grid-template-columns:1fr 1fr;gap:4px;font-size:11px">
+        <div>Load time: <b x-text="(modal.sa?.load_time_ms||'—')+'ms'"></b></div>
+        <div>Page size: <b x-text="(modal.sa?.page_size_kb||'—')+'KB'"></b></div>
+        <div>CMS: <b x-text="modal.sa?.cms||'unknown'"></b></div>
+        <div>Server: <b x-text="modal.sa?.server||'—'"></b></div>
+        <div>Sitemap: <b x-text="modal.sa?.has_sitemap?'✅':'❌'"></b></div>
+        <div>Robots: <b x-text="modal.sa?.has_robots?'✅':'❌'"></b></div>
+        <div>Analytics: <b x-text="(modal.sa?.analytics_present||[]).join(', ')||'none'"></b></div>
+        <div>Mobile: <b x-text="modal.sa?.has_mobile_viewport?'✅':'❌'"></b></div>
+        <div>Lorem ipsum: <b :style="modal.sa?.has_lorem_ipsum?'color:var(--danger)':''" x-text="modal.sa?.has_lorem_ipsum?'⚠ YES':'No'"></b></div>
+        <div>Words: <b x-text="modal.sa?.word_count||'—'"></b></div>
+      </div>
+    </div>
+
    <button class="btn bg" style="margin-top:14px;width:100%" @click="modal.open=false">Close</button>
  </div>
 </div>
@@ -436,7 +519,7 @@ function app() {
    qst: {}, customDomains: '',
    pipeline: {hot:{count:0,samples:[]},warm:{count:0,samples:[]},cold:{count:0,samples:[]}},
    toast: {show:false,msg:'',type:'success'},
-    modal: {open:false,domain:'',data:{}},
+    modal: {open:false, domain:'', ai:{}, sa:null},
    _chart: null, _poll: null, _toastTimer: null,

    async init() {
@@ -556,11 +639,20 @@ function app() {

    openModal(row) {
      this.modal.domain = row.domain;
-      try { this.modal.data = row.ai_assessment ? JSON.parse(row.ai_assessment) : {}; }
-      catch(e) { this.modal.data = {}; }
+      try { this.modal.ai = row.ai_assessment ? JSON.parse(row.ai_assessment) : {}; }
+      catch(e) { this.modal.ai = {}; }
+      try { this.modal.sa = row.site_analysis ? JSON.parse(row.site_analysis) : null; }
+      catch(e) { this.modal.sa = null; }
      this.modal.open = true;
    },

+    qualityBg(s) {
+      if(s==null) return 'background:#333;color:#888';
+      if(s>=8) return 'background:#00d4aa22;color:var(--accent2)';
+      if(s>=5) return 'background:#ffb34722;color:var(--warn)';
+      return 'background:#ff4f6d22;color:var(--danger)';
+    },
+
    scoreBg(s) {
      if(s==null) return 'background:#333;color:#888';
      if(s>=80) return 'background:#ff4f6d22;color:#ff4f6d';
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -13,6 +13,6 @@ services:
      - SCORE_THRESHOLD=60
      - TARGET_TLDS=es,com,net
      - TARGET_COUNTRIES=ES,GB,DE,FR,RO,PT,AD,IT
-      - REPLICATE_API_TOKEN=r8_6kV2NWMQyPVB9JILHJprrXJJh4vWazA22Osyj
+      - REPLICATE_API_TOKEN=r8_7I7Feai78f9PzMOs20y5GVFKiLkgUWP463vZO
      - AI_CONCURRENCY=3
    restart: unless-stopped