feat: web search for contacts, copyright year, contact page scan, CMS/age from Gemini

- replicate_ai: DDG pre-search runs before every Gemini call; top 4 snippets injected into prompt so Gemini can find phone/email not on homepage - replicate_ai: explicit instructions to use search results for contact lookup - replicate_ai: new output fields cms_detected + site_last_updated - site_analyzer: copyright year extracted from footer (© / copyright pattern) - site_analyzer: Last-Modified from HTTP header + OG meta tag - site_analyzer: scans /contacto /contact /contactanos /sobre-nosotros for additional emails/phones (parallel with sitemap/robots fetch) - index.html: modal shows CMS (AI-detected), Last Updated (red if pre-2021) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 08:22:14 +02:00
parent dad910b6b0
commit d62e4e986e
3 changed files with 118 additions and 4 deletions
--- a/app/replicate_ai.py
+++ b/app/replicate_ai.py
@@ -7,6 +7,7 @@ import re
 from typing import Optional

 import httpx
+from bs4 import BeautifulSoup

 logger = logging.getLogger(__name__)

@@ -24,7 +25,36 @@ def _sem() -> asyncio.Semaphore:
    return _ai_sem


-def _build_prompt(a: dict) -> str:
+async def _ddg_search(query: str) -> str:
+    """DuckDuckGo HTML search — returns top snippet text, empty string on failure."""
+    try:
+        async with httpx.AsyncClient(
+            timeout=10, follow_redirects=True,
+            headers={"User-Agent": "Mozilla/5.0 (compatible; DomGod/1.0)"},
+        ) as client:
+            r = await client.get(
+                "https://html.duckduckgo.com/html/",
+                params={"q": query, "kl": "es-es"},
+            )
+            if r.status_code != 200:
+                return ""
+            soup = BeautifulSoup(r.text, "html.parser")
+            parts = []
+            for res in soup.select(".result")[:4]:
+                title = res.select_one(".result__a")
+                snip  = res.select_one(".result__snippet")
+                url   = res.select_one(".result__url")
+                if snip:
+                    t = title.get_text(strip=True) if title else ""
+                    u = url.get_text(strip=True)   if url   else ""
+                    parts.append(f"[{u}] {t} — {snip.get_text(strip=True)}")
+            return "\n".join(parts)
+    except Exception as e:
+        logger.debug("DDG search failed: %s", e)
+        return ""
+
+
+def _build_prompt(a: dict, search_results: str = "") -> str:
    contacts_block = []
    if a.get("emails"):      contacts_block.append(f"  Emails:    {', '.join(a['emails'][:3])}")
    if a.get("phones"):      contacts_block.append(f"  Phones:    {', '.join(a['phones'][:3])}")
@@ -40,6 +70,8 @@ def _build_prompt(a: dict) -> str:
    snippet     = (a.get("visible_text_snippet") or "")[:2000]
    social_str  = ", ".join(a.get("social_links") or []) or "none detected"
    gmb_str     = f"✅ Found — {a.get('gmb_url','')}" if a.get("has_gmb") else "❌ Not detected"
+    copyright_yr = a.get("copyright_year") or "not found"
+    last_mod    = a.get("last_modified") or "not found"

    eu_hosted   = a.get("eu_hosted")
    hosting_flag = "✅ EU" if eu_hosted else ("❌ Non-EU" if eu_hosted is False else "unknown")
@@ -84,9 +116,11 @@ Skip navigation link:  {a.get("has_skip_nav")}
 Empty links:           {a.get("empty_links")}
 Inputs without labels: {a.get("inputs_without_labels")}

-=== CONTENT QUALITY ===
+=== CONTENT QUALITY & FRESHNESS ===
 Lorem ipsum:     {a.get("has_lorem_ipsum")}  →  {lorem_str}
 Placeholder:     {a.get("has_placeholder")}  →  {ph_str}
+Copyright year:  {copyright_yr}
+Last-Modified:   {last_mod}

 === KIT DIGITAL (Spanish gov €12k SME grants — ONLY confirm if you see explicit Kit Digital / agente digitalizador branding) ===
 Heuristic detected: {a.get("kit_digital")}
@@ -104,12 +138,22 @@ Profiles found on site: {social_str}
 === PAGE TEXT SAMPLE ===
 {snippet}

+=== WEB SEARCH RESULTS (use these to find contact info, verify business details) ===
+{search_results if search_results else "No search results available."}
+
 === INSTRUCTIONS ===
 The client sells: web redesign, SEO, hosting migration, SSL renewal,
 security audits, GDPR compliance, accessibility fixes, Google Ads,
 maintenance contracts, AI tools for SMEs, Google My Business setup/optimisation,
 social media management (Instagram, Facebook, LinkedIn, TikTok).

+IMPORTANT — use the WEB SEARCH RESULTS above to:
+1. Find any phone numbers, emails, or WhatsApp not visible on the homepage.
+2. Identify the business owner name if available.
+3. Populate best_contact_value with a real phone/email you found.
+4. Use the copyright year and Last-Modified date to estimate when the site was last updated.
+5. Determine the actual CMS from code signals and visible text (not just the heuristic).
+
 Respond ONLY with valid JSON, no markdown fences, no text outside the JSON:
 {{
  "summary": "2-3 sentence executive summary of the site's state",
@@ -120,6 +164,8 @@ Respond ONLY with valid JSON, no markdown fences, no text outside the JSON:
  "hosting_notes": "ASN/ISP name, EU vs non-EU, any concerns",
  "gdpr_compliance": "cookie banner status, privacy policy, GDPR gaps",
  "accessibility_issues": ["specific a11y problems found"],
+  "cms_detected": "wordpress|wix|squarespace|custom|unknown",
+  "site_last_updated": "year or estimate, e.g. '2019' or 'likely 2021'",
  "kit_digital_confirmed": true/false,
  "has_gmb": true/false,
  "has_social_media": true/false,
@@ -158,9 +204,17 @@ def _parse_output(raw: str) -> dict:
 async def assess_domain(analysis: dict) -> dict:
    """Call Gemini with the full site analysis. Returns parsed assessment."""
    async with _sem():
+        # Build search query from domain / page title for contact lookup
+        domain = analysis.get("domain", "")
+        title  = analysis.get("page_title") or ""
+        biz_name = title.split("|")[0].split("-")[0].strip() or domain
+        search_query = f'"{biz_name}" {domain} contacto telefono email'
+        search_results = await _ddg_search(search_query)
+        logger.info("DDG search for %s → %d chars", domain, len(search_results))
+
        payload = {
            "input": {
-                "prompt": _build_prompt(analysis),
+                "prompt": _build_prompt(analysis, search_results),
                "images":  [],
                "videos":  [],
                "top_p":   0.9,
--- a/app/site_analyzer.py
+++ b/app/site_analyzer.py
@@ -169,6 +169,8 @@ async def _analyze_site_inner(domain: str) -> dict:
        "has_gmb": False, "gmb_url": None,
        # Contacts
        "emails": [], "phones": [], "whatsapp": [], "social_links": [],
+        # Age / freshness
+        "copyright_year": None, "last_modified": None,
        "error": None,
    }

@@ -364,6 +366,23 @@ async def _analyze_site_inner(domain: str) -> dict:
                result["cms"] = cms
                break

+        # ── Last-Modified / copyright year ────────────────────────────────────
+        lm = (resp.headers.get("last-modified") or
+              (soup.find("meta", attrs={"name": "last-modified"}) or {}).get("content") or
+              (soup.find("meta", property="article:modified_time") or {}).get("content"))
+        if lm:
+            result["last_modified"] = str(lm)[:30]
+
+        footer_el = (soup.find("footer") or
+                     soup.find(id=re.compile(r"footer", re.I)) or
+                     soup.find(class_=re.compile(r"footer", re.I)))
+        search_text = footer_el.get_text() if footer_el else visible[-600:]
+        cp = re.search(r"(?:©|&copy;|copyright)\s*[\d\-–]*\s*(20\d{2})", search_text, re.I)
+        if not cp:
+            cp = re.search(r"(20\d{2})\s*[-–]\s*20\d{2}|(?:©|copyright)\D{0,10}(20\d{2})", search_text, re.I)
+        if cp:
+            result["copyright_year"] = cp.group(1) or cp.group(2)
+
    # ── Sitemap & robots (parallel) ───────────────────────────────────────────
    async def _get(url):
        try:
@@ -373,9 +392,17 @@ async def _analyze_site_inner(domain: str) -> dict:
        except Exception:
            return None

-    sitemap_txt, robots_txt = await asyncio.gather(
+    async def _get_contact_page():
+        for path in ("/contacto", "/contact", "/contactanos", "/sobre-nosotros"):
+            txt = await _get(f"https://{domain}{path}")
+            if txt:
+                return txt
+        return None
+
+    sitemap_txt, robots_txt, contact_html = await asyncio.gather(
        _get(f"https://{domain}/sitemap.xml"),
        _get(f"https://{domain}/robots.txt"),
+        _get_contact_page(),
    )
    result["has_sitemap"] = sitemap_txt is not None
    result["has_robots"]  = robots_txt is not None
@@ -383,6 +410,37 @@ async def _analyze_site_inner(domain: str) -> dict:
        rl = robots_txt.lower()
        result["robots_disallows_google"] = "disallow: /" in rl and "googlebot" in rl

+    # Merge contacts from /contacto page
+    if contact_html:
+        try:
+            csoup = BeautifulSoup(contact_html, "html.parser")
+            for a in csoup.find_all("a", href=True):
+                href = a["href"]
+                if href.startswith("mailto:"):
+                    em = href[7:].split("?")[0].strip().lower()
+                    if em and em not in result["emails"]:
+                        result["emails"].append(em)
+                elif href.startswith("tel:"):
+                    ph = re.sub(r"[^\d+]", "", href[4:])
+                    if ph and ph not in result["phones"]:
+                        result["phones"].append(ph)
+                elif "wa.me" in href or "api.whatsapp.com" in href:
+                    if href not in result["whatsapp"]:
+                        result["whatsapp"].append(href[:80])
+            ctext = csoup.get_text()
+            for em in EMAIL_RE.findall(contact_html[:60000]):
+                em = em.lower()
+                if em not in result["emails"] and not any(em.endswith(x) for x in [".png",".jpg",".css",".js"]):
+                    result["emails"].append(em)
+            for ph in PHONE_RE.findall(ctext):
+                ph_c = re.sub(r"[\s\-]", "", ph)
+                if ph_c not in result["phones"]:
+                    result["phones"].append(ph_c)
+            for k in ["emails", "phones", "whatsapp"]:
+                result[k] = list(dict.fromkeys(result[k]))[:5]
+        except Exception:
+            pass
+
    # ── SSL ───────────────────────────────────────────────────────────────────
    import ssl as _ssl
    try:
--- a/app/static/index.html
+++ b/app/static/index.html
@@ -187,6 +187,8 @@ tr:hover td{background:rgba(255,255,255,.025)}

    <div class="mrow"><span class="mlabel">Reasoning</span><span x-text="modal.ai.lead_reasoning||'—'"></span></div>
    <div class="mrow"><span class="mlabel">KD notes</span><span x-text="modal.ai.kit_digital_reasoning||'—'"></span></div>
+    <div class="mrow"><span class="mlabel">CMS</span><span x-text="modal.ai.cms_detected || modal.sa?.cms || '—'"></span></div>
+    <div class="mrow"><span class="mlabel">Last updated</span><span :style="(modal.ai.site_last_updated&&parseInt(modal.ai.site_last_updated)<2021)?'color:var(--danger)':''" x-text="modal.ai.site_last_updated || (modal.sa?.copyright_year ? 'Copyright '+modal.sa.copyright_year : '—')"></span></div>
    <div class="mrow"><span class="mlabel">Performance</span><span x-text="modal.ai.performance_notes||'—'"></span></div>
    <div class="mrow"><span class="mlabel">SEO</span><span x-text="modal.ai.seo_status||'—'"></span></div>
    <div class="mrow"><span class="mlabel">Hosting</span><span x-text="(modal.sa?.org||'?') + ' / ' + (modal.sa?.ip_country||'?') + (modal.sa?.eu_hosted===false?' ❌ Non-EU':modal.sa?.eu_hosted?' ✅ EU':'')"></span></div>