feat: web search for contacts, copyright year, contact page scan, CMS/age from Gemini

- replicate_ai: DDG pre-search runs before every Gemini call; top 4 snippets injected into prompt so Gemini can find phone/email not on homepage - replicate_ai: explicit instructions to use search results for contact lookup - replicate_ai: new output fields cms_detected + site_last_updated - site_analyzer: copyright year extracted from footer (© / copyright pattern) - site_analyzer: Last-Modified from HTTP header + OG meta tag - site_analyzer: scans /contacto /contact /contactanos /sobre-nosotros for additional emails/phones (parallel with sitemap/robots fetch) - index.html: modal shows CMS (AI-detected), Last Updated (red if pre-2021) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 08:22:14 +02:00
parent dad910b6b0
commit d62e4e986e
3 changed files with 118 additions and 4 deletions
--- a/app/replicate_ai.py
+++ b/app/replicate_ai.py
@@ -7,6 +7,7 @@ import re
 from typing import Optional

 import httpx
+from bs4 import BeautifulSoup

 logger = logging.getLogger(__name__)

@@ -24,7 +25,36 @@ def _sem() -> asyncio.Semaphore:
    return _ai_sem


-def _build_prompt(a: dict) -> str:
+async def _ddg_search(query: str) -> str:
+    """DuckDuckGo HTML search — returns top snippet text, empty string on failure."""
+    try:
+        async with httpx.AsyncClient(
+            timeout=10, follow_redirects=True,
+            headers={"User-Agent": "Mozilla/5.0 (compatible; DomGod/1.0)"},
+        ) as client:
+            r = await client.get(
+                "https://html.duckduckgo.com/html/",
+                params={"q": query, "kl": "es-es"},
+            )
+            if r.status_code != 200:
+                return ""
+            soup = BeautifulSoup(r.text, "html.parser")
+            parts = []
+            for res in soup.select(".result")[:4]:
+                title = res.select_one(".result__a")
+                snip  = res.select_one(".result__snippet")
+                url   = res.select_one(".result__url")
+                if snip:
+                    t = title.get_text(strip=True) if title else ""
+                    u = url.get_text(strip=True)   if url   else ""
+                    parts.append(f"[{u}] {t} — {snip.get_text(strip=True)}")
+            return "\n".join(parts)
+    except Exception as e:
+        logger.debug("DDG search failed: %s", e)
+        return ""
+
+
+def _build_prompt(a: dict, search_results: str = "") -> str:
    contacts_block = []
    if a.get("emails"):      contacts_block.append(f"  Emails:    {', '.join(a['emails'][:3])}")
    if a.get("phones"):      contacts_block.append(f"  Phones:    {', '.join(a['phones'][:3])}")
@@ -40,6 +70,8 @@ def _build_prompt(a: dict) -> str:
    snippet     = (a.get("visible_text_snippet") or "")[:2000]
    social_str  = ", ".join(a.get("social_links") or []) or "none detected"
    gmb_str     = f"✅ Found — {a.get('gmb_url','')}" if a.get("has_gmb") else "❌ Not detected"
+    copyright_yr = a.get("copyright_year") or "not found"
+    last_mod    = a.get("last_modified") or "not found"

    eu_hosted   = a.get("eu_hosted")
    hosting_flag = "✅ EU" if eu_hosted else ("❌ Non-EU" if eu_hosted is False else "unknown")
@@ -84,9 +116,11 @@ Skip navigation link:  {a.get("has_skip_nav")}
 Empty links:           {a.get("empty_links")}
 Inputs without labels: {a.get("inputs_without_labels")}

-=== CONTENT QUALITY ===
+=== CONTENT QUALITY & FRESHNESS ===
 Lorem ipsum:     {a.get("has_lorem_ipsum")}  →  {lorem_str}
 Placeholder:     {a.get("has_placeholder")}  →  {ph_str}
+Copyright year:  {copyright_yr}
+Last-Modified:   {last_mod}

 === KIT DIGITAL (Spanish gov €12k SME grants — ONLY confirm if you see explicit Kit Digital / agente digitalizador branding) ===
 Heuristic detected: {a.get("kit_digital")}
@@ -104,12 +138,22 @@ Profiles found on site: {social_str}
 === PAGE TEXT SAMPLE ===
 {snippet}

+=== WEB SEARCH RESULTS (use these to find contact info, verify business details) ===
+{search_results if search_results else "No search results available."}
+
 === INSTRUCTIONS ===
 The client sells: web redesign, SEO, hosting migration, SSL renewal,
 security audits, GDPR compliance, accessibility fixes, Google Ads,
 maintenance contracts, AI tools for SMEs, Google My Business setup/optimisation,
 social media management (Instagram, Facebook, LinkedIn, TikTok).

+IMPORTANT — use the WEB SEARCH RESULTS above to:
+1. Find any phone numbers, emails, or WhatsApp not visible on the homepage.
+2. Identify the business owner name if available.
+3. Populate best_contact_value with a real phone/email you found.
+4. Use the copyright year and Last-Modified date to estimate when the site was last updated.
+5. Determine the actual CMS from code signals and visible text (not just the heuristic).
+
 Respond ONLY with valid JSON, no markdown fences, no text outside the JSON:
 {{
  "summary": "2-3 sentence executive summary of the site's state",
@@ -120,6 +164,8 @@ Respond ONLY with valid JSON, no markdown fences, no text outside the JSON:
  "hosting_notes": "ASN/ISP name, EU vs non-EU, any concerns",
  "gdpr_compliance": "cookie banner status, privacy policy, GDPR gaps",
  "accessibility_issues": ["specific a11y problems found"],
+  "cms_detected": "wordpress|wix|squarespace|custom|unknown",
+  "site_last_updated": "year or estimate, e.g. '2019' or 'likely 2021'",
  "kit_digital_confirmed": true/false,
  "has_gmb": true/false,
  "has_social_media": true/false,
@@ -158,9 +204,17 @@ def _parse_output(raw: str) -> dict:
 async def assess_domain(analysis: dict) -> dict:
    """Call Gemini with the full site analysis. Returns parsed assessment."""
    async with _sem():
+        # Build search query from domain / page title for contact lookup
+        domain = analysis.get("domain", "")
+        title  = analysis.get("page_title") or ""
+        biz_name = title.split("|")[0].split("-")[0].strip() or domain
+        search_query = f'"{biz_name}" {domain} contacto telefono email'
+        search_results = await _ddg_search(search_query)
+        logger.info("DDG search for %s → %d chars", domain, len(search_results))
+
        payload = {
            "input": {
-                "prompt": _build_prompt(analysis),
+                "prompt": _build_prompt(analysis, search_results),
                "images":  [],
                "videos":  [],
                "top_p":   0.9,