feat: web search for contacts, copyright year, contact page scan, CMS/age from Gemini

- replicate_ai: DDG pre-search runs before every Gemini call; top 4 snippets injected into prompt so Gemini can find phone/email not on homepage - replicate_ai: explicit instructions to use search results for contact lookup - replicate_ai: new output fields cms_detected + site_last_updated - site_analyzer: copyright year extracted from footer (© / copyright pattern) - site_analyzer: Last-Modified from HTTP header + OG meta tag - site_analyzer: scans /contacto /contact /contactanos /sobre-nosotros for additional emails/phones (parallel with sitemap/robots fetch) - index.html: modal shows CMS (AI-detected), Last Updated (red if pre-2021) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 08:22:14 +02:00
parent dad910b6b0
commit d62e4e986e
3 changed files with 118 additions and 4 deletions
--- a/app/replicate_ai.py
+++ b/app/replicate_ai.py
@@ -7,6 +7,7 @@ import re
 from typing import Optional
 import httpx
 from bs4 import BeautifulSoup
 logger = logging.getLogger(__name__)
@@ -24,7 +25,36 @@ def _sem() -> asyncio.Semaphore:
    return _ai_sem
-def _build_prompt(a: dict) -> str:
+async def _ddg_search(query: str) -> str:
    """DuckDuckGo HTML search — returns top snippet text, empty string on failure."""
    try:
        async with httpx.AsyncClient(
            timeout=10, follow_redirects=True,
            headers={"User-Agent": "Mozilla/5.0 (compatible; DomGod/1.0)"},
        ) as client:
            r = await client.get(
                "https://html.duckduckgo.com/html/",
                params={"q": query, "kl": "es-es"},
            )
            if r.status_code != 200:
                return ""
            soup = BeautifulSoup(r.text, "html.parser")
            parts = []
            for res in soup.select(".result")[:4]:
                title = res.select_one(".result__a")
                snip  = res.select_one(".result__snippet")
                url   = res.select_one(".result__url")
                if snip:
                    t = title.get_text(strip=True) if title else ""
                    u = url.get_text(strip=True)   if url   else ""
                    parts.append(f"[{u}] {t} — {snip.get_text(strip=True)}")
            return "\n".join(parts)
    except Exception as e:
        logger.debug("DDG search failed: %s", e)
        return ""
 def _build_prompt(a: dict, search_results: str = "") -> str:
    contacts_block = []
    if a.get("emails"):      contacts_block.append(f"  Emails:    {', '.join(a['emails'][:3])}")
    if a.get("phones"):      contacts_block.append(f"  Phones:    {', '.join(a['phones'][:3])}")
@@ -40,6 +70,8 @@ def _build_prompt(a: dict) -> str:
    snippet     = (a.get("visible_text_snippet") or "")[:2000]
    social_str  = ", ".join(a.get("social_links") or []) or "none detected"
    gmb_str     = f"✅ Found — {a.get('gmb_url','')}" if a.get("has_gmb") else "❌ Not detected"
    copyright_yr = a.get("copyright_year") or "not found"
    last_mod    = a.get("last_modified") or "not found"
    eu_hosted   = a.get("eu_hosted")
    hosting_flag = "✅ EU" if eu_hosted else ("❌ Non-EU" if eu_hosted is False else "unknown")
@@ -84,9 +116,11 @@ Skip navigation link:  {a.get("has_skip_nav")}
 Empty links:           {a.get("empty_links")}
 Inputs without labels: {a.get("inputs_without_labels")}
-=== CONTENT QUALITY ===
+=== CONTENT QUALITY & FRESHNESS ===
 Lorem ipsum:     {a.get("has_lorem_ipsum")}  →  {lorem_str}
 Placeholder:     {a.get("has_placeholder")}  →  {ph_str}
 Copyright year:  {copyright_yr}
 Last-Modified:   {last_mod}
 === KIT DIGITAL (Spanish gov €12k SME grants — ONLY confirm if you see explicit Kit Digital / agente digitalizador branding) ===
 Heuristic detected: {a.get("kit_digital")}
@@ -104,12 +138,22 @@ Profiles found on site: {social_str}
 === PAGE TEXT SAMPLE ===
 {snippet}
 === WEB SEARCH RESULTS (use these to find contact info, verify business details) ===
 {search_results if search_results else "No search results available."}
 === INSTRUCTIONS ===
 The client sells: web redesign, SEO, hosting migration, SSL renewal,
 security audits, GDPR compliance, accessibility fixes, Google Ads,
 maintenance contracts, AI tools for SMEs, Google My Business setup/optimisation,
 social media management (Instagram, Facebook, LinkedIn, TikTok).
 IMPORTANT — use the WEB SEARCH RESULTS above to:
 1. Find any phone numbers, emails, or WhatsApp not visible on the homepage.
 2. Identify the business owner name if available.
 3. Populate best_contact_value with a real phone/email you found.
 4. Use the copyright year and Last-Modified date to estimate when the site was last updated.
 5. Determine the actual CMS from code signals and visible text (not just the heuristic).
 Respond ONLY with valid JSON, no markdown fences, no text outside the JSON:
 {{
  "summary": "2-3 sentence executive summary of the site's state",
@@ -120,6 +164,8 @@ Respond ONLY with valid JSON, no markdown fences, no text outside the JSON:
  "hosting_notes": "ASN/ISP name, EU vs non-EU, any concerns",
  "gdpr_compliance": "cookie banner status, privacy policy, GDPR gaps",
  "accessibility_issues": ["specific a11y problems found"],
  "cms_detected": "wordpress|wix|squarespace|custom|unknown",
  "site_last_updated": "year or estimate, e.g. '2019' or 'likely 2021'",
  "kit_digital_confirmed": true/false,
  "has_gmb": true/false,
  "has_social_media": true/false,
@@ -158,9 +204,17 @@ def _parse_output(raw: str) -> dict:
 async def assess_domain(analysis: dict) -> dict:
    """Call Gemini with the full site analysis. Returns parsed assessment."""
    async with _sem():
        # Build search query from domain / page title for contact lookup
        domain = analysis.get("domain", "")
        title  = analysis.get("page_title") or ""
        biz_name = title.split("|")[0].split("-")[0].strip() or domain
        search_query = f'"{biz_name}" {domain} contacto telefono email'
        search_results = await _ddg_search(search_query)
        logger.info("DDG search for %s → %d chars", domain, len(search_results))
        payload = {
            "input": {
-                "prompt": _build_prompt(analysis),
+                "prompt": _build_prompt(analysis, search_results),
                "images":  [],
                "videos":  [],
                "top_p":   0.9,
--- a/app/site_analyzer.py
+++ b/app/site_analyzer.py
@@ -169,6 +169,8 @@ async def _analyze_site_inner(domain: str) -> dict:
        "has_gmb": False, "gmb_url": None,
        # Contacts
        "emails": [], "phones": [], "whatsapp": [], "social_links": [],
        # Age / freshness
        "copyright_year": None, "last_modified": None,
        "error": None,
    }
@@ -364,6 +366,23 @@ async def _analyze_site_inner(domain: str) -> dict:
                result["cms"] = cms
                break
        # ── Last-Modified / copyright year ────────────────────────────────────
        lm = (resp.headers.get("last-modified") or
              (soup.find("meta", attrs={"name": "last-modified"}) or {}).get("content") or
              (soup.find("meta", property="article:modified_time") or {}).get("content"))
        if lm:
            result["last_modified"] = str(lm)[:30]
        footer_el = (soup.find("footer") or
                     soup.find(id=re.compile(r"footer", re.I)) or
                     soup.find(class_=re.compile(r"footer", re.I)))
        search_text = footer_el.get_text() if footer_el else visible[-600:]
        cp = re.search(r"(?:©|&copy;|copyright)\s*[\d\-–]*\s*(20\d{2})", search_text, re.I)
        if not cp:
            cp = re.search(r"(20\d{2})\s*[-–]\s*20\d{2}|(?:©|copyright)\D{0,10}(20\d{2})", search_text, re.I)
        if cp:
            result["copyright_year"] = cp.group(1) or cp.group(2)
    # ── Sitemap & robots (parallel) ───────────────────────────────────────────
    async def _get(url):
        try:
@@ -373,9 +392,17 @@ async def _analyze_site_inner(domain: str) -> dict:
        except Exception:
            return None
-    sitemap_txt, robots_txt = await asyncio.gather(
+    async def _get_contact_page():
        for path in ("/contacto", "/contact", "/contactanos", "/sobre-nosotros"):
            txt = await _get(f"https://{domain}{path}")
            if txt:
                return txt
        return None
    sitemap_txt, robots_txt, contact_html = await asyncio.gather(
        _get(f"https://{domain}/sitemap.xml"),
        _get(f"https://{domain}/robots.txt"),
        _get_contact_page(),
    )
    result["has_sitemap"] = sitemap_txt is not None
    result["has_robots"]  = robots_txt is not None
@@ -383,6 +410,37 @@ async def _analyze_site_inner(domain: str) -> dict:
        rl = robots_txt.lower()
        result["robots_disallows_google"] = "disallow: /" in rl and "googlebot" in rl
    # Merge contacts from /contacto page
    if contact_html:
        try:
            csoup = BeautifulSoup(contact_html, "html.parser")
            for a in csoup.find_all("a", href=True):
                href = a["href"]
                if href.startswith("mailto:"):
                    em = href[7:].split("?")[0].strip().lower()
                    if em and em not in result["emails"]:
                        result["emails"].append(em)
                elif href.startswith("tel:"):
                    ph = re.sub(r"[^\d+]", "", href[4:])
                    if ph and ph not in result["phones"]:
                        result["phones"].append(ph)
                elif "wa.me" in href or "api.whatsapp.com" in href:
                    if href not in result["whatsapp"]:
                        result["whatsapp"].append(href[:80])
            ctext = csoup.get_text()
            for em in EMAIL_RE.findall(contact_html[:60000]):
                em = em.lower()
                if em not in result["emails"] and not any(em.endswith(x) for x in [".png",".jpg",".css",".js"]):
                    result["emails"].append(em)
            for ph in PHONE_RE.findall(ctext):
                ph_c = re.sub(r"[\s\-]", "", ph)
                if ph_c not in result["phones"]:
                    result["phones"].append(ph_c)
            for k in ["emails", "phones", "whatsapp"]:
                result[k] = list(dict.fromkeys(result[k]))[:5]
        except Exception:
            pass
    # ── SSL ───────────────────────────────────────────────────────────────────
    import ssl as _ssl
    try:
--- a/app/static/index.html
+++ b/app/static/index.html
@@ -187,6 +187,8 @@ tr:hover td{background:rgba(255,255,255,.025)}
    <div class="mrow"><span class="mlabel">Reasoning</span><span x-text="modal.ai.lead_reasoning||'—'"></span></div>
    <div class="mrow"><span class="mlabel">KD notes</span><span x-text="modal.ai.kit_digital_reasoning||'—'"></span></div>
    <div class="mrow"><span class="mlabel">CMS</span><span x-text="modal.ai.cms_detected || modal.sa?.cms || '—'"></span></div>
    <div class="mrow"><span class="mlabel">Last updated</span><span :style="(modal.ai.site_last_updated&&parseInt(modal.ai.site_last_updated)<2021)?'color:var(--danger)':''" x-text="modal.ai.site_last_updated || (modal.sa?.copyright_year ? 'Copyright '+modal.sa.copyright_year : '—')"></span></div>
    <div class="mrow"><span class="mlabel">Performance</span><span x-text="modal.ai.performance_notes||'—'"></span></div>
    <div class="mrow"><span class="mlabel">SEO</span><span x-text="modal.ai.seo_status||'—'"></span></div>
    <div class="mrow"><span class="mlabel">Hosting</span><span x-text="(modal.sa?.org||'?') + ' / ' + (modal.sa?.ip_country||'?') + (modal.sa?.eu_hosted===false?' ❌ Non-EU':modal.sa?.eu_hosted?' ✅ EU':'')"></span></div>