feat: web search for contacts, copyright year, contact page scan, CMS/age from Gemini

- replicate_ai: DDG pre-search runs before every Gemini call; top 4 snippets injected into prompt so Gemini can find phone/email not on homepage - replicate_ai: explicit instructions to use search results for contact lookup - replicate_ai: new output fields cms_detected + site_last_updated - site_analyzer: copyright year extracted from footer (© / copyright pattern) - site_analyzer: Last-Modified from HTTP header + OG meta tag - site_analyzer: scans /contacto /contact /contactanos /sobre-nosotros for additional emails/phones (parallel with sitemap/robots fetch) - index.html: modal shows CMS (AI-detected), Last Updated (red if pre-2021) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 08:22:14 +02:00
parent dad910b6b0
commit d62e4e986e
3 changed files with 118 additions and 4 deletions
--- a/app/site_analyzer.py
+++ b/app/site_analyzer.py
@@ -169,6 +169,8 @@ async def _analyze_site_inner(domain: str) -> dict:
        "has_gmb": False, "gmb_url": None,
        # Contacts
        "emails": [], "phones": [], "whatsapp": [], "social_links": [],
+        # Age / freshness
+        "copyright_year": None, "last_modified": None,
        "error": None,
    }

@@ -364,6 +366,23 @@ async def _analyze_site_inner(domain: str) -> dict:
                result["cms"] = cms
                break

+        # ── Last-Modified / copyright year ────────────────────────────────────
+        lm = (resp.headers.get("last-modified") or
+              (soup.find("meta", attrs={"name": "last-modified"}) or {}).get("content") or
+              (soup.find("meta", property="article:modified_time") or {}).get("content"))
+        if lm:
+            result["last_modified"] = str(lm)[:30]
+
+        footer_el = (soup.find("footer") or
+                     soup.find(id=re.compile(r"footer", re.I)) or
+                     soup.find(class_=re.compile(r"footer", re.I)))
+        search_text = footer_el.get_text() if footer_el else visible[-600:]
+        cp = re.search(r"(?:©|&copy;|copyright)\s*[\d\-–]*\s*(20\d{2})", search_text, re.I)
+        if not cp:
+            cp = re.search(r"(20\d{2})\s*[-–]\s*20\d{2}|(?:©|copyright)\D{0,10}(20\d{2})", search_text, re.I)
+        if cp:
+            result["copyright_year"] = cp.group(1) or cp.group(2)
+
    # ── Sitemap & robots (parallel) ───────────────────────────────────────────
    async def _get(url):
        try:
@@ -373,9 +392,17 @@ async def _analyze_site_inner(domain: str) -> dict:
        except Exception:
            return None

-    sitemap_txt, robots_txt = await asyncio.gather(
+    async def _get_contact_page():
+        for path in ("/contacto", "/contact", "/contactanos", "/sobre-nosotros"):
+            txt = await _get(f"https://{domain}{path}")
+            if txt:
+                return txt
+        return None
+
+    sitemap_txt, robots_txt, contact_html = await asyncio.gather(
        _get(f"https://{domain}/sitemap.xml"),
        _get(f"https://{domain}/robots.txt"),
+        _get_contact_page(),
    )
    result["has_sitemap"] = sitemap_txt is not None
    result["has_robots"]  = robots_txt is not None
@@ -383,6 +410,37 @@ async def _analyze_site_inner(domain: str) -> dict:
        rl = robots_txt.lower()
        result["robots_disallows_google"] = "disallow: /" in rl and "googlebot" in rl

+    # Merge contacts from /contacto page
+    if contact_html:
+        try:
+            csoup = BeautifulSoup(contact_html, "html.parser")
+            for a in csoup.find_all("a", href=True):
+                href = a["href"]
+                if href.startswith("mailto:"):
+                    em = href[7:].split("?")[0].strip().lower()
+                    if em and em not in result["emails"]:
+                        result["emails"].append(em)
+                elif href.startswith("tel:"):
+                    ph = re.sub(r"[^\d+]", "", href[4:])
+                    if ph and ph not in result["phones"]:
+                        result["phones"].append(ph)
+                elif "wa.me" in href or "api.whatsapp.com" in href:
+                    if href not in result["whatsapp"]:
+                        result["whatsapp"].append(href[:80])
+            ctext = csoup.get_text()
+            for em in EMAIL_RE.findall(contact_html[:60000]):
+                em = em.lower()
+                if em not in result["emails"] and not any(em.endswith(x) for x in [".png",".jpg",".css",".js"]):
+                    result["emails"].append(em)
+            for ph in PHONE_RE.findall(ctext):
+                ph_c = re.sub(r"[\s\-]", "", ph)
+                if ph_c not in result["phones"]:
+                    result["phones"].append(ph_c)
+            for k in ["emails", "phones", "whatsapp"]:
+                result[k] = list(dict.fromkeys(result[k]))[:5]
+        except Exception:
+            pass
+
    # ── SSL ───────────────────────────────────────────────────────────────────
    import ssl as _ssl
    try: