feat: web search for contacts, copyright year, contact page scan, CMS/age from Gemini

- replicate_ai: DDG pre-search runs before every Gemini call; top 4 snippets
  injected into prompt so Gemini can find phone/email not on homepage
- replicate_ai: explicit instructions to use search results for contact lookup
- replicate_ai: new output fields cms_detected + site_last_updated
- site_analyzer: copyright year extracted from footer (© / copyright pattern)
- site_analyzer: Last-Modified from HTTP header + OG meta tag
- site_analyzer: scans /contacto /contact /contactanos /sobre-nosotros for
  additional emails/phones (parallel with sitemap/robots fetch)
- index.html: modal shows CMS (AI-detected), Last Updated (red if pre-2021)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-14 08:22:14 +02:00
parent dad910b6b0
commit d62e4e986e
3 changed files with 118 additions and 4 deletions

View File

@@ -169,6 +169,8 @@ async def _analyze_site_inner(domain: str) -> dict:
"has_gmb": False, "gmb_url": None,
# Contacts
"emails": [], "phones": [], "whatsapp": [], "social_links": [],
# Age / freshness
"copyright_year": None, "last_modified": None,
"error": None,
}
@@ -364,6 +366,23 @@ async def _analyze_site_inner(domain: str) -> dict:
result["cms"] = cms
break
# ── Last-Modified / copyright year ────────────────────────────────────
lm = (resp.headers.get("last-modified") or
(soup.find("meta", attrs={"name": "last-modified"}) or {}).get("content") or
(soup.find("meta", property="article:modified_time") or {}).get("content"))
if lm:
result["last_modified"] = str(lm)[:30]
footer_el = (soup.find("footer") or
soup.find(id=re.compile(r"footer", re.I)) or
soup.find(class_=re.compile(r"footer", re.I)))
search_text = footer_el.get_text() if footer_el else visible[-600:]
cp = re.search(r"(?:©|&copy;|copyright)\s*[\d\-]*\s*(20\d{2})", search_text, re.I)
if not cp:
cp = re.search(r"(20\d{2})\s*[-]\s*20\d{2}|(?:©|copyright)\D{0,10}(20\d{2})", search_text, re.I)
if cp:
result["copyright_year"] = cp.group(1) or cp.group(2)
# ── Sitemap & robots (parallel) ───────────────────────────────────────────
async def _get(url):
try:
@@ -373,9 +392,17 @@ async def _analyze_site_inner(domain: str) -> dict:
except Exception:
return None
sitemap_txt, robots_txt = await asyncio.gather(
async def _get_contact_page():
for path in ("/contacto", "/contact", "/contactanos", "/sobre-nosotros"):
txt = await _get(f"https://{domain}{path}")
if txt:
return txt
return None
sitemap_txt, robots_txt, contact_html = await asyncio.gather(
_get(f"https://{domain}/sitemap.xml"),
_get(f"https://{domain}/robots.txt"),
_get_contact_page(),
)
result["has_sitemap"] = sitemap_txt is not None
result["has_robots"] = robots_txt is not None
@@ -383,6 +410,37 @@ async def _analyze_site_inner(domain: str) -> dict:
rl = robots_txt.lower()
result["robots_disallows_google"] = "disallow: /" in rl and "googlebot" in rl
# Merge contacts from /contacto page
if contact_html:
try:
csoup = BeautifulSoup(contact_html, "html.parser")
for a in csoup.find_all("a", href=True):
href = a["href"]
if href.startswith("mailto:"):
em = href[7:].split("?")[0].strip().lower()
if em and em not in result["emails"]:
result["emails"].append(em)
elif href.startswith("tel:"):
ph = re.sub(r"[^\d+]", "", href[4:])
if ph and ph not in result["phones"]:
result["phones"].append(ph)
elif "wa.me" in href or "api.whatsapp.com" in href:
if href not in result["whatsapp"]:
result["whatsapp"].append(href[:80])
ctext = csoup.get_text()
for em in EMAIL_RE.findall(contact_html[:60000]):
em = em.lower()
if em not in result["emails"] and not any(em.endswith(x) for x in [".png",".jpg",".css",".js"]):
result["emails"].append(em)
for ph in PHONE_RE.findall(ctext):
ph_c = re.sub(r"[\s\-]", "", ph)
if ph_c not in result["phones"]:
result["phones"].append(ph_c)
for k in ["emails", "phones", "whatsapp"]:
result[k] = list(dict.fromkeys(result[k]))[:5]
except Exception:
pass
# ── SSL ───────────────────────────────────────────────────────────────────
import ssl as _ssl
try: