feat: web search for contacts, copyright year, contact page scan, CMS/age from Gemini

- replicate_ai: DDG pre-search runs before every Gemini call; top 4 snippets
  injected into prompt so Gemini can find phone/email not on homepage
- replicate_ai: explicit instructions to use search results for contact lookup
- replicate_ai: new output fields cms_detected + site_last_updated
- site_analyzer: copyright year extracted from footer (© / copyright pattern)
- site_analyzer: Last-Modified from HTTP header + OG meta tag
- site_analyzer: scans /contacto /contact /contactanos /sobre-nosotros for
  additional emails/phones (parallel with sitemap/robots fetch)
- index.html: modal shows CMS (AI-detected), Last Updated (red if pre-2021)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-14 08:22:14 +02:00
parent dad910b6b0
commit d62e4e986e
3 changed files with 118 additions and 4 deletions

View File

@@ -7,6 +7,7 @@ import re
from typing import Optional
import httpx
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
@@ -24,7 +25,36 @@ def _sem() -> asyncio.Semaphore:
return _ai_sem
def _build_prompt(a: dict) -> str:
async def _ddg_search(query: str) -> str:
"""DuckDuckGo HTML search — returns top snippet text, empty string on failure."""
try:
async with httpx.AsyncClient(
timeout=10, follow_redirects=True,
headers={"User-Agent": "Mozilla/5.0 (compatible; DomGod/1.0)"},
) as client:
r = await client.get(
"https://html.duckduckgo.com/html/",
params={"q": query, "kl": "es-es"},
)
if r.status_code != 200:
return ""
soup = BeautifulSoup(r.text, "html.parser")
parts = []
for res in soup.select(".result")[:4]:
title = res.select_one(".result__a")
snip = res.select_one(".result__snippet")
url = res.select_one(".result__url")
if snip:
t = title.get_text(strip=True) if title else ""
u = url.get_text(strip=True) if url else ""
parts.append(f"[{u}] {t}{snip.get_text(strip=True)}")
return "\n".join(parts)
except Exception as e:
logger.debug("DDG search failed: %s", e)
return ""
def _build_prompt(a: dict, search_results: str = "") -> str:
contacts_block = []
if a.get("emails"): contacts_block.append(f" Emails: {', '.join(a['emails'][:3])}")
if a.get("phones"): contacts_block.append(f" Phones: {', '.join(a['phones'][:3])}")
@@ -40,6 +70,8 @@ def _build_prompt(a: dict) -> str:
snippet = (a.get("visible_text_snippet") or "")[:2000]
social_str = ", ".join(a.get("social_links") or []) or "none detected"
gmb_str = f"✅ Found — {a.get('gmb_url','')}" if a.get("has_gmb") else "❌ Not detected"
copyright_yr = a.get("copyright_year") or "not found"
last_mod = a.get("last_modified") or "not found"
eu_hosted = a.get("eu_hosted")
hosting_flag = "✅ EU" if eu_hosted else ("❌ Non-EU" if eu_hosted is False else "unknown")
@@ -84,9 +116,11 @@ Skip navigation link: {a.get("has_skip_nav")}
Empty links: {a.get("empty_links")}
Inputs without labels: {a.get("inputs_without_labels")}
=== CONTENT QUALITY ===
=== CONTENT QUALITY & FRESHNESS ===
Lorem ipsum: {a.get("has_lorem_ipsum")}{lorem_str}
Placeholder: {a.get("has_placeholder")}{ph_str}
Copyright year: {copyright_yr}
Last-Modified: {last_mod}
=== KIT DIGITAL (Spanish gov €12k SME grants — ONLY confirm if you see explicit Kit Digital / agente digitalizador branding) ===
Heuristic detected: {a.get("kit_digital")}
@@ -104,12 +138,22 @@ Profiles found on site: {social_str}
=== PAGE TEXT SAMPLE ===
{snippet}
=== WEB SEARCH RESULTS (use these to find contact info, verify business details) ===
{search_results if search_results else "No search results available."}
=== INSTRUCTIONS ===
The client sells: web redesign, SEO, hosting migration, SSL renewal,
security audits, GDPR compliance, accessibility fixes, Google Ads,
maintenance contracts, AI tools for SMEs, Google My Business setup/optimisation,
social media management (Instagram, Facebook, LinkedIn, TikTok).
IMPORTANT — use the WEB SEARCH RESULTS above to:
1. Find any phone numbers, emails, or WhatsApp not visible on the homepage.
2. Identify the business owner name if available.
3. Populate best_contact_value with a real phone/email you found.
4. Use the copyright year and Last-Modified date to estimate when the site was last updated.
5. Determine the actual CMS from code signals and visible text (not just the heuristic).
Respond ONLY with valid JSON, no markdown fences, no text outside the JSON:
{{
"summary": "2-3 sentence executive summary of the site's state",
@@ -120,6 +164,8 @@ Respond ONLY with valid JSON, no markdown fences, no text outside the JSON:
"hosting_notes": "ASN/ISP name, EU vs non-EU, any concerns",
"gdpr_compliance": "cookie banner status, privacy policy, GDPR gaps",
"accessibility_issues": ["specific a11y problems found"],
"cms_detected": "wordpress|wix|squarespace|custom|unknown",
"site_last_updated": "year or estimate, e.g. '2019' or 'likely 2021'",
"kit_digital_confirmed": true/false,
"has_gmb": true/false,
"has_social_media": true/false,
@@ -158,9 +204,17 @@ def _parse_output(raw: str) -> dict:
async def assess_domain(analysis: dict) -> dict:
"""Call Gemini with the full site analysis. Returns parsed assessment."""
async with _sem():
# Build search query from domain / page title for contact lookup
domain = analysis.get("domain", "")
title = analysis.get("page_title") or ""
biz_name = title.split("|")[0].split("-")[0].strip() or domain
search_query = f'"{biz_name}" {domain} contacto telefono email'
search_results = await _ddg_search(search_query)
logger.info("DDG search for %s%d chars", domain, len(search_results))
payload = {
"input": {
"prompt": _build_prompt(analysis),
"prompt": _build_prompt(analysis, search_results),
"images": [],
"videos": [],
"top_p": 0.9,