feat: web search for contacts, copyright year, contact page scan, CMS/age from Gemini
- replicate_ai: DDG pre-search runs before every Gemini call; top 4 snippets injected into prompt so Gemini can find phone/email not on homepage - replicate_ai: explicit instructions to use search results for contact lookup - replicate_ai: new output fields cms_detected + site_last_updated - site_analyzer: copyright year extracted from footer (© / copyright pattern) - site_analyzer: Last-Modified from HTTP header + OG meta tag - site_analyzer: scans /contacto /contact /contactanos /sobre-nosotros for additional emails/phones (parallel with sitemap/robots fetch) - index.html: modal shows CMS (AI-detected), Last Updated (red if pre-2021) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -7,6 +7,7 @@ import re
|
|||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -24,7 +25,36 @@ def _sem() -> asyncio.Semaphore:
|
|||||||
return _ai_sem
|
return _ai_sem
|
||||||
|
|
||||||
|
|
||||||
def _build_prompt(a: dict) -> str:
|
async def _ddg_search(query: str) -> str:
|
||||||
|
"""DuckDuckGo HTML search — returns top snippet text, empty string on failure."""
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(
|
||||||
|
timeout=10, follow_redirects=True,
|
||||||
|
headers={"User-Agent": "Mozilla/5.0 (compatible; DomGod/1.0)"},
|
||||||
|
) as client:
|
||||||
|
r = await client.get(
|
||||||
|
"https://html.duckduckgo.com/html/",
|
||||||
|
params={"q": query, "kl": "es-es"},
|
||||||
|
)
|
||||||
|
if r.status_code != 200:
|
||||||
|
return ""
|
||||||
|
soup = BeautifulSoup(r.text, "html.parser")
|
||||||
|
parts = []
|
||||||
|
for res in soup.select(".result")[:4]:
|
||||||
|
title = res.select_one(".result__a")
|
||||||
|
snip = res.select_one(".result__snippet")
|
||||||
|
url = res.select_one(".result__url")
|
||||||
|
if snip:
|
||||||
|
t = title.get_text(strip=True) if title else ""
|
||||||
|
u = url.get_text(strip=True) if url else ""
|
||||||
|
parts.append(f"[{u}] {t} — {snip.get_text(strip=True)}")
|
||||||
|
return "\n".join(parts)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("DDG search failed: %s", e)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _build_prompt(a: dict, search_results: str = "") -> str:
|
||||||
contacts_block = []
|
contacts_block = []
|
||||||
if a.get("emails"): contacts_block.append(f" Emails: {', '.join(a['emails'][:3])}")
|
if a.get("emails"): contacts_block.append(f" Emails: {', '.join(a['emails'][:3])}")
|
||||||
if a.get("phones"): contacts_block.append(f" Phones: {', '.join(a['phones'][:3])}")
|
if a.get("phones"): contacts_block.append(f" Phones: {', '.join(a['phones'][:3])}")
|
||||||
@@ -40,6 +70,8 @@ def _build_prompt(a: dict) -> str:
|
|||||||
snippet = (a.get("visible_text_snippet") or "")[:2000]
|
snippet = (a.get("visible_text_snippet") or "")[:2000]
|
||||||
social_str = ", ".join(a.get("social_links") or []) or "none detected"
|
social_str = ", ".join(a.get("social_links") or []) or "none detected"
|
||||||
gmb_str = f"✅ Found — {a.get('gmb_url','')}" if a.get("has_gmb") else "❌ Not detected"
|
gmb_str = f"✅ Found — {a.get('gmb_url','')}" if a.get("has_gmb") else "❌ Not detected"
|
||||||
|
copyright_yr = a.get("copyright_year") or "not found"
|
||||||
|
last_mod = a.get("last_modified") or "not found"
|
||||||
|
|
||||||
eu_hosted = a.get("eu_hosted")
|
eu_hosted = a.get("eu_hosted")
|
||||||
hosting_flag = "✅ EU" if eu_hosted else ("❌ Non-EU" if eu_hosted is False else "unknown")
|
hosting_flag = "✅ EU" if eu_hosted else ("❌ Non-EU" if eu_hosted is False else "unknown")
|
||||||
@@ -84,9 +116,11 @@ Skip navigation link: {a.get("has_skip_nav")}
|
|||||||
Empty links: {a.get("empty_links")}
|
Empty links: {a.get("empty_links")}
|
||||||
Inputs without labels: {a.get("inputs_without_labels")}
|
Inputs without labels: {a.get("inputs_without_labels")}
|
||||||
|
|
||||||
=== CONTENT QUALITY ===
|
=== CONTENT QUALITY & FRESHNESS ===
|
||||||
Lorem ipsum: {a.get("has_lorem_ipsum")} → {lorem_str}
|
Lorem ipsum: {a.get("has_lorem_ipsum")} → {lorem_str}
|
||||||
Placeholder: {a.get("has_placeholder")} → {ph_str}
|
Placeholder: {a.get("has_placeholder")} → {ph_str}
|
||||||
|
Copyright year: {copyright_yr}
|
||||||
|
Last-Modified: {last_mod}
|
||||||
|
|
||||||
=== KIT DIGITAL (Spanish gov €12k SME grants — ONLY confirm if you see explicit Kit Digital / agente digitalizador branding) ===
|
=== KIT DIGITAL (Spanish gov €12k SME grants — ONLY confirm if you see explicit Kit Digital / agente digitalizador branding) ===
|
||||||
Heuristic detected: {a.get("kit_digital")}
|
Heuristic detected: {a.get("kit_digital")}
|
||||||
@@ -104,12 +138,22 @@ Profiles found on site: {social_str}
|
|||||||
=== PAGE TEXT SAMPLE ===
|
=== PAGE TEXT SAMPLE ===
|
||||||
{snippet}
|
{snippet}
|
||||||
|
|
||||||
|
=== WEB SEARCH RESULTS (use these to find contact info, verify business details) ===
|
||||||
|
{search_results if search_results else "No search results available."}
|
||||||
|
|
||||||
=== INSTRUCTIONS ===
|
=== INSTRUCTIONS ===
|
||||||
The client sells: web redesign, SEO, hosting migration, SSL renewal,
|
The client sells: web redesign, SEO, hosting migration, SSL renewal,
|
||||||
security audits, GDPR compliance, accessibility fixes, Google Ads,
|
security audits, GDPR compliance, accessibility fixes, Google Ads,
|
||||||
maintenance contracts, AI tools for SMEs, Google My Business setup/optimisation,
|
maintenance contracts, AI tools for SMEs, Google My Business setup/optimisation,
|
||||||
social media management (Instagram, Facebook, LinkedIn, TikTok).
|
social media management (Instagram, Facebook, LinkedIn, TikTok).
|
||||||
|
|
||||||
|
IMPORTANT — use the WEB SEARCH RESULTS above to:
|
||||||
|
1. Find any phone numbers, emails, or WhatsApp not visible on the homepage.
|
||||||
|
2. Identify the business owner name if available.
|
||||||
|
3. Populate best_contact_value with a real phone/email you found.
|
||||||
|
4. Use the copyright year and Last-Modified date to estimate when the site was last updated.
|
||||||
|
5. Determine the actual CMS from code signals and visible text (not just the heuristic).
|
||||||
|
|
||||||
Respond ONLY with valid JSON, no markdown fences, no text outside the JSON:
|
Respond ONLY with valid JSON, no markdown fences, no text outside the JSON:
|
||||||
{{
|
{{
|
||||||
"summary": "2-3 sentence executive summary of the site's state",
|
"summary": "2-3 sentence executive summary of the site's state",
|
||||||
@@ -120,6 +164,8 @@ Respond ONLY with valid JSON, no markdown fences, no text outside the JSON:
|
|||||||
"hosting_notes": "ASN/ISP name, EU vs non-EU, any concerns",
|
"hosting_notes": "ASN/ISP name, EU vs non-EU, any concerns",
|
||||||
"gdpr_compliance": "cookie banner status, privacy policy, GDPR gaps",
|
"gdpr_compliance": "cookie banner status, privacy policy, GDPR gaps",
|
||||||
"accessibility_issues": ["specific a11y problems found"],
|
"accessibility_issues": ["specific a11y problems found"],
|
||||||
|
"cms_detected": "wordpress|wix|squarespace|custom|unknown",
|
||||||
|
"site_last_updated": "year or estimate, e.g. '2019' or 'likely 2021'",
|
||||||
"kit_digital_confirmed": true/false,
|
"kit_digital_confirmed": true/false,
|
||||||
"has_gmb": true/false,
|
"has_gmb": true/false,
|
||||||
"has_social_media": true/false,
|
"has_social_media": true/false,
|
||||||
@@ -158,9 +204,17 @@ def _parse_output(raw: str) -> dict:
|
|||||||
async def assess_domain(analysis: dict) -> dict:
|
async def assess_domain(analysis: dict) -> dict:
|
||||||
"""Call Gemini with the full site analysis. Returns parsed assessment."""
|
"""Call Gemini with the full site analysis. Returns parsed assessment."""
|
||||||
async with _sem():
|
async with _sem():
|
||||||
|
# Build search query from domain / page title for contact lookup
|
||||||
|
domain = analysis.get("domain", "")
|
||||||
|
title = analysis.get("page_title") or ""
|
||||||
|
biz_name = title.split("|")[0].split("-")[0].strip() or domain
|
||||||
|
search_query = f'"{biz_name}" {domain} contacto telefono email'
|
||||||
|
search_results = await _ddg_search(search_query)
|
||||||
|
logger.info("DDG search for %s → %d chars", domain, len(search_results))
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"input": {
|
"input": {
|
||||||
"prompt": _build_prompt(analysis),
|
"prompt": _build_prompt(analysis, search_results),
|
||||||
"images": [],
|
"images": [],
|
||||||
"videos": [],
|
"videos": [],
|
||||||
"top_p": 0.9,
|
"top_p": 0.9,
|
||||||
|
|||||||
@@ -169,6 +169,8 @@ async def _analyze_site_inner(domain: str) -> dict:
|
|||||||
"has_gmb": False, "gmb_url": None,
|
"has_gmb": False, "gmb_url": None,
|
||||||
# Contacts
|
# Contacts
|
||||||
"emails": [], "phones": [], "whatsapp": [], "social_links": [],
|
"emails": [], "phones": [], "whatsapp": [], "social_links": [],
|
||||||
|
# Age / freshness
|
||||||
|
"copyright_year": None, "last_modified": None,
|
||||||
"error": None,
|
"error": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -364,6 +366,23 @@ async def _analyze_site_inner(domain: str) -> dict:
|
|||||||
result["cms"] = cms
|
result["cms"] = cms
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# ── Last-Modified / copyright year ────────────────────────────────────
|
||||||
|
lm = (resp.headers.get("last-modified") or
|
||||||
|
(soup.find("meta", attrs={"name": "last-modified"}) or {}).get("content") or
|
||||||
|
(soup.find("meta", property="article:modified_time") or {}).get("content"))
|
||||||
|
if lm:
|
||||||
|
result["last_modified"] = str(lm)[:30]
|
||||||
|
|
||||||
|
footer_el = (soup.find("footer") or
|
||||||
|
soup.find(id=re.compile(r"footer", re.I)) or
|
||||||
|
soup.find(class_=re.compile(r"footer", re.I)))
|
||||||
|
search_text = footer_el.get_text() if footer_el else visible[-600:]
|
||||||
|
cp = re.search(r"(?:©|©|copyright)\s*[\d\-–]*\s*(20\d{2})", search_text, re.I)
|
||||||
|
if not cp:
|
||||||
|
cp = re.search(r"(20\d{2})\s*[-–]\s*20\d{2}|(?:©|copyright)\D{0,10}(20\d{2})", search_text, re.I)
|
||||||
|
if cp:
|
||||||
|
result["copyright_year"] = cp.group(1) or cp.group(2)
|
||||||
|
|
||||||
# ── Sitemap & robots (parallel) ───────────────────────────────────────────
|
# ── Sitemap & robots (parallel) ───────────────────────────────────────────
|
||||||
async def _get(url):
|
async def _get(url):
|
||||||
try:
|
try:
|
||||||
@@ -373,9 +392,17 @@ async def _analyze_site_inner(domain: str) -> dict:
|
|||||||
except Exception:
|
except Exception:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
sitemap_txt, robots_txt = await asyncio.gather(
|
async def _get_contact_page():
|
||||||
|
for path in ("/contacto", "/contact", "/contactanos", "/sobre-nosotros"):
|
||||||
|
txt = await _get(f"https://{domain}{path}")
|
||||||
|
if txt:
|
||||||
|
return txt
|
||||||
|
return None
|
||||||
|
|
||||||
|
sitemap_txt, robots_txt, contact_html = await asyncio.gather(
|
||||||
_get(f"https://{domain}/sitemap.xml"),
|
_get(f"https://{domain}/sitemap.xml"),
|
||||||
_get(f"https://{domain}/robots.txt"),
|
_get(f"https://{domain}/robots.txt"),
|
||||||
|
_get_contact_page(),
|
||||||
)
|
)
|
||||||
result["has_sitemap"] = sitemap_txt is not None
|
result["has_sitemap"] = sitemap_txt is not None
|
||||||
result["has_robots"] = robots_txt is not None
|
result["has_robots"] = robots_txt is not None
|
||||||
@@ -383,6 +410,37 @@ async def _analyze_site_inner(domain: str) -> dict:
|
|||||||
rl = robots_txt.lower()
|
rl = robots_txt.lower()
|
||||||
result["robots_disallows_google"] = "disallow: /" in rl and "googlebot" in rl
|
result["robots_disallows_google"] = "disallow: /" in rl and "googlebot" in rl
|
||||||
|
|
||||||
|
# Merge contacts from /contacto page
|
||||||
|
if contact_html:
|
||||||
|
try:
|
||||||
|
csoup = BeautifulSoup(contact_html, "html.parser")
|
||||||
|
for a in csoup.find_all("a", href=True):
|
||||||
|
href = a["href"]
|
||||||
|
if href.startswith("mailto:"):
|
||||||
|
em = href[7:].split("?")[0].strip().lower()
|
||||||
|
if em and em not in result["emails"]:
|
||||||
|
result["emails"].append(em)
|
||||||
|
elif href.startswith("tel:"):
|
||||||
|
ph = re.sub(r"[^\d+]", "", href[4:])
|
||||||
|
if ph and ph not in result["phones"]:
|
||||||
|
result["phones"].append(ph)
|
||||||
|
elif "wa.me" in href or "api.whatsapp.com" in href:
|
||||||
|
if href not in result["whatsapp"]:
|
||||||
|
result["whatsapp"].append(href[:80])
|
||||||
|
ctext = csoup.get_text()
|
||||||
|
for em in EMAIL_RE.findall(contact_html[:60000]):
|
||||||
|
em = em.lower()
|
||||||
|
if em not in result["emails"] and not any(em.endswith(x) for x in [".png",".jpg",".css",".js"]):
|
||||||
|
result["emails"].append(em)
|
||||||
|
for ph in PHONE_RE.findall(ctext):
|
||||||
|
ph_c = re.sub(r"[\s\-]", "", ph)
|
||||||
|
if ph_c not in result["phones"]:
|
||||||
|
result["phones"].append(ph_c)
|
||||||
|
for k in ["emails", "phones", "whatsapp"]:
|
||||||
|
result[k] = list(dict.fromkeys(result[k]))[:5]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# ── SSL ───────────────────────────────────────────────────────────────────
|
# ── SSL ───────────────────────────────────────────────────────────────────
|
||||||
import ssl as _ssl
|
import ssl as _ssl
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -187,6 +187,8 @@ tr:hover td{background:rgba(255,255,255,.025)}
|
|||||||
|
|
||||||
<div class="mrow"><span class="mlabel">Reasoning</span><span x-text="modal.ai.lead_reasoning||'—'"></span></div>
|
<div class="mrow"><span class="mlabel">Reasoning</span><span x-text="modal.ai.lead_reasoning||'—'"></span></div>
|
||||||
<div class="mrow"><span class="mlabel">KD notes</span><span x-text="modal.ai.kit_digital_reasoning||'—'"></span></div>
|
<div class="mrow"><span class="mlabel">KD notes</span><span x-text="modal.ai.kit_digital_reasoning||'—'"></span></div>
|
||||||
|
<div class="mrow"><span class="mlabel">CMS</span><span x-text="modal.ai.cms_detected || modal.sa?.cms || '—'"></span></div>
|
||||||
|
<div class="mrow"><span class="mlabel">Last updated</span><span :style="(modal.ai.site_last_updated&&parseInt(modal.ai.site_last_updated)<2021)?'color:var(--danger)':''" x-text="modal.ai.site_last_updated || (modal.sa?.copyright_year ? 'Copyright '+modal.sa.copyright_year : '—')"></span></div>
|
||||||
<div class="mrow"><span class="mlabel">Performance</span><span x-text="modal.ai.performance_notes||'—'"></span></div>
|
<div class="mrow"><span class="mlabel">Performance</span><span x-text="modal.ai.performance_notes||'—'"></span></div>
|
||||||
<div class="mrow"><span class="mlabel">SEO</span><span x-text="modal.ai.seo_status||'—'"></span></div>
|
<div class="mrow"><span class="mlabel">SEO</span><span x-text="modal.ai.seo_status||'—'"></span></div>
|
||||||
<div class="mrow"><span class="mlabel">Hosting</span><span x-text="(modal.sa?.org||'?') + ' / ' + (modal.sa?.ip_country||'?') + (modal.sa?.eu_hosted===false?' ❌ Non-EU':modal.sa?.eu_hosted?' ✅ EU':'')"></span></div>
|
<div class="mrow"><span class="mlabel">Hosting</span><span x-text="(modal.sa?.org||'?') + ' / ' + (modal.sa?.ip_country||'?') + (modal.sa?.eu_hosted===false?' ❌ Non-EU':modal.sa?.eu_hosted?' ✅ EU':'')"></span></div>
|
||||||
|
|||||||
Reference in New Issue
Block a user