feat: web search for contacts, copyright year, contact page scan, CMS/age from Gemini
- replicate_ai: DDG pre-search runs before every Gemini call; top 4 snippets injected into prompt so Gemini can find phone/email not on homepage - replicate_ai: explicit instructions to use search results for contact lookup - replicate_ai: new output fields cms_detected + site_last_updated - site_analyzer: copyright year extracted from footer (© / copyright pattern) - site_analyzer: Last-Modified from HTTP header + OG meta tag - site_analyzer: scans /contacto /contact /contactanos /sobre-nosotros for additional emails/phones (parallel with sitemap/robots fetch) - index.html: modal shows CMS (AI-detected), Last Updated (red if pre-2021) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -7,6 +7,7 @@ import re
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -24,7 +25,36 @@ def _sem() -> asyncio.Semaphore:
|
||||
return _ai_sem
|
||||
|
||||
|
||||
def _build_prompt(a: dict) -> str:
|
||||
async def _ddg_search(query: str) -> str:
|
||||
"""DuckDuckGo HTML search — returns top snippet text, empty string on failure."""
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
timeout=10, follow_redirects=True,
|
||||
headers={"User-Agent": "Mozilla/5.0 (compatible; DomGod/1.0)"},
|
||||
) as client:
|
||||
r = await client.get(
|
||||
"https://html.duckduckgo.com/html/",
|
||||
params={"q": query, "kl": "es-es"},
|
||||
)
|
||||
if r.status_code != 200:
|
||||
return ""
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
parts = []
|
||||
for res in soup.select(".result")[:4]:
|
||||
title = res.select_one(".result__a")
|
||||
snip = res.select_one(".result__snippet")
|
||||
url = res.select_one(".result__url")
|
||||
if snip:
|
||||
t = title.get_text(strip=True) if title else ""
|
||||
u = url.get_text(strip=True) if url else ""
|
||||
parts.append(f"[{u}] {t} — {snip.get_text(strip=True)}")
|
||||
return "\n".join(parts)
|
||||
except Exception as e:
|
||||
logger.debug("DDG search failed: %s", e)
|
||||
return ""
|
||||
|
||||
|
||||
def _build_prompt(a: dict, search_results: str = "") -> str:
|
||||
contacts_block = []
|
||||
if a.get("emails"): contacts_block.append(f" Emails: {', '.join(a['emails'][:3])}")
|
||||
if a.get("phones"): contacts_block.append(f" Phones: {', '.join(a['phones'][:3])}")
|
||||
@@ -40,6 +70,8 @@ def _build_prompt(a: dict) -> str:
|
||||
snippet = (a.get("visible_text_snippet") or "")[:2000]
|
||||
social_str = ", ".join(a.get("social_links") or []) or "none detected"
|
||||
gmb_str = f"✅ Found — {a.get('gmb_url','')}" if a.get("has_gmb") else "❌ Not detected"
|
||||
copyright_yr = a.get("copyright_year") or "not found"
|
||||
last_mod = a.get("last_modified") or "not found"
|
||||
|
||||
eu_hosted = a.get("eu_hosted")
|
||||
hosting_flag = "✅ EU" if eu_hosted else ("❌ Non-EU" if eu_hosted is False else "unknown")
|
||||
@@ -84,9 +116,11 @@ Skip navigation link: {a.get("has_skip_nav")}
|
||||
Empty links: {a.get("empty_links")}
|
||||
Inputs without labels: {a.get("inputs_without_labels")}
|
||||
|
||||
=== CONTENT QUALITY ===
|
||||
=== CONTENT QUALITY & FRESHNESS ===
|
||||
Lorem ipsum: {a.get("has_lorem_ipsum")} → {lorem_str}
|
||||
Placeholder: {a.get("has_placeholder")} → {ph_str}
|
||||
Copyright year: {copyright_yr}
|
||||
Last-Modified: {last_mod}
|
||||
|
||||
=== KIT DIGITAL (Spanish gov €12k SME grants — ONLY confirm if you see explicit Kit Digital / agente digitalizador branding) ===
|
||||
Heuristic detected: {a.get("kit_digital")}
|
||||
@@ -104,12 +138,22 @@ Profiles found on site: {social_str}
|
||||
=== PAGE TEXT SAMPLE ===
|
||||
{snippet}
|
||||
|
||||
=== WEB SEARCH RESULTS (use these to find contact info, verify business details) ===
|
||||
{search_results if search_results else "No search results available."}
|
||||
|
||||
=== INSTRUCTIONS ===
|
||||
The client sells: web redesign, SEO, hosting migration, SSL renewal,
|
||||
security audits, GDPR compliance, accessibility fixes, Google Ads,
|
||||
maintenance contracts, AI tools for SMEs, Google My Business setup/optimisation,
|
||||
social media management (Instagram, Facebook, LinkedIn, TikTok).
|
||||
|
||||
IMPORTANT — use the WEB SEARCH RESULTS above to:
|
||||
1. Find any phone numbers, emails, or WhatsApp not visible on the homepage.
|
||||
2. Identify the business owner name if available.
|
||||
3. Populate best_contact_value with a real phone/email you found.
|
||||
4. Use the copyright year and Last-Modified date to estimate when the site was last updated.
|
||||
5. Determine the actual CMS from code signals and visible text (not just the heuristic).
|
||||
|
||||
Respond ONLY with valid JSON, no markdown fences, no text outside the JSON:
|
||||
{{
|
||||
"summary": "2-3 sentence executive summary of the site's state",
|
||||
@@ -120,6 +164,8 @@ Respond ONLY with valid JSON, no markdown fences, no text outside the JSON:
|
||||
"hosting_notes": "ASN/ISP name, EU vs non-EU, any concerns",
|
||||
"gdpr_compliance": "cookie banner status, privacy policy, GDPR gaps",
|
||||
"accessibility_issues": ["specific a11y problems found"],
|
||||
"cms_detected": "wordpress|wix|squarespace|custom|unknown",
|
||||
"site_last_updated": "year or estimate, e.g. '2019' or 'likely 2021'",
|
||||
"kit_digital_confirmed": true/false,
|
||||
"has_gmb": true/false,
|
||||
"has_social_media": true/false,
|
||||
@@ -158,9 +204,17 @@ def _parse_output(raw: str) -> dict:
|
||||
async def assess_domain(analysis: dict) -> dict:
|
||||
"""Call Gemini with the full site analysis. Returns parsed assessment."""
|
||||
async with _sem():
|
||||
# Build search query from domain / page title for contact lookup
|
||||
domain = analysis.get("domain", "")
|
||||
title = analysis.get("page_title") or ""
|
||||
biz_name = title.split("|")[0].split("-")[0].strip() or domain
|
||||
search_query = f'"{biz_name}" {domain} contacto telefono email'
|
||||
search_results = await _ddg_search(search_query)
|
||||
logger.info("DDG search for %s → %d chars", domain, len(search_results))
|
||||
|
||||
payload = {
|
||||
"input": {
|
||||
"prompt": _build_prompt(analysis),
|
||||
"prompt": _build_prompt(analysis, search_results),
|
||||
"images": [],
|
||||
"videos": [],
|
||||
"top_p": 0.9,
|
||||
|
||||
Reference in New Issue
Block a user