feat: web search for contacts, copyright year, contact page scan, CMS/age from Gemini
- replicate_ai: DDG pre-search runs before every Gemini call; top 4 snippets injected into prompt so Gemini can find phone/email not on homepage - replicate_ai: explicit instructions to use search results for contact lookup - replicate_ai: new output fields cms_detected + site_last_updated - site_analyzer: copyright year extracted from footer (© / copyright pattern) - site_analyzer: Last-Modified from HTTP header + OG meta tag - site_analyzer: scans /contacto /contact /contactanos /sobre-nosotros for additional emails/phones (parallel with sitemap/robots fetch) - index.html: modal shows CMS (AI-detected), Last Updated (red if pre-2021) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -169,6 +169,8 @@ async def _analyze_site_inner(domain: str) -> dict:
|
||||
"has_gmb": False, "gmb_url": None,
|
||||
# Contacts
|
||||
"emails": [], "phones": [], "whatsapp": [], "social_links": [],
|
||||
# Age / freshness
|
||||
"copyright_year": None, "last_modified": None,
|
||||
"error": None,
|
||||
}
|
||||
|
||||
@@ -364,6 +366,23 @@ async def _analyze_site_inner(domain: str) -> dict:
|
||||
result["cms"] = cms
|
||||
break
|
||||
|
||||
# ── Last-Modified / copyright year ────────────────────────────────────
|
||||
lm = (resp.headers.get("last-modified") or
|
||||
(soup.find("meta", attrs={"name": "last-modified"}) or {}).get("content") or
|
||||
(soup.find("meta", property="article:modified_time") or {}).get("content"))
|
||||
if lm:
|
||||
result["last_modified"] = str(lm)[:30]
|
||||
|
||||
footer_el = (soup.find("footer") or
|
||||
soup.find(id=re.compile(r"footer", re.I)) or
|
||||
soup.find(class_=re.compile(r"footer", re.I)))
|
||||
search_text = footer_el.get_text() if footer_el else visible[-600:]
|
||||
cp = re.search(r"(?:©|©|copyright)\s*[\d\-–]*\s*(20\d{2})", search_text, re.I)
|
||||
if not cp:
|
||||
cp = re.search(r"(20\d{2})\s*[-–]\s*20\d{2}|(?:©|copyright)\D{0,10}(20\d{2})", search_text, re.I)
|
||||
if cp:
|
||||
result["copyright_year"] = cp.group(1) or cp.group(2)
|
||||
|
||||
# ── Sitemap & robots (parallel) ───────────────────────────────────────────
|
||||
async def _get(url):
|
||||
try:
|
||||
@@ -373,9 +392,17 @@ async def _analyze_site_inner(domain: str) -> dict:
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
sitemap_txt, robots_txt = await asyncio.gather(
|
||||
async def _get_contact_page():
|
||||
for path in ("/contacto", "/contact", "/contactanos", "/sobre-nosotros"):
|
||||
txt = await _get(f"https://{domain}{path}")
|
||||
if txt:
|
||||
return txt
|
||||
return None
|
||||
|
||||
sitemap_txt, robots_txt, contact_html = await asyncio.gather(
|
||||
_get(f"https://{domain}/sitemap.xml"),
|
||||
_get(f"https://{domain}/robots.txt"),
|
||||
_get_contact_page(),
|
||||
)
|
||||
result["has_sitemap"] = sitemap_txt is not None
|
||||
result["has_robots"] = robots_txt is not None
|
||||
@@ -383,6 +410,37 @@ async def _analyze_site_inner(domain: str) -> dict:
|
||||
rl = robots_txt.lower()
|
||||
result["robots_disallows_google"] = "disallow: /" in rl and "googlebot" in rl
|
||||
|
||||
# Merge contacts from /contacto page
|
||||
if contact_html:
|
||||
try:
|
||||
csoup = BeautifulSoup(contact_html, "html.parser")
|
||||
for a in csoup.find_all("a", href=True):
|
||||
href = a["href"]
|
||||
if href.startswith("mailto:"):
|
||||
em = href[7:].split("?")[0].strip().lower()
|
||||
if em and em not in result["emails"]:
|
||||
result["emails"].append(em)
|
||||
elif href.startswith("tel:"):
|
||||
ph = re.sub(r"[^\d+]", "", href[4:])
|
||||
if ph and ph not in result["phones"]:
|
||||
result["phones"].append(ph)
|
||||
elif "wa.me" in href or "api.whatsapp.com" in href:
|
||||
if href not in result["whatsapp"]:
|
||||
result["whatsapp"].append(href[:80])
|
||||
ctext = csoup.get_text()
|
||||
for em in EMAIL_RE.findall(contact_html[:60000]):
|
||||
em = em.lower()
|
||||
if em not in result["emails"] and not any(em.endswith(x) for x in [".png",".jpg",".css",".js"]):
|
||||
result["emails"].append(em)
|
||||
for ph in PHONE_RE.findall(ctext):
|
||||
ph_c = re.sub(r"[\s\-]", "", ph)
|
||||
if ph_c not in result["phones"]:
|
||||
result["phones"].append(ph_c)
|
||||
for k in ["emails", "phones", "whatsapp"]:
|
||||
result[k] = list(dict.fromkeys(result[k]))[:5]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ── SSL ───────────────────────────────────────────────────────────────────
|
||||
import ssl as _ssl
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user