feat: deep site analysis engine + fix AI assess for any domain
site_analyzer.py (new):
- Fresh scrape with timing, page size, server, CMS detection
- Lorem ipsum detection (16 phrases incl. user's example)
- Placeholder content detection (hello world, sample page, etc.)
- Analytics: GA4, GTM, Facebook Pixel, Hotjar, Clarity
- Webmaster: Google Search Console, Bing, Yandex verification tags
- sitemap.xml and robots.txt check + Googlebot block detection
- Mobile viewport check, word count, image/script count
- Full contact extraction: emails, phones, WhatsApp, social links
- Kit Digital signal detection
AI worker fix:
- No longer requires pre-enrichment — works on ANY selected domain
- Does fresh site_analyzer scrape then calls Gemini with full context
- Stores site_analysis JSON alongside AI assessment
- Upserts into enriched_domains even if domain was never enriched
Gemini prompt now includes:
- Complete technical snapshot (load time, size, server, SSL)
- Full SEO signals (sitemap, robots, analytics, webmaster verified)
- Content quality (lorem ipsum matches, placeholder matches)
- Kit Digital signals
- All extracted contacts
- 500-word page text sample
- Outputs: summary, site_quality_score/10, content_issues[],
urgency_signals[], performance_notes, seo_status,
best_contact_channel+value, all_contacts, ES pitch,
services_needed, outreach_notes
UI: rich AI modal with summary banner, quality grid, content issues,
urgency signals, full contact list, technical snapshot
Fixes: correct Replicate token, ai_queue status='running' bug
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
277
app/site_analyzer.py
Normal file
277
app/site_analyzer.py
Normal file
@@ -0,0 +1,277 @@
|
||||
"""Deep site analysis: content quality, SEO signals, performance, indexing hints."""
|
||||
import asyncio
|
||||
import re
|
||||
import time
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ── Content quality ───────────────────────────────────────────────────────────
|
||||
|
||||
LOREM_PHRASES = [
|
||||
"lorem ipsum", "sed ut perspiciatis", "nunc sem sapien",
|
||||
"nulla id nibh", "aenean dignissim", "aliquam tincidunt",
|
||||
"vestibulum commodo", "fusce nunc lacus", "consectetuer",
|
||||
"cras ornare tristique", "ntulla nec ante", "risus id metus",
|
||||
"praesent placerat", "fusce pellentesque", "suscipit nibh",
|
||||
"integer vitae libero", "felis quis tortor",
|
||||
]
|
||||
|
||||
PLACEHOLDER_PHRASES = [
|
||||
"under construction", "coming soon", "sample page",
|
||||
"this is a demo", "default post", "hello world",
|
||||
"test post", "uncategorized",
|
||||
]
|
||||
|
||||
# ── Analytics & webmaster tags ────────────────────────────────────────────────
|
||||
|
||||
ANALYTICS = {
|
||||
"google_analytics": ["gtag('config'", "google-analytics.com/analytics.js", "G-"],
|
||||
"google_tag_manager": ["googletagmanager.com/gtm.js", "GTM-"],
|
||||
"facebook_pixel": ["fbq('init'", "connect.facebook.net/en_US/fbevents"],
|
||||
"hotjar": ["static.hotjar.com"],
|
||||
"clarity": ["clarity.ms/tag"],
|
||||
}
|
||||
|
||||
WEBMASTER = {
|
||||
"google_search_console": ['google-site-verification'],
|
||||
"bing_webmaster": ['msvalidate.01'],
|
||||
"yandex": ['yandex-verification'],
|
||||
}
|
||||
|
||||
KIT_IMG_PATS = [
|
||||
"digitalizadores", "kit-digital", "kitdigital", "kit_digital",
|
||||
"fondos-europeos", "fondos_europeos", "nextgeneration", "next-generation",
|
||||
"prtr", "plan-recuperacion", "acelerapyme", "cofinanciado",
|
||||
]
|
||||
KIT_TEXT_PATS = [
|
||||
"kit digital", "agente digitalizador", "fondos europeos",
|
||||
"next generation eu", "nextgenerationeu", "plan de recuperación",
|
||||
"prtr", "financiado por la unión europea", "red.es/kit-digital", "acelerapyme",
|
||||
]
|
||||
|
||||
EMAIL_RE = re.compile(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}")
|
||||
PHONE_RE = re.compile(r"(?:\+34[\s\-]?)?(?:6|7|8|9)\d{2}[\s\-]?\d{3}[\s\-]?\d{3}")
|
||||
SOCIAL_DOM = ["facebook.com", "instagram.com", "linkedin.com", "twitter.com", "x.com", "tiktok.com"]
|
||||
|
||||
|
||||
async def analyze_site(domain: str) -> dict:
|
||||
"""Fetch and deeply analyse a site. Returns a rich dict for the AI prompt."""
|
||||
result = {
|
||||
"domain": domain,
|
||||
"reachable": False,
|
||||
"load_time_ms": None,
|
||||
"status_code": None,
|
||||
"final_url": None,
|
||||
"page_size_kb": None,
|
||||
"server": None,
|
||||
"cms": None,
|
||||
"ssl_valid": False,
|
||||
"ssl_expiry_days": None,
|
||||
# Content quality
|
||||
"has_lorem_ipsum": False,
|
||||
"lorem_matches": [],
|
||||
"has_placeholder": False,
|
||||
"placeholder_matches": [],
|
||||
"word_count": 0,
|
||||
"image_count": 0,
|
||||
"broken_images": 0,
|
||||
"script_count": 0,
|
||||
"has_mobile_viewport": False,
|
||||
"page_title": None,
|
||||
"meta_description": None,
|
||||
"h1_text": None,
|
||||
"visible_text_snippet": "",
|
||||
# SEO / webmaster
|
||||
"has_sitemap": False,
|
||||
"has_robots": False,
|
||||
"robots_disallows_google": False,
|
||||
"analytics_present": [],
|
||||
"webmaster_verified": [],
|
||||
"canonical_url": None,
|
||||
"og_title": None,
|
||||
# Kit Digital
|
||||
"kit_digital": False,
|
||||
"kit_digital_signals": [],
|
||||
# Contacts
|
||||
"emails": [],
|
||||
"phones": [],
|
||||
"whatsapp": [],
|
||||
"social_links": [],
|
||||
# Errors
|
||||
"error": None,
|
||||
}
|
||||
|
||||
# ── Fetch main page ───────────────────────────────────────────────────────
|
||||
try:
|
||||
t0 = time.monotonic()
|
||||
async with httpx.AsyncClient(
|
||||
timeout=15, follow_redirects=True, verify=False,
|
||||
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"},
|
||||
) as client:
|
||||
resp = await client.get(f"https://{domain}")
|
||||
if resp.status_code >= 400:
|
||||
resp = await client.get(f"http://{domain}")
|
||||
|
||||
load_ms = int((time.monotonic() - t0) * 1000)
|
||||
html = resp.text
|
||||
result.update({
|
||||
"reachable": resp.status_code < 400,
|
||||
"load_time_ms": load_ms,
|
||||
"status_code": resp.status_code,
|
||||
"final_url": str(resp.url),
|
||||
"page_size_kb": round(len(resp.content) / 1024, 1),
|
||||
"server": resp.headers.get("server"),
|
||||
})
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
hl = html.lower()
|
||||
|
||||
# Title, meta
|
||||
title_tag = soup.find("title")
|
||||
result["page_title"] = title_tag.get_text(strip=True)[:200] if title_tag else None
|
||||
meta_desc = soup.find("meta", attrs={"name": "description"})
|
||||
result["meta_description"] = (meta_desc.get("content") or "")[:300] if meta_desc else None
|
||||
h1 = soup.find("h1")
|
||||
result["h1_text"] = h1.get_text(strip=True)[:200] if h1 else None
|
||||
|
||||
# Mobile viewport
|
||||
result["has_mobile_viewport"] = bool(soup.find("meta", attrs={"name": "viewport"}))
|
||||
|
||||
# Canonical + OG
|
||||
canon = soup.find("link", rel="canonical")
|
||||
result["canonical_url"] = canon.get("href") if canon else None
|
||||
og = soup.find("meta", property="og:title")
|
||||
result["og_title"] = og.get("content") if og else None
|
||||
|
||||
# Visible text
|
||||
for tag in soup(["script", "style", "noscript"]):
|
||||
tag.decompose()
|
||||
visible_text = soup.get_text(separator=" ", strip=True)
|
||||
words = visible_text.split()
|
||||
result["word_count"] = len(words)
|
||||
result["visible_text_snippet"] = " ".join(words[:500])
|
||||
|
||||
# Lorem ipsum / placeholder detection
|
||||
vl = visible_text.lower()
|
||||
lorem_hits = [p for p in LOREM_PHRASES if p in vl]
|
||||
result["has_lorem_ipsum"] = len(lorem_hits) > 0
|
||||
result["lorem_matches"] = lorem_hits[:5]
|
||||
ph_hits = [p for p in PLACEHOLDER_PHRASES if p in vl]
|
||||
result["has_placeholder"] = len(ph_hits) > 0
|
||||
result["placeholder_matches"] = ph_hits[:3]
|
||||
|
||||
# Images & scripts
|
||||
imgs = soup.find_all("img")
|
||||
result["image_count"] = len(imgs)
|
||||
result["script_count"] = len(soup.find_all("script", src=True))
|
||||
|
||||
# Analytics / webmaster tags
|
||||
for name, sigs in ANALYTICS.items():
|
||||
if any(s.lower() in hl for s in sigs):
|
||||
result["analytics_present"].append(name)
|
||||
for name, sigs in WEBMASTER.items():
|
||||
if any(s.lower() in hl for s in sigs):
|
||||
result["webmaster_verified"].append(name)
|
||||
|
||||
# Kit Digital
|
||||
kd_signals = []
|
||||
for img in imgs:
|
||||
combined = ((img.get("src") or "") + (img.get("alt") or "") + (img.get("srcset") or "")).lower()
|
||||
for p in KIT_IMG_PATS:
|
||||
if p in combined:
|
||||
kd_signals.append(f"img:{p}")
|
||||
break
|
||||
for p in KIT_TEXT_PATS:
|
||||
if p in hl:
|
||||
kd_signals.append(f"text:{p}")
|
||||
for a in soup.find_all("a", href=True):
|
||||
href = a["href"].lower()
|
||||
if "acelerapyme" in href or "red.es" in href or "kit-digital" in href:
|
||||
kd_signals.append(f"link:{href[:50]}")
|
||||
kd_signals = list(dict.fromkeys(kd_signals))[:10]
|
||||
result["kit_digital"] = len(kd_signals) > 0
|
||||
result["kit_digital_signals"] = kd_signals
|
||||
|
||||
# Contacts
|
||||
for a in soup.find_all("a", href=True):
|
||||
href = a["href"]
|
||||
if href.startswith("mailto:"):
|
||||
em = href[7:].split("?")[0].strip().lower()
|
||||
if em and em not in result["emails"]:
|
||||
result["emails"].append(em)
|
||||
elif href.startswith("tel:"):
|
||||
ph = re.sub(r"[^\d+]", "", href[4:])
|
||||
if ph and ph not in result["phones"]:
|
||||
result["phones"].append(ph)
|
||||
elif "wa.me" in href or "api.whatsapp.com" in href:
|
||||
if href not in result["whatsapp"]:
|
||||
result["whatsapp"].append(href[:80])
|
||||
else:
|
||||
for sd in SOCIAL_DOM:
|
||||
if sd in href.lower():
|
||||
clean = href.split("?")[0].rstrip("/")
|
||||
if clean not in result["social_links"]:
|
||||
result["social_links"].append(clean)
|
||||
break
|
||||
for em in EMAIL_RE.findall(html[:80000]):
|
||||
em = em.lower()
|
||||
if em not in result["emails"] and not any(em.endswith(x) for x in [".png", ".jpg", ".css", ".js", ".svg"]):
|
||||
result["emails"].append(em)
|
||||
for ph in PHONE_RE.findall(visible_text):
|
||||
ph_c = re.sub(r"[\s\-]", "", ph)
|
||||
if ph_c not in result["phones"]:
|
||||
result["phones"].append(ph_c)
|
||||
# Cap
|
||||
for k in ["emails", "phones", "whatsapp", "social_links"]:
|
||||
result[k] = list(dict.fromkeys(result[k]))[:5]
|
||||
|
||||
# CMS
|
||||
from app.enricher import detect_cms
|
||||
result["cms"] = detect_cms(html, dict(resp.headers))
|
||||
|
||||
except Exception as e:
|
||||
result["error"] = str(e)[:300]
|
||||
|
||||
# ── Sitemap & robots (parallel) ───────────────────────────────────────────
|
||||
async def _check_url(url: str) -> Optional[str]:
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=6, follow_redirects=True, verify=False) as c:
|
||||
r = await c.get(url)
|
||||
return r.text if r.status_code == 200 else None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
sitemap_txt, robots_txt = await asyncio.gather(
|
||||
_check_url(f"https://{domain}/sitemap.xml"),
|
||||
_check_url(f"https://{domain}/robots.txt"),
|
||||
)
|
||||
result["has_sitemap"] = sitemap_txt is not None
|
||||
result["has_robots"] = robots_txt is not None
|
||||
if robots_txt:
|
||||
robots_lower = robots_txt.lower()
|
||||
result["robots_disallows_google"] = (
|
||||
"disallow: /" in robots_lower and "googlebot" in robots_lower
|
||||
)
|
||||
|
||||
# ── SSL ───────────────────────────────────────────────────────────────────
|
||||
import ssl as _ssl, socket as _socket
|
||||
try:
|
||||
def _ssl_check():
|
||||
import datetime as _dt
|
||||
ctx = _ssl.create_default_context()
|
||||
with _socket.create_connection((domain, 443), timeout=5) as s:
|
||||
with ctx.wrap_socket(s, server_hostname=domain) as ss:
|
||||
cert = ss.getpeercert()
|
||||
exp = _dt.datetime.strptime(cert["notAfter"], "%b %d %H:%M:%S %Y %Z")
|
||||
return True, (_dt.datetime.utcnow() - exp).days * -1
|
||||
loop = asyncio.get_event_loop()
|
||||
result["ssl_valid"], result["ssl_expiry_days"] = await loop.run_in_executor(None, _ssl_check)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user