278 lines
11 KiB
Python
278 lines
11 KiB
Python
|
|
"""Deep site analysis: content quality, SEO signals, performance, indexing hints."""
|
||
|
|
import asyncio
|
||
|
|
import re
|
||
|
|
import time
|
||
|
|
import logging
|
||
|
|
from typing import Optional
|
||
|
|
|
||
|
|
import httpx
|
||
|
|
from bs4 import BeautifulSoup
|
||
|
|
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
# ── Content quality ───────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
LOREM_PHRASES = [
|
||
|
|
"lorem ipsum", "sed ut perspiciatis", "nunc sem sapien",
|
||
|
|
"nulla id nibh", "aenean dignissim", "aliquam tincidunt",
|
||
|
|
"vestibulum commodo", "fusce nunc lacus", "consectetuer",
|
||
|
|
"cras ornare tristique", "ntulla nec ante", "risus id metus",
|
||
|
|
"praesent placerat", "fusce pellentesque", "suscipit nibh",
|
||
|
|
"integer vitae libero", "felis quis tortor",
|
||
|
|
]
|
||
|
|
|
||
|
|
PLACEHOLDER_PHRASES = [
|
||
|
|
"under construction", "coming soon", "sample page",
|
||
|
|
"this is a demo", "default post", "hello world",
|
||
|
|
"test post", "uncategorized",
|
||
|
|
]
|
||
|
|
|
||
|
|
# ── Analytics & webmaster tags ────────────────────────────────────────────────
|
||
|
|
|
||
|
|
ANALYTICS = {
|
||
|
|
"google_analytics": ["gtag('config'", "google-analytics.com/analytics.js", "G-"],
|
||
|
|
"google_tag_manager": ["googletagmanager.com/gtm.js", "GTM-"],
|
||
|
|
"facebook_pixel": ["fbq('init'", "connect.facebook.net/en_US/fbevents"],
|
||
|
|
"hotjar": ["static.hotjar.com"],
|
||
|
|
"clarity": ["clarity.ms/tag"],
|
||
|
|
}
|
||
|
|
|
||
|
|
WEBMASTER = {
|
||
|
|
"google_search_console": ['google-site-verification'],
|
||
|
|
"bing_webmaster": ['msvalidate.01'],
|
||
|
|
"yandex": ['yandex-verification'],
|
||
|
|
}
|
||
|
|
|
||
|
|
KIT_IMG_PATS = [
|
||
|
|
"digitalizadores", "kit-digital", "kitdigital", "kit_digital",
|
||
|
|
"fondos-europeos", "fondos_europeos", "nextgeneration", "next-generation",
|
||
|
|
"prtr", "plan-recuperacion", "acelerapyme", "cofinanciado",
|
||
|
|
]
|
||
|
|
KIT_TEXT_PATS = [
|
||
|
|
"kit digital", "agente digitalizador", "fondos europeos",
|
||
|
|
"next generation eu", "nextgenerationeu", "plan de recuperación",
|
||
|
|
"prtr", "financiado por la unión europea", "red.es/kit-digital", "acelerapyme",
|
||
|
|
]
|
||
|
|
|
||
|
|
EMAIL_RE = re.compile(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}")
|
||
|
|
PHONE_RE = re.compile(r"(?:\+34[\s\-]?)?(?:6|7|8|9)\d{2}[\s\-]?\d{3}[\s\-]?\d{3}")
|
||
|
|
SOCIAL_DOM = ["facebook.com", "instagram.com", "linkedin.com", "twitter.com", "x.com", "tiktok.com"]
|
||
|
|
|
||
|
|
|
||
|
|
async def analyze_site(domain: str) -> dict:
|
||
|
|
"""Fetch and deeply analyse a site. Returns a rich dict for the AI prompt."""
|
||
|
|
result = {
|
||
|
|
"domain": domain,
|
||
|
|
"reachable": False,
|
||
|
|
"load_time_ms": None,
|
||
|
|
"status_code": None,
|
||
|
|
"final_url": None,
|
||
|
|
"page_size_kb": None,
|
||
|
|
"server": None,
|
||
|
|
"cms": None,
|
||
|
|
"ssl_valid": False,
|
||
|
|
"ssl_expiry_days": None,
|
||
|
|
# Content quality
|
||
|
|
"has_lorem_ipsum": False,
|
||
|
|
"lorem_matches": [],
|
||
|
|
"has_placeholder": False,
|
||
|
|
"placeholder_matches": [],
|
||
|
|
"word_count": 0,
|
||
|
|
"image_count": 0,
|
||
|
|
"broken_images": 0,
|
||
|
|
"script_count": 0,
|
||
|
|
"has_mobile_viewport": False,
|
||
|
|
"page_title": None,
|
||
|
|
"meta_description": None,
|
||
|
|
"h1_text": None,
|
||
|
|
"visible_text_snippet": "",
|
||
|
|
# SEO / webmaster
|
||
|
|
"has_sitemap": False,
|
||
|
|
"has_robots": False,
|
||
|
|
"robots_disallows_google": False,
|
||
|
|
"analytics_present": [],
|
||
|
|
"webmaster_verified": [],
|
||
|
|
"canonical_url": None,
|
||
|
|
"og_title": None,
|
||
|
|
# Kit Digital
|
||
|
|
"kit_digital": False,
|
||
|
|
"kit_digital_signals": [],
|
||
|
|
# Contacts
|
||
|
|
"emails": [],
|
||
|
|
"phones": [],
|
||
|
|
"whatsapp": [],
|
||
|
|
"social_links": [],
|
||
|
|
# Errors
|
||
|
|
"error": None,
|
||
|
|
}
|
||
|
|
|
||
|
|
# ── Fetch main page ───────────────────────────────────────────────────────
|
||
|
|
try:
|
||
|
|
t0 = time.monotonic()
|
||
|
|
async with httpx.AsyncClient(
|
||
|
|
timeout=15, follow_redirects=True, verify=False,
|
||
|
|
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"},
|
||
|
|
) as client:
|
||
|
|
resp = await client.get(f"https://{domain}")
|
||
|
|
if resp.status_code >= 400:
|
||
|
|
resp = await client.get(f"http://{domain}")
|
||
|
|
|
||
|
|
load_ms = int((time.monotonic() - t0) * 1000)
|
||
|
|
html = resp.text
|
||
|
|
result.update({
|
||
|
|
"reachable": resp.status_code < 400,
|
||
|
|
"load_time_ms": load_ms,
|
||
|
|
"status_code": resp.status_code,
|
||
|
|
"final_url": str(resp.url),
|
||
|
|
"page_size_kb": round(len(resp.content) / 1024, 1),
|
||
|
|
"server": resp.headers.get("server"),
|
||
|
|
})
|
||
|
|
|
||
|
|
soup = BeautifulSoup(html, "html.parser")
|
||
|
|
hl = html.lower()
|
||
|
|
|
||
|
|
# Title, meta
|
||
|
|
title_tag = soup.find("title")
|
||
|
|
result["page_title"] = title_tag.get_text(strip=True)[:200] if title_tag else None
|
||
|
|
meta_desc = soup.find("meta", attrs={"name": "description"})
|
||
|
|
result["meta_description"] = (meta_desc.get("content") or "")[:300] if meta_desc else None
|
||
|
|
h1 = soup.find("h1")
|
||
|
|
result["h1_text"] = h1.get_text(strip=True)[:200] if h1 else None
|
||
|
|
|
||
|
|
# Mobile viewport
|
||
|
|
result["has_mobile_viewport"] = bool(soup.find("meta", attrs={"name": "viewport"}))
|
||
|
|
|
||
|
|
# Canonical + OG
|
||
|
|
canon = soup.find("link", rel="canonical")
|
||
|
|
result["canonical_url"] = canon.get("href") if canon else None
|
||
|
|
og = soup.find("meta", property="og:title")
|
||
|
|
result["og_title"] = og.get("content") if og else None
|
||
|
|
|
||
|
|
# Visible text
|
||
|
|
for tag in soup(["script", "style", "noscript"]):
|
||
|
|
tag.decompose()
|
||
|
|
visible_text = soup.get_text(separator=" ", strip=True)
|
||
|
|
words = visible_text.split()
|
||
|
|
result["word_count"] = len(words)
|
||
|
|
result["visible_text_snippet"] = " ".join(words[:500])
|
||
|
|
|
||
|
|
# Lorem ipsum / placeholder detection
|
||
|
|
vl = visible_text.lower()
|
||
|
|
lorem_hits = [p for p in LOREM_PHRASES if p in vl]
|
||
|
|
result["has_lorem_ipsum"] = len(lorem_hits) > 0
|
||
|
|
result["lorem_matches"] = lorem_hits[:5]
|
||
|
|
ph_hits = [p for p in PLACEHOLDER_PHRASES if p in vl]
|
||
|
|
result["has_placeholder"] = len(ph_hits) > 0
|
||
|
|
result["placeholder_matches"] = ph_hits[:3]
|
||
|
|
|
||
|
|
# Images & scripts
|
||
|
|
imgs = soup.find_all("img")
|
||
|
|
result["image_count"] = len(imgs)
|
||
|
|
result["script_count"] = len(soup.find_all("script", src=True))
|
||
|
|
|
||
|
|
# Analytics / webmaster tags
|
||
|
|
for name, sigs in ANALYTICS.items():
|
||
|
|
if any(s.lower() in hl for s in sigs):
|
||
|
|
result["analytics_present"].append(name)
|
||
|
|
for name, sigs in WEBMASTER.items():
|
||
|
|
if any(s.lower() in hl for s in sigs):
|
||
|
|
result["webmaster_verified"].append(name)
|
||
|
|
|
||
|
|
# Kit Digital
|
||
|
|
kd_signals = []
|
||
|
|
for img in imgs:
|
||
|
|
combined = ((img.get("src") or "") + (img.get("alt") or "") + (img.get("srcset") or "")).lower()
|
||
|
|
for p in KIT_IMG_PATS:
|
||
|
|
if p in combined:
|
||
|
|
kd_signals.append(f"img:{p}")
|
||
|
|
break
|
||
|
|
for p in KIT_TEXT_PATS:
|
||
|
|
if p in hl:
|
||
|
|
kd_signals.append(f"text:{p}")
|
||
|
|
for a in soup.find_all("a", href=True):
|
||
|
|
href = a["href"].lower()
|
||
|
|
if "acelerapyme" in href or "red.es" in href or "kit-digital" in href:
|
||
|
|
kd_signals.append(f"link:{href[:50]}")
|
||
|
|
kd_signals = list(dict.fromkeys(kd_signals))[:10]
|
||
|
|
result["kit_digital"] = len(kd_signals) > 0
|
||
|
|
result["kit_digital_signals"] = kd_signals
|
||
|
|
|
||
|
|
# Contacts
|
||
|
|
for a in soup.find_all("a", href=True):
|
||
|
|
href = a["href"]
|
||
|
|
if href.startswith("mailto:"):
|
||
|
|
em = href[7:].split("?")[0].strip().lower()
|
||
|
|
if em and em not in result["emails"]:
|
||
|
|
result["emails"].append(em)
|
||
|
|
elif href.startswith("tel:"):
|
||
|
|
ph = re.sub(r"[^\d+]", "", href[4:])
|
||
|
|
if ph and ph not in result["phones"]:
|
||
|
|
result["phones"].append(ph)
|
||
|
|
elif "wa.me" in href or "api.whatsapp.com" in href:
|
||
|
|
if href not in result["whatsapp"]:
|
||
|
|
result["whatsapp"].append(href[:80])
|
||
|
|
else:
|
||
|
|
for sd in SOCIAL_DOM:
|
||
|
|
if sd in href.lower():
|
||
|
|
clean = href.split("?")[0].rstrip("/")
|
||
|
|
if clean not in result["social_links"]:
|
||
|
|
result["social_links"].append(clean)
|
||
|
|
break
|
||
|
|
for em in EMAIL_RE.findall(html[:80000]):
|
||
|
|
em = em.lower()
|
||
|
|
if em not in result["emails"] and not any(em.endswith(x) for x in [".png", ".jpg", ".css", ".js", ".svg"]):
|
||
|
|
result["emails"].append(em)
|
||
|
|
for ph in PHONE_RE.findall(visible_text):
|
||
|
|
ph_c = re.sub(r"[\s\-]", "", ph)
|
||
|
|
if ph_c not in result["phones"]:
|
||
|
|
result["phones"].append(ph_c)
|
||
|
|
# Cap
|
||
|
|
for k in ["emails", "phones", "whatsapp", "social_links"]:
|
||
|
|
result[k] = list(dict.fromkeys(result[k]))[:5]
|
||
|
|
|
||
|
|
# CMS
|
||
|
|
from app.enricher import detect_cms
|
||
|
|
result["cms"] = detect_cms(html, dict(resp.headers))
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
result["error"] = str(e)[:300]
|
||
|
|
|
||
|
|
# ── Sitemap & robots (parallel) ───────────────────────────────────────────
|
||
|
|
async def _check_url(url: str) -> Optional[str]:
|
||
|
|
try:
|
||
|
|
async with httpx.AsyncClient(timeout=6, follow_redirects=True, verify=False) as c:
|
||
|
|
r = await c.get(url)
|
||
|
|
return r.text if r.status_code == 200 else None
|
||
|
|
except Exception:
|
||
|
|
return None
|
||
|
|
|
||
|
|
sitemap_txt, robots_txt = await asyncio.gather(
|
||
|
|
_check_url(f"https://{domain}/sitemap.xml"),
|
||
|
|
_check_url(f"https://{domain}/robots.txt"),
|
||
|
|
)
|
||
|
|
result["has_sitemap"] = sitemap_txt is not None
|
||
|
|
result["has_robots"] = robots_txt is not None
|
||
|
|
if robots_txt:
|
||
|
|
robots_lower = robots_txt.lower()
|
||
|
|
result["robots_disallows_google"] = (
|
||
|
|
"disallow: /" in robots_lower and "googlebot" in robots_lower
|
||
|
|
)
|
||
|
|
|
||
|
|
# ── SSL ───────────────────────────────────────────────────────────────────
|
||
|
|
import ssl as _ssl, socket as _socket
|
||
|
|
try:
|
||
|
|
def _ssl_check():
|
||
|
|
import datetime as _dt
|
||
|
|
ctx = _ssl.create_default_context()
|
||
|
|
with _socket.create_connection((domain, 443), timeout=5) as s:
|
||
|
|
with ctx.wrap_socket(s, server_hostname=domain) as ss:
|
||
|
|
cert = ss.getpeercert()
|
||
|
|
exp = _dt.datetime.strptime(cert["notAfter"], "%b %d %H:%M:%S %Y %Z")
|
||
|
|
return True, (_dt.datetime.utcnow() - exp).days * -1
|
||
|
|
loop = asyncio.get_event_loop()
|
||
|
|
result["ssl_valid"], result["ssl_expiry_days"] = await loop.run_in_executor(None, _ssl_check)
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
return result
|