"""Deep site analysis: content quality, SEO signals, performance, indexing hints.""" import asyncio import re import time import logging from typing import Optional import httpx from bs4 import BeautifulSoup logger = logging.getLogger(__name__) # ── Content quality ─────────────────────────────────────────────────────────── LOREM_PHRASES = [ "lorem ipsum", "sed ut perspiciatis", "nunc sem sapien", "nulla id nibh", "aenean dignissim", "aliquam tincidunt", "vestibulum commodo", "fusce nunc lacus", "consectetuer", "cras ornare tristique", "ntulla nec ante", "risus id metus", "praesent placerat", "fusce pellentesque", "suscipit nibh", "integer vitae libero", "felis quis tortor", ] PLACEHOLDER_PHRASES = [ "under construction", "coming soon", "sample page", "this is a demo", "default post", "hello world", "test post", "uncategorized", ] # ── Analytics & webmaster tags ──────────────────────────────────────────────── ANALYTICS = { "google_analytics": ["gtag('config'", "google-analytics.com/analytics.js", "G-"], "google_tag_manager": ["googletagmanager.com/gtm.js", "GTM-"], "facebook_pixel": ["fbq('init'", "connect.facebook.net/en_US/fbevents"], "hotjar": ["static.hotjar.com"], "clarity": ["clarity.ms/tag"], } WEBMASTER = { "google_search_console": ['google-site-verification'], "bing_webmaster": ['msvalidate.01'], "yandex": ['yandex-verification'], } KIT_IMG_PATS = [ "digitalizadores", "kit-digital", "kitdigital", "kit_digital", "fondos-europeos", "fondos_europeos", "nextgeneration", "next-generation", "prtr", "plan-recuperacion", "acelerapyme", "cofinanciado", ] KIT_TEXT_PATS = [ "kit digital", "agente digitalizador", "fondos europeos", "next generation eu", "nextgenerationeu", "plan de recuperación", "prtr", "financiado por la unión europea", "red.es/kit-digital", "acelerapyme", ] EMAIL_RE = re.compile(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}") PHONE_RE = re.compile(r"(?:\+34[\s\-]?)?(?:6|7|8|9)\d{2}[\s\-]?\d{3}[\s\-]?\d{3}") SOCIAL_DOM = ["facebook.com", "instagram.com", "linkedin.com", "twitter.com", "x.com", "tiktok.com"] async def analyze_site(domain: str) -> dict: """Fetch and deeply analyse a site. Returns a rich dict for the AI prompt.""" result = { "domain": domain, "reachable": False, "load_time_ms": None, "status_code": None, "final_url": None, "page_size_kb": None, "server": None, "cms": None, "ssl_valid": False, "ssl_expiry_days": None, # Content quality "has_lorem_ipsum": False, "lorem_matches": [], "has_placeholder": False, "placeholder_matches": [], "word_count": 0, "image_count": 0, "broken_images": 0, "script_count": 0, "has_mobile_viewport": False, "page_title": None, "meta_description": None, "h1_text": None, "visible_text_snippet": "", # SEO / webmaster "has_sitemap": False, "has_robots": False, "robots_disallows_google": False, "analytics_present": [], "webmaster_verified": [], "canonical_url": None, "og_title": None, # Kit Digital "kit_digital": False, "kit_digital_signals": [], # Contacts "emails": [], "phones": [], "whatsapp": [], "social_links": [], # Errors "error": None, } # ── Fetch main page ─────────────────────────────────────────────────────── try: t0 = time.monotonic() async with httpx.AsyncClient( timeout=15, follow_redirects=True, verify=False, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}, ) as client: resp = await client.get(f"https://{domain}") if resp.status_code >= 400: resp = await client.get(f"http://{domain}") load_ms = int((time.monotonic() - t0) * 1000) html = resp.text result.update({ "reachable": resp.status_code < 400, "load_time_ms": load_ms, "status_code": resp.status_code, "final_url": str(resp.url), "page_size_kb": round(len(resp.content) / 1024, 1), "server": resp.headers.get("server"), }) soup = BeautifulSoup(html, "html.parser") hl = html.lower() # Title, meta title_tag = soup.find("title") result["page_title"] = title_tag.get_text(strip=True)[:200] if title_tag else None meta_desc = soup.find("meta", attrs={"name": "description"}) result["meta_description"] = (meta_desc.get("content") or "")[:300] if meta_desc else None h1 = soup.find("h1") result["h1_text"] = h1.get_text(strip=True)[:200] if h1 else None # Mobile viewport result["has_mobile_viewport"] = bool(soup.find("meta", attrs={"name": "viewport"})) # Canonical + OG canon = soup.find("link", rel="canonical") result["canonical_url"] = canon.get("href") if canon else None og = soup.find("meta", property="og:title") result["og_title"] = og.get("content") if og else None # Visible text for tag in soup(["script", "style", "noscript"]): tag.decompose() visible_text = soup.get_text(separator=" ", strip=True) words = visible_text.split() result["word_count"] = len(words) result["visible_text_snippet"] = " ".join(words[:500]) # Lorem ipsum / placeholder detection vl = visible_text.lower() lorem_hits = [p for p in LOREM_PHRASES if p in vl] result["has_lorem_ipsum"] = len(lorem_hits) > 0 result["lorem_matches"] = lorem_hits[:5] ph_hits = [p for p in PLACEHOLDER_PHRASES if p in vl] result["has_placeholder"] = len(ph_hits) > 0 result["placeholder_matches"] = ph_hits[:3] # Images & scripts imgs = soup.find_all("img") result["image_count"] = len(imgs) result["script_count"] = len(soup.find_all("script", src=True)) # Analytics / webmaster tags for name, sigs in ANALYTICS.items(): if any(s.lower() in hl for s in sigs): result["analytics_present"].append(name) for name, sigs in WEBMASTER.items(): if any(s.lower() in hl for s in sigs): result["webmaster_verified"].append(name) # Kit Digital kd_signals = [] for img in imgs: combined = ((img.get("src") or "") + (img.get("alt") or "") + (img.get("srcset") or "")).lower() for p in KIT_IMG_PATS: if p in combined: kd_signals.append(f"img:{p}") break for p in KIT_TEXT_PATS: if p in hl: kd_signals.append(f"text:{p}") for a in soup.find_all("a", href=True): href = a["href"].lower() if "acelerapyme" in href or "red.es" in href or "kit-digital" in href: kd_signals.append(f"link:{href[:50]}") kd_signals = list(dict.fromkeys(kd_signals))[:10] result["kit_digital"] = len(kd_signals) > 0 result["kit_digital_signals"] = kd_signals # Contacts for a in soup.find_all("a", href=True): href = a["href"] if href.startswith("mailto:"): em = href[7:].split("?")[0].strip().lower() if em and em not in result["emails"]: result["emails"].append(em) elif href.startswith("tel:"): ph = re.sub(r"[^\d+]", "", href[4:]) if ph and ph not in result["phones"]: result["phones"].append(ph) elif "wa.me" in href or "api.whatsapp.com" in href: if href not in result["whatsapp"]: result["whatsapp"].append(href[:80]) else: for sd in SOCIAL_DOM: if sd in href.lower(): clean = href.split("?")[0].rstrip("/") if clean not in result["social_links"]: result["social_links"].append(clean) break for em in EMAIL_RE.findall(html[:80000]): em = em.lower() if em not in result["emails"] and not any(em.endswith(x) for x in [".png", ".jpg", ".css", ".js", ".svg"]): result["emails"].append(em) for ph in PHONE_RE.findall(visible_text): ph_c = re.sub(r"[\s\-]", "", ph) if ph_c not in result["phones"]: result["phones"].append(ph_c) # Cap for k in ["emails", "phones", "whatsapp", "social_links"]: result[k] = list(dict.fromkeys(result[k]))[:5] # CMS from app.enricher import detect_cms result["cms"] = detect_cms(html, dict(resp.headers)) except Exception as e: result["error"] = str(e)[:300] # ── Sitemap & robots (parallel) ─────────────────────────────────────────── async def _check_url(url: str) -> Optional[str]: try: async with httpx.AsyncClient(timeout=6, follow_redirects=True, verify=False) as c: r = await c.get(url) return r.text if r.status_code == 200 else None except Exception: return None sitemap_txt, robots_txt = await asyncio.gather( _check_url(f"https://{domain}/sitemap.xml"), _check_url(f"https://{domain}/robots.txt"), ) result["has_sitemap"] = sitemap_txt is not None result["has_robots"] = robots_txt is not None if robots_txt: robots_lower = robots_txt.lower() result["robots_disallows_google"] = ( "disallow: /" in robots_lower and "googlebot" in robots_lower ) # ── SSL ─────────────────────────────────────────────────────────────────── import ssl as _ssl, socket as _socket try: def _ssl_check(): import datetime as _dt ctx = _ssl.create_default_context() with _socket.create_connection((domain, 443), timeout=5) as s: with ctx.wrap_socket(s, server_hostname=domain) as ss: cert = ss.getpeercert() exp = _dt.datetime.strptime(cert["notAfter"], "%b %d %H:%M:%S %Y %Z") return True, (_dt.datetime.utcnow() - exp).days * -1 loop = asyncio.get_event_loop() result["ssl_valid"], result["ssl_expiry_days"] = await loop.run_in_executor(None, _ssl_check) except Exception: pass return result