fix: AI worker crash-proof + GDPR/hosting/accessibility analysis

AI worker fixes (root cause of "nothing reaches Replicate"):
- Worker task died silently — no exception handler around while loop
- Added try/except around entire loop body with exc_info logging
- Added watchdog task that restarts dead workers every 10 seconds
- ensure_workers_alive() called on every /api/ai/assess/batch POST
- _assess_one() is now a top-level function (not closure) — avoids
  subtle scoping bugs with async inner functions in while loops
- /api/ai/debug endpoint: shows worker alive status, task exception,
  last 10 queue entries — browse to /api/ai/debug to diagnose
- /api/ai/worker/restart endpoint + UI button
- "Restart AI worker" button + "Debug AI queue" link in enrichment tab

site_analyzer.py — new signals:
- IP resolution + ip-api.com for ASN, org, ISP, host country
- EU hosting detection (27 EU + EEA + adequacy countries)
- GDPR: detects Cookiebot, OneTrust, CookiePro, Osano, Iubenda,
  Borlabs, CookieYes, Complianz, Usercentrics + text signals
- Privacy policy and GDPR text presence
- Accessibility: html lang missing, images without alt count,
  skip nav link, empty links, inputs without labels

Gemini prompt additions:
- Hosting section: IP, ASN, org/ISP, EU vs non-EU flag
- GDPR section: cookie tool, notice, privacy policy
- Accessibility section: all quick-scan results
- New output fields: hosting_notes, gdpr_compliance,
  accessibility_issues[]

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-13 18:01:34 +02:00
parent 5ad8259c75
commit 60c9b495ae
10 changed files with 409 additions and 205 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -338,14 +338,42 @@ async def worker_loop():
# ── AI assessment worker ────────────────────────────────────────────────────── # ── AI assessment worker ──────────────────────────────────────────────────────
async def ai_worker_loop(): async def _assess_one(domain: str) -> None:
"""Process a single AI assessment — safe to call concurrently."""
from app.replicate_ai import assess_domain as gemini_assess from app.replicate_ai import assess_domain as gemini_assess
from app.site_analyzer import analyze_site from app.site_analyzer import analyze_site
logger.info("AI: starting analysis for %s", domain)
try:
analysis = await analyze_site(domain)
logger.info("AI: site analyzed %s (reachable=%s, words=%s)",
domain, analysis.get("reachable"), analysis.get("word_count"))
assessment = await gemini_assess(analysis)
logger.info("AI: Gemini done %s → quality=%s",
domain, assessment.get("lead_quality"))
await save_ai_assessment(domain, assessment, site_analysis=analysis)
logger.info("AI: saved %s", domain)
except Exception as e:
logger.error("AI: failed %s%s", domain, e, exc_info=True)
try:
async with aiosqlite.connect(SQLITE_PATH) as db:
await db.execute(
"UPDATE ai_queue SET status='failed', completed_at=datetime('now'), error=? WHERE domain=?",
(str(e)[:400], domain),
)
await db.commit()
except Exception:
pass
async def ai_worker_loop():
logger.info("AI worker loop starting")
while True: while True:
rows = []
try:
async with aiosqlite.connect(SQLITE_PATH) as db: async with aiosqlite.connect(SQLITE_PATH) as db:
async with db.execute( async with db.execute(
"SELECT domain FROM ai_queue WHERE status='pending' LIMIT 10" "SELECT domain FROM ai_queue WHERE status='pending' LIMIT 5"
) as cur: ) as cur:
rows = await cur.fetchall() rows = await cur.fetchall()
if rows: if rows:
@@ -354,37 +382,44 @@ async def ai_worker_loop():
[(r[0],) for r in rows], [(r[0],) for r in rows],
) )
await db.commit() await db.commit()
logger.info("AI worker: picked up %d jobs: %s",
len(rows), [r[0] for r in rows])
except Exception as e:
logger.error("AI worker DB error: %s", e, exc_info=True)
await asyncio.sleep(5)
continue
if not rows: if not rows:
await asyncio.sleep(3) await asyncio.sleep(3)
continue continue
async def assess_one(domain: str): # Run assessments concurrently (semaphore in replicate_ai enforces AI_CONCURRENCY)
try: results = await asyncio.gather(
# Always do a fresh deep scrape — no pre-enrichment required *[_assess_one(r[0]) for r in rows],
analysis = await analyze_site(domain) return_exceptions=True,
assessment = await gemini_assess(analysis)
await save_ai_assessment(domain, assessment, site_analysis=analysis)
logger.info("AI done: %s%s", domain, assessment.get("lead_quality"))
except Exception as e:
async with aiosqlite.connect(SQLITE_PATH) as db:
await db.execute(
"UPDATE ai_queue SET status='failed', completed_at=datetime('now') WHERE domain=?",
(domain,),
) )
await db.commit() for r, exc in zip(rows, results):
logger.error("AI worker error %s: %s", domain, e) if isinstance(exc, Exception):
logger.error("AI task exception for %s: %s", r[0], exc, exc_info=exc)
# AI_CONCURRENCY concurrent assessments (already enforced by replicate_ai semaphore)
await asyncio.gather(*[asyncio.create_task(assess_one(r[0])) for r in rows], return_exceptions=True)
def start_worker(): def start_worker():
global _worker_task, _ai_worker_task global _worker_task, _ai_worker_task
if _worker_task is None or _worker_task.done(): if _worker_task is None or _worker_task.done():
_worker_task = asyncio.create_task(worker_loop()) _worker_task = asyncio.create_task(worker_loop())
logger.info("Enrichment worker started")
if _ai_worker_task is None or _ai_worker_task.done(): if _ai_worker_task is None or _ai_worker_task.done():
if _ai_worker_task is not None and _ai_worker_task.done():
exc = _ai_worker_task.exception() if not _ai_worker_task.cancelled() else None
if exc:
logger.error("AI worker died with: %s", exc, exc_info=exc)
_ai_worker_task = asyncio.create_task(ai_worker_loop()) _ai_worker_task = asyncio.create_task(ai_worker_loop())
logger.info("AI worker started/restarted")
def ensure_workers_alive():
"""Restart workers if they've died — call periodically."""
start_worker()
def pause_worker(): def pause_worker():

View File

@@ -20,7 +20,7 @@ from app.db import (
queue_domains, get_queue_status, build_duckdb_index, index_status, queue_domains, get_queue_status, build_duckdb_index, index_status,
queue_ai, get_ai_queue_status, save_ai_assessment, queue_ai, get_ai_queue_status, save_ai_assessment,
) )
from app.enricher import start_worker, pause_worker, resume_worker, is_running from app.enricher import start_worker, pause_worker, resume_worker, is_running, ensure_workers_alive
from app.scorer import run_scoring from app.scorer import run_scoring
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
@@ -61,13 +61,20 @@ async def download_parquet():
logger.info("Parquet download complete") logger.info("Parquet download complete")
async def _watchdog():
"""Restart workers if they die every 10 seconds."""
while True:
await asyncio.sleep(10)
ensure_workers_alive()
@asynccontextmanager @asynccontextmanager
async def lifespan(app: FastAPI): async def lifespan(app: FastAPI):
await download_parquet() await download_parquet()
await init_db() await init_db()
# Build DuckDB index in background — queries still work (slower) while building
asyncio.create_task(build_duckdb_index()) asyncio.create_task(build_duckdb_index())
start_worker() start_worker()
asyncio.create_task(_watchdog())
logger.info("DomGod ready on port 6677") logger.info("DomGod ready on port 6677")
yield yield
@@ -167,9 +174,43 @@ async def ai_assess_batch(body: dict):
if not domains_list: if not domains_list:
return JSONResponse({"error": "no domains provided"}, status_code=400) return JSONResponse({"error": "no domains provided"}, status_code=400)
await queue_ai(domains_list) await queue_ai(domains_list)
ensure_workers_alive() # ensure AI worker is alive when jobs are queued
return {"queued": len(domains_list)} return {"queued": len(domains_list)}
@app.post("/api/ai/worker/restart")
async def ai_worker_restart():
ensure_workers_alive()
return {"status": "restarted"}
@app.get("/api/ai/debug")
async def ai_debug():
"""Returns worker state + last 10 queue entries for troubleshooting."""
from app.enricher import _ai_worker_task
task_alive = _ai_worker_task is not None and not _ai_worker_task.done()
task_exc = None
if _ai_worker_task and _ai_worker_task.done() and not _ai_worker_task.cancelled():
try:
task_exc = str(_ai_worker_task.exception())
except Exception:
pass
async with aiosqlite.connect(SQLITE_PATH) as db:
db.row_factory = aiosqlite.Row
async with db.execute(
"SELECT domain, status, created_at, completed_at, error FROM ai_queue ORDER BY created_at DESC LIMIT 10"
) as cur:
recent = [dict(r) async for r in cur]
return {
"ai_worker_alive": task_alive,
"ai_worker_exception": task_exc,
"recent_queue": recent,
"queue_status": await get_ai_queue_status(),
}
@app.get("/api/ai/status") @app.get("/api/ai/status")
async def ai_status(): async def ai_status():
return await get_ai_queue_status() return await get_ai_queue_status()

View File

@@ -25,7 +25,6 @@ def _sem() -> asyncio.Semaphore:
def _build_prompt(a: dict) -> str: def _build_prompt(a: dict) -> str:
"""Build the Gemini prompt from a full site analysis dict."""
contacts_block = [] contacts_block = []
if a.get("emails"): contacts_block.append(f" Emails: {', '.join(a['emails'][:3])}") if a.get("emails"): contacts_block.append(f" Emails: {', '.join(a['emails'][:3])}")
if a.get("phones"): contacts_block.append(f" Phones: {', '.join(a['phones'][:3])}") if a.get("phones"): contacts_block.append(f" Phones: {', '.join(a['phones'][:3])}")
@@ -33,80 +32,98 @@ def _build_prompt(a: dict) -> str:
if a.get("social_links"):contacts_block.append(f" Social: {', '.join(a['social_links'][:4])}") if a.get("social_links"):contacts_block.append(f" Social: {', '.join(a['social_links'][:4])}")
contacts_str = "\n".join(contacts_block) or " None found" contacts_str = "\n".join(contacts_block) or " None found"
kd_str = "\n".join(f" - {s}" for s in (a.get("kit_digital_signals") or [])) or " None detected" kd_str = "\n".join(f" - {s}" for s in (a.get("kit_digital_signals") or [])) or " None"
analytics_str = ", ".join(a.get("analytics_present") or []) or "none" analytics = ", ".join(a.get("analytics_present") or []) or "none"
webmaster_str = ", ".join(a.get("webmaster_verified") or []) or "none" webmaster = ", ".join(a.get("webmaster_verified") or []) or "none"
lorem_str = ", ".join(a.get("lorem_matches") or []) or "none" lorem_str = ", ".join(a.get("lorem_matches") or []) or "none"
placeholder_str = ", ".join(a.get("placeholder_matches") or []) or "none" ph_str = ", ".join(a.get("placeholder_matches") or []) or "none"
snippet = (a.get("visible_text_snippet") or "")[:2000]
text_snippet = (a.get("visible_text_snippet") or "")[:2000] eu_hosted = a.get("eu_hosted")
hosting_flag = "✅ EU" if eu_hosted else ("❌ Non-EU" if eu_hosted is False else "unknown")
return f"""You are a senior web consultant and IT sales analyst reviewing a Spanish SME website. return f"""You are a senior web consultant and IT sales analyst evaluating a Spanish/European SME website for IT services upsell.
=== TECHNICAL SNAPSHOT === === TECHNICAL ===
Domain: {a.get("domain")} Domain: {a.get("domain")}
Reachable: {a.get("reachable")} | Status: {a.get("status_code")} | Load time: {a.get("load_time_ms")} ms Reachable: {a.get("reachable")} | Status: {a.get("status_code")} | Load time: {a.get("load_time_ms")} ms
Final URL: {a.get("final_url")}
Page size: {a.get("page_size_kb")} KB | Server: {a.get("server")} | CMS: {a.get("cms") or "unknown"} Page size: {a.get("page_size_kb")} KB | Server: {a.get("server")} | CMS: {a.get("cms") or "unknown"}
SSL valid: {a.get("ssl_valid")} | SSL expires in: {a.get("ssl_expiry_days")} days SSL: valid={a.get("ssl_valid")} expires_in={a.get("ssl_expiry_days")} days
Mobile viewport: {a.get("has_mobile_viewport")} Mobile: viewport={a.get("has_mobile_viewport")}
Word count: {a.get("word_count")} | Images: {a.get("image_count")} | Scripts: {a.get("script_count")} Words: {a.get("word_count")} | Images: {a.get("image_count")} | Scripts: {a.get("script_count")}
=== SEO & INDEXING SIGNALS === === HOSTING & INFRASTRUCTURE ===
Page title: {a.get("page_title") or "missing"} IP: {a.get("ip") or "unknown"}
H1: {a.get("h1_text") or "missing"} ASN: {a.get("asn") or "unknown"}
Meta description: {a.get("meta_description") or "missing"} Organisation: {a.get("org") or "unknown"}
Canonical URL: {a.get("canonical_url") or "not set"} ISP: {a.get("isp") or "unknown"}
Sitemap.xml: {a.get("has_sitemap")} Host country: {a.get("ip_country") or "unknown"} / {a.get("ip_region") or ""}
Robots.txt: {a.get("has_robots")} | Blocks Googlebot: {a.get("robots_disallows_google")} EU hosted: {hosting_flag}
Analytics: {analytics_str}
Webmaster verified:{webmaster_str} === SEO & INDEXING ===
Title: {a.get("page_title") or "MISSING"}
H1: {a.get("h1_text") or "MISSING"}
Meta desc: {a.get("meta_description") or "MISSING"}
Canonical: {a.get("canonical_url") or "not set"}
Sitemap: {a.get("has_sitemap")} | Robots: {a.get("has_robots")} | Blocks Google: {a.get("robots_disallows_google")}
Analytics: {analytics}
Webmaster: {webmaster}
=== GDPR & LEGAL COMPLIANCE ===
Cookie tool: {a.get("cookie_tool") or "none detected"}
Cookie notice: {a.get("has_cookie_notice")}
Privacy policy: {a.get("has_privacy_policy")}
GDPR text: {a.get("has_gdpr_text")}
=== ACCESSIBILITY (quick scan) ===
HTML lang attr: {a.get("html_lang") or "MISSING"}
Images missing alt: {a.get("images_missing_alt")}
Skip navigation link: {a.get("has_skip_nav")}
Empty links: {a.get("empty_links")}
Inputs without labels: {a.get("inputs_without_labels")}
=== CONTENT QUALITY === === CONTENT QUALITY ===
Lorem ipsum found: {a.get("has_lorem_ipsum")} matches: {lorem_str} Lorem ipsum: {a.get("has_lorem_ipsum")}{lorem_str}
Placeholder text: {a.get("has_placeholder")}matches: {placeholder_str} Placeholder: {a.get("has_placeholder")}{ph_str}
=== KIT DIGITAL (Spanish gov digitalization grant — sites must display EU logos) === === KIT DIGITAL (Spanish gov €12k SME grants — sites must show EU logos) ===
Detected: {a.get("kit_digital")} Detected: {a.get("kit_digital")}
Signals:
{kd_str} {kd_str}
=== CONTACT CHANNELS === === CONTACT CHANNELS ===
{contacts_str} {contacts_str}
=== PAGE TEXT SAMPLE (first 2000 chars) === === PAGE TEXT SAMPLE ===
{text_snippet} {snippet}
=== TASK === === INSTRUCTIONS ===
Analyse this site for IT services upsell potential. The client sells: The client sells: web redesign, SEO, hosting migration, SSL renewal,
web design/redesign, SEO, hosting migration, SSL renewal, security audits, security audits, GDPR compliance, accessibility fixes, Google Ads,
maintenance contracts, Google Ads, and AI-assisted tools for SMEs. maintenance contracts, AI tools for SMEs.
Respond ONLY with valid JSON no markdown, no text outside the JSON object: Respond ONLY with valid JSON, no markdown fences, no text outside the JSON:
{{ {{
"summary": "2-3 sentence executive summary of the site's current state", "summary": "2-3 sentence executive summary of the site's state",
"site_quality_score": <0-10 integer>, "site_quality_score": <0-10>,
"content_issues": ["list of specific content problems found — lorem ipsum, broken sections, placeholder text, etc."], "content_issues": ["specific issues found in page content"],
"performance_notes": "comment on load time, page size, mobile readiness", "performance_notes": "load time, size, mobile assessment",
"seo_status": "brief SEO assessment — indexing signals, missing elements", "seo_status": "SEO health — what's missing or broken",
"hosting_notes": "ASN/ISP name, EU vs non-EU, any concerns",
"gdpr_compliance": "cookie banner status, privacy policy, GDPR gaps",
"accessibility_issues": ["specific a11y problems found"],
"kit_digital_confirmed": true/false, "kit_digital_confirmed": true/false,
"kit_digital_reasoning": "1 sentence — why confirmed or not", "kit_digital_reasoning": "1 sentence",
"is_local_sme": true/false, "is_local_sme": true/false,
"lead_quality": "HOT|WARM|COLD", "lead_quality": "HOT|WARM|COLD",
"lead_reasoning": "1-2 sentences on why", "lead_reasoning": "1-2 sentences",
"best_contact_channel": "email|phone|whatsapp|social|web_form|unknown", "best_contact_channel": "email|phone|whatsapp|social|web_form|unknown",
"best_contact_value": "the actual value to use (email address, phone number, URL) or empty string", "best_contact_value": "actual email/phone/URL or empty string",
"all_contacts": {{ "all_contacts": {{"emails":[],"phones":[],"whatsapp":[],"social":[]}},
"emails": [], "pitch_angle": "1 cold-outreach sentence in Spanish",
"phones": [],
"whatsapp": [],
"social": []
}},
"pitch_angle": "One concrete opening sentence in Spanish for cold outreach",
"services_needed": ["service1","service2"], "services_needed": ["service1","service2"],
"urgency_signals": ["list of specific urgent issues — expiring SSL, lorem ipsum live, no GA, blocked robots etc"], "urgency_signals": ["specific urgent issues like 'SSL expires in 12 days', 'lorem ipsum on homepage', 'no cookie banner'"],
"outreach_notes": "Key context for the sales rep" "outreach_notes": "sales rep context"
}}""" }}
def _parse_output(raw: str) -> dict: def _parse_output(raw: str) -> dict:

View File

@@ -1,8 +1,9 @@
"""Deep site analysis: content quality, SEO signals, performance, indexing hints.""" """Deep site analysis: content quality, SEO, hosting, GDPR, accessibility."""
import asyncio import asyncio
import re import re
import time import time
import logging import logging
import socket
from typing import Optional from typing import Optional
import httpx import httpx
@@ -10,25 +11,58 @@ from bs4 import BeautifulSoup
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# ── Content quality ─────────────────────────────────────────────────────────── # ── EU countries (hosting check) ─────────────────────────────────────────────
EU_COUNTRIES = {
'AT','BE','BG','HR','CY','CZ','DK','EE','FI','FR','DE','GR',
'HU','IE','IT','LV','LT','LU','MT','NL','PL','PT','RO','SK',
'SI','ES','SE',
'NO','IS','LI', # EEA
'CH','GB','AD', # adequacy / adjacent
}
# ── Content quality ───────────────────────────────────────────────────────────
LOREM_PHRASES = [ LOREM_PHRASES = [
"lorem ipsum", "sed ut perspiciatis", "nunc sem sapien", "lorem ipsum", "sed ut perspiciatis", "nunc sem sapien",
"nulla id nibh", "aenean dignissim", "aliquam tincidunt", "nulla id nibh", "aenean dignissim", "aliquam tincidunt",
"vestibulum commodo", "fusce nunc lacus", "consectetuer", "vestibulum commodo", "fusce nunc lacus", "consectetuer",
"cras ornare tristique", "ntulla nec ante", "risus id metus", "cras ornare tristique", "ntulla nec ante", "risus id metus",
"praesent placerat", "fusce pellentesque", "suscipit nibh", "praesent placerat", "fusce pellentesque", "suscipit nibh",
"integer vitae libero", "felis quis tortor", "integer vitae libero", "felis quis tortor", "dolor sit amet",
] ]
PLACEHOLDER_PHRASES = [ PLACEHOLDER_PHRASES = [
"under construction", "coming soon", "sample page", "under construction", "coming soon", "sample page",
"this is a demo", "default post", "hello world", "this is a demo", "hello world", "test content",
"test post", "uncategorized", "default post", "uncategorized", "demo content",
] ]
# ── Analytics & webmaster tags ─────────────────────────────────────────────── # ── Cookie / GDPR consent tools ───────────────────────────────────────────────
COOKIE_TOOLS = {
"cookiebot": ["cookiebot.com", "CookieConsent", "CybotCookiebot"],
"onetrust": ["onetrust", "otBannerSdk"],
"cookiepro": ["cookiepro.com"],
"osano": ["osano.com"],
"iubenda": ["iubenda.com"],
"borlabs": ["borlabs-cookie"],
"complianz": ["complianz"],
"cookieyes": ["cookieyes.com", "cookie-law-info"],
"usercentrics": ["usercentrics.com"],
"quantcast": ["quantcast.com/cmp"],
}
COOKIE_TEXT_SIGNALS = [
"accept cookies", "acepta las cookies", "we use cookies", "usamos cookies",
"cookie policy", "política de cookies", "cookie settings", "manage cookies",
"aceptar todas", "rechazar cookies",
]
PRIVACY_SIGNALS = [
"privacy policy", "política de privacidad", "aviso legal",
"privacy notice", "data protection",
]
GDPR_TEXT_SIGNALS = [
"rgpd", "gdpr", "reglamento general de protección",
"lopd", "protección de datos", "responsable del tratamiento",
]
# ── Analytics / webmaster ─────────────────────────────────────────────────────
ANALYTICS = { ANALYTICS = {
"google_analytics": ["gtag('config'", "google-analytics.com/analytics.js", "G-"], "google_analytics": ["gtag('config'", "google-analytics.com/analytics.js", "G-"],
"google_tag_manager": ["googletagmanager.com/gtm.js", "GTM-"], "google_tag_manager": ["googletagmanager.com/gtm.js", "GTM-"],
@@ -36,13 +70,13 @@ ANALYTICS = {
"hotjar": ["static.hotjar.com"], "hotjar": ["static.hotjar.com"],
"clarity": ["clarity.ms/tag"], "clarity": ["clarity.ms/tag"],
} }
WEBMASTER = { WEBMASTER = {
"google_search_console": ['google-site-verification'], "google_search_console": ["google-site-verification"],
"bing_webmaster": ['msvalidate.01'], "bing_webmaster": ["msvalidate.01"],
"yandex": ['yandex-verification'], "yandex": ["yandex-verification"],
} }
# ── Kit Digital ───────────────────────────────────────────────────────────────
KIT_IMG_PATS = [ KIT_IMG_PATS = [
"digitalizadores", "kit-digital", "kitdigital", "kit_digital", "digitalizadores", "kit-digital", "kitdigital", "kit_digital",
"fondos-europeos", "fondos_europeos", "nextgeneration", "next-generation", "fondos-europeos", "fondos_europeos", "nextgeneration", "next-generation",
@@ -56,59 +90,78 @@ KIT_TEXT_PATS = [
EMAIL_RE = re.compile(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}") EMAIL_RE = re.compile(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}")
PHONE_RE = re.compile(r"(?:\+34[\s\-]?)?(?:6|7|8|9)\d{2}[\s\-]?\d{3}[\s\-]?\d{3}") PHONE_RE = re.compile(r"(?:\+34[\s\-]?)?(?:6|7|8|9)\d{2}[\s\-]?\d{3}[\s\-]?\d{3}")
SOCIAL_DOM = ["facebook.com", "instagram.com", "linkedin.com", "twitter.com", "x.com", "tiktok.com"] SOCIAL_DOM = ["facebook.com", "instagram.com", "linkedin.com",
"twitter.com", "x.com", "tiktok.com", "youtube.com"]
async def _get_hosting_info(domain: str) -> dict:
"""Resolve IP, then look up ASN / org / country via ip-api.com."""
info = {"ip": None, "asn": None, "org": None, "isp": None,
"ip_country": None, "ip_region": None, "eu_hosted": None}
try:
loop = asyncio.get_event_loop()
ip = await loop.run_in_executor(None, socket.gethostbyname, domain)
info["ip"] = ip
async with httpx.AsyncClient(timeout=6) as client:
r = await client.get(
f"http://ip-api.com/json/{ip}",
params={"fields": "status,country,countryCode,regionName,org,as,isp"},
)
if r.status_code == 200:
d = r.json()
if d.get("status") == "success":
info.update({
"asn": d.get("as"),
"org": d.get("org"),
"isp": d.get("isp"),
"ip_country": d.get("countryCode"),
"ip_region": d.get("regionName"),
"eu_hosted": d.get("countryCode") in EU_COUNTRIES,
})
except Exception as e:
logger.debug("Hosting lookup failed for %s: %s", domain, e)
return info
async def analyze_site(domain: str) -> dict: async def analyze_site(domain: str) -> dict:
"""Fetch and deeply analyse a site. Returns a rich dict for the AI prompt."""
result = { result = {
"domain": domain, "domain": domain,
"reachable": False, "reachable": False, "load_time_ms": None, "status_code": None,
"load_time_ms": None, "final_url": None, "page_size_kb": None, "server": None, "cms": None,
"status_code": None, # Hosting
"final_url": None, "ip": None, "asn": None, "org": None, "isp": None,
"page_size_kb": None, "ip_country": None, "ip_region": None, "eu_hosted": None,
"server": None, # SSL
"cms": None, "ssl_valid": False, "ssl_expiry_days": None,
"ssl_valid": False,
"ssl_expiry_days": None,
# Content quality # Content quality
"has_lorem_ipsum": False, "has_lorem_ipsum": False, "lorem_matches": [],
"lorem_matches": [], "has_placeholder": False, "placeholder_matches": [],
"has_placeholder": False, "word_count": 0, "image_count": 0, "script_count": 0,
"placeholder_matches": [],
"word_count": 0,
"image_count": 0,
"broken_images": 0,
"script_count": 0,
"has_mobile_viewport": False, "has_mobile_viewport": False,
"page_title": None, "page_title": None, "meta_description": None, "h1_text": None,
"meta_description": None,
"h1_text": None,
"visible_text_snippet": "", "visible_text_snippet": "",
# SEO / webmaster # SEO
"has_sitemap": False, "has_sitemap": False, "has_robots": False, "robots_disallows_google": False,
"has_robots": False, "analytics_present": [], "webmaster_verified": [],
"robots_disallows_google": False, "canonical_url": None, "og_title": None,
"analytics_present": [], # GDPR / cookies
"webmaster_verified": [], "cookie_tool": None, "has_cookie_notice": False,
"canonical_url": None, "has_privacy_policy": False, "has_gdpr_text": False,
"og_title": None, # Accessibility
"html_lang": None, "images_missing_alt": 0,
"has_skip_nav": False, "empty_links": 0,
"inputs_without_labels": 0,
# Kit Digital # Kit Digital
"kit_digital": False, "kit_digital": False, "kit_digital_signals": [],
"kit_digital_signals": [],
# Contacts # Contacts
"emails": [], "emails": [], "phones": [], "whatsapp": [], "social_links": [],
"phones": [],
"whatsapp": [],
"social_links": [],
# Errors
"error": None, "error": None,
} }
# ── Fetch main page ─────────────────────────────────────────────────────── # ── Fetch + hosting (parallel) ────────────────────────────────────────────
try: async def _fetch():
t0 = time.monotonic() t0 = time.monotonic()
try:
async with httpx.AsyncClient( async with httpx.AsyncClient(
timeout=15, follow_redirects=True, verify=False, timeout=15, follow_redirects=True, verify=False,
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"},
@@ -116,12 +169,20 @@ async def analyze_site(domain: str) -> dict:
resp = await client.get(f"https://{domain}") resp = await client.get(f"https://{domain}")
if resp.status_code >= 400: if resp.status_code >= 400:
resp = await client.get(f"http://{domain}") resp = await client.get(f"http://{domain}")
return resp, int((time.monotonic() - t0) * 1000)
except Exception as e:
return None, int((time.monotonic() - t0) * 1000)
load_ms = int((time.monotonic() - t0) * 1000) (resp, load_ms), hosting = await asyncio.gather(_fetch(), _get_hosting_info(domain))
result.update(hosting)
result["load_time_ms"] = load_ms
if resp is None:
result["error"] = "Failed to fetch site"
else:
html = resp.text html = resp.text
result.update({ result.update({
"reachable": resp.status_code < 400, "reachable": resp.status_code < 400,
"load_time_ms": load_ms,
"status_code": resp.status_code, "status_code": resp.status_code,
"final_url": str(resp.url), "final_url": str(resp.url),
"page_size_kb": round(len(resp.content) / 1024, 1), "page_size_kb": round(len(resp.content) / 1024, 1),
@@ -131,46 +192,42 @@ async def analyze_site(domain: str) -> dict:
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
hl = html.lower() hl = html.lower()
# Title, meta # ── Basic metadata ────────────────────────────────────────────────────
title_tag = soup.find("title") result["html_lang"] = (soup.find("html") or {}).get("lang")
result["page_title"] = title_tag.get_text(strip=True)[:200] if title_tag else None t = soup.find("title")
meta_desc = soup.find("meta", attrs={"name": "description"}) result["page_title"] = t.get_text(strip=True)[:200] if t else None
result["meta_description"] = (meta_desc.get("content") or "")[:300] if meta_desc else None md = soup.find("meta", attrs={"name": "description"})
result["meta_description"] = (md.get("content") or "")[:300] if md else None
h1 = soup.find("h1") h1 = soup.find("h1")
result["h1_text"] = h1.get_text(strip=True)[:200] if h1 else None result["h1_text"] = h1.get_text(strip=True)[:200] if h1 else None
# Mobile viewport
result["has_mobile_viewport"] = bool(soup.find("meta", attrs={"name": "viewport"})) result["has_mobile_viewport"] = bool(soup.find("meta", attrs={"name": "viewport"}))
c = soup.find("link", rel="canonical")
# Canonical + OG result["canonical_url"] = c.get("href") if c else None
canon = soup.find("link", rel="canonical")
result["canonical_url"] = canon.get("href") if canon else None
og = soup.find("meta", property="og:title") og = soup.find("meta", property="og:title")
result["og_title"] = og.get("content") if og else None result["og_title"] = og.get("content") if og else None
# Visible text # ── Visible text ──────────────────────────────────────────────────────
for tag in soup(["script", "style", "noscript"]): for tag in soup(["script", "style", "noscript"]):
tag.decompose() tag.decompose()
visible_text = soup.get_text(separator=" ", strip=True) visible = soup.get_text(separator=" ", strip=True)
words = visible_text.split() vl = visible.lower()
words = visible.split()
result["word_count"] = len(words) result["word_count"] = len(words)
result["visible_text_snippet"] = " ".join(words[:500]) result["visible_text_snippet"] = " ".join(words[:600])
# Lorem ipsum / placeholder detection # ── Content quality ───────────────────────────────────────────────────
vl = visible_text.lower()
lorem_hits = [p for p in LOREM_PHRASES if p in vl] lorem_hits = [p for p in LOREM_PHRASES if p in vl]
result["has_lorem_ipsum"] = len(lorem_hits) > 0 result["has_lorem_ipsum"] = len(lorem_hits) > 0
result["lorem_matches"] = lorem_hits[:5] result["lorem_matches"] = lorem_hits[:6]
ph_hits = [p for p in PLACEHOLDER_PHRASES if p in vl] ph_hits = [p for p in PLACEHOLDER_PHRASES if p in vl]
result["has_placeholder"] = len(ph_hits) > 0 result["has_placeholder"] = len(ph_hits) > 0
result["placeholder_matches"] = ph_hits[:3] result["placeholder_matches"] = ph_hits[:3]
# Images & scripts
imgs = soup.find_all("img") imgs = soup.find_all("img")
result["image_count"] = len(imgs) result["image_count"] = len(imgs)
result["script_count"] = len(soup.find_all("script", src=True)) result["script_count"] = len(soup.find_all("script", src=True))
# Analytics / webmaster tags # ── Analytics / webmaster ─────────────────────────────────────────────
for name, sigs in ANALYTICS.items(): for name, sigs in ANALYTICS.items():
if any(s.lower() in hl for s in sigs): if any(s.lower() in hl for s in sigs):
result["analytics_present"].append(name) result["analytics_present"].append(name)
@@ -178,12 +235,42 @@ async def analyze_site(domain: str) -> dict:
if any(s.lower() in hl for s in sigs): if any(s.lower() in hl for s in sigs):
result["webmaster_verified"].append(name) result["webmaster_verified"].append(name)
# Kit Digital # ── GDPR / cookies ────────────────────────────────────────────────────
for tool, sigs in COOKIE_TOOLS.items():
if any(s.lower() in hl for s in sigs):
result["cookie_tool"] = tool
result["has_cookie_notice"] = True
break
if not result["has_cookie_notice"]:
result["has_cookie_notice"] = any(s in vl for s in COOKIE_TEXT_SIGNALS)
result["has_privacy_policy"] = any(s in vl for s in PRIVACY_SIGNALS) or bool(
soup.find("a", href=lambda h: h and ("privacidad" in h.lower() or "privacy" in h.lower()))
)
result["has_gdpr_text"] = any(s in vl for s in GDPR_TEXT_SIGNALS)
# ── Accessibility ─────────────────────────────────────────────────────
result["images_missing_alt"] = sum(
1 for img in imgs if not img.get("alt") and img.get("alt") != ""
)
result["has_skip_nav"] = bool(
soup.find("a", href=lambda h: h and h.lower() in ("#main", "#content", "#maincontent", "#skip"))
)
result["empty_links"] = sum(
1 for a in soup.find_all("a") if not a.get_text(strip=True) and not a.find("img")
)
all_inputs = soup.find_all("input", type=lambda t: t not in ("hidden", "submit", "button", None) or t is None)
labeled_ids = {lbl.get("for") for lbl in soup.find_all("label") if lbl.get("for")}
result["inputs_without_labels"] = sum(
1 for inp in all_inputs
if inp.get("id") not in labeled_ids and not inp.get("aria-label") and not inp.get("aria-labelledby")
)
# ── Kit Digital ───────────────────────────────────────────────────────
kd_signals = [] kd_signals = []
for img in imgs: for img in soup.find_all("img"):
combined = ((img.get("src") or "") + (img.get("alt") or "") + (img.get("srcset") or "")).lower() comb = ((img.get("src") or "") + (img.get("alt") or "") + (img.get("srcset") or "")).lower()
for p in KIT_IMG_PATS: for p in KIT_IMG_PATS:
if p in combined: if p in comb:
kd_signals.append(f"img:{p}") kd_signals.append(f"img:{p}")
break break
for p in KIT_TEXT_PATS: for p in KIT_TEXT_PATS:
@@ -197,7 +284,7 @@ async def analyze_site(domain: str) -> dict:
result["kit_digital"] = len(kd_signals) > 0 result["kit_digital"] = len(kd_signals) > 0
result["kit_digital_signals"] = kd_signals result["kit_digital_signals"] = kd_signals
# Contacts # ── Contacts ──────────────────────────────────────────────────────────
for a in soup.find_all("a", href=True): for a in soup.find_all("a", href=True):
href = a["href"] href = a["href"]
if href.startswith("mailto:"): if href.startswith("mailto:"):
@@ -222,23 +309,34 @@ async def analyze_site(domain: str) -> dict:
em = em.lower() em = em.lower()
if em not in result["emails"] and not any(em.endswith(x) for x in [".png",".jpg",".css",".js",".svg"]): if em not in result["emails"] and not any(em.endswith(x) for x in [".png",".jpg",".css",".js",".svg"]):
result["emails"].append(em) result["emails"].append(em)
for ph in PHONE_RE.findall(visible_text): for ph in PHONE_RE.findall(visible):
ph_c = re.sub(r"[\s\-]", "", ph) ph_c = re.sub(r"[\s\-]", "", ph)
if ph_c not in result["phones"]: if ph_c not in result["phones"]:
result["phones"].append(ph_c) result["phones"].append(ph_c)
# Cap
for k in ["emails", "phones", "whatsapp", "social_links"]: for k in ["emails", "phones", "whatsapp", "social_links"]:
result[k] = list(dict.fromkeys(result[k]))[:5] result[k] = list(dict.fromkeys(result[k]))[:5]
# CMS # ── CMS ───────────────────────────────────────────────────────────────
from app.enricher import detect_cms CMS_SIGS = {
result["cms"] = detect_cms(html, dict(resp.headers)) "wordpress": ["/wp-content/", "/wp-includes/", 'content="WordPress'],
"joomla": ["/components/com_", "Joomla!", 'content="Joomla'],
except Exception as e: "drupal": ["/sites/default/files/", "Drupal.settings"],
result["error"] = str(e)[:300] "wix": ["static.wixstatic.com", "X-Wix-"],
"squarespace": ["squarespace.com", "X-Squarespace-"],
"shopify": ["cdn.shopify.com", "Shopify.theme"],
"prestashop": ["PrestaShop", "/modules/prestashop"],
"magento": ["Mage.Cookies", "X-Magento-"],
"typo3": ["typo3temp", "TYPO3 CMS"],
"opencart": ["route=common/home", "OpenCart"],
}
combined_check = html[:60000] + " ".join(f"{k}:{v}" for k, v in resp.headers.items())
for cms, sigs in CMS_SIGS.items():
if any(s.lower() in combined_check.lower() for s in sigs):
result["cms"] = cms
break
# ── Sitemap & robots (parallel) ─────────────────────────────────────────── # ── Sitemap & robots (parallel) ───────────────────────────────────────────
async def _check_url(url: str) -> Optional[str]: async def _get(url):
try: try:
async with httpx.AsyncClient(timeout=6, follow_redirects=True, verify=False) as c: async with httpx.AsyncClient(timeout=6, follow_redirects=True, verify=False) as c:
r = await c.get(url) r = await c.get(url)
@@ -247,24 +345,22 @@ async def analyze_site(domain: str) -> dict:
return None return None
sitemap_txt, robots_txt = await asyncio.gather( sitemap_txt, robots_txt = await asyncio.gather(
_check_url(f"https://{domain}/sitemap.xml"), _get(f"https://{domain}/sitemap.xml"),
_check_url(f"https://{domain}/robots.txt"), _get(f"https://{domain}/robots.txt"),
) )
result["has_sitemap"] = sitemap_txt is not None result["has_sitemap"] = sitemap_txt is not None
result["has_robots"] = robots_txt is not None result["has_robots"] = robots_txt is not None
if robots_txt: if robots_txt:
robots_lower = robots_txt.lower() rl = robots_txt.lower()
result["robots_disallows_google"] = ( result["robots_disallows_google"] = "disallow: /" in rl and "googlebot" in rl
"disallow: /" in robots_lower and "googlebot" in robots_lower
)
# ── SSL ─────────────────────────────────────────────────────────────────── # ── SSL ───────────────────────────────────────────────────────────────────
import ssl as _ssl, socket as _socket import ssl as _ssl
try: try:
def _ssl_check(): def _ssl_check():
import datetime as _dt import datetime as _dt
ctx = _ssl.create_default_context() ctx = _ssl.create_default_context()
with _socket.create_connection((domain, 443), timeout=5) as s: with socket.create_connection((domain, 443), timeout=5) as s:
with ctx.wrap_socket(s, server_hostname=domain) as ss: with ctx.wrap_socket(s, server_hostname=domain) as ss:
cert = ss.getpeercert() cert = ss.getpeercert()
exp = _dt.datetime.strptime(cert["notAfter"], "%b %d %H:%M:%S %Y %Z") exp = _dt.datetime.strptime(cert["notAfter"], "%b %d %H:%M:%S %Y %Z")

View File

@@ -179,7 +179,9 @@ tr:hover td{background:rgba(255,255,255,.025)}
<div class="mrow"><span class="mlabel">Reasoning</span><span x-text="modal.ai.lead_reasoning||'—'"></span></div> <div class="mrow"><span class="mlabel">Reasoning</span><span x-text="modal.ai.lead_reasoning||'—'"></span></div>
<div class="mrow"><span class="mlabel">KD notes</span><span x-text="modal.ai.kit_digital_reasoning||'—'"></span></div> <div class="mrow"><span class="mlabel">KD notes</span><span x-text="modal.ai.kit_digital_reasoning||'—'"></span></div>
<div class="mrow"><span class="mlabel">Performance</span><span x-text="modal.ai.performance_notes||'—'"></span></div> <div class="mrow"><span class="mlabel">Performance</span><span x-text="modal.ai.performance_notes||'—'"></span></div>
<div class="mrow"><span class="mlabel">SEO status</span><span x-text="modal.ai.seo_status||'—'"></span></div> <div class="mrow"><span class="mlabel">SEO</span><span x-text="modal.ai.seo_status||'—'"></span></div>
<div class="mrow"><span class="mlabel">Hosting</span><span x-text="(modal.sa?.org||'?') + ' / ' + (modal.sa?.ip_country||'?') + (modal.sa?.eu_hosted===false?' ❌ Non-EU':modal.sa?.eu_hosted?' ✅ EU':'')"></span></div>
<div class="mrow"><span class="mlabel">GDPR</span><span :style="(!modal.sa?.has_cookie_notice)?'color:var(--danger)':''" x-text="modal.ai.gdpr_compliance||'—'"></span></div>
<!-- Content issues --> <!-- Content issues -->
<div x-show="(modal.ai.content_issues||[]).length>0" style="margin:8px 0"> <div x-show="(modal.ai.content_issues||[]).length>0" style="margin:8px 0">
@@ -225,6 +227,14 @@ tr:hover td{background:rgba(255,255,255,.025)}
<div style="font-size:13px;font-style:italic;color:var(--accent2)" x-text="modal.ai.pitch_angle||'—'"></div> <div style="font-size:13px;font-style:italic;color:var(--accent2)" x-text="modal.ai.pitch_angle||'—'"></div>
</div> </div>
<!-- Accessibility issues -->
<div x-show="(modal.ai.accessibility_issues||[]).length>0" style="margin:8px 0">
<div style="font-size:10px;color:var(--muted);text-transform:uppercase;margin-bottom:4px">Accessibility Issues</div>
<template x-for="issue in (modal.ai.accessibility_issues||[])">
<div style="font-size:12px;color:var(--warn);padding:2px 0"><span x-text="issue"></span></div>
</template>
</div>
<div class="mrow"><span class="mlabel">Services</span><span x-text="(modal.ai.services_needed||[]).join(', ')||'—'"></span></div> <div class="mrow"><span class="mlabel">Services</span><span x-text="(modal.ai.services_needed||[]).join(', ')||'—'"></span></div>
<div class="mrow"><span class="mlabel">Notes</span><span x-text="modal.ai.outreach_notes||'—'"></span></div> <div class="mrow"><span class="mlabel">Notes</span><span x-text="modal.ai.outreach_notes||'—'"></span></div>
@@ -431,7 +441,11 @@ tr:hover td{background:rgba(255,255,255,.025)}
<div style="font-size:12px;color:var(--muted);margin-bottom:8px"> <div style="font-size:12px;color:var(--muted);margin-bottom:8px">
Auto-assesses enriched domains via Gemini. Detects Kit Digital confirmation, extracts best contact channel, writes pitch. Auto-assesses enriched domains via Gemini. Detects Kit Digital confirmation, extracts best contact channel, writes pitch.
</div> </div>
<div style="display:flex;gap:6px;flex-wrap:wrap">
<button class="btn bai" @click="aiAssessAllKD()">🤖 AI Assess all Kit Digital domains</button> <button class="btn bai" @click="aiAssessAllKD()">🤖 AI Assess all Kit Digital domains</button>
<button class="btn bg sm" @click="restartAiWorker()">↺ Restart AI worker</button>
<a class="btn bg sm" href="/api/ai/debug" target="_blank" style="text-decoration:none">🔍 Debug AI queue</a>
</div>
</div> </div>
<div style="margin-top:18px;padding-top:14px;border-top:1px solid var(--border)"> <div style="margin-top:18px;padding-top:14px;border-top:1px solid var(--border)">
@@ -616,6 +630,7 @@ function app() {
try { this.qst = await fetch('/api/enrich/status').then(r=>r.json()); } catch(e){} try { this.qst = await fetch('/api/enrich/status').then(r=>r.json()); } catch(e){}
}, },
async restartAiWorker() { const r=await fetch('/api/ai/worker/restart',{method:'POST'}); this.notify('AI worker restarted','info'); await this.loadAiStatus(); },
async startEnrich() { await fetch('/api/enrich/resume',{method:'POST'}); this.notify('Worker started','success'); await this.loadQueue(); }, async startEnrich() { await fetch('/api/enrich/resume',{method:'POST'}); this.notify('Worker started','success'); await this.loadQueue(); },
async pauseEnrich() { await fetch('/api/enrich/pause',{method:'POST'}); this.notify('Worker paused','success'); await this.loadQueue(); }, async pauseEnrich() { await fetch('/api/enrich/pause',{method:'POST'}); this.notify('Worker paused','success'); await this.loadQueue(); },
async retryFailed() { await fetch('/api/enrich/retry',{method:'POST'}); this.notify('Retrying failed','success'); await this.loadQueue(); }, async retryFailed() { await fetch('/api/enrich/retry',{method:'POST'}); this.notify('Retrying failed','success'); await this.loadQueue(); },