fix: AI worker crash-proof + GDPR/hosting/accessibility analysis
AI worker fixes (root cause of "nothing reaches Replicate"): - Worker task died silently — no exception handler around while loop - Added try/except around entire loop body with exc_info logging - Added watchdog task that restarts dead workers every 10 seconds - ensure_workers_alive() called on every /api/ai/assess/batch POST - _assess_one() is now a top-level function (not closure) — avoids subtle scoping bugs with async inner functions in while loops - /api/ai/debug endpoint: shows worker alive status, task exception, last 10 queue entries — browse to /api/ai/debug to diagnose - /api/ai/worker/restart endpoint + UI button - "Restart AI worker" button + "Debug AI queue" link in enrichment tab site_analyzer.py — new signals: - IP resolution + ip-api.com for ASN, org, ISP, host country - EU hosting detection (27 EU + EEA + adequacy countries) - GDPR: detects Cookiebot, OneTrust, CookiePro, Osano, Iubenda, Borlabs, CookieYes, Complianz, Usercentrics + text signals - Privacy policy and GDPR text presence - Accessibility: html lang missing, images without alt count, skip nav link, empty links, inputs without labels Gemini prompt additions: - Hosting section: IP, ASN, org/ISP, EU vs non-EU flag - GDPR section: cookie tool, notice, privacy policy - Accessibility section: all quick-scan results - New output fields: hosting_notes, gdpr_compliance, accessibility_issues[] Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
BIN
app/__pycache__/__init__.cpython-311.pyc
Normal file
BIN
app/__pycache__/__init__.cpython-311.pyc
Normal file
Binary file not shown.
BIN
app/__pycache__/db.cpython-311.pyc
Normal file
BIN
app/__pycache__/db.cpython-311.pyc
Normal file
Binary file not shown.
BIN
app/__pycache__/enricher.cpython-311.pyc
Normal file
BIN
app/__pycache__/enricher.cpython-311.pyc
Normal file
Binary file not shown.
BIN
app/__pycache__/replicate_ai.cpython-311.pyc
Normal file
BIN
app/__pycache__/replicate_ai.cpython-311.pyc
Normal file
Binary file not shown.
BIN
app/__pycache__/site_analyzer.cpython-311.pyc
Normal file
BIN
app/__pycache__/site_analyzer.cpython-311.pyc
Normal file
Binary file not shown.
@@ -338,14 +338,42 @@ async def worker_loop():
|
||||
|
||||
# ── AI assessment worker ──────────────────────────────────────────────────────
|
||||
|
||||
async def ai_worker_loop():
|
||||
async def _assess_one(domain: str) -> None:
|
||||
"""Process a single AI assessment — safe to call concurrently."""
|
||||
from app.replicate_ai import assess_domain as gemini_assess
|
||||
from app.site_analyzer import analyze_site
|
||||
|
||||
logger.info("AI: starting analysis for %s", domain)
|
||||
try:
|
||||
analysis = await analyze_site(domain)
|
||||
logger.info("AI: site analyzed %s (reachable=%s, words=%s)",
|
||||
domain, analysis.get("reachable"), analysis.get("word_count"))
|
||||
assessment = await gemini_assess(analysis)
|
||||
logger.info("AI: Gemini done %s → quality=%s",
|
||||
domain, assessment.get("lead_quality"))
|
||||
await save_ai_assessment(domain, assessment, site_analysis=analysis)
|
||||
logger.info("AI: saved %s", domain)
|
||||
except Exception as e:
|
||||
logger.error("AI: failed %s — %s", domain, e, exc_info=True)
|
||||
try:
|
||||
async with aiosqlite.connect(SQLITE_PATH) as db:
|
||||
await db.execute(
|
||||
"UPDATE ai_queue SET status='failed', completed_at=datetime('now'), error=? WHERE domain=?",
|
||||
(str(e)[:400], domain),
|
||||
)
|
||||
await db.commit()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
async def ai_worker_loop():
|
||||
logger.info("AI worker loop starting")
|
||||
while True:
|
||||
rows = []
|
||||
try:
|
||||
async with aiosqlite.connect(SQLITE_PATH) as db:
|
||||
async with db.execute(
|
||||
"SELECT domain FROM ai_queue WHERE status='pending' LIMIT 10"
|
||||
"SELECT domain FROM ai_queue WHERE status='pending' LIMIT 5"
|
||||
) as cur:
|
||||
rows = await cur.fetchall()
|
||||
if rows:
|
||||
@@ -354,37 +382,44 @@ async def ai_worker_loop():
|
||||
[(r[0],) for r in rows],
|
||||
)
|
||||
await db.commit()
|
||||
logger.info("AI worker: picked up %d jobs: %s",
|
||||
len(rows), [r[0] for r in rows])
|
||||
except Exception as e:
|
||||
logger.error("AI worker DB error: %s", e, exc_info=True)
|
||||
await asyncio.sleep(5)
|
||||
continue
|
||||
|
||||
if not rows:
|
||||
await asyncio.sleep(3)
|
||||
continue
|
||||
|
||||
async def assess_one(domain: str):
|
||||
try:
|
||||
# Always do a fresh deep scrape — no pre-enrichment required
|
||||
analysis = await analyze_site(domain)
|
||||
assessment = await gemini_assess(analysis)
|
||||
await save_ai_assessment(domain, assessment, site_analysis=analysis)
|
||||
logger.info("AI done: %s → %s", domain, assessment.get("lead_quality"))
|
||||
except Exception as e:
|
||||
async with aiosqlite.connect(SQLITE_PATH) as db:
|
||||
await db.execute(
|
||||
"UPDATE ai_queue SET status='failed', completed_at=datetime('now') WHERE domain=?",
|
||||
(domain,),
|
||||
# Run assessments concurrently (semaphore in replicate_ai enforces AI_CONCURRENCY)
|
||||
results = await asyncio.gather(
|
||||
*[_assess_one(r[0]) for r in rows],
|
||||
return_exceptions=True,
|
||||
)
|
||||
await db.commit()
|
||||
logger.error("AI worker error %s: %s", domain, e)
|
||||
|
||||
# AI_CONCURRENCY concurrent assessments (already enforced by replicate_ai semaphore)
|
||||
await asyncio.gather(*[asyncio.create_task(assess_one(r[0])) for r in rows], return_exceptions=True)
|
||||
for r, exc in zip(rows, results):
|
||||
if isinstance(exc, Exception):
|
||||
logger.error("AI task exception for %s: %s", r[0], exc, exc_info=exc)
|
||||
|
||||
|
||||
def start_worker():
|
||||
global _worker_task, _ai_worker_task
|
||||
if _worker_task is None or _worker_task.done():
|
||||
_worker_task = asyncio.create_task(worker_loop())
|
||||
logger.info("Enrichment worker started")
|
||||
if _ai_worker_task is None or _ai_worker_task.done():
|
||||
if _ai_worker_task is not None and _ai_worker_task.done():
|
||||
exc = _ai_worker_task.exception() if not _ai_worker_task.cancelled() else None
|
||||
if exc:
|
||||
logger.error("AI worker died with: %s", exc, exc_info=exc)
|
||||
_ai_worker_task = asyncio.create_task(ai_worker_loop())
|
||||
logger.info("AI worker started/restarted")
|
||||
|
||||
|
||||
def ensure_workers_alive():
|
||||
"""Restart workers if they've died — call periodically."""
|
||||
start_worker()
|
||||
|
||||
|
||||
def pause_worker():
|
||||
|
||||
45
app/main.py
45
app/main.py
@@ -20,7 +20,7 @@ from app.db import (
|
||||
queue_domains, get_queue_status, build_duckdb_index, index_status,
|
||||
queue_ai, get_ai_queue_status, save_ai_assessment,
|
||||
)
|
||||
from app.enricher import start_worker, pause_worker, resume_worker, is_running
|
||||
from app.enricher import start_worker, pause_worker, resume_worker, is_running, ensure_workers_alive
|
||||
from app.scorer import run_scoring
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||
@@ -61,13 +61,20 @@ async def download_parquet():
|
||||
logger.info("Parquet download complete")
|
||||
|
||||
|
||||
async def _watchdog():
|
||||
"""Restart workers if they die every 10 seconds."""
|
||||
while True:
|
||||
await asyncio.sleep(10)
|
||||
ensure_workers_alive()
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
await download_parquet()
|
||||
await init_db()
|
||||
# Build DuckDB index in background — queries still work (slower) while building
|
||||
asyncio.create_task(build_duckdb_index())
|
||||
start_worker()
|
||||
asyncio.create_task(_watchdog())
|
||||
logger.info("DomGod ready on port 6677")
|
||||
yield
|
||||
|
||||
@@ -167,9 +174,43 @@ async def ai_assess_batch(body: dict):
|
||||
if not domains_list:
|
||||
return JSONResponse({"error": "no domains provided"}, status_code=400)
|
||||
await queue_ai(domains_list)
|
||||
ensure_workers_alive() # ensure AI worker is alive when jobs are queued
|
||||
return {"queued": len(domains_list)}
|
||||
|
||||
|
||||
@app.post("/api/ai/worker/restart")
|
||||
async def ai_worker_restart():
|
||||
ensure_workers_alive()
|
||||
return {"status": "restarted"}
|
||||
|
||||
|
||||
@app.get("/api/ai/debug")
|
||||
async def ai_debug():
|
||||
"""Returns worker state + last 10 queue entries for troubleshooting."""
|
||||
from app.enricher import _ai_worker_task
|
||||
task_alive = _ai_worker_task is not None and not _ai_worker_task.done()
|
||||
task_exc = None
|
||||
if _ai_worker_task and _ai_worker_task.done() and not _ai_worker_task.cancelled():
|
||||
try:
|
||||
task_exc = str(_ai_worker_task.exception())
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
async with aiosqlite.connect(SQLITE_PATH) as db:
|
||||
db.row_factory = aiosqlite.Row
|
||||
async with db.execute(
|
||||
"SELECT domain, status, created_at, completed_at, error FROM ai_queue ORDER BY created_at DESC LIMIT 10"
|
||||
) as cur:
|
||||
recent = [dict(r) async for r in cur]
|
||||
|
||||
return {
|
||||
"ai_worker_alive": task_alive,
|
||||
"ai_worker_exception": task_exc,
|
||||
"recent_queue": recent,
|
||||
"queue_status": await get_ai_queue_status(),
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/ai/status")
|
||||
async def ai_status():
|
||||
return await get_ai_queue_status()
|
||||
|
||||
@@ -25,88 +25,105 @@ def _sem() -> asyncio.Semaphore:
|
||||
|
||||
|
||||
def _build_prompt(a: dict) -> str:
|
||||
"""Build the Gemini prompt from a full site analysis dict."""
|
||||
contacts_block = []
|
||||
if a.get("emails"): contacts_block.append(f" Emails: {', '.join(a['emails'][:3])}")
|
||||
if a.get("phones"): contacts_block.append(f" Phones: {', '.join(a['phones'][:3])}")
|
||||
if a.get("whatsapp"): contacts_block.append(f" WhatsApp: {', '.join(a['whatsapp'][:2])}")
|
||||
if a.get("social_links"): contacts_block.append(f" Social: {', '.join(a['social_links'][:4])}")
|
||||
if a.get("social_links"):contacts_block.append(f" Social: {', '.join(a['social_links'][:4])}")
|
||||
contacts_str = "\n".join(contacts_block) or " None found"
|
||||
|
||||
kd_str = "\n".join(f" - {s}" for s in (a.get("kit_digital_signals") or [])) or " None detected"
|
||||
analytics_str = ", ".join(a.get("analytics_present") or []) or "none"
|
||||
webmaster_str = ", ".join(a.get("webmaster_verified") or []) or "none"
|
||||
kd_str = "\n".join(f" - {s}" for s in (a.get("kit_digital_signals") or [])) or " None"
|
||||
analytics = ", ".join(a.get("analytics_present") or []) or "none"
|
||||
webmaster = ", ".join(a.get("webmaster_verified") or []) or "none"
|
||||
lorem_str = ", ".join(a.get("lorem_matches") or []) or "none"
|
||||
placeholder_str = ", ".join(a.get("placeholder_matches") or []) or "none"
|
||||
ph_str = ", ".join(a.get("placeholder_matches") or []) or "none"
|
||||
snippet = (a.get("visible_text_snippet") or "")[:2000]
|
||||
|
||||
text_snippet = (a.get("visible_text_snippet") or "")[:2000]
|
||||
eu_hosted = a.get("eu_hosted")
|
||||
hosting_flag = "✅ EU" if eu_hosted else ("❌ Non-EU" if eu_hosted is False else "unknown")
|
||||
|
||||
return f"""You are a senior web consultant and IT sales analyst reviewing a Spanish SME website.
|
||||
return f"""You are a senior web consultant and IT sales analyst evaluating a Spanish/European SME website for IT services upsell.
|
||||
|
||||
=== TECHNICAL SNAPSHOT ===
|
||||
=== TECHNICAL ===
|
||||
Domain: {a.get("domain")}
|
||||
Reachable: {a.get("reachable")} | Status: {a.get("status_code")} | Load time: {a.get("load_time_ms")} ms
|
||||
Final URL: {a.get("final_url")}
|
||||
Page size: {a.get("page_size_kb")} KB | Server: {a.get("server")} | CMS: {a.get("cms") or "unknown"}
|
||||
SSL valid: {a.get("ssl_valid")} | SSL expires in: {a.get("ssl_expiry_days")} days
|
||||
Mobile viewport: {a.get("has_mobile_viewport")}
|
||||
Word count: {a.get("word_count")} | Images: {a.get("image_count")} | Scripts: {a.get("script_count")}
|
||||
SSL: valid={a.get("ssl_valid")} expires_in={a.get("ssl_expiry_days")} days
|
||||
Mobile: viewport={a.get("has_mobile_viewport")}
|
||||
Words: {a.get("word_count")} | Images: {a.get("image_count")} | Scripts: {a.get("script_count")}
|
||||
|
||||
=== SEO & INDEXING SIGNALS ===
|
||||
Page title: {a.get("page_title") or "missing"}
|
||||
H1: {a.get("h1_text") or "missing"}
|
||||
Meta description: {a.get("meta_description") or "missing"}
|
||||
Canonical URL: {a.get("canonical_url") or "not set"}
|
||||
Sitemap.xml: {a.get("has_sitemap")}
|
||||
Robots.txt: {a.get("has_robots")} | Blocks Googlebot: {a.get("robots_disallows_google")}
|
||||
Analytics: {analytics_str}
|
||||
Webmaster verified:{webmaster_str}
|
||||
=== HOSTING & INFRASTRUCTURE ===
|
||||
IP: {a.get("ip") or "unknown"}
|
||||
ASN: {a.get("asn") or "unknown"}
|
||||
Organisation: {a.get("org") or "unknown"}
|
||||
ISP: {a.get("isp") or "unknown"}
|
||||
Host country: {a.get("ip_country") or "unknown"} / {a.get("ip_region") or ""}
|
||||
EU hosted: {hosting_flag}
|
||||
|
||||
=== SEO & INDEXING ===
|
||||
Title: {a.get("page_title") or "MISSING"}
|
||||
H1: {a.get("h1_text") or "MISSING"}
|
||||
Meta desc: {a.get("meta_description") or "MISSING"}
|
||||
Canonical: {a.get("canonical_url") or "not set"}
|
||||
Sitemap: {a.get("has_sitemap")} | Robots: {a.get("has_robots")} | Blocks Google: {a.get("robots_disallows_google")}
|
||||
Analytics: {analytics}
|
||||
Webmaster: {webmaster}
|
||||
|
||||
=== GDPR & LEGAL COMPLIANCE ===
|
||||
Cookie tool: {a.get("cookie_tool") or "none detected"}
|
||||
Cookie notice: {a.get("has_cookie_notice")}
|
||||
Privacy policy: {a.get("has_privacy_policy")}
|
||||
GDPR text: {a.get("has_gdpr_text")}
|
||||
|
||||
=== ACCESSIBILITY (quick scan) ===
|
||||
HTML lang attr: {a.get("html_lang") or "MISSING"}
|
||||
Images missing alt: {a.get("images_missing_alt")}
|
||||
Skip navigation link: {a.get("has_skip_nav")}
|
||||
Empty links: {a.get("empty_links")}
|
||||
Inputs without labels: {a.get("inputs_without_labels")}
|
||||
|
||||
=== CONTENT QUALITY ===
|
||||
Lorem ipsum found: {a.get("has_lorem_ipsum")} → matches: {lorem_str}
|
||||
Placeholder text: {a.get("has_placeholder")} → matches: {placeholder_str}
|
||||
Lorem ipsum: {a.get("has_lorem_ipsum")} → {lorem_str}
|
||||
Placeholder: {a.get("has_placeholder")} → {ph_str}
|
||||
|
||||
=== KIT DIGITAL (Spanish gov digitalization grant — sites must display EU logos) ===
|
||||
=== KIT DIGITAL (Spanish gov €12k SME grants — sites must show EU logos) ===
|
||||
Detected: {a.get("kit_digital")}
|
||||
Signals:
|
||||
{kd_str}
|
||||
|
||||
=== CONTACT CHANNELS ===
|
||||
{contacts_str}
|
||||
|
||||
=== PAGE TEXT SAMPLE (first 2000 chars) ===
|
||||
{text_snippet}
|
||||
=== PAGE TEXT SAMPLE ===
|
||||
{snippet}
|
||||
|
||||
=== TASK ===
|
||||
Analyse this site for IT services upsell potential. The client sells:
|
||||
web design/redesign, SEO, hosting migration, SSL renewal, security audits,
|
||||
maintenance contracts, Google Ads, and AI-assisted tools for SMEs.
|
||||
=== INSTRUCTIONS ===
|
||||
The client sells: web redesign, SEO, hosting migration, SSL renewal,
|
||||
security audits, GDPR compliance, accessibility fixes, Google Ads,
|
||||
maintenance contracts, AI tools for SMEs.
|
||||
|
||||
Respond ONLY with valid JSON — no markdown, no text outside the JSON object:
|
||||
Respond ONLY with valid JSON, no markdown fences, no text outside the JSON:
|
||||
{{
|
||||
"summary": "2-3 sentence executive summary of the site's current state",
|
||||
"site_quality_score": <0-10 integer>,
|
||||
"content_issues": ["list of specific content problems found — lorem ipsum, broken sections, placeholder text, etc."],
|
||||
"performance_notes": "comment on load time, page size, mobile readiness",
|
||||
"seo_status": "brief SEO assessment — indexing signals, missing elements",
|
||||
"summary": "2-3 sentence executive summary of the site's state",
|
||||
"site_quality_score": <0-10>,
|
||||
"content_issues": ["specific issues found in page content"],
|
||||
"performance_notes": "load time, size, mobile assessment",
|
||||
"seo_status": "SEO health — what's missing or broken",
|
||||
"hosting_notes": "ASN/ISP name, EU vs non-EU, any concerns",
|
||||
"gdpr_compliance": "cookie banner status, privacy policy, GDPR gaps",
|
||||
"accessibility_issues": ["specific a11y problems found"],
|
||||
"kit_digital_confirmed": true/false,
|
||||
"kit_digital_reasoning": "1 sentence — why confirmed or not",
|
||||
"kit_digital_reasoning": "1 sentence",
|
||||
"is_local_sme": true/false,
|
||||
"lead_quality": "HOT|WARM|COLD",
|
||||
"lead_reasoning": "1-2 sentences on why",
|
||||
"lead_reasoning": "1-2 sentences",
|
||||
"best_contact_channel": "email|phone|whatsapp|social|web_form|unknown",
|
||||
"best_contact_value": "the actual value to use (email address, phone number, URL) or empty string",
|
||||
"all_contacts": {{
|
||||
"emails": [],
|
||||
"phones": [],
|
||||
"whatsapp": [],
|
||||
"social": []
|
||||
}},
|
||||
"pitch_angle": "One concrete opening sentence in Spanish for cold outreach",
|
||||
"services_needed": ["service1", "service2"],
|
||||
"urgency_signals": ["list of specific urgent issues — expiring SSL, lorem ipsum live, no GA, blocked robots etc"],
|
||||
"outreach_notes": "Key context for the sales rep"
|
||||
}}"""
|
||||
"best_contact_value": "actual email/phone/URL or empty string",
|
||||
"all_contacts": {{"emails":[],"phones":[],"whatsapp":[],"social":[]}},
|
||||
"pitch_angle": "1 cold-outreach sentence in Spanish",
|
||||
"services_needed": ["service1","service2"],
|
||||
"urgency_signals": ["specific urgent issues like 'SSL expires in 12 days', 'lorem ipsum on homepage', 'no cookie banner'"],
|
||||
"outreach_notes": "sales rep context"
|
||||
}}
|
||||
|
||||
|
||||
def _parse_output(raw: str) -> dict:
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
"""Deep site analysis: content quality, SEO signals, performance, indexing hints."""
|
||||
"""Deep site analysis: content quality, SEO, hosting, GDPR, accessibility."""
|
||||
import asyncio
|
||||
import re
|
||||
import time
|
||||
import logging
|
||||
import socket
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
@@ -10,25 +11,58 @@ from bs4 import BeautifulSoup
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ── Content quality ───────────────────────────────────────────────────────────
|
||||
# ── EU countries (hosting check) ─────────────────────────────────────────────
|
||||
EU_COUNTRIES = {
|
||||
'AT','BE','BG','HR','CY','CZ','DK','EE','FI','FR','DE','GR',
|
||||
'HU','IE','IT','LV','LT','LU','MT','NL','PL','PT','RO','SK',
|
||||
'SI','ES','SE',
|
||||
'NO','IS','LI', # EEA
|
||||
'CH','GB','AD', # adequacy / adjacent
|
||||
}
|
||||
|
||||
# ── Content quality ───────────────────────────────────────────────────────────
|
||||
LOREM_PHRASES = [
|
||||
"lorem ipsum", "sed ut perspiciatis", "nunc sem sapien",
|
||||
"nulla id nibh", "aenean dignissim", "aliquam tincidunt",
|
||||
"vestibulum commodo", "fusce nunc lacus", "consectetuer",
|
||||
"cras ornare tristique", "ntulla nec ante", "risus id metus",
|
||||
"praesent placerat", "fusce pellentesque", "suscipit nibh",
|
||||
"integer vitae libero", "felis quis tortor",
|
||||
"integer vitae libero", "felis quis tortor", "dolor sit amet",
|
||||
]
|
||||
|
||||
PLACEHOLDER_PHRASES = [
|
||||
"under construction", "coming soon", "sample page",
|
||||
"this is a demo", "default post", "hello world",
|
||||
"test post", "uncategorized",
|
||||
"this is a demo", "hello world", "test content",
|
||||
"default post", "uncategorized", "demo content",
|
||||
]
|
||||
|
||||
# ── Analytics & webmaster tags ────────────────────────────────────────────────
|
||||
# ── Cookie / GDPR consent tools ───────────────────────────────────────────────
|
||||
COOKIE_TOOLS = {
|
||||
"cookiebot": ["cookiebot.com", "CookieConsent", "CybotCookiebot"],
|
||||
"onetrust": ["onetrust", "otBannerSdk"],
|
||||
"cookiepro": ["cookiepro.com"],
|
||||
"osano": ["osano.com"],
|
||||
"iubenda": ["iubenda.com"],
|
||||
"borlabs": ["borlabs-cookie"],
|
||||
"complianz": ["complianz"],
|
||||
"cookieyes": ["cookieyes.com", "cookie-law-info"],
|
||||
"usercentrics": ["usercentrics.com"],
|
||||
"quantcast": ["quantcast.com/cmp"],
|
||||
}
|
||||
COOKIE_TEXT_SIGNALS = [
|
||||
"accept cookies", "acepta las cookies", "we use cookies", "usamos cookies",
|
||||
"cookie policy", "política de cookies", "cookie settings", "manage cookies",
|
||||
"aceptar todas", "rechazar cookies",
|
||||
]
|
||||
PRIVACY_SIGNALS = [
|
||||
"privacy policy", "política de privacidad", "aviso legal",
|
||||
"privacy notice", "data protection",
|
||||
]
|
||||
GDPR_TEXT_SIGNALS = [
|
||||
"rgpd", "gdpr", "reglamento general de protección",
|
||||
"lopd", "protección de datos", "responsable del tratamiento",
|
||||
]
|
||||
|
||||
# ── Analytics / webmaster ─────────────────────────────────────────────────────
|
||||
ANALYTICS = {
|
||||
"google_analytics": ["gtag('config'", "google-analytics.com/analytics.js", "G-"],
|
||||
"google_tag_manager": ["googletagmanager.com/gtm.js", "GTM-"],
|
||||
@@ -36,13 +70,13 @@ ANALYTICS = {
|
||||
"hotjar": ["static.hotjar.com"],
|
||||
"clarity": ["clarity.ms/tag"],
|
||||
}
|
||||
|
||||
WEBMASTER = {
|
||||
"google_search_console": ['google-site-verification'],
|
||||
"bing_webmaster": ['msvalidate.01'],
|
||||
"yandex": ['yandex-verification'],
|
||||
"google_search_console": ["google-site-verification"],
|
||||
"bing_webmaster": ["msvalidate.01"],
|
||||
"yandex": ["yandex-verification"],
|
||||
}
|
||||
|
||||
# ── Kit Digital ───────────────────────────────────────────────────────────────
|
||||
KIT_IMG_PATS = [
|
||||
"digitalizadores", "kit-digital", "kitdigital", "kit_digital",
|
||||
"fondos-europeos", "fondos_europeos", "nextgeneration", "next-generation",
|
||||
@@ -56,59 +90,78 @@ KIT_TEXT_PATS = [
|
||||
|
||||
EMAIL_RE = re.compile(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}")
|
||||
PHONE_RE = re.compile(r"(?:\+34[\s\-]?)?(?:6|7|8|9)\d{2}[\s\-]?\d{3}[\s\-]?\d{3}")
|
||||
SOCIAL_DOM = ["facebook.com", "instagram.com", "linkedin.com", "twitter.com", "x.com", "tiktok.com"]
|
||||
SOCIAL_DOM = ["facebook.com", "instagram.com", "linkedin.com",
|
||||
"twitter.com", "x.com", "tiktok.com", "youtube.com"]
|
||||
|
||||
|
||||
async def _get_hosting_info(domain: str) -> dict:
|
||||
"""Resolve IP, then look up ASN / org / country via ip-api.com."""
|
||||
info = {"ip": None, "asn": None, "org": None, "isp": None,
|
||||
"ip_country": None, "ip_region": None, "eu_hosted": None}
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
ip = await loop.run_in_executor(None, socket.gethostbyname, domain)
|
||||
info["ip"] = ip
|
||||
async with httpx.AsyncClient(timeout=6) as client:
|
||||
r = await client.get(
|
||||
f"http://ip-api.com/json/{ip}",
|
||||
params={"fields": "status,country,countryCode,regionName,org,as,isp"},
|
||||
)
|
||||
if r.status_code == 200:
|
||||
d = r.json()
|
||||
if d.get("status") == "success":
|
||||
info.update({
|
||||
"asn": d.get("as"),
|
||||
"org": d.get("org"),
|
||||
"isp": d.get("isp"),
|
||||
"ip_country": d.get("countryCode"),
|
||||
"ip_region": d.get("regionName"),
|
||||
"eu_hosted": d.get("countryCode") in EU_COUNTRIES,
|
||||
})
|
||||
except Exception as e:
|
||||
logger.debug("Hosting lookup failed for %s: %s", domain, e)
|
||||
return info
|
||||
|
||||
|
||||
async def analyze_site(domain: str) -> dict:
|
||||
"""Fetch and deeply analyse a site. Returns a rich dict for the AI prompt."""
|
||||
result = {
|
||||
"domain": domain,
|
||||
"reachable": False,
|
||||
"load_time_ms": None,
|
||||
"status_code": None,
|
||||
"final_url": None,
|
||||
"page_size_kb": None,
|
||||
"server": None,
|
||||
"cms": None,
|
||||
"ssl_valid": False,
|
||||
"ssl_expiry_days": None,
|
||||
"reachable": False, "load_time_ms": None, "status_code": None,
|
||||
"final_url": None, "page_size_kb": None, "server": None, "cms": None,
|
||||
# Hosting
|
||||
"ip": None, "asn": None, "org": None, "isp": None,
|
||||
"ip_country": None, "ip_region": None, "eu_hosted": None,
|
||||
# SSL
|
||||
"ssl_valid": False, "ssl_expiry_days": None,
|
||||
# Content quality
|
||||
"has_lorem_ipsum": False,
|
||||
"lorem_matches": [],
|
||||
"has_placeholder": False,
|
||||
"placeholder_matches": [],
|
||||
"word_count": 0,
|
||||
"image_count": 0,
|
||||
"broken_images": 0,
|
||||
"script_count": 0,
|
||||
"has_lorem_ipsum": False, "lorem_matches": [],
|
||||
"has_placeholder": False, "placeholder_matches": [],
|
||||
"word_count": 0, "image_count": 0, "script_count": 0,
|
||||
"has_mobile_viewport": False,
|
||||
"page_title": None,
|
||||
"meta_description": None,
|
||||
"h1_text": None,
|
||||
"page_title": None, "meta_description": None, "h1_text": None,
|
||||
"visible_text_snippet": "",
|
||||
# SEO / webmaster
|
||||
"has_sitemap": False,
|
||||
"has_robots": False,
|
||||
"robots_disallows_google": False,
|
||||
"analytics_present": [],
|
||||
"webmaster_verified": [],
|
||||
"canonical_url": None,
|
||||
"og_title": None,
|
||||
# SEO
|
||||
"has_sitemap": False, "has_robots": False, "robots_disallows_google": False,
|
||||
"analytics_present": [], "webmaster_verified": [],
|
||||
"canonical_url": None, "og_title": None,
|
||||
# GDPR / cookies
|
||||
"cookie_tool": None, "has_cookie_notice": False,
|
||||
"has_privacy_policy": False, "has_gdpr_text": False,
|
||||
# Accessibility
|
||||
"html_lang": None, "images_missing_alt": 0,
|
||||
"has_skip_nav": False, "empty_links": 0,
|
||||
"inputs_without_labels": 0,
|
||||
# Kit Digital
|
||||
"kit_digital": False,
|
||||
"kit_digital_signals": [],
|
||||
"kit_digital": False, "kit_digital_signals": [],
|
||||
# Contacts
|
||||
"emails": [],
|
||||
"phones": [],
|
||||
"whatsapp": [],
|
||||
"social_links": [],
|
||||
# Errors
|
||||
"emails": [], "phones": [], "whatsapp": [], "social_links": [],
|
||||
"error": None,
|
||||
}
|
||||
|
||||
# ── Fetch main page ───────────────────────────────────────────────────────
|
||||
try:
|
||||
# ── Fetch + hosting (parallel) ────────────────────────────────────────────
|
||||
async def _fetch():
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
timeout=15, follow_redirects=True, verify=False,
|
||||
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"},
|
||||
@@ -116,12 +169,20 @@ async def analyze_site(domain: str) -> dict:
|
||||
resp = await client.get(f"https://{domain}")
|
||||
if resp.status_code >= 400:
|
||||
resp = await client.get(f"http://{domain}")
|
||||
return resp, int((time.monotonic() - t0) * 1000)
|
||||
except Exception as e:
|
||||
return None, int((time.monotonic() - t0) * 1000)
|
||||
|
||||
load_ms = int((time.monotonic() - t0) * 1000)
|
||||
(resp, load_ms), hosting = await asyncio.gather(_fetch(), _get_hosting_info(domain))
|
||||
result.update(hosting)
|
||||
result["load_time_ms"] = load_ms
|
||||
|
||||
if resp is None:
|
||||
result["error"] = "Failed to fetch site"
|
||||
else:
|
||||
html = resp.text
|
||||
result.update({
|
||||
"reachable": resp.status_code < 400,
|
||||
"load_time_ms": load_ms,
|
||||
"status_code": resp.status_code,
|
||||
"final_url": str(resp.url),
|
||||
"page_size_kb": round(len(resp.content) / 1024, 1),
|
||||
@@ -131,46 +192,42 @@ async def analyze_site(domain: str) -> dict:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
hl = html.lower()
|
||||
|
||||
# Title, meta
|
||||
title_tag = soup.find("title")
|
||||
result["page_title"] = title_tag.get_text(strip=True)[:200] if title_tag else None
|
||||
meta_desc = soup.find("meta", attrs={"name": "description"})
|
||||
result["meta_description"] = (meta_desc.get("content") or "")[:300] if meta_desc else None
|
||||
# ── Basic metadata ────────────────────────────────────────────────────
|
||||
result["html_lang"] = (soup.find("html") or {}).get("lang")
|
||||
t = soup.find("title")
|
||||
result["page_title"] = t.get_text(strip=True)[:200] if t else None
|
||||
md = soup.find("meta", attrs={"name": "description"})
|
||||
result["meta_description"] = (md.get("content") or "")[:300] if md else None
|
||||
h1 = soup.find("h1")
|
||||
result["h1_text"] = h1.get_text(strip=True)[:200] if h1 else None
|
||||
|
||||
# Mobile viewport
|
||||
result["has_mobile_viewport"] = bool(soup.find("meta", attrs={"name": "viewport"}))
|
||||
|
||||
# Canonical + OG
|
||||
canon = soup.find("link", rel="canonical")
|
||||
result["canonical_url"] = canon.get("href") if canon else None
|
||||
c = soup.find("link", rel="canonical")
|
||||
result["canonical_url"] = c.get("href") if c else None
|
||||
og = soup.find("meta", property="og:title")
|
||||
result["og_title"] = og.get("content") if og else None
|
||||
|
||||
# Visible text
|
||||
# ── Visible text ──────────────────────────────────────────────────────
|
||||
for tag in soup(["script", "style", "noscript"]):
|
||||
tag.decompose()
|
||||
visible_text = soup.get_text(separator=" ", strip=True)
|
||||
words = visible_text.split()
|
||||
visible = soup.get_text(separator=" ", strip=True)
|
||||
vl = visible.lower()
|
||||
words = visible.split()
|
||||
result["word_count"] = len(words)
|
||||
result["visible_text_snippet"] = " ".join(words[:500])
|
||||
result["visible_text_snippet"] = " ".join(words[:600])
|
||||
|
||||
# Lorem ipsum / placeholder detection
|
||||
vl = visible_text.lower()
|
||||
# ── Content quality ───────────────────────────────────────────────────
|
||||
lorem_hits = [p for p in LOREM_PHRASES if p in vl]
|
||||
result["has_lorem_ipsum"] = len(lorem_hits) > 0
|
||||
result["lorem_matches"] = lorem_hits[:5]
|
||||
result["lorem_matches"] = lorem_hits[:6]
|
||||
ph_hits = [p for p in PLACEHOLDER_PHRASES if p in vl]
|
||||
result["has_placeholder"] = len(ph_hits) > 0
|
||||
result["placeholder_matches"] = ph_hits[:3]
|
||||
|
||||
# Images & scripts
|
||||
imgs = soup.find_all("img")
|
||||
result["image_count"] = len(imgs)
|
||||
result["script_count"] = len(soup.find_all("script", src=True))
|
||||
|
||||
# Analytics / webmaster tags
|
||||
# ── Analytics / webmaster ─────────────────────────────────────────────
|
||||
for name, sigs in ANALYTICS.items():
|
||||
if any(s.lower() in hl for s in sigs):
|
||||
result["analytics_present"].append(name)
|
||||
@@ -178,12 +235,42 @@ async def analyze_site(domain: str) -> dict:
|
||||
if any(s.lower() in hl for s in sigs):
|
||||
result["webmaster_verified"].append(name)
|
||||
|
||||
# Kit Digital
|
||||
# ── GDPR / cookies ────────────────────────────────────────────────────
|
||||
for tool, sigs in COOKIE_TOOLS.items():
|
||||
if any(s.lower() in hl for s in sigs):
|
||||
result["cookie_tool"] = tool
|
||||
result["has_cookie_notice"] = True
|
||||
break
|
||||
if not result["has_cookie_notice"]:
|
||||
result["has_cookie_notice"] = any(s in vl for s in COOKIE_TEXT_SIGNALS)
|
||||
result["has_privacy_policy"] = any(s in vl for s in PRIVACY_SIGNALS) or bool(
|
||||
soup.find("a", href=lambda h: h and ("privacidad" in h.lower() or "privacy" in h.lower()))
|
||||
)
|
||||
result["has_gdpr_text"] = any(s in vl for s in GDPR_TEXT_SIGNALS)
|
||||
|
||||
# ── Accessibility ─────────────────────────────────────────────────────
|
||||
result["images_missing_alt"] = sum(
|
||||
1 for img in imgs if not img.get("alt") and img.get("alt") != ""
|
||||
)
|
||||
result["has_skip_nav"] = bool(
|
||||
soup.find("a", href=lambda h: h and h.lower() in ("#main", "#content", "#maincontent", "#skip"))
|
||||
)
|
||||
result["empty_links"] = sum(
|
||||
1 for a in soup.find_all("a") if not a.get_text(strip=True) and not a.find("img")
|
||||
)
|
||||
all_inputs = soup.find_all("input", type=lambda t: t not in ("hidden", "submit", "button", None) or t is None)
|
||||
labeled_ids = {lbl.get("for") for lbl in soup.find_all("label") if lbl.get("for")}
|
||||
result["inputs_without_labels"] = sum(
|
||||
1 for inp in all_inputs
|
||||
if inp.get("id") not in labeled_ids and not inp.get("aria-label") and not inp.get("aria-labelledby")
|
||||
)
|
||||
|
||||
# ── Kit Digital ───────────────────────────────────────────────────────
|
||||
kd_signals = []
|
||||
for img in imgs:
|
||||
combined = ((img.get("src") or "") + (img.get("alt") or "") + (img.get("srcset") or "")).lower()
|
||||
for img in soup.find_all("img"):
|
||||
comb = ((img.get("src") or "") + (img.get("alt") or "") + (img.get("srcset") or "")).lower()
|
||||
for p in KIT_IMG_PATS:
|
||||
if p in combined:
|
||||
if p in comb:
|
||||
kd_signals.append(f"img:{p}")
|
||||
break
|
||||
for p in KIT_TEXT_PATS:
|
||||
@@ -197,7 +284,7 @@ async def analyze_site(domain: str) -> dict:
|
||||
result["kit_digital"] = len(kd_signals) > 0
|
||||
result["kit_digital_signals"] = kd_signals
|
||||
|
||||
# Contacts
|
||||
# ── Contacts ──────────────────────────────────────────────────────────
|
||||
for a in soup.find_all("a", href=True):
|
||||
href = a["href"]
|
||||
if href.startswith("mailto:"):
|
||||
@@ -220,25 +307,36 @@ async def analyze_site(domain: str) -> dict:
|
||||
break
|
||||
for em in EMAIL_RE.findall(html[:80000]):
|
||||
em = em.lower()
|
||||
if em not in result["emails"] and not any(em.endswith(x) for x in [".png", ".jpg", ".css", ".js", ".svg"]):
|
||||
if em not in result["emails"] and not any(em.endswith(x) for x in [".png",".jpg",".css",".js",".svg"]):
|
||||
result["emails"].append(em)
|
||||
for ph in PHONE_RE.findall(visible_text):
|
||||
for ph in PHONE_RE.findall(visible):
|
||||
ph_c = re.sub(r"[\s\-]", "", ph)
|
||||
if ph_c not in result["phones"]:
|
||||
result["phones"].append(ph_c)
|
||||
# Cap
|
||||
for k in ["emails", "phones", "whatsapp", "social_links"]:
|
||||
result[k] = list(dict.fromkeys(result[k]))[:5]
|
||||
|
||||
# CMS
|
||||
from app.enricher import detect_cms
|
||||
result["cms"] = detect_cms(html, dict(resp.headers))
|
||||
|
||||
except Exception as e:
|
||||
result["error"] = str(e)[:300]
|
||||
# ── CMS ───────────────────────────────────────────────────────────────
|
||||
CMS_SIGS = {
|
||||
"wordpress": ["/wp-content/", "/wp-includes/", 'content="WordPress'],
|
||||
"joomla": ["/components/com_", "Joomla!", 'content="Joomla'],
|
||||
"drupal": ["/sites/default/files/", "Drupal.settings"],
|
||||
"wix": ["static.wixstatic.com", "X-Wix-"],
|
||||
"squarespace": ["squarespace.com", "X-Squarespace-"],
|
||||
"shopify": ["cdn.shopify.com", "Shopify.theme"],
|
||||
"prestashop": ["PrestaShop", "/modules/prestashop"],
|
||||
"magento": ["Mage.Cookies", "X-Magento-"],
|
||||
"typo3": ["typo3temp", "TYPO3 CMS"],
|
||||
"opencart": ["route=common/home", "OpenCart"],
|
||||
}
|
||||
combined_check = html[:60000] + " ".join(f"{k}:{v}" for k, v in resp.headers.items())
|
||||
for cms, sigs in CMS_SIGS.items():
|
||||
if any(s.lower() in combined_check.lower() for s in sigs):
|
||||
result["cms"] = cms
|
||||
break
|
||||
|
||||
# ── Sitemap & robots (parallel) ───────────────────────────────────────────
|
||||
async def _check_url(url: str) -> Optional[str]:
|
||||
async def _get(url):
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=6, follow_redirects=True, verify=False) as c:
|
||||
r = await c.get(url)
|
||||
@@ -247,24 +345,22 @@ async def analyze_site(domain: str) -> dict:
|
||||
return None
|
||||
|
||||
sitemap_txt, robots_txt = await asyncio.gather(
|
||||
_check_url(f"https://{domain}/sitemap.xml"),
|
||||
_check_url(f"https://{domain}/robots.txt"),
|
||||
_get(f"https://{domain}/sitemap.xml"),
|
||||
_get(f"https://{domain}/robots.txt"),
|
||||
)
|
||||
result["has_sitemap"] = sitemap_txt is not None
|
||||
result["has_robots"] = robots_txt is not None
|
||||
if robots_txt:
|
||||
robots_lower = robots_txt.lower()
|
||||
result["robots_disallows_google"] = (
|
||||
"disallow: /" in robots_lower and "googlebot" in robots_lower
|
||||
)
|
||||
rl = robots_txt.lower()
|
||||
result["robots_disallows_google"] = "disallow: /" in rl and "googlebot" in rl
|
||||
|
||||
# ── SSL ───────────────────────────────────────────────────────────────────
|
||||
import ssl as _ssl, socket as _socket
|
||||
import ssl as _ssl
|
||||
try:
|
||||
def _ssl_check():
|
||||
import datetime as _dt
|
||||
ctx = _ssl.create_default_context()
|
||||
with _socket.create_connection((domain, 443), timeout=5) as s:
|
||||
with socket.create_connection((domain, 443), timeout=5) as s:
|
||||
with ctx.wrap_socket(s, server_hostname=domain) as ss:
|
||||
cert = ss.getpeercert()
|
||||
exp = _dt.datetime.strptime(cert["notAfter"], "%b %d %H:%M:%S %Y %Z")
|
||||
|
||||
@@ -179,7 +179,9 @@ tr:hover td{background:rgba(255,255,255,.025)}
|
||||
<div class="mrow"><span class="mlabel">Reasoning</span><span x-text="modal.ai.lead_reasoning||'—'"></span></div>
|
||||
<div class="mrow"><span class="mlabel">KD notes</span><span x-text="modal.ai.kit_digital_reasoning||'—'"></span></div>
|
||||
<div class="mrow"><span class="mlabel">Performance</span><span x-text="modal.ai.performance_notes||'—'"></span></div>
|
||||
<div class="mrow"><span class="mlabel">SEO status</span><span x-text="modal.ai.seo_status||'—'"></span></div>
|
||||
<div class="mrow"><span class="mlabel">SEO</span><span x-text="modal.ai.seo_status||'—'"></span></div>
|
||||
<div class="mrow"><span class="mlabel">Hosting</span><span x-text="(modal.sa?.org||'?') + ' / ' + (modal.sa?.ip_country||'?') + (modal.sa?.eu_hosted===false?' ❌ Non-EU':modal.sa?.eu_hosted?' ✅ EU':'')"></span></div>
|
||||
<div class="mrow"><span class="mlabel">GDPR</span><span :style="(!modal.sa?.has_cookie_notice)?'color:var(--danger)':''" x-text="modal.ai.gdpr_compliance||'—'"></span></div>
|
||||
|
||||
<!-- Content issues -->
|
||||
<div x-show="(modal.ai.content_issues||[]).length>0" style="margin:8px 0">
|
||||
@@ -225,6 +227,14 @@ tr:hover td{background:rgba(255,255,255,.025)}
|
||||
<div style="font-size:13px;font-style:italic;color:var(--accent2)" x-text="modal.ai.pitch_angle||'—'"></div>
|
||||
</div>
|
||||
|
||||
<!-- Accessibility issues -->
|
||||
<div x-show="(modal.ai.accessibility_issues||[]).length>0" style="margin:8px 0">
|
||||
<div style="font-size:10px;color:var(--muted);text-transform:uppercase;margin-bottom:4px">Accessibility Issues</div>
|
||||
<template x-for="issue in (modal.ai.accessibility_issues||[])">
|
||||
<div style="font-size:12px;color:var(--warn);padding:2px 0">♿ <span x-text="issue"></span></div>
|
||||
</template>
|
||||
</div>
|
||||
|
||||
<div class="mrow"><span class="mlabel">Services</span><span x-text="(modal.ai.services_needed||[]).join(', ')||'—'"></span></div>
|
||||
<div class="mrow"><span class="mlabel">Notes</span><span x-text="modal.ai.outreach_notes||'—'"></span></div>
|
||||
|
||||
@@ -431,7 +441,11 @@ tr:hover td{background:rgba(255,255,255,.025)}
|
||||
<div style="font-size:12px;color:var(--muted);margin-bottom:8px">
|
||||
Auto-assesses enriched domains via Gemini. Detects Kit Digital confirmation, extracts best contact channel, writes pitch.
|
||||
</div>
|
||||
<div style="display:flex;gap:6px;flex-wrap:wrap">
|
||||
<button class="btn bai" @click="aiAssessAllKD()">🤖 AI Assess all Kit Digital domains</button>
|
||||
<button class="btn bg sm" @click="restartAiWorker()">↺ Restart AI worker</button>
|
||||
<a class="btn bg sm" href="/api/ai/debug" target="_blank" style="text-decoration:none">🔍 Debug AI queue</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div style="margin-top:18px;padding-top:14px;border-top:1px solid var(--border)">
|
||||
@@ -616,6 +630,7 @@ function app() {
|
||||
try { this.qst = await fetch('/api/enrich/status').then(r=>r.json()); } catch(e){}
|
||||
},
|
||||
|
||||
async restartAiWorker() { const r=await fetch('/api/ai/worker/restart',{method:'POST'}); this.notify('AI worker restarted','info'); await this.loadAiStatus(); },
|
||||
async startEnrich() { await fetch('/api/enrich/resume',{method:'POST'}); this.notify('Worker started','success'); await this.loadQueue(); },
|
||||
async pauseEnrich() { await fetch('/api/enrich/pause',{method:'POST'}); this.notify('Worker paused','success'); await this.loadQueue(); },
|
||||
async retryFailed() { await fetch('/api/enrich/retry',{method:'POST'}); this.notify('Retrying failed','success'); await this.loadQueue(); },
|
||||
|
||||
Reference in New Issue
Block a user