fix: AI worker crash-proof + GDPR/hosting/accessibility analysis

AI worker fixes (root cause of "nothing reaches Replicate"):
- Worker task died silently — no exception handler around while loop
- Added try/except around entire loop body with exc_info logging
- Added watchdog task that restarts dead workers every 10 seconds
- ensure_workers_alive() called on every /api/ai/assess/batch POST
- _assess_one() is now a top-level function (not closure) — avoids
  subtle scoping bugs with async inner functions in while loops
- /api/ai/debug endpoint: shows worker alive status, task exception,
  last 10 queue entries — browse to /api/ai/debug to diagnose
- /api/ai/worker/restart endpoint + UI button
- "Restart AI worker" button + "Debug AI queue" link in enrichment tab

site_analyzer.py — new signals:
- IP resolution + ip-api.com for ASN, org, ISP, host country
- EU hosting detection (27 EU + EEA + adequacy countries)
- GDPR: detects Cookiebot, OneTrust, CookiePro, Osano, Iubenda,
  Borlabs, CookieYes, Complianz, Usercentrics + text signals
- Privacy policy and GDPR text presence
- Accessibility: html lang missing, images without alt count,
  skip nav link, empty links, inputs without labels

Gemini prompt additions:
- Hosting section: IP, ASN, org/ISP, EU vs non-EU flag
- GDPR section: cookie tool, notice, privacy policy
- Accessibility section: all quick-scan results
- New output fields: hosting_notes, gdpr_compliance,
  accessibility_issues[]

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-13 18:01:34 +02:00
parent 5ad8259c75
commit 60c9b495ae
10 changed files with 409 additions and 205 deletions

View File

@@ -25,88 +25,105 @@ def _sem() -> asyncio.Semaphore:
def _build_prompt(a: dict) -> str:
"""Build the Gemini prompt from a full site analysis dict."""
contacts_block = []
if a.get("emails"): contacts_block.append(f" Emails: {', '.join(a['emails'][:3])}")
if a.get("phones"): contacts_block.append(f" Phones: {', '.join(a['phones'][:3])}")
if a.get("whatsapp"): contacts_block.append(f" WhatsApp: {', '.join(a['whatsapp'][:2])}")
if a.get("social_links"): contacts_block.append(f" Social: {', '.join(a['social_links'][:4])}")
if a.get("emails"): contacts_block.append(f" Emails: {', '.join(a['emails'][:3])}")
if a.get("phones"): contacts_block.append(f" Phones: {', '.join(a['phones'][:3])}")
if a.get("whatsapp"): contacts_block.append(f" WhatsApp: {', '.join(a['whatsapp'][:2])}")
if a.get("social_links"):contacts_block.append(f" Social: {', '.join(a['social_links'][:4])}")
contacts_str = "\n".join(contacts_block) or " None found"
kd_str = "\n".join(f" - {s}" for s in (a.get("kit_digital_signals") or [])) or " None detected"
analytics_str = ", ".join(a.get("analytics_present") or []) or "none"
webmaster_str = ", ".join(a.get("webmaster_verified") or []) or "none"
lorem_str = ", ".join(a.get("lorem_matches") or []) or "none"
placeholder_str = ", ".join(a.get("placeholder_matches") or []) or "none"
kd_str = "\n".join(f" - {s}" for s in (a.get("kit_digital_signals") or [])) or " None"
analytics = ", ".join(a.get("analytics_present") or []) or "none"
webmaster = ", ".join(a.get("webmaster_verified") or []) or "none"
lorem_str = ", ".join(a.get("lorem_matches") or []) or "none"
ph_str = ", ".join(a.get("placeholder_matches") or []) or "none"
snippet = (a.get("visible_text_snippet") or "")[:2000]
text_snippet = (a.get("visible_text_snippet") or "")[:2000]
eu_hosted = a.get("eu_hosted")
hosting_flag = "✅ EU" if eu_hosted else ("❌ Non-EU" if eu_hosted is False else "unknown")
return f"""You are a senior web consultant and IT sales analyst reviewing a Spanish SME website.
return f"""You are a senior web consultant and IT sales analyst evaluating a Spanish/European SME website for IT services upsell.
=== TECHNICAL SNAPSHOT ===
Domain: {a.get("domain")}
Reachable: {a.get("reachable")} | Status: {a.get("status_code")} | Load time: {a.get("load_time_ms")} ms
Final URL: {a.get("final_url")}
Page size: {a.get("page_size_kb")} KB | Server: {a.get("server")} | CMS: {a.get("cms") or "unknown"}
SSL valid: {a.get("ssl_valid")} | SSL expires in: {a.get("ssl_expiry_days")} days
Mobile viewport: {a.get("has_mobile_viewport")}
Word count: {a.get("word_count")} | Images: {a.get("image_count")} | Scripts: {a.get("script_count")}
=== TECHNICAL ===
Domain: {a.get("domain")}
Reachable: {a.get("reachable")} | Status: {a.get("status_code")} | Load time: {a.get("load_time_ms")} ms
Page size: {a.get("page_size_kb")} KB | Server: {a.get("server")} | CMS: {a.get("cms") or "unknown"}
SSL: valid={a.get("ssl_valid")} expires_in={a.get("ssl_expiry_days")} days
Mobile: viewport={a.get("has_mobile_viewport")}
Words: {a.get("word_count")} | Images: {a.get("image_count")} | Scripts: {a.get("script_count")}
=== SEO & INDEXING SIGNALS ===
Page title: {a.get("page_title") or "missing"}
H1: {a.get("h1_text") or "missing"}
Meta description: {a.get("meta_description") or "missing"}
Canonical URL: {a.get("canonical_url") or "not set"}
Sitemap.xml: {a.get("has_sitemap")}
Robots.txt: {a.get("has_robots")} | Blocks Googlebot: {a.get("robots_disallows_google")}
Analytics: {analytics_str}
Webmaster verified:{webmaster_str}
=== HOSTING & INFRASTRUCTURE ===
IP: {a.get("ip") or "unknown"}
ASN: {a.get("asn") or "unknown"}
Organisation: {a.get("org") or "unknown"}
ISP: {a.get("isp") or "unknown"}
Host country: {a.get("ip_country") or "unknown"} / {a.get("ip_region") or ""}
EU hosted: {hosting_flag}
=== SEO & INDEXING ===
Title: {a.get("page_title") or "MISSING"}
H1: {a.get("h1_text") or "MISSING"}
Meta desc: {a.get("meta_description") or "MISSING"}
Canonical: {a.get("canonical_url") or "not set"}
Sitemap: {a.get("has_sitemap")} | Robots: {a.get("has_robots")} | Blocks Google: {a.get("robots_disallows_google")}
Analytics: {analytics}
Webmaster: {webmaster}
=== GDPR & LEGAL COMPLIANCE ===
Cookie tool: {a.get("cookie_tool") or "none detected"}
Cookie notice: {a.get("has_cookie_notice")}
Privacy policy: {a.get("has_privacy_policy")}
GDPR text: {a.get("has_gdpr_text")}
=== ACCESSIBILITY (quick scan) ===
HTML lang attr: {a.get("html_lang") or "MISSING"}
Images missing alt: {a.get("images_missing_alt")}
Skip navigation link: {a.get("has_skip_nav")}
Empty links: {a.get("empty_links")}
Inputs without labels: {a.get("inputs_without_labels")}
=== CONTENT QUALITY ===
Lorem ipsum found: {a.get("has_lorem_ipsum")} matches: {lorem_str}
Placeholder text: {a.get("has_placeholder")}matches: {placeholder_str}
Lorem ipsum: {a.get("has_lorem_ipsum")}{lorem_str}
Placeholder: {a.get("has_placeholder")}{ph_str}
=== KIT DIGITAL (Spanish gov digitalization grant — sites must display EU logos) ===
Detected: {a.get("kit_digital")}
Signals:
=== KIT DIGITAL (Spanish gov €12k SME grants — sites must show EU logos) ===
Detected: {a.get("kit_digital")}
{kd_str}
=== CONTACT CHANNELS ===
{contacts_str}
=== PAGE TEXT SAMPLE (first 2000 chars) ===
{text_snippet}
=== PAGE TEXT SAMPLE ===
{snippet}
=== TASK ===
Analyse this site for IT services upsell potential. The client sells:
web design/redesign, SEO, hosting migration, SSL renewal, security audits,
maintenance contracts, Google Ads, and AI-assisted tools for SMEs.
=== INSTRUCTIONS ===
The client sells: web redesign, SEO, hosting migration, SSL renewal,
security audits, GDPR compliance, accessibility fixes, Google Ads,
maintenance contracts, AI tools for SMEs.
Respond ONLY with valid JSON no markdown, no text outside the JSON object:
Respond ONLY with valid JSON, no markdown fences, no text outside the JSON:
{{
"summary": "2-3 sentence executive summary of the site's current state",
"site_quality_score": <0-10 integer>,
"content_issues": ["list of specific content problems found — lorem ipsum, broken sections, placeholder text, etc."],
"performance_notes": "comment on load time, page size, mobile readiness",
"seo_status": "brief SEO assessment — indexing signals, missing elements",
"summary": "2-3 sentence executive summary of the site's state",
"site_quality_score": <0-10>,
"content_issues": ["specific issues found in page content"],
"performance_notes": "load time, size, mobile assessment",
"seo_status": "SEO health — what's missing or broken",
"hosting_notes": "ASN/ISP name, EU vs non-EU, any concerns",
"gdpr_compliance": "cookie banner status, privacy policy, GDPR gaps",
"accessibility_issues": ["specific a11y problems found"],
"kit_digital_confirmed": true/false,
"kit_digital_reasoning": "1 sentence — why confirmed or not",
"kit_digital_reasoning": "1 sentence",
"is_local_sme": true/false,
"lead_quality": "HOT|WARM|COLD",
"lead_reasoning": "1-2 sentences on why",
"lead_reasoning": "1-2 sentences",
"best_contact_channel": "email|phone|whatsapp|social|web_form|unknown",
"best_contact_value": "the actual value to use (email address, phone number, URL) or empty string",
"all_contacts": {{
"emails": [],
"phones": [],
"whatsapp": [],
"social": []
}},
"pitch_angle": "One concrete opening sentence in Spanish for cold outreach",
"services_needed": ["service1", "service2"],
"urgency_signals": ["list of specific urgent issues — expiring SSL, lorem ipsum live, no GA, blocked robots etc"],
"outreach_notes": "Key context for the sales rep"
}}"""
"best_contact_value": "actual email/phone/URL or empty string",
"all_contacts": {{"emails":[],"phones":[],"whatsapp":[],"social":[]}},
"pitch_angle": "1 cold-outreach sentence in Spanish",
"services_needed": ["service1","service2"],
"urgency_signals": ["specific urgent issues like 'SSL expires in 12 days', 'lorem ipsum on homepage', 'no cookie banner'"],
"outreach_notes": "sales rep context"
}}
def _parse_output(raw: str) -> dict: