fix: AI worker crash-proof + GDPR/hosting/accessibility analysis
AI worker fixes (root cause of "nothing reaches Replicate"): - Worker task died silently — no exception handler around while loop - Added try/except around entire loop body with exc_info logging - Added watchdog task that restarts dead workers every 10 seconds - ensure_workers_alive() called on every /api/ai/assess/batch POST - _assess_one() is now a top-level function (not closure) — avoids subtle scoping bugs with async inner functions in while loops - /api/ai/debug endpoint: shows worker alive status, task exception, last 10 queue entries — browse to /api/ai/debug to diagnose - /api/ai/worker/restart endpoint + UI button - "Restart AI worker" button + "Debug AI queue" link in enrichment tab site_analyzer.py — new signals: - IP resolution + ip-api.com for ASN, org, ISP, host country - EU hosting detection (27 EU + EEA + adequacy countries) - GDPR: detects Cookiebot, OneTrust, CookiePro, Osano, Iubenda, Borlabs, CookieYes, Complianz, Usercentrics + text signals - Privacy policy and GDPR text presence - Accessibility: html lang missing, images without alt count, skip nav link, empty links, inputs without labels Gemini prompt additions: - Hosting section: IP, ASN, org/ISP, EU vs non-EU flag - GDPR section: cookie tool, notice, privacy policy - Accessibility section: all quick-scan results - New output fields: hosting_notes, gdpr_compliance, accessibility_issues[] Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -25,88 +25,105 @@ def _sem() -> asyncio.Semaphore:
|
||||
|
||||
|
||||
def _build_prompt(a: dict) -> str:
|
||||
"""Build the Gemini prompt from a full site analysis dict."""
|
||||
contacts_block = []
|
||||
if a.get("emails"): contacts_block.append(f" Emails: {', '.join(a['emails'][:3])}")
|
||||
if a.get("phones"): contacts_block.append(f" Phones: {', '.join(a['phones'][:3])}")
|
||||
if a.get("whatsapp"): contacts_block.append(f" WhatsApp: {', '.join(a['whatsapp'][:2])}")
|
||||
if a.get("social_links"): contacts_block.append(f" Social: {', '.join(a['social_links'][:4])}")
|
||||
if a.get("emails"): contacts_block.append(f" Emails: {', '.join(a['emails'][:3])}")
|
||||
if a.get("phones"): contacts_block.append(f" Phones: {', '.join(a['phones'][:3])}")
|
||||
if a.get("whatsapp"): contacts_block.append(f" WhatsApp: {', '.join(a['whatsapp'][:2])}")
|
||||
if a.get("social_links"):contacts_block.append(f" Social: {', '.join(a['social_links'][:4])}")
|
||||
contacts_str = "\n".join(contacts_block) or " None found"
|
||||
|
||||
kd_str = "\n".join(f" - {s}" for s in (a.get("kit_digital_signals") or [])) or " None detected"
|
||||
analytics_str = ", ".join(a.get("analytics_present") or []) or "none"
|
||||
webmaster_str = ", ".join(a.get("webmaster_verified") or []) or "none"
|
||||
lorem_str = ", ".join(a.get("lorem_matches") or []) or "none"
|
||||
placeholder_str = ", ".join(a.get("placeholder_matches") or []) or "none"
|
||||
kd_str = "\n".join(f" - {s}" for s in (a.get("kit_digital_signals") or [])) or " None"
|
||||
analytics = ", ".join(a.get("analytics_present") or []) or "none"
|
||||
webmaster = ", ".join(a.get("webmaster_verified") or []) or "none"
|
||||
lorem_str = ", ".join(a.get("lorem_matches") or []) or "none"
|
||||
ph_str = ", ".join(a.get("placeholder_matches") or []) or "none"
|
||||
snippet = (a.get("visible_text_snippet") or "")[:2000]
|
||||
|
||||
text_snippet = (a.get("visible_text_snippet") or "")[:2000]
|
||||
eu_hosted = a.get("eu_hosted")
|
||||
hosting_flag = "✅ EU" if eu_hosted else ("❌ Non-EU" if eu_hosted is False else "unknown")
|
||||
|
||||
return f"""You are a senior web consultant and IT sales analyst reviewing a Spanish SME website.
|
||||
return f"""You are a senior web consultant and IT sales analyst evaluating a Spanish/European SME website for IT services upsell.
|
||||
|
||||
=== TECHNICAL SNAPSHOT ===
|
||||
Domain: {a.get("domain")}
|
||||
Reachable: {a.get("reachable")} | Status: {a.get("status_code")} | Load time: {a.get("load_time_ms")} ms
|
||||
Final URL: {a.get("final_url")}
|
||||
Page size: {a.get("page_size_kb")} KB | Server: {a.get("server")} | CMS: {a.get("cms") or "unknown"}
|
||||
SSL valid: {a.get("ssl_valid")} | SSL expires in: {a.get("ssl_expiry_days")} days
|
||||
Mobile viewport: {a.get("has_mobile_viewport")}
|
||||
Word count: {a.get("word_count")} | Images: {a.get("image_count")} | Scripts: {a.get("script_count")}
|
||||
=== TECHNICAL ===
|
||||
Domain: {a.get("domain")}
|
||||
Reachable: {a.get("reachable")} | Status: {a.get("status_code")} | Load time: {a.get("load_time_ms")} ms
|
||||
Page size: {a.get("page_size_kb")} KB | Server: {a.get("server")} | CMS: {a.get("cms") or "unknown"}
|
||||
SSL: valid={a.get("ssl_valid")} expires_in={a.get("ssl_expiry_days")} days
|
||||
Mobile: viewport={a.get("has_mobile_viewport")}
|
||||
Words: {a.get("word_count")} | Images: {a.get("image_count")} | Scripts: {a.get("script_count")}
|
||||
|
||||
=== SEO & INDEXING SIGNALS ===
|
||||
Page title: {a.get("page_title") or "missing"}
|
||||
H1: {a.get("h1_text") or "missing"}
|
||||
Meta description: {a.get("meta_description") or "missing"}
|
||||
Canonical URL: {a.get("canonical_url") or "not set"}
|
||||
Sitemap.xml: {a.get("has_sitemap")}
|
||||
Robots.txt: {a.get("has_robots")} | Blocks Googlebot: {a.get("robots_disallows_google")}
|
||||
Analytics: {analytics_str}
|
||||
Webmaster verified:{webmaster_str}
|
||||
=== HOSTING & INFRASTRUCTURE ===
|
||||
IP: {a.get("ip") or "unknown"}
|
||||
ASN: {a.get("asn") or "unknown"}
|
||||
Organisation: {a.get("org") or "unknown"}
|
||||
ISP: {a.get("isp") or "unknown"}
|
||||
Host country: {a.get("ip_country") or "unknown"} / {a.get("ip_region") or ""}
|
||||
EU hosted: {hosting_flag}
|
||||
|
||||
=== SEO & INDEXING ===
|
||||
Title: {a.get("page_title") or "MISSING"}
|
||||
H1: {a.get("h1_text") or "MISSING"}
|
||||
Meta desc: {a.get("meta_description") or "MISSING"}
|
||||
Canonical: {a.get("canonical_url") or "not set"}
|
||||
Sitemap: {a.get("has_sitemap")} | Robots: {a.get("has_robots")} | Blocks Google: {a.get("robots_disallows_google")}
|
||||
Analytics: {analytics}
|
||||
Webmaster: {webmaster}
|
||||
|
||||
=== GDPR & LEGAL COMPLIANCE ===
|
||||
Cookie tool: {a.get("cookie_tool") or "none detected"}
|
||||
Cookie notice: {a.get("has_cookie_notice")}
|
||||
Privacy policy: {a.get("has_privacy_policy")}
|
||||
GDPR text: {a.get("has_gdpr_text")}
|
||||
|
||||
=== ACCESSIBILITY (quick scan) ===
|
||||
HTML lang attr: {a.get("html_lang") or "MISSING"}
|
||||
Images missing alt: {a.get("images_missing_alt")}
|
||||
Skip navigation link: {a.get("has_skip_nav")}
|
||||
Empty links: {a.get("empty_links")}
|
||||
Inputs without labels: {a.get("inputs_without_labels")}
|
||||
|
||||
=== CONTENT QUALITY ===
|
||||
Lorem ipsum found: {a.get("has_lorem_ipsum")} → matches: {lorem_str}
|
||||
Placeholder text: {a.get("has_placeholder")} → matches: {placeholder_str}
|
||||
Lorem ipsum: {a.get("has_lorem_ipsum")} → {lorem_str}
|
||||
Placeholder: {a.get("has_placeholder")} → {ph_str}
|
||||
|
||||
=== KIT DIGITAL (Spanish gov digitalization grant — sites must display EU logos) ===
|
||||
Detected: {a.get("kit_digital")}
|
||||
Signals:
|
||||
=== KIT DIGITAL (Spanish gov €12k SME grants — sites must show EU logos) ===
|
||||
Detected: {a.get("kit_digital")}
|
||||
{kd_str}
|
||||
|
||||
=== CONTACT CHANNELS ===
|
||||
{contacts_str}
|
||||
|
||||
=== PAGE TEXT SAMPLE (first 2000 chars) ===
|
||||
{text_snippet}
|
||||
=== PAGE TEXT SAMPLE ===
|
||||
{snippet}
|
||||
|
||||
=== TASK ===
|
||||
Analyse this site for IT services upsell potential. The client sells:
|
||||
web design/redesign, SEO, hosting migration, SSL renewal, security audits,
|
||||
maintenance contracts, Google Ads, and AI-assisted tools for SMEs.
|
||||
=== INSTRUCTIONS ===
|
||||
The client sells: web redesign, SEO, hosting migration, SSL renewal,
|
||||
security audits, GDPR compliance, accessibility fixes, Google Ads,
|
||||
maintenance contracts, AI tools for SMEs.
|
||||
|
||||
Respond ONLY with valid JSON — no markdown, no text outside the JSON object:
|
||||
Respond ONLY with valid JSON, no markdown fences, no text outside the JSON:
|
||||
{{
|
||||
"summary": "2-3 sentence executive summary of the site's current state",
|
||||
"site_quality_score": <0-10 integer>,
|
||||
"content_issues": ["list of specific content problems found — lorem ipsum, broken sections, placeholder text, etc."],
|
||||
"performance_notes": "comment on load time, page size, mobile readiness",
|
||||
"seo_status": "brief SEO assessment — indexing signals, missing elements",
|
||||
"summary": "2-3 sentence executive summary of the site's state",
|
||||
"site_quality_score": <0-10>,
|
||||
"content_issues": ["specific issues found in page content"],
|
||||
"performance_notes": "load time, size, mobile assessment",
|
||||
"seo_status": "SEO health — what's missing or broken",
|
||||
"hosting_notes": "ASN/ISP name, EU vs non-EU, any concerns",
|
||||
"gdpr_compliance": "cookie banner status, privacy policy, GDPR gaps",
|
||||
"accessibility_issues": ["specific a11y problems found"],
|
||||
"kit_digital_confirmed": true/false,
|
||||
"kit_digital_reasoning": "1 sentence — why confirmed or not",
|
||||
"kit_digital_reasoning": "1 sentence",
|
||||
"is_local_sme": true/false,
|
||||
"lead_quality": "HOT|WARM|COLD",
|
||||
"lead_reasoning": "1-2 sentences on why",
|
||||
"lead_reasoning": "1-2 sentences",
|
||||
"best_contact_channel": "email|phone|whatsapp|social|web_form|unknown",
|
||||
"best_contact_value": "the actual value to use (email address, phone number, URL) or empty string",
|
||||
"all_contacts": {{
|
||||
"emails": [],
|
||||
"phones": [],
|
||||
"whatsapp": [],
|
||||
"social": []
|
||||
}},
|
||||
"pitch_angle": "One concrete opening sentence in Spanish for cold outreach",
|
||||
"services_needed": ["service1", "service2"],
|
||||
"urgency_signals": ["list of specific urgent issues — expiring SSL, lorem ipsum live, no GA, blocked robots etc"],
|
||||
"outreach_notes": "Key context for the sales rep"
|
||||
}}"""
|
||||
"best_contact_value": "actual email/phone/URL or empty string",
|
||||
"all_contacts": {{"emails":[],"phones":[],"whatsapp":[],"social":[]}},
|
||||
"pitch_angle": "1 cold-outreach sentence in Spanish",
|
||||
"services_needed": ["service1","service2"],
|
||||
"urgency_signals": ["specific urgent issues like 'SSL expires in 12 days', 'lorem ipsum on homepage', 'no cookie banner'"],
|
||||
"outreach_notes": "sales rep context"
|
||||
}}
|
||||
|
||||
|
||||
def _parse_output(raw: str) -> dict:
|
||||
|
||||
Reference in New Issue
Block a user