Files
DomGod/app/replicate_ai.py
Malin f33dabbb7d fix: close missing triple-quote on _build_prompt f-string (SyntaxError)
The f-string in _build_prompt was never closed — the }} at end of the
JSON template was missing the closing \"\"\". Python consumed the entire
rest of the file as f-string content, then tried to evaluate the
{\s\S} regex braces as an f-string expression, giving
"unexpected character after line continuation character".

Also bundles the earlier timeout fixes (SSL handshake, DNS, analyze_site
90s cap, _assess_one 180s cap, worker reset of stale running jobs).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-13 21:03:56 +02:00

192 lines
7.5 KiB
Python

"""Replicate / Gemini integration — deep site assessment."""
import asyncio
import json
import logging
import os
import re
from typing import Optional
import httpx
logger = logging.getLogger(__name__)
REPLICATE_TOKEN = os.getenv("REPLICATE_API_TOKEN", "r8_7I7Feai78f9PzMOs20y5GVFKiLkgUWP463vZO") # override via env
REPLICATE_MODEL = "https://api.replicate.com/v1/models/google/gemini-3-pro/predictions"
AI_CONCURRENCY = int(os.getenv("AI_CONCURRENCY", "3"))
_ai_sem: Optional[asyncio.Semaphore] = None
def _sem() -> asyncio.Semaphore:
global _ai_sem
if _ai_sem is None:
_ai_sem = asyncio.Semaphore(AI_CONCURRENCY)
return _ai_sem
def _build_prompt(a: dict) -> str:
contacts_block = []
if a.get("emails"): contacts_block.append(f" Emails: {', '.join(a['emails'][:3])}")
if a.get("phones"): contacts_block.append(f" Phones: {', '.join(a['phones'][:3])}")
if a.get("whatsapp"): contacts_block.append(f" WhatsApp: {', '.join(a['whatsapp'][:2])}")
if a.get("social_links"):contacts_block.append(f" Social: {', '.join(a['social_links'][:4])}")
contacts_str = "\n".join(contacts_block) or " None found"
kd_str = "\n".join(f" - {s}" for s in (a.get("kit_digital_signals") or [])) or " None"
analytics = ", ".join(a.get("analytics_present") or []) or "none"
webmaster = ", ".join(a.get("webmaster_verified") or []) or "none"
lorem_str = ", ".join(a.get("lorem_matches") or []) or "none"
ph_str = ", ".join(a.get("placeholder_matches") or []) or "none"
snippet = (a.get("visible_text_snippet") or "")[:2000]
eu_hosted = a.get("eu_hosted")
hosting_flag = "✅ EU" if eu_hosted else ("❌ Non-EU" if eu_hosted is False else "unknown")
return f"""You are a senior web consultant and IT sales analyst evaluating a Spanish/European SME website for IT services upsell.
=== TECHNICAL ===
Domain: {a.get("domain")}
Reachable: {a.get("reachable")} | Status: {a.get("status_code")} | Load time: {a.get("load_time_ms")} ms
Page size: {a.get("page_size_kb")} KB | Server: {a.get("server")} | CMS: {a.get("cms") or "unknown"}
SSL: valid={a.get("ssl_valid")} expires_in={a.get("ssl_expiry_days")} days
Mobile: viewport={a.get("has_mobile_viewport")}
Words: {a.get("word_count")} | Images: {a.get("image_count")} | Scripts: {a.get("script_count")}
=== HOSTING & INFRASTRUCTURE ===
IP: {a.get("ip") or "unknown"}
ASN: {a.get("asn") or "unknown"}
Organisation: {a.get("org") or "unknown"}
ISP: {a.get("isp") or "unknown"}
Host country: {a.get("ip_country") or "unknown"} / {a.get("ip_region") or ""}
EU hosted: {hosting_flag}
=== SEO & INDEXING ===
Title: {a.get("page_title") or "MISSING"}
H1: {a.get("h1_text") or "MISSING"}
Meta desc: {a.get("meta_description") or "MISSING"}
Canonical: {a.get("canonical_url") or "not set"}
Sitemap: {a.get("has_sitemap")} | Robots: {a.get("has_robots")} | Blocks Google: {a.get("robots_disallows_google")}
Analytics: {analytics}
Webmaster: {webmaster}
=== GDPR & LEGAL COMPLIANCE ===
Cookie tool: {a.get("cookie_tool") or "none detected"}
Cookie notice: {a.get("has_cookie_notice")}
Privacy policy: {a.get("has_privacy_policy")}
GDPR text: {a.get("has_gdpr_text")}
=== ACCESSIBILITY (quick scan) ===
HTML lang attr: {a.get("html_lang") or "MISSING"}
Images missing alt: {a.get("images_missing_alt")}
Skip navigation link: {a.get("has_skip_nav")}
Empty links: {a.get("empty_links")}
Inputs without labels: {a.get("inputs_without_labels")}
=== CONTENT QUALITY ===
Lorem ipsum: {a.get("has_lorem_ipsum")}{lorem_str}
Placeholder: {a.get("has_placeholder")}{ph_str}
=== KIT DIGITAL (Spanish gov €12k SME grants — sites must show EU logos) ===
Detected: {a.get("kit_digital")}
{kd_str}
=== CONTACT CHANNELS ===
{contacts_str}
=== PAGE TEXT SAMPLE ===
{snippet}
=== INSTRUCTIONS ===
The client sells: web redesign, SEO, hosting migration, SSL renewal,
security audits, GDPR compliance, accessibility fixes, Google Ads,
maintenance contracts, AI tools for SMEs.
Respond ONLY with valid JSON, no markdown fences, no text outside the JSON:
{{
"summary": "2-3 sentence executive summary of the site's state",
"site_quality_score": <0-10>,
"content_issues": ["specific issues found in page content"],
"performance_notes": "load time, size, mobile assessment",
"seo_status": "SEO health — what's missing or broken",
"hosting_notes": "ASN/ISP name, EU vs non-EU, any concerns",
"gdpr_compliance": "cookie banner status, privacy policy, GDPR gaps",
"accessibility_issues": ["specific a11y problems found"],
"kit_digital_confirmed": true/false,
"kit_digital_reasoning": "1 sentence",
"is_local_sme": true/false,
"lead_quality": "HOT|WARM|COLD",
"lead_reasoning": "1-2 sentences",
"best_contact_channel": "email|phone|whatsapp|social|web_form|unknown",
"best_contact_value": "actual email/phone/URL or empty string",
"all_contacts": {{"emails":[],"phones":[],"whatsapp":[],"social":[]}},
"pitch_angle": "1 cold-outreach sentence in Spanish",
"services_needed": ["service1","service2"],
"urgency_signals": ["specific urgent issues like 'SSL expires in 12 days', 'lorem ipsum on homepage', 'no cookie banner'"],
"outreach_notes": "sales rep context"
}}"""
def _parse_output(raw: str) -> dict:
text = re.sub(r"```(?:json)?", "", raw).strip().rstrip("`").strip()
m = re.search(r"\{[\s\S]+\}", text)
if m:
try:
return json.loads(m.group(0))
except json.JSONDecodeError:
pass
logger.warning("Could not parse Gemini JSON output, raw: %s", raw[:300])
return {
"summary": raw[:400],
"lead_quality": "COLD",
"best_contact_channel": "unknown",
"best_contact_value": "",
"parse_error": True,
}
async def assess_domain(analysis: dict) -> dict:
"""Call Gemini with the full site analysis. Returns parsed assessment."""
async with _sem():
payload = {
"input": {
"prompt": _build_prompt(analysis),
"images": [],
"videos": [],
"top_p": 0.9,
"temperature": 0.2,
"thinking_level": "low",
"max_output_tokens": 2048,
}
}
try:
async with httpx.AsyncClient(timeout=120) as client:
resp = await client.post(
REPLICATE_MODEL,
headers={
"Authorization": f"Bearer {REPLICATE_TOKEN}",
"Content-Type": "application/json",
"Prefer": "wait",
},
json=payload,
)
resp.raise_for_status()
data = resp.json()
output = data.get("output", "")
if isinstance(output, list):
output = "".join(output)
result = _parse_output(output)
logger.info("AI %s%s (quality %s)",
analysis.get("domain"), result.get("lead_quality"), result.get("site_quality_score"))
return result
except Exception as e:
logger.error("Replicate error %s: %s", analysis.get("domain"), e)
return {
"error": str(e)[:300],
"lead_quality": "COLD",
"best_contact_channel": "unknown",
"best_contact_value": "",
}