"""Replicate / Gemini integration — deep site assessment.""" import asyncio import json import logging import os import re from typing import Optional import httpx logger = logging.getLogger(__name__) REPLICATE_TOKEN = os.getenv("REPLICATE_API_TOKEN", "r8_7I7Feai78f9PzMOs20y5GVFKiLkgUWP463vZO") # override via env REPLICATE_MODEL = "https://api.replicate.com/v1/models/google/gemini-3-pro/predictions" AI_CONCURRENCY = int(os.getenv("AI_CONCURRENCY", "3")) _ai_sem: Optional[asyncio.Semaphore] = None def _sem() -> asyncio.Semaphore: global _ai_sem if _ai_sem is None: _ai_sem = asyncio.Semaphore(AI_CONCURRENCY) return _ai_sem def _build_prompt(a: dict) -> str: """Build the Gemini prompt from a full site analysis dict.""" contacts_block = [] if a.get("emails"): contacts_block.append(f" Emails: {', '.join(a['emails'][:3])}") if a.get("phones"): contacts_block.append(f" Phones: {', '.join(a['phones'][:3])}") if a.get("whatsapp"): contacts_block.append(f" WhatsApp: {', '.join(a['whatsapp'][:2])}") if a.get("social_links"): contacts_block.append(f" Social: {', '.join(a['social_links'][:4])}") contacts_str = "\n".join(contacts_block) or " None found" kd_str = "\n".join(f" - {s}" for s in (a.get("kit_digital_signals") or [])) or " None detected" analytics_str = ", ".join(a.get("analytics_present") or []) or "none" webmaster_str = ", ".join(a.get("webmaster_verified") or []) or "none" lorem_str = ", ".join(a.get("lorem_matches") or []) or "none" placeholder_str = ", ".join(a.get("placeholder_matches") or []) or "none" text_snippet = (a.get("visible_text_snippet") or "")[:2000] return f"""You are a senior web consultant and IT sales analyst reviewing a Spanish SME website. === TECHNICAL SNAPSHOT === Domain: {a.get("domain")} Reachable: {a.get("reachable")} | Status: {a.get("status_code")} | Load time: {a.get("load_time_ms")} ms Final URL: {a.get("final_url")} Page size: {a.get("page_size_kb")} KB | Server: {a.get("server")} | CMS: {a.get("cms") or "unknown"} SSL valid: {a.get("ssl_valid")} | SSL expires in: {a.get("ssl_expiry_days")} days Mobile viewport: {a.get("has_mobile_viewport")} Word count: {a.get("word_count")} | Images: {a.get("image_count")} | Scripts: {a.get("script_count")} === SEO & INDEXING SIGNALS === Page title: {a.get("page_title") or "missing"} H1: {a.get("h1_text") or "missing"} Meta description: {a.get("meta_description") or "missing"} Canonical URL: {a.get("canonical_url") or "not set"} Sitemap.xml: {a.get("has_sitemap")} Robots.txt: {a.get("has_robots")} | Blocks Googlebot: {a.get("robots_disallows_google")} Analytics: {analytics_str} Webmaster verified:{webmaster_str} === CONTENT QUALITY === Lorem ipsum found: {a.get("has_lorem_ipsum")} → matches: {lorem_str} Placeholder text: {a.get("has_placeholder")} → matches: {placeholder_str} === KIT DIGITAL (Spanish gov digitalization grant — sites must display EU logos) === Detected: {a.get("kit_digital")} Signals: {kd_str} === CONTACT CHANNELS === {contacts_str} === PAGE TEXT SAMPLE (first 2000 chars) === {text_snippet} === TASK === Analyse this site for IT services upsell potential. The client sells: web design/redesign, SEO, hosting migration, SSL renewal, security audits, maintenance contracts, Google Ads, and AI-assisted tools for SMEs. Respond ONLY with valid JSON — no markdown, no text outside the JSON object: {{ "summary": "2-3 sentence executive summary of the site's current state", "site_quality_score": <0-10 integer>, "content_issues": ["list of specific content problems found — lorem ipsum, broken sections, placeholder text, etc."], "performance_notes": "comment on load time, page size, mobile readiness", "seo_status": "brief SEO assessment — indexing signals, missing elements", "kit_digital_confirmed": true/false, "kit_digital_reasoning": "1 sentence — why confirmed or not", "is_local_sme": true/false, "lead_quality": "HOT|WARM|COLD", "lead_reasoning": "1-2 sentences on why", "best_contact_channel": "email|phone|whatsapp|social|web_form|unknown", "best_contact_value": "the actual value to use (email address, phone number, URL) or empty string", "all_contacts": {{ "emails": [], "phones": [], "whatsapp": [], "social": [] }}, "pitch_angle": "One concrete opening sentence in Spanish for cold outreach", "services_needed": ["service1", "service2"], "urgency_signals": ["list of specific urgent issues — expiring SSL, lorem ipsum live, no GA, blocked robots etc"], "outreach_notes": "Key context for the sales rep" }}""" def _parse_output(raw: str) -> dict: text = re.sub(r"```(?:json)?", "", raw).strip().rstrip("`").strip() m = re.search(r"\{[\s\S]+\}", text) if m: try: return json.loads(m.group(0)) except json.JSONDecodeError: pass logger.warning("Could not parse Gemini JSON output, raw: %s", raw[:300]) return { "summary": raw[:400], "lead_quality": "COLD", "best_contact_channel": "unknown", "best_contact_value": "", "parse_error": True, } async def assess_domain(analysis: dict) -> dict: """Call Gemini with the full site analysis. Returns parsed assessment.""" async with _sem(): payload = { "input": { "prompt": _build_prompt(analysis), "images": [], "videos": [], "top_p": 0.9, "temperature": 0.2, "thinking_level": "low", "max_output_tokens": 2048, } } try: async with httpx.AsyncClient(timeout=120) as client: resp = await client.post( REPLICATE_MODEL, headers={ "Authorization": f"Bearer {REPLICATE_TOKEN}", "Content-Type": "application/json", "Prefer": "wait", }, json=payload, ) resp.raise_for_status() data = resp.json() output = data.get("output", "") if isinstance(output, list): output = "".join(output) result = _parse_output(output) logger.info("AI %s → %s (quality %s)", analysis.get("domain"), result.get("lead_quality"), result.get("site_quality_score")) return result except Exception as e: logger.error("Replicate error %s: %s", analysis.get("domain"), e) return { "error": str(e)[:300], "lead_quality": "COLD", "best_contact_channel": "unknown", "best_contact_value": "", }