app/replicate_ai.py

"""Replicate / Gemini integration — deep site assessment."""
import asyncio
import json
import logging
import os
import re
from typing import Optional

import httpx

logger = logging.getLogger(__name__)

REPLICATE_TOKEN = os.getenv("REPLICATE_API_TOKEN", "r8_7I7Feai78f9PzMOs20y5GVFKiLkgUWP463vZO")  # override via env
REPLICATE_MODEL = "https://api.replicate.com/v1/models/google/gemini-3-pro/predictions"
AI_CONCURRENCY  = int(os.getenv("AI_CONCURRENCY", "3"))

_ai_sem: Optional[asyncio.Semaphore] = None


def _sem() -> asyncio.Semaphore:
    global _ai_sem
    if _ai_sem is None:
        _ai_sem = asyncio.Semaphore(AI_CONCURRENCY)
    return _ai_sem


def _build_prompt(a: dict) -> str:
    """Build the Gemini prompt from a full site analysis dict."""
    contacts_block = []
    if a.get("emails"):    contacts_block.append(f"  Emails:    {', '.join(a['emails'][:3])}")
    if a.get("phones"):    contacts_block.append(f"  Phones:    {', '.join(a['phones'][:3])}")
    if a.get("whatsapp"):  contacts_block.append(f"  WhatsApp:  {', '.join(a['whatsapp'][:2])}")
    if a.get("social_links"): contacts_block.append(f"  Social:    {', '.join(a['social_links'][:4])}")
    contacts_str = "\n".join(contacts_block) or "  None found"

    kd_str = "\n".join(f"  - {s}" for s in (a.get("kit_digital_signals") or [])) or "  None detected"
    analytics_str = ", ".join(a.get("analytics_present") or []) or "none"
    webmaster_str = ", ".join(a.get("webmaster_verified") or []) or "none"
    lorem_str     = ", ".join(a.get("lorem_matches") or []) or "none"
    placeholder_str = ", ".join(a.get("placeholder_matches") or []) or "none"

    text_snippet = (a.get("visible_text_snippet") or "")[:2000]

    return f"""You are a senior web consultant and IT sales analyst reviewing a Spanish SME website.

=== TECHNICAL SNAPSHOT ===
Domain:            {a.get("domain")}
Reachable:         {a.get("reachable")}  |  Status: {a.get("status_code")}  |  Load time: {a.get("load_time_ms")} ms
Final URL:         {a.get("final_url")}
Page size:         {a.get("page_size_kb")} KB  |  Server: {a.get("server")}  |  CMS: {a.get("cms") or "unknown"}
SSL valid:         {a.get("ssl_valid")}  |  SSL expires in: {a.get("ssl_expiry_days")} days
Mobile viewport:   {a.get("has_mobile_viewport")}
Word count:        {a.get("word_count")}  |  Images: {a.get("image_count")}  |  Scripts: {a.get("script_count")}

=== SEO & INDEXING SIGNALS ===
Page title:        {a.get("page_title") or "missing"}
H1:                {a.get("h1_text") or "missing"}
Meta description:  {a.get("meta_description") or "missing"}
Canonical URL:     {a.get("canonical_url") or "not set"}
Sitemap.xml:       {a.get("has_sitemap")}
Robots.txt:        {a.get("has_robots")}  |  Blocks Googlebot: {a.get("robots_disallows_google")}
Analytics:         {analytics_str}
Webmaster verified:{webmaster_str}

=== CONTENT QUALITY ===
Lorem ipsum found: {a.get("has_lorem_ipsum")}  →  matches: {lorem_str}
Placeholder text:  {a.get("has_placeholder")}  →  matches: {placeholder_str}

=== KIT DIGITAL (Spanish gov digitalization grant — sites must display EU logos) ===
Detected:          {a.get("kit_digital")}
Signals:
{kd_str}

=== CONTACT CHANNELS ===
{contacts_str}

=== PAGE TEXT SAMPLE (first 2000 chars) ===
{text_snippet}

=== TASK ===
Analyse this site for IT services upsell potential. The client sells:
web design/redesign, SEO, hosting migration, SSL renewal, security audits,
maintenance contracts, Google Ads, and AI-assisted tools for SMEs.

Respond ONLY with valid JSON — no markdown, no text outside the JSON object:
{{
  "summary": "2-3 sentence executive summary of the site's current state",
  "site_quality_score": <0-10 integer>,
  "content_issues": ["list of specific content problems found — lorem ipsum, broken sections, placeholder text, etc."],
  "performance_notes": "comment on load time, page size, mobile readiness",
  "seo_status": "brief SEO assessment — indexing signals, missing elements",
  "kit_digital_confirmed": true/false,
  "kit_digital_reasoning": "1 sentence — why confirmed or not",
  "is_local_sme": true/false,
  "lead_quality": "HOT|WARM|COLD",
  "lead_reasoning": "1-2 sentences on why",
  "best_contact_channel": "email|phone|whatsapp|social|web_form|unknown",
  "best_contact_value": "the actual value to use (email address, phone number, URL) or empty string",
  "all_contacts": {{
    "emails": [],
    "phones": [],
    "whatsapp": [],
    "social": []
  }},
  "pitch_angle": "One concrete opening sentence in Spanish for cold outreach",
  "services_needed": ["service1", "service2"],
  "urgency_signals": ["list of specific urgent issues — expiring SSL, lorem ipsum live, no GA, blocked robots etc"],
  "outreach_notes": "Key context for the sales rep"
}}"""


def _parse_output(raw: str) -> dict:
    text = re.sub(r"```(?:json)?", "", raw).strip().rstrip("`").strip()
    m = re.search(r"\{[\s\S]+\}", text)
    if m:
        try:
            return json.loads(m.group(0))
        except json.JSONDecodeError:
            pass
    logger.warning("Could not parse Gemini JSON output, raw: %s", raw[:300])
    return {
        "summary": raw[:400],
        "lead_quality": "COLD",
        "best_contact_channel": "unknown",
        "best_contact_value": "",
        "parse_error": True,
    }


async def assess_domain(analysis: dict) -> dict:
    """Call Gemini with the full site analysis. Returns parsed assessment."""
    async with _sem():
        payload = {
            "input": {
                "prompt": _build_prompt(analysis),
                "images":  [],
                "videos":  [],
                "top_p":   0.9,
                "temperature": 0.2,
                "thinking_level": "low",
                "max_output_tokens": 2048,
            }
        }
        try:
            async with httpx.AsyncClient(timeout=120) as client:
                resp = await client.post(
                    REPLICATE_MODEL,
                    headers={
                        "Authorization": f"Bearer {REPLICATE_TOKEN}",
                        "Content-Type":  "application/json",
                        "Prefer":        "wait",
                    },
                    json=payload,
                )
                resp.raise_for_status()
                data = resp.json()

            output = data.get("output", "")
            if isinstance(output, list):
                output = "".join(output)

            result = _parse_output(output)
            logger.info("AI %s → %s (quality %s)",
                        analysis.get("domain"), result.get("lead_quality"), result.get("site_quality_score"))
            return result

        except Exception as e:
            logger.error("Replicate error %s: %s", analysis.get("domain"), e)
            return {
                "error":               str(e)[:300],
                "lead_quality":        "COLD",
                "best_contact_channel": "unknown",
                "best_contact_value":  "",
            }