feat: Gemini AI assessment, Kit Digital detection, contact extraction
Kit Digital detection (enricher.py):
- Scans img src/alt/srcset for digitalizadores, kit-digital, fondos-europeos etc
- Scans page text for Kit Digital, Agente Digitalizador, Next Generation EU, PRTR
- Scans links for acelerapyme.es, red.es, kit-digital refs
- +20 score bonus for Kit Digital confirmed sites (proven IT buyers)
Contact extraction (enricher.py):
- Pulls mailto/tel/wa.me links from HTML
- Extracts email addresses via regex, phone numbers (ES format)
- Detects social media links (FB, IG, LinkedIn, Twitter, TikTok)
- Stored as JSON in contact_info column
Gemini via Replicate (replicate_ai.py):
- Assesses lead quality (HOT/WARM/COLD), Kit Digital confirmation
- Identifies best contact channel + actual value (email/phone/WA)
- Writes Spanish cold-call/email pitch angle
- Lists services likely needed + outreach notes
- 3 concurrent requests, 90s timeout, JSON output parsing
DB: migration adds kit_digital, kit_digital_signals, contact_info,
ai_assessment, ai_lead_quality, ai_pitch, ai_contact_channel/value,
ai_queue table
UI: Kit Digital 🏅 badge, AI quality pill (clickable modal with full
assessment), contact chips (email/phone/WA/social), AI Assess button,
Kit Digital only filter, AI queue status in enrichment tab
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
142
app/replicate_ai.py
Normal file
142
app/replicate_ai.py
Normal file
@@ -0,0 +1,142 @@
|
||||
"""Replicate / Gemini integration for domain lead assessment."""
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
REPLICATE_TOKEN = os.getenv("REPLICATE_API_TOKEN", "r8_6kV2NWMQyPVB9JILHJprrXJJh4vWazA22Osyj")
|
||||
REPLICATE_MODEL = "https://api.replicate.com/v1/models/google/gemini-3-pro/predictions"
|
||||
AI_CONCURRENCY = int(os.getenv("AI_CONCURRENCY", "3"))
|
||||
|
||||
_ai_sem: Optional[asyncio.Semaphore] = None
|
||||
|
||||
|
||||
def _sem() -> asyncio.Semaphore:
|
||||
global _ai_sem
|
||||
if _ai_sem is None:
|
||||
_ai_sem = asyncio.Semaphore(AI_CONCURRENCY)
|
||||
return _ai_sem
|
||||
|
||||
|
||||
def _build_prompt(row: dict) -> str:
|
||||
kit_signals = row.get("kit_digital_signals") or "[]"
|
||||
try:
|
||||
sigs = json.loads(kit_signals)
|
||||
kit_block = "\n".join(f" - {s}" for s in sigs) if sigs else " None detected"
|
||||
except Exception:
|
||||
kit_block = f" {kit_signals}"
|
||||
|
||||
contact_raw = row.get("contact_info") or "{}"
|
||||
try:
|
||||
contacts = json.loads(contact_raw)
|
||||
except Exception:
|
||||
contacts = {}
|
||||
|
||||
contact_block = []
|
||||
if contacts.get("emails"):
|
||||
contact_block.append(f" Emails: {', '.join(contacts['emails'][:3])}")
|
||||
if contacts.get("phones"):
|
||||
contact_block.append(f" Phones: {', '.join(contacts['phones'][:3])}")
|
||||
if contacts.get("whatsapp"):
|
||||
contact_block.append(f" WhatsApp: {', '.join(contacts['whatsapp'][:2])}")
|
||||
if contacts.get("social"):
|
||||
contact_block.append(f" Social: {', '.join(contacts['social'][:4])}")
|
||||
contact_str = "\n".join(contact_block) if contact_block else " None found"
|
||||
|
||||
return f"""You are a sales intelligence analyst evaluating Spanish SME websites for IT services upsell.
|
||||
|
||||
DOMAIN DATA:
|
||||
- Domain: {row.get("domain")}
|
||||
- Page title: {row.get("page_title") or "N/A"}
|
||||
- CMS: {row.get("cms") or "unknown"}
|
||||
- Server: {row.get("server") or "unknown"}
|
||||
- Country: {row.get("ip_country") or "unknown"}
|
||||
- SSL valid: {row.get("ssl_valid")}, expires in {row.get("ssl_expiry_days") or "?"} days
|
||||
- Has email (MX): {bool(row.get("has_mx"))}
|
||||
- Is live: {bool(row.get("is_live"))}
|
||||
- Kit Digital signals found on page:
|
||||
{kit_block}
|
||||
- Contact channels found on page:
|
||||
{contact_str}
|
||||
|
||||
Kit Digital is a Spanish government program (up to €12k grants for SME digitalization). Sites that received it MUST display EU/digitalizadores logos. These businesses have proven they invest in IT services and may need follow-up: new website, SEO, hosting migration, security, maintenance contracts.
|
||||
|
||||
Assess this lead and respond ONLY with valid JSON (no markdown, no explanation outside the JSON):
|
||||
{{
|
||||
"is_local_sme": true/false,
|
||||
"kit_digital_confirmed": true/false,
|
||||
"kit_digital_reasoning": "1 sentence explaining why or why not",
|
||||
"lead_quality": "HOT|WARM|COLD",
|
||||
"lead_reasoning": "1-2 sentences on why this is a good/bad lead for IT services sales",
|
||||
"best_contact_channel": "email|phone|whatsapp|social|web_form|unknown",
|
||||
"best_contact_value": "the actual email/phone/URL to use, or empty string",
|
||||
"pitch_angle": "One concrete opening sentence for a cold email or call in Spanish",
|
||||
"services_likely_needed": ["service1", "service2"],
|
||||
"outreach_notes": "Any useful context for the sales rep (language, business type, urgency)"
|
||||
}}"""
|
||||
|
||||
|
||||
def _parse_output(raw: str) -> dict:
|
||||
"""Extract JSON from Gemini text output."""
|
||||
text = re.sub(r"```(?:json)?", "", raw).strip().rstrip("`").strip()
|
||||
m = re.search(r"\{[\s\S]+\}", text)
|
||||
if m:
|
||||
try:
|
||||
return json.loads(m.group(0))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
return {
|
||||
"raw": raw[:500],
|
||||
"lead_quality": "COLD",
|
||||
"best_contact_channel": "unknown",
|
||||
"best_contact_value": "",
|
||||
"parse_error": True,
|
||||
}
|
||||
|
||||
|
||||
async def assess_domain(row: dict) -> dict:
|
||||
"""Call Gemini via Replicate to assess a domain. Returns parsed assessment dict."""
|
||||
async with _sem():
|
||||
payload = {
|
||||
"input": {
|
||||
"prompt": _build_prompt(row),
|
||||
"images": [],
|
||||
"videos": [],
|
||||
"top_p": 0.9,
|
||||
"temperature": 0.2,
|
||||
"thinking_level": "low",
|
||||
"max_output_tokens": 1024,
|
||||
}
|
||||
}
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=90) as client:
|
||||
resp = await client.post(
|
||||
REPLICATE_MODEL,
|
||||
headers={
|
||||
"Authorization": f"Bearer {REPLICATE_TOKEN}",
|
||||
"Content-Type": "application/json",
|
||||
"Prefer": "wait",
|
||||
},
|
||||
json=payload,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
|
||||
output = data.get("output", "")
|
||||
if isinstance(output, list):
|
||||
output = "".join(output)
|
||||
|
||||
result = _parse_output(output)
|
||||
logger.info("AI %s → %s / contact: %s",
|
||||
row.get("domain"), result.get("lead_quality"), result.get("best_contact_channel"))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Replicate error %s: %s", row.get("domain"), e)
|
||||
return {"error": str(e)[:300], "lead_quality": "COLD", "best_contact_channel": "unknown", "best_contact_value": ""}
|
||||
Reference in New Issue
Block a user