feat: deep site analysis engine + fix AI assess for any domain
site_analyzer.py (new):
- Fresh scrape with timing, page size, server, CMS detection
- Lorem ipsum detection (16 phrases incl. user's example)
- Placeholder content detection (hello world, sample page, etc.)
- Analytics: GA4, GTM, Facebook Pixel, Hotjar, Clarity
- Webmaster: Google Search Console, Bing, Yandex verification tags
- sitemap.xml and robots.txt check + Googlebot block detection
- Mobile viewport check, word count, image/script count
- Full contact extraction: emails, phones, WhatsApp, social links
- Kit Digital signal detection
AI worker fix:
- No longer requires pre-enrichment — works on ANY selected domain
- Does fresh site_analyzer scrape then calls Gemini with full context
- Stores site_analysis JSON alongside AI assessment
- Upserts into enriched_domains even if domain was never enriched
Gemini prompt now includes:
- Complete technical snapshot (load time, size, server, SSL)
- Full SEO signals (sitemap, robots, analytics, webmaster verified)
- Content quality (lorem ipsum matches, placeholder matches)
- Kit Digital signals
- All extracted contacts
- 500-word page text sample
- Outputs: summary, site_quality_score/10, content_issues[],
urgency_signals[], performance_notes, seo_status,
best_contact_channel+value, all_contacts, ES pitch,
services_needed, outreach_notes
UI: rich AI modal with summary banner, quality grid, content issues,
urgency signals, full contact list, technical snapshot
Fixes: correct Replicate token, ai_queue status='running' bug
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
"""Replicate / Gemini integration for domain lead assessment."""
|
||||
"""Replicate / Gemini integration — deep site assessment."""
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
@@ -10,9 +10,9 @@ import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
REPLICATE_TOKEN = os.getenv("REPLICATE_API_TOKEN", "r8_6kV2NWMQyPVB9JILHJprrXJJh4vWazA22Osyj")
|
||||
REPLICATE_TOKEN = os.getenv("REPLICATE_API_TOKEN", "r8_7I7Feai78f9PzMOs20y5GVFKiLkgUWP463vZO") # override via env
|
||||
REPLICATE_MODEL = "https://api.replicate.com/v1/models/google/gemini-3-pro/predictions"
|
||||
AI_CONCURRENCY = int(os.getenv("AI_CONCURRENCY", "3"))
|
||||
AI_CONCURRENCY = int(os.getenv("AI_CONCURRENCY", "3"))
|
||||
|
||||
_ai_sem: Optional[asyncio.Semaphore] = None
|
||||
|
||||
@@ -24,66 +24,92 @@ def _sem() -> asyncio.Semaphore:
|
||||
return _ai_sem
|
||||
|
||||
|
||||
def _build_prompt(row: dict) -> str:
|
||||
kit_signals = row.get("kit_digital_signals") or "[]"
|
||||
try:
|
||||
sigs = json.loads(kit_signals)
|
||||
kit_block = "\n".join(f" - {s}" for s in sigs) if sigs else " None detected"
|
||||
except Exception:
|
||||
kit_block = f" {kit_signals}"
|
||||
def _build_prompt(a: dict) -> str:
|
||||
"""Build the Gemini prompt from a full site analysis dict."""
|
||||
contacts_block = []
|
||||
if a.get("emails"): contacts_block.append(f" Emails: {', '.join(a['emails'][:3])}")
|
||||
if a.get("phones"): contacts_block.append(f" Phones: {', '.join(a['phones'][:3])}")
|
||||
if a.get("whatsapp"): contacts_block.append(f" WhatsApp: {', '.join(a['whatsapp'][:2])}")
|
||||
if a.get("social_links"): contacts_block.append(f" Social: {', '.join(a['social_links'][:4])}")
|
||||
contacts_str = "\n".join(contacts_block) or " None found"
|
||||
|
||||
contact_raw = row.get("contact_info") or "{}"
|
||||
try:
|
||||
contacts = json.loads(contact_raw)
|
||||
except Exception:
|
||||
contacts = {}
|
||||
kd_str = "\n".join(f" - {s}" for s in (a.get("kit_digital_signals") or [])) or " None detected"
|
||||
analytics_str = ", ".join(a.get("analytics_present") or []) or "none"
|
||||
webmaster_str = ", ".join(a.get("webmaster_verified") or []) or "none"
|
||||
lorem_str = ", ".join(a.get("lorem_matches") or []) or "none"
|
||||
placeholder_str = ", ".join(a.get("placeholder_matches") or []) or "none"
|
||||
|
||||
contact_block = []
|
||||
if contacts.get("emails"):
|
||||
contact_block.append(f" Emails: {', '.join(contacts['emails'][:3])}")
|
||||
if contacts.get("phones"):
|
||||
contact_block.append(f" Phones: {', '.join(contacts['phones'][:3])}")
|
||||
if contacts.get("whatsapp"):
|
||||
contact_block.append(f" WhatsApp: {', '.join(contacts['whatsapp'][:2])}")
|
||||
if contacts.get("social"):
|
||||
contact_block.append(f" Social: {', '.join(contacts['social'][:4])}")
|
||||
contact_str = "\n".join(contact_block) if contact_block else " None found"
|
||||
text_snippet = (a.get("visible_text_snippet") or "")[:2000]
|
||||
|
||||
return f"""You are a sales intelligence analyst evaluating Spanish SME websites for IT services upsell.
|
||||
return f"""You are a senior web consultant and IT sales analyst reviewing a Spanish SME website.
|
||||
|
||||
DOMAIN DATA:
|
||||
- Domain: {row.get("domain")}
|
||||
- Page title: {row.get("page_title") or "N/A"}
|
||||
- CMS: {row.get("cms") or "unknown"}
|
||||
- Server: {row.get("server") or "unknown"}
|
||||
- Country: {row.get("ip_country") or "unknown"}
|
||||
- SSL valid: {row.get("ssl_valid")}, expires in {row.get("ssl_expiry_days") or "?"} days
|
||||
- Has email (MX): {bool(row.get("has_mx"))}
|
||||
- Is live: {bool(row.get("is_live"))}
|
||||
- Kit Digital signals found on page:
|
||||
{kit_block}
|
||||
- Contact channels found on page:
|
||||
{contact_str}
|
||||
=== TECHNICAL SNAPSHOT ===
|
||||
Domain: {a.get("domain")}
|
||||
Reachable: {a.get("reachable")} | Status: {a.get("status_code")} | Load time: {a.get("load_time_ms")} ms
|
||||
Final URL: {a.get("final_url")}
|
||||
Page size: {a.get("page_size_kb")} KB | Server: {a.get("server")} | CMS: {a.get("cms") or "unknown"}
|
||||
SSL valid: {a.get("ssl_valid")} | SSL expires in: {a.get("ssl_expiry_days")} days
|
||||
Mobile viewport: {a.get("has_mobile_viewport")}
|
||||
Word count: {a.get("word_count")} | Images: {a.get("image_count")} | Scripts: {a.get("script_count")}
|
||||
|
||||
Kit Digital is a Spanish government program (up to €12k grants for SME digitalization). Sites that received it MUST display EU/digitalizadores logos. These businesses have proven they invest in IT services and may need follow-up: new website, SEO, hosting migration, security, maintenance contracts.
|
||||
=== SEO & INDEXING SIGNALS ===
|
||||
Page title: {a.get("page_title") or "missing"}
|
||||
H1: {a.get("h1_text") or "missing"}
|
||||
Meta description: {a.get("meta_description") or "missing"}
|
||||
Canonical URL: {a.get("canonical_url") or "not set"}
|
||||
Sitemap.xml: {a.get("has_sitemap")}
|
||||
Robots.txt: {a.get("has_robots")} | Blocks Googlebot: {a.get("robots_disallows_google")}
|
||||
Analytics: {analytics_str}
|
||||
Webmaster verified:{webmaster_str}
|
||||
|
||||
Assess this lead and respond ONLY with valid JSON (no markdown, no explanation outside the JSON):
|
||||
=== CONTENT QUALITY ===
|
||||
Lorem ipsum found: {a.get("has_lorem_ipsum")} → matches: {lorem_str}
|
||||
Placeholder text: {a.get("has_placeholder")} → matches: {placeholder_str}
|
||||
|
||||
=== KIT DIGITAL (Spanish gov digitalization grant — sites must display EU logos) ===
|
||||
Detected: {a.get("kit_digital")}
|
||||
Signals:
|
||||
{kd_str}
|
||||
|
||||
=== CONTACT CHANNELS ===
|
||||
{contacts_str}
|
||||
|
||||
=== PAGE TEXT SAMPLE (first 2000 chars) ===
|
||||
{text_snippet}
|
||||
|
||||
=== TASK ===
|
||||
Analyse this site for IT services upsell potential. The client sells:
|
||||
web design/redesign, SEO, hosting migration, SSL renewal, security audits,
|
||||
maintenance contracts, Google Ads, and AI-assisted tools for SMEs.
|
||||
|
||||
Respond ONLY with valid JSON — no markdown, no text outside the JSON object:
|
||||
{{
|
||||
"is_local_sme": true/false,
|
||||
"summary": "2-3 sentence executive summary of the site's current state",
|
||||
"site_quality_score": <0-10 integer>,
|
||||
"content_issues": ["list of specific content problems found — lorem ipsum, broken sections, placeholder text, etc."],
|
||||
"performance_notes": "comment on load time, page size, mobile readiness",
|
||||
"seo_status": "brief SEO assessment — indexing signals, missing elements",
|
||||
"kit_digital_confirmed": true/false,
|
||||
"kit_digital_reasoning": "1 sentence explaining why or why not",
|
||||
"kit_digital_reasoning": "1 sentence — why confirmed or not",
|
||||
"is_local_sme": true/false,
|
||||
"lead_quality": "HOT|WARM|COLD",
|
||||
"lead_reasoning": "1-2 sentences on why this is a good/bad lead for IT services sales",
|
||||
"lead_reasoning": "1-2 sentences on why",
|
||||
"best_contact_channel": "email|phone|whatsapp|social|web_form|unknown",
|
||||
"best_contact_value": "the actual email/phone/URL to use, or empty string",
|
||||
"pitch_angle": "One concrete opening sentence for a cold email or call in Spanish",
|
||||
"services_likely_needed": ["service1", "service2"],
|
||||
"outreach_notes": "Any useful context for the sales rep (language, business type, urgency)"
|
||||
"best_contact_value": "the actual value to use (email address, phone number, URL) or empty string",
|
||||
"all_contacts": {{
|
||||
"emails": [],
|
||||
"phones": [],
|
||||
"whatsapp": [],
|
||||
"social": []
|
||||
}},
|
||||
"pitch_angle": "One concrete opening sentence in Spanish for cold outreach",
|
||||
"services_needed": ["service1", "service2"],
|
||||
"urgency_signals": ["list of specific urgent issues — expiring SSL, lorem ipsum live, no GA, blocked robots etc"],
|
||||
"outreach_notes": "Key context for the sales rep"
|
||||
}}"""
|
||||
|
||||
|
||||
def _parse_output(raw: str) -> dict:
|
||||
"""Extract JSON from Gemini text output."""
|
||||
text = re.sub(r"```(?:json)?", "", raw).strip().rstrip("`").strip()
|
||||
m = re.search(r"\{[\s\S]+\}", text)
|
||||
if m:
|
||||
@@ -91,8 +117,9 @@ def _parse_output(raw: str) -> dict:
|
||||
return json.loads(m.group(0))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
logger.warning("Could not parse Gemini JSON output, raw: %s", raw[:300])
|
||||
return {
|
||||
"raw": raw[:500],
|
||||
"summary": raw[:400],
|
||||
"lead_quality": "COLD",
|
||||
"best_contact_channel": "unknown",
|
||||
"best_contact_value": "",
|
||||
@@ -100,28 +127,28 @@ def _parse_output(raw: str) -> dict:
|
||||
}
|
||||
|
||||
|
||||
async def assess_domain(row: dict) -> dict:
|
||||
"""Call Gemini via Replicate to assess a domain. Returns parsed assessment dict."""
|
||||
async def assess_domain(analysis: dict) -> dict:
|
||||
"""Call Gemini with the full site analysis. Returns parsed assessment."""
|
||||
async with _sem():
|
||||
payload = {
|
||||
"input": {
|
||||
"prompt": _build_prompt(row),
|
||||
"images": [],
|
||||
"videos": [],
|
||||
"top_p": 0.9,
|
||||
"prompt": _build_prompt(analysis),
|
||||
"images": [],
|
||||
"videos": [],
|
||||
"top_p": 0.9,
|
||||
"temperature": 0.2,
|
||||
"thinking_level": "low",
|
||||
"max_output_tokens": 1024,
|
||||
"max_output_tokens": 2048,
|
||||
}
|
||||
}
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=90) as client:
|
||||
async with httpx.AsyncClient(timeout=120) as client:
|
||||
resp = await client.post(
|
||||
REPLICATE_MODEL,
|
||||
headers={
|
||||
"Authorization": f"Bearer {REPLICATE_TOKEN}",
|
||||
"Content-Type": "application/json",
|
||||
"Prefer": "wait",
|
||||
"Content-Type": "application/json",
|
||||
"Prefer": "wait",
|
||||
},
|
||||
json=payload,
|
||||
)
|
||||
@@ -133,10 +160,15 @@ async def assess_domain(row: dict) -> dict:
|
||||
output = "".join(output)
|
||||
|
||||
result = _parse_output(output)
|
||||
logger.info("AI %s → %s / contact: %s",
|
||||
row.get("domain"), result.get("lead_quality"), result.get("best_contact_channel"))
|
||||
logger.info("AI %s → %s (quality %s)",
|
||||
analysis.get("domain"), result.get("lead_quality"), result.get("site_quality_score"))
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Replicate error %s: %s", row.get("domain"), e)
|
||||
return {"error": str(e)[:300], "lead_quality": "COLD", "best_contact_channel": "unknown", "best_contact_value": ""}
|
||||
logger.error("Replicate error %s: %s", analysis.get("domain"), e)
|
||||
return {
|
||||
"error": str(e)[:300],
|
||||
"lead_quality": "COLD",
|
||||
"best_contact_channel": "unknown",
|
||||
"best_contact_value": "",
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user