feat: deep site analysis engine + fix AI assess for any domain

site_analyzer.py (new):
- Fresh scrape with timing, page size, server, CMS detection
- Lorem ipsum detection (16 phrases incl. user's example)
- Placeholder content detection (hello world, sample page, etc.)
- Analytics: GA4, GTM, Facebook Pixel, Hotjar, Clarity
- Webmaster: Google Search Console, Bing, Yandex verification tags
- sitemap.xml and robots.txt check + Googlebot block detection
- Mobile viewport check, word count, image/script count
- Full contact extraction: emails, phones, WhatsApp, social links
- Kit Digital signal detection

AI worker fix:
- No longer requires pre-enrichment — works on ANY selected domain
- Does fresh site_analyzer scrape then calls Gemini with full context
- Stores site_analysis JSON alongside AI assessment
- Upserts into enriched_domains even if domain was never enriched

Gemini prompt now includes:
- Complete technical snapshot (load time, size, server, SSL)
- Full SEO signals (sitemap, robots, analytics, webmaster verified)
- Content quality (lorem ipsum matches, placeholder matches)
- Kit Digital signals
- All extracted contacts
- 500-word page text sample
- Outputs: summary, site_quality_score/10, content_issues[],
  urgency_signals[], performance_notes, seo_status,
  best_contact_channel+value, all_contacts, ES pitch,
  services_needed, outreach_notes

UI: rich AI modal with summary banner, quality grid, content issues,
    urgency signals, full contact list, technical snapshot

Fixes: correct Replicate token, ai_queue status='running' bug

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-13 17:46:01 +02:00
parent faca4b6e1a
commit 5ad8259c75
7 changed files with 530 additions and 111 deletions

View File

@@ -1,4 +1,4 @@
"""Replicate / Gemini integration for domain lead assessment."""
"""Replicate / Gemini integration — deep site assessment."""
import asyncio
import json
import logging
@@ -10,9 +10,9 @@ import httpx
logger = logging.getLogger(__name__)
REPLICATE_TOKEN = os.getenv("REPLICATE_API_TOKEN", "r8_6kV2NWMQyPVB9JILHJprrXJJh4vWazA22Osyj")
REPLICATE_TOKEN = os.getenv("REPLICATE_API_TOKEN", "r8_7I7Feai78f9PzMOs20y5GVFKiLkgUWP463vZO") # override via env
REPLICATE_MODEL = "https://api.replicate.com/v1/models/google/gemini-3-pro/predictions"
AI_CONCURRENCY = int(os.getenv("AI_CONCURRENCY", "3"))
AI_CONCURRENCY = int(os.getenv("AI_CONCURRENCY", "3"))
_ai_sem: Optional[asyncio.Semaphore] = None
@@ -24,66 +24,92 @@ def _sem() -> asyncio.Semaphore:
return _ai_sem
def _build_prompt(row: dict) -> str:
kit_signals = row.get("kit_digital_signals") or "[]"
try:
sigs = json.loads(kit_signals)
kit_block = "\n".join(f" - {s}" for s in sigs) if sigs else " None detected"
except Exception:
kit_block = f" {kit_signals}"
def _build_prompt(a: dict) -> str:
"""Build the Gemini prompt from a full site analysis dict."""
contacts_block = []
if a.get("emails"): contacts_block.append(f" Emails: {', '.join(a['emails'][:3])}")
if a.get("phones"): contacts_block.append(f" Phones: {', '.join(a['phones'][:3])}")
if a.get("whatsapp"): contacts_block.append(f" WhatsApp: {', '.join(a['whatsapp'][:2])}")
if a.get("social_links"): contacts_block.append(f" Social: {', '.join(a['social_links'][:4])}")
contacts_str = "\n".join(contacts_block) or " None found"
contact_raw = row.get("contact_info") or "{}"
try:
contacts = json.loads(contact_raw)
except Exception:
contacts = {}
kd_str = "\n".join(f" - {s}" for s in (a.get("kit_digital_signals") or [])) or " None detected"
analytics_str = ", ".join(a.get("analytics_present") or []) or "none"
webmaster_str = ", ".join(a.get("webmaster_verified") or []) or "none"
lorem_str = ", ".join(a.get("lorem_matches") or []) or "none"
placeholder_str = ", ".join(a.get("placeholder_matches") or []) or "none"
contact_block = []
if contacts.get("emails"):
contact_block.append(f" Emails: {', '.join(contacts['emails'][:3])}")
if contacts.get("phones"):
contact_block.append(f" Phones: {', '.join(contacts['phones'][:3])}")
if contacts.get("whatsapp"):
contact_block.append(f" WhatsApp: {', '.join(contacts['whatsapp'][:2])}")
if contacts.get("social"):
contact_block.append(f" Social: {', '.join(contacts['social'][:4])}")
contact_str = "\n".join(contact_block) if contact_block else " None found"
text_snippet = (a.get("visible_text_snippet") or "")[:2000]
return f"""You are a sales intelligence analyst evaluating Spanish SME websites for IT services upsell.
return f"""You are a senior web consultant and IT sales analyst reviewing a Spanish SME website.
DOMAIN DATA:
- Domain: {row.get("domain")}
- Page title: {row.get("page_title") or "N/A"}
- CMS: {row.get("cms") or "unknown"}
- Server: {row.get("server") or "unknown"}
- Country: {row.get("ip_country") or "unknown"}
- SSL valid: {row.get("ssl_valid")}, expires in {row.get("ssl_expiry_days") or "?"} days
- Has email (MX): {bool(row.get("has_mx"))}
- Is live: {bool(row.get("is_live"))}
- Kit Digital signals found on page:
{kit_block}
- Contact channels found on page:
{contact_str}
=== TECHNICAL SNAPSHOT ===
Domain: {a.get("domain")}
Reachable: {a.get("reachable")} | Status: {a.get("status_code")} | Load time: {a.get("load_time_ms")} ms
Final URL: {a.get("final_url")}
Page size: {a.get("page_size_kb")} KB | Server: {a.get("server")} | CMS: {a.get("cms") or "unknown"}
SSL valid: {a.get("ssl_valid")} | SSL expires in: {a.get("ssl_expiry_days")} days
Mobile viewport: {a.get("has_mobile_viewport")}
Word count: {a.get("word_count")} | Images: {a.get("image_count")} | Scripts: {a.get("script_count")}
Kit Digital is a Spanish government program (up to €12k grants for SME digitalization). Sites that received it MUST display EU/digitalizadores logos. These businesses have proven they invest in IT services and may need follow-up: new website, SEO, hosting migration, security, maintenance contracts.
=== SEO & INDEXING SIGNALS ===
Page title: {a.get("page_title") or "missing"}
H1: {a.get("h1_text") or "missing"}
Meta description: {a.get("meta_description") or "missing"}
Canonical URL: {a.get("canonical_url") or "not set"}
Sitemap.xml: {a.get("has_sitemap")}
Robots.txt: {a.get("has_robots")} | Blocks Googlebot: {a.get("robots_disallows_google")}
Analytics: {analytics_str}
Webmaster verified:{webmaster_str}
Assess this lead and respond ONLY with valid JSON (no markdown, no explanation outside the JSON):
=== CONTENT QUALITY ===
Lorem ipsum found: {a.get("has_lorem_ipsum")} → matches: {lorem_str}
Placeholder text: {a.get("has_placeholder")} → matches: {placeholder_str}
=== KIT DIGITAL (Spanish gov digitalization grant — sites must display EU logos) ===
Detected: {a.get("kit_digital")}
Signals:
{kd_str}
=== CONTACT CHANNELS ===
{contacts_str}
=== PAGE TEXT SAMPLE (first 2000 chars) ===
{text_snippet}
=== TASK ===
Analyse this site for IT services upsell potential. The client sells:
web design/redesign, SEO, hosting migration, SSL renewal, security audits,
maintenance contracts, Google Ads, and AI-assisted tools for SMEs.
Respond ONLY with valid JSON — no markdown, no text outside the JSON object:
{{
"is_local_sme": true/false,
"summary": "2-3 sentence executive summary of the site's current state",
"site_quality_score": <0-10 integer>,
"content_issues": ["list of specific content problems found — lorem ipsum, broken sections, placeholder text, etc."],
"performance_notes": "comment on load time, page size, mobile readiness",
"seo_status": "brief SEO assessment — indexing signals, missing elements",
"kit_digital_confirmed": true/false,
"kit_digital_reasoning": "1 sentence explaining why or why not",
"kit_digital_reasoning": "1 sentence — why confirmed or not",
"is_local_sme": true/false,
"lead_quality": "HOT|WARM|COLD",
"lead_reasoning": "1-2 sentences on why this is a good/bad lead for IT services sales",
"lead_reasoning": "1-2 sentences on why",
"best_contact_channel": "email|phone|whatsapp|social|web_form|unknown",
"best_contact_value": "the actual email/phone/URL to use, or empty string",
"pitch_angle": "One concrete opening sentence for a cold email or call in Spanish",
"services_likely_needed": ["service1", "service2"],
"outreach_notes": "Any useful context for the sales rep (language, business type, urgency)"
"best_contact_value": "the actual value to use (email address, phone number, URL) or empty string",
"all_contacts": {{
"emails": [],
"phones": [],
"whatsapp": [],
"social": []
}},
"pitch_angle": "One concrete opening sentence in Spanish for cold outreach",
"services_needed": ["service1", "service2"],
"urgency_signals": ["list of specific urgent issues — expiring SSL, lorem ipsum live, no GA, blocked robots etc"],
"outreach_notes": "Key context for the sales rep"
}}"""
def _parse_output(raw: str) -> dict:
"""Extract JSON from Gemini text output."""
text = re.sub(r"```(?:json)?", "", raw).strip().rstrip("`").strip()
m = re.search(r"\{[\s\S]+\}", text)
if m:
@@ -91,8 +117,9 @@ def _parse_output(raw: str) -> dict:
return json.loads(m.group(0))
except json.JSONDecodeError:
pass
logger.warning("Could not parse Gemini JSON output, raw: %s", raw[:300])
return {
"raw": raw[:500],
"summary": raw[:400],
"lead_quality": "COLD",
"best_contact_channel": "unknown",
"best_contact_value": "",
@@ -100,28 +127,28 @@ def _parse_output(raw: str) -> dict:
}
async def assess_domain(row: dict) -> dict:
"""Call Gemini via Replicate to assess a domain. Returns parsed assessment dict."""
async def assess_domain(analysis: dict) -> dict:
"""Call Gemini with the full site analysis. Returns parsed assessment."""
async with _sem():
payload = {
"input": {
"prompt": _build_prompt(row),
"images": [],
"videos": [],
"top_p": 0.9,
"prompt": _build_prompt(analysis),
"images": [],
"videos": [],
"top_p": 0.9,
"temperature": 0.2,
"thinking_level": "low",
"max_output_tokens": 1024,
"max_output_tokens": 2048,
}
}
try:
async with httpx.AsyncClient(timeout=90) as client:
async with httpx.AsyncClient(timeout=120) as client:
resp = await client.post(
REPLICATE_MODEL,
headers={
"Authorization": f"Bearer {REPLICATE_TOKEN}",
"Content-Type": "application/json",
"Prefer": "wait",
"Content-Type": "application/json",
"Prefer": "wait",
},
json=payload,
)
@@ -133,10 +160,15 @@ async def assess_domain(row: dict) -> dict:
output = "".join(output)
result = _parse_output(output)
logger.info("AI %s%s / contact: %s",
row.get("domain"), result.get("lead_quality"), result.get("best_contact_channel"))
logger.info("AI %s%s (quality %s)",
analysis.get("domain"), result.get("lead_quality"), result.get("site_quality_score"))
return result
except Exception as e:
logger.error("Replicate error %s: %s", row.get("domain"), e)
return {"error": str(e)[:300], "lead_quality": "COLD", "best_contact_channel": "unknown", "best_contact_value": ""}
logger.error("Replicate error %s: %s", analysis.get("domain"), e)
return {
"error": str(e)[:300],
"lead_quality": "COLD",
"best_contact_channel": "unknown",
"best_contact_value": "",
}