fix: truncated JSON, missing pitch for placeholder sites, token limit
- max_output_tokens 2048→4096 (main truncation fix)
- page snippet 2000→800 chars, search results capped at 600 chars
- JSON schema reordered: lead_quality/pitch_angle/services_needed first,
so most important fields survive even if output is truncated
- RULES block in prompt: placeholder = HOT lead, pitch_angle is MANDATORY,
services_needed must have ≥2 items, keep values ≤15 words to avoid truncation
- _parse_output: truncated JSON repair — closes open [] and {} brackets
and strips trailing incomplete key-value before retrying json.loads
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -67,7 +67,7 @@ def _build_prompt(a: dict, search_results: str = "") -> str:
|
||||
webmaster = ", ".join(a.get("webmaster_verified") or []) or "none"
|
||||
lorem_str = ", ".join(a.get("lorem_matches") or []) or "none"
|
||||
ph_str = ", ".join(a.get("placeholder_matches") or []) or "none"
|
||||
snippet = (a.get("visible_text_snippet") or "")[:2000]
|
||||
snippet = (a.get("visible_text_snippet") or "")[:800]
|
||||
social_str = ", ".join(a.get("social_links") or []) or "none detected"
|
||||
gmb_str = f"✅ Found — {a.get('gmb_url','')}" if a.get("has_gmb") else "❌ Not detected"
|
||||
copyright_yr = a.get("copyright_year") or "not found"
|
||||
@@ -138,47 +138,54 @@ Profiles found on site: {social_str}
|
||||
=== PAGE TEXT SAMPLE ===
|
||||
{snippet}
|
||||
|
||||
=== WEB SEARCH RESULTS (use these to find contact info, verify business details) ===
|
||||
{search_results if search_results else "No search results available."}
|
||||
=== WEB SEARCH RESULTS (use to find contacts, verify business identity) ===
|
||||
{(search_results or "No results.")[:600]}
|
||||
|
||||
=== INSTRUCTIONS ===
|
||||
The client sells: web redesign, SEO, hosting migration, SSL renewal,
|
||||
security audits, GDPR compliance, accessibility fixes, Google Ads,
|
||||
maintenance contracts, AI tools for SMEs, Google My Business setup/optimisation,
|
||||
social media management (Instagram, Facebook, LinkedIn, TikTok).
|
||||
maintenance contracts, AI tools for SMEs, GMB setup, social media management.
|
||||
|
||||
IMPORTANT — use the WEB SEARCH RESULTS above to:
|
||||
1. Find any phone numbers, emails, or WhatsApp not visible on the homepage.
|
||||
2. Identify the business owner name if available.
|
||||
3. Populate best_contact_value with a real phone/email you found.
|
||||
4. Use the copyright year and Last-Modified date to estimate when the site was last updated.
|
||||
5. Determine the actual CMS from code signals and visible text (not just the heuristic).
|
||||
RULES — you MUST follow all of these:
|
||||
1. A placeholder / minimal / blank site (few words, no images, no CMS) is one of
|
||||
the BEST leads — they need a complete website build + all digital services.
|
||||
Score it lead_quality=HOT or WARM and write an enthusiastic pitch.
|
||||
2. pitch_angle is MANDATORY. Never leave it empty. Write 1 punchy Spanish sentence
|
||||
tailored to the business type. Even "Hola, su web necesita una renovación
|
||||
completa — podemos tenerla lista en 2 semanas." is better than nothing.
|
||||
3. services_needed must list at LEAST 2 services. For a blank/placeholder site
|
||||
always include "diseño web" and "posicionamiento SEO".
|
||||
4. Use the WEB SEARCH RESULTS to find the real phone/email — put the best one
|
||||
in best_contact_value.
|
||||
5. Use copyright_year + Last-Modified to estimate site_last_updated.
|
||||
6. Keep every string value SHORT (≤ 15 words). Arrays: max 4 items.
|
||||
This keeps the JSON small and avoids truncation.
|
||||
|
||||
Respond ONLY with valid JSON, no markdown fences, no text outside the JSON:
|
||||
{{
|
||||
"summary": "2-3 sentence executive summary of the site's state",
|
||||
"lead_quality": "HOT|WARM|COLD",
|
||||
"lead_reasoning": "1-2 sentences why",
|
||||
"pitch_angle": "1 punchy cold-outreach sentence in Spanish — NEVER empty",
|
||||
"services_needed": ["service1","service2"],
|
||||
"best_contact_channel": "email|phone|whatsapp|social|web_form|unknown",
|
||||
"best_contact_value": "real email/phone from page or search results",
|
||||
"all_contacts": {{"emails":[],"phones":[],"whatsapp":[],"social":[]}},
|
||||
"summary": "2-3 sentence executive summary",
|
||||
"site_quality_score": <0-10>,
|
||||
"content_issues": ["specific issues found in page content"],
|
||||
"performance_notes": "load time, size, mobile assessment",
|
||||
"seo_status": "SEO health — what's missing or broken",
|
||||
"hosting_notes": "ASN/ISP name, EU vs non-EU, any concerns",
|
||||
"gdpr_compliance": "cookie banner status, privacy policy, GDPR gaps",
|
||||
"accessibility_issues": ["specific a11y problems found"],
|
||||
"cms_detected": "wordpress|wix|squarespace|custom|unknown",
|
||||
"site_last_updated": "year or estimate, e.g. '2019' or 'likely 2021'",
|
||||
"cms_detected": "wordpress|wix|custom|unknown",
|
||||
"site_last_updated": "year or estimate",
|
||||
"kit_digital_confirmed": true/false,
|
||||
"kit_digital_reasoning": "1 sentence",
|
||||
"has_gmb": true/false,
|
||||
"has_social_media": true/false,
|
||||
"kit_digital_reasoning": "1 sentence",
|
||||
"is_local_sme": true/false,
|
||||
"lead_quality": "HOT|WARM|COLD",
|
||||
"lead_reasoning": "1-2 sentences",
|
||||
"best_contact_channel": "email|phone|whatsapp|social|web_form|unknown",
|
||||
"best_contact_value": "actual email/phone/URL or empty string",
|
||||
"all_contacts": {{"emails":[],"phones":[],"whatsapp":[],"social":[]}},
|
||||
"pitch_angle": "1 cold-outreach sentence in Spanish",
|
||||
"services_needed": ["service1","service2"],
|
||||
"urgency_signals": ["specific urgent issues like 'SSL expires in 12 days', 'lorem ipsum on homepage', 'no cookie banner'"],
|
||||
"urgency_signals": ["issue1","issue2"],
|
||||
"content_issues": ["issue1"],
|
||||
"accessibility_issues": ["issue1"],
|
||||
"performance_notes": "brief",
|
||||
"seo_status": "brief",
|
||||
"hosting_notes": "brief",
|
||||
"gdpr_compliance": "brief",
|
||||
"outreach_notes": "sales rep context"
|
||||
}}"""
|
||||
|
||||
@@ -187,13 +194,25 @@ def _parse_output(raw: str) -> dict:
|
||||
text = re.sub(r"```(?:json)?", "", raw).strip().rstrip("`").strip()
|
||||
m = re.search(r"\{[\s\S]+\}", text)
|
||||
if m:
|
||||
candidate = m.group(0)
|
||||
try:
|
||||
return json.loads(m.group(0))
|
||||
return json.loads(candidate)
|
||||
except json.JSONDecodeError:
|
||||
# Truncated JSON: close any open arrays/objects and retry
|
||||
fixed = candidate
|
||||
# Count unclosed brackets
|
||||
depth_obj = fixed.count("{") - fixed.count("}")
|
||||
depth_arr = fixed.count("[") - fixed.count("]")
|
||||
# Strip trailing incomplete key-value (e.g. `,"foo": "bar` with no closing `"`)
|
||||
fixed = re.sub(r',\s*"[^"]*"?\s*:\s*[^,\}\]]*$', '', fixed)
|
||||
fixed += "]" * max(0, depth_arr) + "}" * max(0, depth_obj)
|
||||
try:
|
||||
return json.loads(fixed)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
logger.warning("Could not parse Gemini JSON output, raw: %s", raw[:300])
|
||||
return {
|
||||
"summary": raw[:400],
|
||||
"summary": raw[:400] if raw.strip() else "AI assessment failed — no output.",
|
||||
"lead_quality": "COLD",
|
||||
"best_contact_channel": "unknown",
|
||||
"best_contact_value": "",
|
||||
@@ -220,7 +239,7 @@ async def assess_domain(analysis: dict) -> dict:
|
||||
"top_p": 0.9,
|
||||
"temperature": 0.2,
|
||||
"thinking_level": "low",
|
||||
"max_output_tokens": 2048,
|
||||
"max_output_tokens": 4096,
|
||||
}
|
||||
}
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user