fix: truncated JSON, missing pitch for placeholder sites, token limit
- max_output_tokens 2048→4096 (main truncation fix)
- page snippet 2000→800 chars, search results capped at 600 chars
- JSON schema reordered: lead_quality/pitch_angle/services_needed first,
so most important fields survive even if output is truncated
- RULES block in prompt: placeholder = HOT lead, pitch_angle is MANDATORY,
services_needed must have ≥2 items, keep values ≤15 words to avoid truncation
- _parse_output: truncated JSON repair — closes open [] and {} brackets
and strips trailing incomplete key-value before retrying json.loads
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -67,7 +67,7 @@ def _build_prompt(a: dict, search_results: str = "") -> str:
|
|||||||
webmaster = ", ".join(a.get("webmaster_verified") or []) or "none"
|
webmaster = ", ".join(a.get("webmaster_verified") or []) or "none"
|
||||||
lorem_str = ", ".join(a.get("lorem_matches") or []) or "none"
|
lorem_str = ", ".join(a.get("lorem_matches") or []) or "none"
|
||||||
ph_str = ", ".join(a.get("placeholder_matches") or []) or "none"
|
ph_str = ", ".join(a.get("placeholder_matches") or []) or "none"
|
||||||
snippet = (a.get("visible_text_snippet") or "")[:2000]
|
snippet = (a.get("visible_text_snippet") or "")[:800]
|
||||||
social_str = ", ".join(a.get("social_links") or []) or "none detected"
|
social_str = ", ".join(a.get("social_links") or []) or "none detected"
|
||||||
gmb_str = f"✅ Found — {a.get('gmb_url','')}" if a.get("has_gmb") else "❌ Not detected"
|
gmb_str = f"✅ Found — {a.get('gmb_url','')}" if a.get("has_gmb") else "❌ Not detected"
|
||||||
copyright_yr = a.get("copyright_year") or "not found"
|
copyright_yr = a.get("copyright_year") or "not found"
|
||||||
@@ -138,47 +138,54 @@ Profiles found on site: {social_str}
|
|||||||
=== PAGE TEXT SAMPLE ===
|
=== PAGE TEXT SAMPLE ===
|
||||||
{snippet}
|
{snippet}
|
||||||
|
|
||||||
=== WEB SEARCH RESULTS (use these to find contact info, verify business details) ===
|
=== WEB SEARCH RESULTS (use to find contacts, verify business identity) ===
|
||||||
{search_results if search_results else "No search results available."}
|
{(search_results or "No results.")[:600]}
|
||||||
|
|
||||||
=== INSTRUCTIONS ===
|
=== INSTRUCTIONS ===
|
||||||
The client sells: web redesign, SEO, hosting migration, SSL renewal,
|
The client sells: web redesign, SEO, hosting migration, SSL renewal,
|
||||||
security audits, GDPR compliance, accessibility fixes, Google Ads,
|
security audits, GDPR compliance, accessibility fixes, Google Ads,
|
||||||
maintenance contracts, AI tools for SMEs, Google My Business setup/optimisation,
|
maintenance contracts, AI tools for SMEs, GMB setup, social media management.
|
||||||
social media management (Instagram, Facebook, LinkedIn, TikTok).
|
|
||||||
|
|
||||||
IMPORTANT — use the WEB SEARCH RESULTS above to:
|
RULES — you MUST follow all of these:
|
||||||
1. Find any phone numbers, emails, or WhatsApp not visible on the homepage.
|
1. A placeholder / minimal / blank site (few words, no images, no CMS) is one of
|
||||||
2. Identify the business owner name if available.
|
the BEST leads — they need a complete website build + all digital services.
|
||||||
3. Populate best_contact_value with a real phone/email you found.
|
Score it lead_quality=HOT or WARM and write an enthusiastic pitch.
|
||||||
4. Use the copyright year and Last-Modified date to estimate when the site was last updated.
|
2. pitch_angle is MANDATORY. Never leave it empty. Write 1 punchy Spanish sentence
|
||||||
5. Determine the actual CMS from code signals and visible text (not just the heuristic).
|
tailored to the business type. Even "Hola, su web necesita una renovación
|
||||||
|
completa — podemos tenerla lista en 2 semanas." is better than nothing.
|
||||||
|
3. services_needed must list at LEAST 2 services. For a blank/placeholder site
|
||||||
|
always include "diseño web" and "posicionamiento SEO".
|
||||||
|
4. Use the WEB SEARCH RESULTS to find the real phone/email — put the best one
|
||||||
|
in best_contact_value.
|
||||||
|
5. Use copyright_year + Last-Modified to estimate site_last_updated.
|
||||||
|
6. Keep every string value SHORT (≤ 15 words). Arrays: max 4 items.
|
||||||
|
This keeps the JSON small and avoids truncation.
|
||||||
|
|
||||||
Respond ONLY with valid JSON, no markdown fences, no text outside the JSON:
|
Respond ONLY with valid JSON, no markdown fences, no text outside the JSON:
|
||||||
{{
|
{{
|
||||||
"summary": "2-3 sentence executive summary of the site's state",
|
"lead_quality": "HOT|WARM|COLD",
|
||||||
|
"lead_reasoning": "1-2 sentences why",
|
||||||
|
"pitch_angle": "1 punchy cold-outreach sentence in Spanish — NEVER empty",
|
||||||
|
"services_needed": ["service1","service2"],
|
||||||
|
"best_contact_channel": "email|phone|whatsapp|social|web_form|unknown",
|
||||||
|
"best_contact_value": "real email/phone from page or search results",
|
||||||
|
"all_contacts": {{"emails":[],"phones":[],"whatsapp":[],"social":[]}},
|
||||||
|
"summary": "2-3 sentence executive summary",
|
||||||
"site_quality_score": <0-10>,
|
"site_quality_score": <0-10>,
|
||||||
"content_issues": ["specific issues found in page content"],
|
"cms_detected": "wordpress|wix|custom|unknown",
|
||||||
"performance_notes": "load time, size, mobile assessment",
|
"site_last_updated": "year or estimate",
|
||||||
"seo_status": "SEO health — what's missing or broken",
|
|
||||||
"hosting_notes": "ASN/ISP name, EU vs non-EU, any concerns",
|
|
||||||
"gdpr_compliance": "cookie banner status, privacy policy, GDPR gaps",
|
|
||||||
"accessibility_issues": ["specific a11y problems found"],
|
|
||||||
"cms_detected": "wordpress|wix|squarespace|custom|unknown",
|
|
||||||
"site_last_updated": "year or estimate, e.g. '2019' or 'likely 2021'",
|
|
||||||
"kit_digital_confirmed": true/false,
|
"kit_digital_confirmed": true/false,
|
||||||
|
"kit_digital_reasoning": "1 sentence",
|
||||||
"has_gmb": true/false,
|
"has_gmb": true/false,
|
||||||
"has_social_media": true/false,
|
"has_social_media": true/false,
|
||||||
"kit_digital_reasoning": "1 sentence",
|
|
||||||
"is_local_sme": true/false,
|
"is_local_sme": true/false,
|
||||||
"lead_quality": "HOT|WARM|COLD",
|
"urgency_signals": ["issue1","issue2"],
|
||||||
"lead_reasoning": "1-2 sentences",
|
"content_issues": ["issue1"],
|
||||||
"best_contact_channel": "email|phone|whatsapp|social|web_form|unknown",
|
"accessibility_issues": ["issue1"],
|
||||||
"best_contact_value": "actual email/phone/URL or empty string",
|
"performance_notes": "brief",
|
||||||
"all_contacts": {{"emails":[],"phones":[],"whatsapp":[],"social":[]}},
|
"seo_status": "brief",
|
||||||
"pitch_angle": "1 cold-outreach sentence in Spanish",
|
"hosting_notes": "brief",
|
||||||
"services_needed": ["service1","service2"],
|
"gdpr_compliance": "brief",
|
||||||
"urgency_signals": ["specific urgent issues like 'SSL expires in 12 days', 'lorem ipsum on homepage', 'no cookie banner'"],
|
|
||||||
"outreach_notes": "sales rep context"
|
"outreach_notes": "sales rep context"
|
||||||
}}"""
|
}}"""
|
||||||
|
|
||||||
@@ -187,13 +194,25 @@ def _parse_output(raw: str) -> dict:
|
|||||||
text = re.sub(r"```(?:json)?", "", raw).strip().rstrip("`").strip()
|
text = re.sub(r"```(?:json)?", "", raw).strip().rstrip("`").strip()
|
||||||
m = re.search(r"\{[\s\S]+\}", text)
|
m = re.search(r"\{[\s\S]+\}", text)
|
||||||
if m:
|
if m:
|
||||||
|
candidate = m.group(0)
|
||||||
try:
|
try:
|
||||||
return json.loads(m.group(0))
|
return json.loads(candidate)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
# Truncated JSON: close any open arrays/objects and retry
|
||||||
|
fixed = candidate
|
||||||
|
# Count unclosed brackets
|
||||||
|
depth_obj = fixed.count("{") - fixed.count("}")
|
||||||
|
depth_arr = fixed.count("[") - fixed.count("]")
|
||||||
|
# Strip trailing incomplete key-value (e.g. `,"foo": "bar` with no closing `"`)
|
||||||
|
fixed = re.sub(r',\s*"[^"]*"?\s*:\s*[^,\}\]]*$', '', fixed)
|
||||||
|
fixed += "]" * max(0, depth_arr) + "}" * max(0, depth_obj)
|
||||||
|
try:
|
||||||
|
return json.loads(fixed)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
pass
|
pass
|
||||||
logger.warning("Could not parse Gemini JSON output, raw: %s", raw[:300])
|
logger.warning("Could not parse Gemini JSON output, raw: %s", raw[:300])
|
||||||
return {
|
return {
|
||||||
"summary": raw[:400],
|
"summary": raw[:400] if raw.strip() else "AI assessment failed — no output.",
|
||||||
"lead_quality": "COLD",
|
"lead_quality": "COLD",
|
||||||
"best_contact_channel": "unknown",
|
"best_contact_channel": "unknown",
|
||||||
"best_contact_value": "",
|
"best_contact_value": "",
|
||||||
@@ -220,7 +239,7 @@ async def assess_domain(analysis: dict) -> dict:
|
|||||||
"top_p": 0.9,
|
"top_p": 0.9,
|
||||||
"temperature": 0.2,
|
"temperature": 0.2,
|
||||||
"thinking_level": "low",
|
"thinking_level": "low",
|
||||||
"max_output_tokens": 2048,
|
"max_output_tokens": 4096,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
try:
|
try:
|
||||||
|
|||||||
Reference in New Issue
Block a user