fix: truncated JSON, missing pitch for placeholder sites, token limit

- max_output_tokens 2048→4096 (main truncation fix)
- page snippet 2000→800 chars, search results capped at 600 chars
- JSON schema reordered: lead_quality/pitch_angle/services_needed first,
  so most important fields survive even if output is truncated
- RULES block in prompt: placeholder = HOT lead, pitch_angle is MANDATORY,
  services_needed must have ≥2 items, keep values ≤15 words to avoid truncation
- _parse_output: truncated JSON repair — closes open [] and {} brackets
  and strips trailing incomplete key-value before retrying json.loads

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-14 08:32:49 +02:00
parent d62e4e986e
commit 6cea07f0f4

View File

@@ -67,7 +67,7 @@ def _build_prompt(a: dict, search_results: str = "") -> str:
webmaster = ", ".join(a.get("webmaster_verified") or []) or "none" webmaster = ", ".join(a.get("webmaster_verified") or []) or "none"
lorem_str = ", ".join(a.get("lorem_matches") or []) or "none" lorem_str = ", ".join(a.get("lorem_matches") or []) or "none"
ph_str = ", ".join(a.get("placeholder_matches") or []) or "none" ph_str = ", ".join(a.get("placeholder_matches") or []) or "none"
snippet = (a.get("visible_text_snippet") or "")[:2000] snippet = (a.get("visible_text_snippet") or "")[:800]
social_str = ", ".join(a.get("social_links") or []) or "none detected" social_str = ", ".join(a.get("social_links") or []) or "none detected"
gmb_str = f"✅ Found — {a.get('gmb_url','')}" if a.get("has_gmb") else "❌ Not detected" gmb_str = f"✅ Found — {a.get('gmb_url','')}" if a.get("has_gmb") else "❌ Not detected"
copyright_yr = a.get("copyright_year") or "not found" copyright_yr = a.get("copyright_year") or "not found"
@@ -138,47 +138,54 @@ Profiles found on site: {social_str}
=== PAGE TEXT SAMPLE === === PAGE TEXT SAMPLE ===
{snippet} {snippet}
=== WEB SEARCH RESULTS (use these to find contact info, verify business details) === === WEB SEARCH RESULTS (use to find contacts, verify business identity) ===
{search_results if search_results else "No search results available."} {(search_results or "No results.")[:600]}
=== INSTRUCTIONS === === INSTRUCTIONS ===
The client sells: web redesign, SEO, hosting migration, SSL renewal, The client sells: web redesign, SEO, hosting migration, SSL renewal,
security audits, GDPR compliance, accessibility fixes, Google Ads, security audits, GDPR compliance, accessibility fixes, Google Ads,
maintenance contracts, AI tools for SMEs, Google My Business setup/optimisation, maintenance contracts, AI tools for SMEs, GMB setup, social media management.
social media management (Instagram, Facebook, LinkedIn, TikTok).
IMPORTANT — use the WEB SEARCH RESULTS above to: RULES — you MUST follow all of these:
1. Find any phone numbers, emails, or WhatsApp not visible on the homepage. 1. A placeholder / minimal / blank site (few words, no images, no CMS) is one of
2. Identify the business owner name if available. the BEST leads — they need a complete website build + all digital services.
3. Populate best_contact_value with a real phone/email you found. Score it lead_quality=HOT or WARM and write an enthusiastic pitch.
4. Use the copyright year and Last-Modified date to estimate when the site was last updated. 2. pitch_angle is MANDATORY. Never leave it empty. Write 1 punchy Spanish sentence
5. Determine the actual CMS from code signals and visible text (not just the heuristic). tailored to the business type. Even "Hola, su web necesita una renovación
completa — podemos tenerla lista en 2 semanas." is better than nothing.
3. services_needed must list at LEAST 2 services. For a blank/placeholder site
always include "diseño web" and "posicionamiento SEO".
4. Use the WEB SEARCH RESULTS to find the real phone/email — put the best one
in best_contact_value.
5. Use copyright_year + Last-Modified to estimate site_last_updated.
6. Keep every string value SHORT (≤ 15 words). Arrays: max 4 items.
This keeps the JSON small and avoids truncation.
Respond ONLY with valid JSON, no markdown fences, no text outside the JSON: Respond ONLY with valid JSON, no markdown fences, no text outside the JSON:
{{ {{
"summary": "2-3 sentence executive summary of the site's state", "lead_quality": "HOT|WARM|COLD",
"lead_reasoning": "1-2 sentences why",
"pitch_angle": "1 punchy cold-outreach sentence in Spanish — NEVER empty",
"services_needed": ["service1","service2"],
"best_contact_channel": "email|phone|whatsapp|social|web_form|unknown",
"best_contact_value": "real email/phone from page or search results",
"all_contacts": {{"emails":[],"phones":[],"whatsapp":[],"social":[]}},
"summary": "2-3 sentence executive summary",
"site_quality_score": <0-10>, "site_quality_score": <0-10>,
"content_issues": ["specific issues found in page content"], "cms_detected": "wordpress|wix|custom|unknown",
"performance_notes": "load time, size, mobile assessment", "site_last_updated": "year or estimate",
"seo_status": "SEO health — what's missing or broken",
"hosting_notes": "ASN/ISP name, EU vs non-EU, any concerns",
"gdpr_compliance": "cookie banner status, privacy policy, GDPR gaps",
"accessibility_issues": ["specific a11y problems found"],
"cms_detected": "wordpress|wix|squarespace|custom|unknown",
"site_last_updated": "year or estimate, e.g. '2019' or 'likely 2021'",
"kit_digital_confirmed": true/false, "kit_digital_confirmed": true/false,
"kit_digital_reasoning": "1 sentence",
"has_gmb": true/false, "has_gmb": true/false,
"has_social_media": true/false, "has_social_media": true/false,
"kit_digital_reasoning": "1 sentence",
"is_local_sme": true/false, "is_local_sme": true/false,
"lead_quality": "HOT|WARM|COLD", "urgency_signals": ["issue1","issue2"],
"lead_reasoning": "1-2 sentences", "content_issues": ["issue1"],
"best_contact_channel": "email|phone|whatsapp|social|web_form|unknown", "accessibility_issues": ["issue1"],
"best_contact_value": "actual email/phone/URL or empty string", "performance_notes": "brief",
"all_contacts": {{"emails":[],"phones":[],"whatsapp":[],"social":[]}}, "seo_status": "brief",
"pitch_angle": "1 cold-outreach sentence in Spanish", "hosting_notes": "brief",
"services_needed": ["service1","service2"], "gdpr_compliance": "brief",
"urgency_signals": ["specific urgent issues like 'SSL expires in 12 days', 'lorem ipsum on homepage', 'no cookie banner'"],
"outreach_notes": "sales rep context" "outreach_notes": "sales rep context"
}}""" }}"""
@@ -187,13 +194,25 @@ def _parse_output(raw: str) -> dict:
text = re.sub(r"```(?:json)?", "", raw).strip().rstrip("`").strip() text = re.sub(r"```(?:json)?", "", raw).strip().rstrip("`").strip()
m = re.search(r"\{[\s\S]+\}", text) m = re.search(r"\{[\s\S]+\}", text)
if m: if m:
candidate = m.group(0)
try: try:
return json.loads(m.group(0)) return json.loads(candidate)
except json.JSONDecodeError:
# Truncated JSON: close any open arrays/objects and retry
fixed = candidate
# Count unclosed brackets
depth_obj = fixed.count("{") - fixed.count("}")
depth_arr = fixed.count("[") - fixed.count("]")
# Strip trailing incomplete key-value (e.g. `,"foo": "bar` with no closing `"`)
fixed = re.sub(r',\s*"[^"]*"?\s*:\s*[^,\}\]]*$', '', fixed)
fixed += "]" * max(0, depth_arr) + "}" * max(0, depth_obj)
try:
return json.loads(fixed)
except json.JSONDecodeError: except json.JSONDecodeError:
pass pass
logger.warning("Could not parse Gemini JSON output, raw: %s", raw[:300]) logger.warning("Could not parse Gemini JSON output, raw: %s", raw[:300])
return { return {
"summary": raw[:400], "summary": raw[:400] if raw.strip() else "AI assessment failed — no output.",
"lead_quality": "COLD", "lead_quality": "COLD",
"best_contact_channel": "unknown", "best_contact_channel": "unknown",
"best_contact_value": "", "best_contact_value": "",
@@ -220,7 +239,7 @@ async def assess_domain(analysis: dict) -> dict:
"top_p": 0.9, "top_p": 0.9,
"temperature": 0.2, "temperature": 0.2,
"thinking_level": "low", "thinking_level": "low",
"max_output_tokens": 2048, "max_output_tokens": 4096,
} }
} }
try: try: