fix: truncated JSON, missing pitch for placeholder sites, token limit

- max_output_tokens 2048→4096 (main truncation fix)
- page snippet 2000→800 chars, search results capped at 600 chars
- JSON schema reordered: lead_quality/pitch_angle/services_needed first,
  so most important fields survive even if output is truncated
- RULES block in prompt: placeholder = HOT lead, pitch_angle is MANDATORY,
  services_needed must have ≥2 items, keep values ≤15 words to avoid truncation
- _parse_output: truncated JSON repair — closes open [] and {} brackets
  and strips trailing incomplete key-value before retrying json.loads

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-14 08:32:49 +02:00
parent d62e4e986e
commit 6cea07f0f4

View File

@@ -67,7 +67,7 @@ def _build_prompt(a: dict, search_results: str = "") -> str:
webmaster = ", ".join(a.get("webmaster_verified") or []) or "none"
lorem_str = ", ".join(a.get("lorem_matches") or []) or "none"
ph_str = ", ".join(a.get("placeholder_matches") or []) or "none"
snippet = (a.get("visible_text_snippet") or "")[:2000]
snippet = (a.get("visible_text_snippet") or "")[:800]
social_str = ", ".join(a.get("social_links") or []) or "none detected"
gmb_str = f"✅ Found — {a.get('gmb_url','')}" if a.get("has_gmb") else "❌ Not detected"
copyright_yr = a.get("copyright_year") or "not found"
@@ -138,47 +138,54 @@ Profiles found on site: {social_str}
=== PAGE TEXT SAMPLE ===
{snippet}
=== WEB SEARCH RESULTS (use these to find contact info, verify business details) ===
{search_results if search_results else "No search results available."}
=== WEB SEARCH RESULTS (use to find contacts, verify business identity) ===
{(search_results or "No results.")[:600]}
=== INSTRUCTIONS ===
The client sells: web redesign, SEO, hosting migration, SSL renewal,
security audits, GDPR compliance, accessibility fixes, Google Ads,
maintenance contracts, AI tools for SMEs, Google My Business setup/optimisation,
social media management (Instagram, Facebook, LinkedIn, TikTok).
maintenance contracts, AI tools for SMEs, GMB setup, social media management.
IMPORTANT — use the WEB SEARCH RESULTS above to:
1. Find any phone numbers, emails, or WhatsApp not visible on the homepage.
2. Identify the business owner name if available.
3. Populate best_contact_value with a real phone/email you found.
4. Use the copyright year and Last-Modified date to estimate when the site was last updated.
5. Determine the actual CMS from code signals and visible text (not just the heuristic).
RULES — you MUST follow all of these:
1. A placeholder / minimal / blank site (few words, no images, no CMS) is one of
the BEST leads — they need a complete website build + all digital services.
Score it lead_quality=HOT or WARM and write an enthusiastic pitch.
2. pitch_angle is MANDATORY. Never leave it empty. Write 1 punchy Spanish sentence
tailored to the business type. Even "Hola, su web necesita una renovación
completa — podemos tenerla lista en 2 semanas." is better than nothing.
3. services_needed must list at LEAST 2 services. For a blank/placeholder site
always include "diseño web" and "posicionamiento SEO".
4. Use the WEB SEARCH RESULTS to find the real phone/email — put the best one
in best_contact_value.
5. Use copyright_year + Last-Modified to estimate site_last_updated.
6. Keep every string value SHORT (≤ 15 words). Arrays: max 4 items.
This keeps the JSON small and avoids truncation.
Respond ONLY with valid JSON, no markdown fences, no text outside the JSON:
{{
"summary": "2-3 sentence executive summary of the site's state",
"lead_quality": "HOT|WARM|COLD",
"lead_reasoning": "1-2 sentences why",
"pitch_angle": "1 punchy cold-outreach sentence in Spanish — NEVER empty",
"services_needed": ["service1","service2"],
"best_contact_channel": "email|phone|whatsapp|social|web_form|unknown",
"best_contact_value": "real email/phone from page or search results",
"all_contacts": {{"emails":[],"phones":[],"whatsapp":[],"social":[]}},
"summary": "2-3 sentence executive summary",
"site_quality_score": <0-10>,
"content_issues": ["specific issues found in page content"],
"performance_notes": "load time, size, mobile assessment",
"seo_status": "SEO health — what's missing or broken",
"hosting_notes": "ASN/ISP name, EU vs non-EU, any concerns",
"gdpr_compliance": "cookie banner status, privacy policy, GDPR gaps",
"accessibility_issues": ["specific a11y problems found"],
"cms_detected": "wordpress|wix|squarespace|custom|unknown",
"site_last_updated": "year or estimate, e.g. '2019' or 'likely 2021'",
"cms_detected": "wordpress|wix|custom|unknown",
"site_last_updated": "year or estimate",
"kit_digital_confirmed": true/false,
"kit_digital_reasoning": "1 sentence",
"has_gmb": true/false,
"has_social_media": true/false,
"kit_digital_reasoning": "1 sentence",
"is_local_sme": true/false,
"lead_quality": "HOT|WARM|COLD",
"lead_reasoning": "1-2 sentences",
"best_contact_channel": "email|phone|whatsapp|social|web_form|unknown",
"best_contact_value": "actual email/phone/URL or empty string",
"all_contacts": {{"emails":[],"phones":[],"whatsapp":[],"social":[]}},
"pitch_angle": "1 cold-outreach sentence in Spanish",
"services_needed": ["service1","service2"],
"urgency_signals": ["specific urgent issues like 'SSL expires in 12 days', 'lorem ipsum on homepage', 'no cookie banner'"],
"urgency_signals": ["issue1","issue2"],
"content_issues": ["issue1"],
"accessibility_issues": ["issue1"],
"performance_notes": "brief",
"seo_status": "brief",
"hosting_notes": "brief",
"gdpr_compliance": "brief",
"outreach_notes": "sales rep context"
}}"""
@@ -187,13 +194,25 @@ def _parse_output(raw: str) -> dict:
text = re.sub(r"```(?:json)?", "", raw).strip().rstrip("`").strip()
m = re.search(r"\{[\s\S]+\}", text)
if m:
candidate = m.group(0)
try:
return json.loads(m.group(0))
return json.loads(candidate)
except json.JSONDecodeError:
# Truncated JSON: close any open arrays/objects and retry
fixed = candidate
# Count unclosed brackets
depth_obj = fixed.count("{") - fixed.count("}")
depth_arr = fixed.count("[") - fixed.count("]")
# Strip trailing incomplete key-value (e.g. `,"foo": "bar` with no closing `"`)
fixed = re.sub(r',\s*"[^"]*"?\s*:\s*[^,\}\]]*$', '', fixed)
fixed += "]" * max(0, depth_arr) + "}" * max(0, depth_obj)
try:
return json.loads(fixed)
except json.JSONDecodeError:
pass
logger.warning("Could not parse Gemini JSON output, raw: %s", raw[:300])
return {
"summary": raw[:400],
"summary": raw[:400] if raw.strip() else "AI assessment failed — no output.",
"lead_quality": "COLD",
"best_contact_channel": "unknown",
"best_contact_value": "",
@@ -220,7 +239,7 @@ async def assess_domain(analysis: dict) -> dict:
"top_p": 0.9,
"temperature": 0.2,
"thinking_level": "low",
"max_output_tokens": 2048,
"max_output_tokens": 4096,
}
}
try: