diff --git a/app/replicate_ai.py b/app/replicate_ai.py index caa4eef..310c33b 100644 --- a/app/replicate_ai.py +++ b/app/replicate_ai.py @@ -67,7 +67,7 @@ def _build_prompt(a: dict, search_results: str = "") -> str: webmaster = ", ".join(a.get("webmaster_verified") or []) or "none" lorem_str = ", ".join(a.get("lorem_matches") or []) or "none" ph_str = ", ".join(a.get("placeholder_matches") or []) or "none" - snippet = (a.get("visible_text_snippet") or "")[:2000] + snippet = (a.get("visible_text_snippet") or "")[:800] social_str = ", ".join(a.get("social_links") or []) or "none detected" gmb_str = f"✅ Found — {a.get('gmb_url','')}" if a.get("has_gmb") else "❌ Not detected" copyright_yr = a.get("copyright_year") or "not found" @@ -138,47 +138,54 @@ Profiles found on site: {social_str} === PAGE TEXT SAMPLE === {snippet} -=== WEB SEARCH RESULTS (use these to find contact info, verify business details) === -{search_results if search_results else "No search results available."} +=== WEB SEARCH RESULTS (use to find contacts, verify business identity) === +{(search_results or "No results.")[:600]} === INSTRUCTIONS === The client sells: web redesign, SEO, hosting migration, SSL renewal, security audits, GDPR compliance, accessibility fixes, Google Ads, -maintenance contracts, AI tools for SMEs, Google My Business setup/optimisation, -social media management (Instagram, Facebook, LinkedIn, TikTok). +maintenance contracts, AI tools for SMEs, GMB setup, social media management. -IMPORTANT — use the WEB SEARCH RESULTS above to: -1. Find any phone numbers, emails, or WhatsApp not visible on the homepage. -2. Identify the business owner name if available. -3. Populate best_contact_value with a real phone/email you found. -4. Use the copyright year and Last-Modified date to estimate when the site was last updated. -5. Determine the actual CMS from code signals and visible text (not just the heuristic). +RULES — you MUST follow all of these: +1. A placeholder / minimal / blank site (few words, no images, no CMS) is one of + the BEST leads — they need a complete website build + all digital services. + Score it lead_quality=HOT or WARM and write an enthusiastic pitch. +2. pitch_angle is MANDATORY. Never leave it empty. Write 1 punchy Spanish sentence + tailored to the business type. Even "Hola, su web necesita una renovación + completa — podemos tenerla lista en 2 semanas." is better than nothing. +3. services_needed must list at LEAST 2 services. For a blank/placeholder site + always include "diseño web" and "posicionamiento SEO". +4. Use the WEB SEARCH RESULTS to find the real phone/email — put the best one + in best_contact_value. +5. Use copyright_year + Last-Modified to estimate site_last_updated. +6. Keep every string value SHORT (≤ 15 words). Arrays: max 4 items. + This keeps the JSON small and avoids truncation. Respond ONLY with valid JSON, no markdown fences, no text outside the JSON: {{ - "summary": "2-3 sentence executive summary of the site's state", + "lead_quality": "HOT|WARM|COLD", + "lead_reasoning": "1-2 sentences why", + "pitch_angle": "1 punchy cold-outreach sentence in Spanish — NEVER empty", + "services_needed": ["service1","service2"], + "best_contact_channel": "email|phone|whatsapp|social|web_form|unknown", + "best_contact_value": "real email/phone from page or search results", + "all_contacts": {{"emails":[],"phones":[],"whatsapp":[],"social":[]}}, + "summary": "2-3 sentence executive summary", "site_quality_score": <0-10>, - "content_issues": ["specific issues found in page content"], - "performance_notes": "load time, size, mobile assessment", - "seo_status": "SEO health — what's missing or broken", - "hosting_notes": "ASN/ISP name, EU vs non-EU, any concerns", - "gdpr_compliance": "cookie banner status, privacy policy, GDPR gaps", - "accessibility_issues": ["specific a11y problems found"], - "cms_detected": "wordpress|wix|squarespace|custom|unknown", - "site_last_updated": "year or estimate, e.g. '2019' or 'likely 2021'", + "cms_detected": "wordpress|wix|custom|unknown", + "site_last_updated": "year or estimate", "kit_digital_confirmed": true/false, + "kit_digital_reasoning": "1 sentence", "has_gmb": true/false, "has_social_media": true/false, - "kit_digital_reasoning": "1 sentence", "is_local_sme": true/false, - "lead_quality": "HOT|WARM|COLD", - "lead_reasoning": "1-2 sentences", - "best_contact_channel": "email|phone|whatsapp|social|web_form|unknown", - "best_contact_value": "actual email/phone/URL or empty string", - "all_contacts": {{"emails":[],"phones":[],"whatsapp":[],"social":[]}}, - "pitch_angle": "1 cold-outreach sentence in Spanish", - "services_needed": ["service1","service2"], - "urgency_signals": ["specific urgent issues like 'SSL expires in 12 days', 'lorem ipsum on homepage', 'no cookie banner'"], + "urgency_signals": ["issue1","issue2"], + "content_issues": ["issue1"], + "accessibility_issues": ["issue1"], + "performance_notes": "brief", + "seo_status": "brief", + "hosting_notes": "brief", + "gdpr_compliance": "brief", "outreach_notes": "sales rep context" }}""" @@ -187,13 +194,25 @@ def _parse_output(raw: str) -> dict: text = re.sub(r"```(?:json)?", "", raw).strip().rstrip("`").strip() m = re.search(r"\{[\s\S]+\}", text) if m: + candidate = m.group(0) try: - return json.loads(m.group(0)) + return json.loads(candidate) except json.JSONDecodeError: - pass + # Truncated JSON: close any open arrays/objects and retry + fixed = candidate + # Count unclosed brackets + depth_obj = fixed.count("{") - fixed.count("}") + depth_arr = fixed.count("[") - fixed.count("]") + # Strip trailing incomplete key-value (e.g. `,"foo": "bar` with no closing `"`) + fixed = re.sub(r',\s*"[^"]*"?\s*:\s*[^,\}\]]*$', '', fixed) + fixed += "]" * max(0, depth_arr) + "}" * max(0, depth_obj) + try: + return json.loads(fixed) + except json.JSONDecodeError: + pass logger.warning("Could not parse Gemini JSON output, raw: %s", raw[:300]) return { - "summary": raw[:400], + "summary": raw[:400] if raw.strip() else "AI assessment failed — no output.", "lead_quality": "COLD", "best_contact_channel": "unknown", "best_contact_value": "", @@ -220,7 +239,7 @@ async def assess_domain(analysis: dict) -> dict: "top_p": 0.9, "temperature": 0.2, "thinking_level": "low", - "max_output_tokens": 2048, + "max_output_tokens": 4096, } } try: