diff --git a/app/replicate_ai.py b/app/replicate_ai.py index 5428f98..caa4eef 100644 --- a/app/replicate_ai.py +++ b/app/replicate_ai.py @@ -7,6 +7,7 @@ import re from typing import Optional import httpx +from bs4 import BeautifulSoup logger = logging.getLogger(__name__) @@ -24,7 +25,36 @@ def _sem() -> asyncio.Semaphore: return _ai_sem -def _build_prompt(a: dict) -> str: +async def _ddg_search(query: str) -> str: + """DuckDuckGo HTML search — returns top snippet text, empty string on failure.""" + try: + async with httpx.AsyncClient( + timeout=10, follow_redirects=True, + headers={"User-Agent": "Mozilla/5.0 (compatible; DomGod/1.0)"}, + ) as client: + r = await client.get( + "https://html.duckduckgo.com/html/", + params={"q": query, "kl": "es-es"}, + ) + if r.status_code != 200: + return "" + soup = BeautifulSoup(r.text, "html.parser") + parts = [] + for res in soup.select(".result")[:4]: + title = res.select_one(".result__a") + snip = res.select_one(".result__snippet") + url = res.select_one(".result__url") + if snip: + t = title.get_text(strip=True) if title else "" + u = url.get_text(strip=True) if url else "" + parts.append(f"[{u}] {t} — {snip.get_text(strip=True)}") + return "\n".join(parts) + except Exception as e: + logger.debug("DDG search failed: %s", e) + return "" + + +def _build_prompt(a: dict, search_results: str = "") -> str: contacts_block = [] if a.get("emails"): contacts_block.append(f" Emails: {', '.join(a['emails'][:3])}") if a.get("phones"): contacts_block.append(f" Phones: {', '.join(a['phones'][:3])}") @@ -40,6 +70,8 @@ def _build_prompt(a: dict) -> str: snippet = (a.get("visible_text_snippet") or "")[:2000] social_str = ", ".join(a.get("social_links") or []) or "none detected" gmb_str = f"✅ Found — {a.get('gmb_url','')}" if a.get("has_gmb") else "❌ Not detected" + copyright_yr = a.get("copyright_year") or "not found" + last_mod = a.get("last_modified") or "not found" eu_hosted = a.get("eu_hosted") hosting_flag = "✅ EU" if eu_hosted else ("❌ Non-EU" if eu_hosted is False else "unknown") @@ -84,9 +116,11 @@ Skip navigation link: {a.get("has_skip_nav")} Empty links: {a.get("empty_links")} Inputs without labels: {a.get("inputs_without_labels")} -=== CONTENT QUALITY === +=== CONTENT QUALITY & FRESHNESS === Lorem ipsum: {a.get("has_lorem_ipsum")} → {lorem_str} Placeholder: {a.get("has_placeholder")} → {ph_str} +Copyright year: {copyright_yr} +Last-Modified: {last_mod} === KIT DIGITAL (Spanish gov €12k SME grants — ONLY confirm if you see explicit Kit Digital / agente digitalizador branding) === Heuristic detected: {a.get("kit_digital")} @@ -104,12 +138,22 @@ Profiles found on site: {social_str} === PAGE TEXT SAMPLE === {snippet} +=== WEB SEARCH RESULTS (use these to find contact info, verify business details) === +{search_results if search_results else "No search results available."} + === INSTRUCTIONS === The client sells: web redesign, SEO, hosting migration, SSL renewal, security audits, GDPR compliance, accessibility fixes, Google Ads, maintenance contracts, AI tools for SMEs, Google My Business setup/optimisation, social media management (Instagram, Facebook, LinkedIn, TikTok). +IMPORTANT — use the WEB SEARCH RESULTS above to: +1. Find any phone numbers, emails, or WhatsApp not visible on the homepage. +2. Identify the business owner name if available. +3. Populate best_contact_value with a real phone/email you found. +4. Use the copyright year and Last-Modified date to estimate when the site was last updated. +5. Determine the actual CMS from code signals and visible text (not just the heuristic). + Respond ONLY with valid JSON, no markdown fences, no text outside the JSON: {{ "summary": "2-3 sentence executive summary of the site's state", @@ -120,6 +164,8 @@ Respond ONLY with valid JSON, no markdown fences, no text outside the JSON: "hosting_notes": "ASN/ISP name, EU vs non-EU, any concerns", "gdpr_compliance": "cookie banner status, privacy policy, GDPR gaps", "accessibility_issues": ["specific a11y problems found"], + "cms_detected": "wordpress|wix|squarespace|custom|unknown", + "site_last_updated": "year or estimate, e.g. '2019' or 'likely 2021'", "kit_digital_confirmed": true/false, "has_gmb": true/false, "has_social_media": true/false, @@ -158,9 +204,17 @@ def _parse_output(raw: str) -> dict: async def assess_domain(analysis: dict) -> dict: """Call Gemini with the full site analysis. Returns parsed assessment.""" async with _sem(): + # Build search query from domain / page title for contact lookup + domain = analysis.get("domain", "") + title = analysis.get("page_title") or "" + biz_name = title.split("|")[0].split("-")[0].strip() or domain + search_query = f'"{biz_name}" {domain} contacto telefono email' + search_results = await _ddg_search(search_query) + logger.info("DDG search for %s → %d chars", domain, len(search_results)) + payload = { "input": { - "prompt": _build_prompt(analysis), + "prompt": _build_prompt(analysis, search_results), "images": [], "videos": [], "top_p": 0.9, diff --git a/app/site_analyzer.py b/app/site_analyzer.py index f8a4f45..74f9c52 100644 --- a/app/site_analyzer.py +++ b/app/site_analyzer.py @@ -169,6 +169,8 @@ async def _analyze_site_inner(domain: str) -> dict: "has_gmb": False, "gmb_url": None, # Contacts "emails": [], "phones": [], "whatsapp": [], "social_links": [], + # Age / freshness + "copyright_year": None, "last_modified": None, "error": None, } @@ -364,6 +366,23 @@ async def _analyze_site_inner(domain: str) -> dict: result["cms"] = cms break + # ── Last-Modified / copyright year ──────────────────────────────────── + lm = (resp.headers.get("last-modified") or + (soup.find("meta", attrs={"name": "last-modified"}) or {}).get("content") or + (soup.find("meta", property="article:modified_time") or {}).get("content")) + if lm: + result["last_modified"] = str(lm)[:30] + + footer_el = (soup.find("footer") or + soup.find(id=re.compile(r"footer", re.I)) or + soup.find(class_=re.compile(r"footer", re.I))) + search_text = footer_el.get_text() if footer_el else visible[-600:] + cp = re.search(r"(?:©|©|copyright)\s*[\d\-–]*\s*(20\d{2})", search_text, re.I) + if not cp: + cp = re.search(r"(20\d{2})\s*[-–]\s*20\d{2}|(?:©|copyright)\D{0,10}(20\d{2})", search_text, re.I) + if cp: + result["copyright_year"] = cp.group(1) or cp.group(2) + # ── Sitemap & robots (parallel) ─────────────────────────────────────────── async def _get(url): try: @@ -373,9 +392,17 @@ async def _analyze_site_inner(domain: str) -> dict: except Exception: return None - sitemap_txt, robots_txt = await asyncio.gather( + async def _get_contact_page(): + for path in ("/contacto", "/contact", "/contactanos", "/sobre-nosotros"): + txt = await _get(f"https://{domain}{path}") + if txt: + return txt + return None + + sitemap_txt, robots_txt, contact_html = await asyncio.gather( _get(f"https://{domain}/sitemap.xml"), _get(f"https://{domain}/robots.txt"), + _get_contact_page(), ) result["has_sitemap"] = sitemap_txt is not None result["has_robots"] = robots_txt is not None @@ -383,6 +410,37 @@ async def _analyze_site_inner(domain: str) -> dict: rl = robots_txt.lower() result["robots_disallows_google"] = "disallow: /" in rl and "googlebot" in rl + # Merge contacts from /contacto page + if contact_html: + try: + csoup = BeautifulSoup(contact_html, "html.parser") + for a in csoup.find_all("a", href=True): + href = a["href"] + if href.startswith("mailto:"): + em = href[7:].split("?")[0].strip().lower() + if em and em not in result["emails"]: + result["emails"].append(em) + elif href.startswith("tel:"): + ph = re.sub(r"[^\d+]", "", href[4:]) + if ph and ph not in result["phones"]: + result["phones"].append(ph) + elif "wa.me" in href or "api.whatsapp.com" in href: + if href not in result["whatsapp"]: + result["whatsapp"].append(href[:80]) + ctext = csoup.get_text() + for em in EMAIL_RE.findall(contact_html[:60000]): + em = em.lower() + if em not in result["emails"] and not any(em.endswith(x) for x in [".png",".jpg",".css",".js"]): + result["emails"].append(em) + for ph in PHONE_RE.findall(ctext): + ph_c = re.sub(r"[\s\-]", "", ph) + if ph_c not in result["phones"]: + result["phones"].append(ph_c) + for k in ["emails", "phones", "whatsapp"]: + result[k] = list(dict.fromkeys(result[k]))[:5] + except Exception: + pass + # ── SSL ─────────────────────────────────────────────────────────────────── import ssl as _ssl try: diff --git a/app/static/index.html b/app/static/index.html index 459c6da..a899e48 100644 --- a/app/static/index.html +++ b/app/static/index.html @@ -187,6 +187,8 @@ tr:hover td{background:rgba(255,255,255,.025)}