diff --git a/app/db.py b/app/db.py index 432db74..0b012b3 100644 --- a/app/db.py +++ b/app/db.py @@ -35,7 +35,8 @@ CREATE TABLE IF NOT EXISTS enriched_domains ( ai_pitch TEXT, ai_contact_channel TEXT, ai_contact_value TEXT, - ai_assessed_at TEXT + ai_assessed_at TEXT, + site_analysis TEXT ); CREATE TABLE IF NOT EXISTS job_queue ( id INTEGER PRIMARY KEY AUTOINCREMENT, @@ -71,6 +72,7 @@ _MIGRATIONS = [ "ALTER TABLE enriched_domains ADD COLUMN ai_contact_channel TEXT", "ALTER TABLE enriched_domains ADD COLUMN ai_contact_value TEXT", "ALTER TABLE enriched_domains ADD COLUMN ai_assessed_at TEXT", + "ALTER TABLE enriched_domains ADD COLUMN site_analysis TEXT", "CREATE TABLE IF NOT EXISTS ai_queue (domain TEXT PRIMARY KEY, status TEXT DEFAULT 'pending', created_at TEXT DEFAULT (datetime('now')), completed_at TEXT, error TEXT)", ] @@ -352,13 +354,19 @@ async def get_ai_queue_status(): } -async def save_ai_assessment(domain: str, assessment: dict): +async def save_ai_assessment(domain: str, assessment: dict, site_analysis: dict = None): import json as _json async with aiosqlite.connect(SQLITE_PATH) as db: + # Upsert into enriched_domains (domain may not exist yet if assessed before full enrichment) + await db.execute( + """INSERT INTO enriched_domains (domain) VALUES (?) ON CONFLICT(domain) DO NOTHING""", + (domain,), + ) await db.execute( """UPDATE enriched_domains SET ai_assessment=?, ai_lead_quality=?, ai_pitch=?, - ai_contact_channel=?, ai_contact_value=?, ai_assessed_at=datetime('now') + ai_contact_channel=?, ai_contact_value=?, ai_assessed_at=datetime('now'), + site_analysis=? WHERE domain=?""", ( _json.dumps(assessment), @@ -366,9 +374,29 @@ async def save_ai_assessment(domain: str, assessment: dict): assessment.get("pitch_angle"), assessment.get("best_contact_channel"), assessment.get("best_contact_value"), + _json.dumps(site_analysis) if site_analysis else None, domain, ), ) + # Also update contact_info + kit_digital from site_analysis if available + if site_analysis: + contacts = { + "emails": site_analysis.get("emails", []), + "phones": site_analysis.get("phones", []), + "whatsapp": site_analysis.get("whatsapp", []), + "social": site_analysis.get("social_links", []), + } + await db.execute( + """UPDATE enriched_domains SET + kit_digital=?, kit_digital_signals=?, contact_info=? + WHERE domain=?""", + ( + int(site_analysis.get("kit_digital", False)), + _json.dumps(site_analysis.get("kit_digital_signals", [])), + _json.dumps(contacts), + domain, + ), + ) await db.execute( "UPDATE ai_queue SET status='done', completed_at=datetime('now') WHERE domain=?", (domain,), diff --git a/app/enricher.py b/app/enricher.py index 3ac91bb..cf3d40e 100644 --- a/app/enricher.py +++ b/app/enricher.py @@ -13,7 +13,7 @@ import dns.resolver import aiosqlite from bs4 import BeautifulSoup -from app.db import SQLITE_PATH, queue_ai, save_ai_assessment, get_ai_queue_status +from app.db import SQLITE_PATH, queue_ai, save_ai_assessment from app.scorer import score logger = logging.getLogger(__name__) @@ -340,17 +340,17 @@ async def worker_loop(): async def ai_worker_loop(): from app.replicate_ai import assess_domain as gemini_assess + from app.site_analyzer import analyze_site while True: async with aiosqlite.connect(SQLITE_PATH) as db: async with db.execute( - "SELECT domain FROM ai_queue WHERE status='pending' LIMIT 20" + "SELECT domain FROM ai_queue WHERE status='pending' LIMIT 10" ) as cur: rows = await cur.fetchall() - # Mark as running if rows: await db.executemany( - "UPDATE ai_queue SET status='running', created_at=created_at WHERE domain=?", + "UPDATE ai_queue SET status='running' WHERE domain=?", [(r[0],) for r in rows], ) await db.commit() @@ -361,16 +361,11 @@ async def ai_worker_loop(): async def assess_one(domain: str): try: - async with aiosqlite.connect(SQLITE_PATH) as db: - db.row_factory = aiosqlite.Row - async with db.execute( - "SELECT * FROM enriched_domains WHERE domain=?", (domain,) - ) as cur: - row = await cur.fetchone() - if not row: - return - assessment = await gemini_assess(dict(row)) - await save_ai_assessment(domain, assessment) + # Always do a fresh deep scrape — no pre-enrichment required + analysis = await analyze_site(domain) + assessment = await gemini_assess(analysis) + await save_ai_assessment(domain, assessment, site_analysis=analysis) + logger.info("AI done: %s → %s", domain, assessment.get("lead_quality")) except Exception as e: async with aiosqlite.connect(SQLITE_PATH) as db: await db.execute( @@ -380,6 +375,7 @@ async def ai_worker_loop(): await db.commit() logger.error("AI worker error %s: %s", domain, e) + # AI_CONCURRENCY concurrent assessments (already enforced by replicate_ai semaphore) await asyncio.gather(*[asyncio.create_task(assess_one(r[0])) for r in rows], return_exceptions=True) diff --git a/app/main.py b/app/main.py index 3f611b2..35b87f5 100644 --- a/app/main.py +++ b/app/main.py @@ -177,22 +177,16 @@ async def ai_status(): @app.post("/api/ai/assess/single") async def ai_assess_single(body: dict): - """Immediate (blocking) AI assessment of a single domain.""" + """Immediate (blocking) AI assessment — does fresh scrape, no pre-enrichment needed.""" domain = body.get("domain") if not domain: return JSONResponse({"error": "no domain"}, status_code=400) + from app.site_analyzer import analyze_site from app.replicate_ai import assess_domain as gemini_assess - async with aiosqlite.connect(SQLITE_PATH) as db: - db.row_factory = aiosqlite.Row - async with db.execute( - "SELECT * FROM enriched_domains WHERE domain=?", (domain,) - ) as cur: - row = await cur.fetchone() - if not row: - return JSONResponse({"error": "domain not yet enriched"}, status_code=404) - assessment = await gemini_assess(dict(row)) - await save_ai_assessment(domain, assessment) - return assessment + analysis = await analyze_site(domain) + assessment = await gemini_assess(analysis) + await save_ai_assessment(domain, assessment, site_analysis=analysis) + return {**assessment, "site_analysis": analysis} @app.get("/api/export") diff --git a/app/replicate_ai.py b/app/replicate_ai.py index a680d0e..1e30eb1 100644 --- a/app/replicate_ai.py +++ b/app/replicate_ai.py @@ -1,4 +1,4 @@ -"""Replicate / Gemini integration for domain lead assessment.""" +"""Replicate / Gemini integration — deep site assessment.""" import asyncio import json import logging @@ -10,9 +10,9 @@ import httpx logger = logging.getLogger(__name__) -REPLICATE_TOKEN = os.getenv("REPLICATE_API_TOKEN", "r8_6kV2NWMQyPVB9JILHJprrXJJh4vWazA22Osyj") +REPLICATE_TOKEN = os.getenv("REPLICATE_API_TOKEN", "r8_7I7Feai78f9PzMOs20y5GVFKiLkgUWP463vZO") # override via env REPLICATE_MODEL = "https://api.replicate.com/v1/models/google/gemini-3-pro/predictions" -AI_CONCURRENCY = int(os.getenv("AI_CONCURRENCY", "3")) +AI_CONCURRENCY = int(os.getenv("AI_CONCURRENCY", "3")) _ai_sem: Optional[asyncio.Semaphore] = None @@ -24,66 +24,92 @@ def _sem() -> asyncio.Semaphore: return _ai_sem -def _build_prompt(row: dict) -> str: - kit_signals = row.get("kit_digital_signals") or "[]" - try: - sigs = json.loads(kit_signals) - kit_block = "\n".join(f" - {s}" for s in sigs) if sigs else " None detected" - except Exception: - kit_block = f" {kit_signals}" +def _build_prompt(a: dict) -> str: + """Build the Gemini prompt from a full site analysis dict.""" + contacts_block = [] + if a.get("emails"): contacts_block.append(f" Emails: {', '.join(a['emails'][:3])}") + if a.get("phones"): contacts_block.append(f" Phones: {', '.join(a['phones'][:3])}") + if a.get("whatsapp"): contacts_block.append(f" WhatsApp: {', '.join(a['whatsapp'][:2])}") + if a.get("social_links"): contacts_block.append(f" Social: {', '.join(a['social_links'][:4])}") + contacts_str = "\n".join(contacts_block) or " None found" - contact_raw = row.get("contact_info") or "{}" - try: - contacts = json.loads(contact_raw) - except Exception: - contacts = {} + kd_str = "\n".join(f" - {s}" for s in (a.get("kit_digital_signals") or [])) or " None detected" + analytics_str = ", ".join(a.get("analytics_present") or []) or "none" + webmaster_str = ", ".join(a.get("webmaster_verified") or []) or "none" + lorem_str = ", ".join(a.get("lorem_matches") or []) or "none" + placeholder_str = ", ".join(a.get("placeholder_matches") or []) or "none" - contact_block = [] - if contacts.get("emails"): - contact_block.append(f" Emails: {', '.join(contacts['emails'][:3])}") - if contacts.get("phones"): - contact_block.append(f" Phones: {', '.join(contacts['phones'][:3])}") - if contacts.get("whatsapp"): - contact_block.append(f" WhatsApp: {', '.join(contacts['whatsapp'][:2])}") - if contacts.get("social"): - contact_block.append(f" Social: {', '.join(contacts['social'][:4])}") - contact_str = "\n".join(contact_block) if contact_block else " None found" + text_snippet = (a.get("visible_text_snippet") or "")[:2000] - return f"""You are a sales intelligence analyst evaluating Spanish SME websites for IT services upsell. + return f"""You are a senior web consultant and IT sales analyst reviewing a Spanish SME website. -DOMAIN DATA: -- Domain: {row.get("domain")} -- Page title: {row.get("page_title") or "N/A"} -- CMS: {row.get("cms") or "unknown"} -- Server: {row.get("server") or "unknown"} -- Country: {row.get("ip_country") or "unknown"} -- SSL valid: {row.get("ssl_valid")}, expires in {row.get("ssl_expiry_days") or "?"} days -- Has email (MX): {bool(row.get("has_mx"))} -- Is live: {bool(row.get("is_live"))} -- Kit Digital signals found on page: -{kit_block} -- Contact channels found on page: -{contact_str} +=== TECHNICAL SNAPSHOT === +Domain: {a.get("domain")} +Reachable: {a.get("reachable")} | Status: {a.get("status_code")} | Load time: {a.get("load_time_ms")} ms +Final URL: {a.get("final_url")} +Page size: {a.get("page_size_kb")} KB | Server: {a.get("server")} | CMS: {a.get("cms") or "unknown"} +SSL valid: {a.get("ssl_valid")} | SSL expires in: {a.get("ssl_expiry_days")} days +Mobile viewport: {a.get("has_mobile_viewport")} +Word count: {a.get("word_count")} | Images: {a.get("image_count")} | Scripts: {a.get("script_count")} -Kit Digital is a Spanish government program (up to €12k grants for SME digitalization). Sites that received it MUST display EU/digitalizadores logos. These businesses have proven they invest in IT services and may need follow-up: new website, SEO, hosting migration, security, maintenance contracts. +=== SEO & INDEXING SIGNALS === +Page title: {a.get("page_title") or "missing"} +H1: {a.get("h1_text") or "missing"} +Meta description: {a.get("meta_description") or "missing"} +Canonical URL: {a.get("canonical_url") or "not set"} +Sitemap.xml: {a.get("has_sitemap")} +Robots.txt: {a.get("has_robots")} | Blocks Googlebot: {a.get("robots_disallows_google")} +Analytics: {analytics_str} +Webmaster verified:{webmaster_str} -Assess this lead and respond ONLY with valid JSON (no markdown, no explanation outside the JSON): +=== CONTENT QUALITY === +Lorem ipsum found: {a.get("has_lorem_ipsum")} → matches: {lorem_str} +Placeholder text: {a.get("has_placeholder")} → matches: {placeholder_str} + +=== KIT DIGITAL (Spanish gov digitalization grant — sites must display EU logos) === +Detected: {a.get("kit_digital")} +Signals: +{kd_str} + +=== CONTACT CHANNELS === +{contacts_str} + +=== PAGE TEXT SAMPLE (first 2000 chars) === +{text_snippet} + +=== TASK === +Analyse this site for IT services upsell potential. The client sells: +web design/redesign, SEO, hosting migration, SSL renewal, security audits, +maintenance contracts, Google Ads, and AI-assisted tools for SMEs. + +Respond ONLY with valid JSON — no markdown, no text outside the JSON object: {{ - "is_local_sme": true/false, + "summary": "2-3 sentence executive summary of the site's current state", + "site_quality_score": <0-10 integer>, + "content_issues": ["list of specific content problems found — lorem ipsum, broken sections, placeholder text, etc."], + "performance_notes": "comment on load time, page size, mobile readiness", + "seo_status": "brief SEO assessment — indexing signals, missing elements", "kit_digital_confirmed": true/false, - "kit_digital_reasoning": "1 sentence explaining why or why not", + "kit_digital_reasoning": "1 sentence — why confirmed or not", + "is_local_sme": true/false, "lead_quality": "HOT|WARM|COLD", - "lead_reasoning": "1-2 sentences on why this is a good/bad lead for IT services sales", + "lead_reasoning": "1-2 sentences on why", "best_contact_channel": "email|phone|whatsapp|social|web_form|unknown", - "best_contact_value": "the actual email/phone/URL to use, or empty string", - "pitch_angle": "One concrete opening sentence for a cold email or call in Spanish", - "services_likely_needed": ["service1", "service2"], - "outreach_notes": "Any useful context for the sales rep (language, business type, urgency)" + "best_contact_value": "the actual value to use (email address, phone number, URL) or empty string", + "all_contacts": {{ + "emails": [], + "phones": [], + "whatsapp": [], + "social": [] + }}, + "pitch_angle": "One concrete opening sentence in Spanish for cold outreach", + "services_needed": ["service1", "service2"], + "urgency_signals": ["list of specific urgent issues — expiring SSL, lorem ipsum live, no GA, blocked robots etc"], + "outreach_notes": "Key context for the sales rep" }}""" def _parse_output(raw: str) -> dict: - """Extract JSON from Gemini text output.""" text = re.sub(r"```(?:json)?", "", raw).strip().rstrip("`").strip() m = re.search(r"\{[\s\S]+\}", text) if m: @@ -91,8 +117,9 @@ def _parse_output(raw: str) -> dict: return json.loads(m.group(0)) except json.JSONDecodeError: pass + logger.warning("Could not parse Gemini JSON output, raw: %s", raw[:300]) return { - "raw": raw[:500], + "summary": raw[:400], "lead_quality": "COLD", "best_contact_channel": "unknown", "best_contact_value": "", @@ -100,28 +127,28 @@ def _parse_output(raw: str) -> dict: } -async def assess_domain(row: dict) -> dict: - """Call Gemini via Replicate to assess a domain. Returns parsed assessment dict.""" +async def assess_domain(analysis: dict) -> dict: + """Call Gemini with the full site analysis. Returns parsed assessment.""" async with _sem(): payload = { "input": { - "prompt": _build_prompt(row), - "images": [], - "videos": [], - "top_p": 0.9, + "prompt": _build_prompt(analysis), + "images": [], + "videos": [], + "top_p": 0.9, "temperature": 0.2, "thinking_level": "low", - "max_output_tokens": 1024, + "max_output_tokens": 2048, } } try: - async with httpx.AsyncClient(timeout=90) as client: + async with httpx.AsyncClient(timeout=120) as client: resp = await client.post( REPLICATE_MODEL, headers={ "Authorization": f"Bearer {REPLICATE_TOKEN}", - "Content-Type": "application/json", - "Prefer": "wait", + "Content-Type": "application/json", + "Prefer": "wait", }, json=payload, ) @@ -133,10 +160,15 @@ async def assess_domain(row: dict) -> dict: output = "".join(output) result = _parse_output(output) - logger.info("AI %s → %s / contact: %s", - row.get("domain"), result.get("lead_quality"), result.get("best_contact_channel")) + logger.info("AI %s → %s (quality %s)", + analysis.get("domain"), result.get("lead_quality"), result.get("site_quality_score")) return result except Exception as e: - logger.error("Replicate error %s: %s", row.get("domain"), e) - return {"error": str(e)[:300], "lead_quality": "COLD", "best_contact_channel": "unknown", "best_contact_value": ""} + logger.error("Replicate error %s: %s", analysis.get("domain"), e) + return { + "error": str(e)[:300], + "lead_quality": "COLD", + "best_contact_channel": "unknown", + "best_contact_value": "", + } diff --git a/app/site_analyzer.py b/app/site_analyzer.py new file mode 100644 index 0000000..fb01a55 --- /dev/null +++ b/app/site_analyzer.py @@ -0,0 +1,277 @@ +"""Deep site analysis: content quality, SEO signals, performance, indexing hints.""" +import asyncio +import re +import time +import logging +from typing import Optional + +import httpx +from bs4 import BeautifulSoup + +logger = logging.getLogger(__name__) + +# ── Content quality ─────────────────────────────────────────────────────────── + +LOREM_PHRASES = [ + "lorem ipsum", "sed ut perspiciatis", "nunc sem sapien", + "nulla id nibh", "aenean dignissim", "aliquam tincidunt", + "vestibulum commodo", "fusce nunc lacus", "consectetuer", + "cras ornare tristique", "ntulla nec ante", "risus id metus", + "praesent placerat", "fusce pellentesque", "suscipit nibh", + "integer vitae libero", "felis quis tortor", +] + +PLACEHOLDER_PHRASES = [ + "under construction", "coming soon", "sample page", + "this is a demo", "default post", "hello world", + "test post", "uncategorized", +] + +# ── Analytics & webmaster tags ──────────────────────────────────────────────── + +ANALYTICS = { + "google_analytics": ["gtag('config'", "google-analytics.com/analytics.js", "G-"], + "google_tag_manager": ["googletagmanager.com/gtm.js", "GTM-"], + "facebook_pixel": ["fbq('init'", "connect.facebook.net/en_US/fbevents"], + "hotjar": ["static.hotjar.com"], + "clarity": ["clarity.ms/tag"], +} + +WEBMASTER = { + "google_search_console": ['google-site-verification'], + "bing_webmaster": ['msvalidate.01'], + "yandex": ['yandex-verification'], +} + +KIT_IMG_PATS = [ + "digitalizadores", "kit-digital", "kitdigital", "kit_digital", + "fondos-europeos", "fondos_europeos", "nextgeneration", "next-generation", + "prtr", "plan-recuperacion", "acelerapyme", "cofinanciado", +] +KIT_TEXT_PATS = [ + "kit digital", "agente digitalizador", "fondos europeos", + "next generation eu", "nextgenerationeu", "plan de recuperación", + "prtr", "financiado por la unión europea", "red.es/kit-digital", "acelerapyme", +] + +EMAIL_RE = re.compile(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}") +PHONE_RE = re.compile(r"(?:\+34[\s\-]?)?(?:6|7|8|9)\d{2}[\s\-]?\d{3}[\s\-]?\d{3}") +SOCIAL_DOM = ["facebook.com", "instagram.com", "linkedin.com", "twitter.com", "x.com", "tiktok.com"] + + +async def analyze_site(domain: str) -> dict: + """Fetch and deeply analyse a site. Returns a rich dict for the AI prompt.""" + result = { + "domain": domain, + "reachable": False, + "load_time_ms": None, + "status_code": None, + "final_url": None, + "page_size_kb": None, + "server": None, + "cms": None, + "ssl_valid": False, + "ssl_expiry_days": None, + # Content quality + "has_lorem_ipsum": False, + "lorem_matches": [], + "has_placeholder": False, + "placeholder_matches": [], + "word_count": 0, + "image_count": 0, + "broken_images": 0, + "script_count": 0, + "has_mobile_viewport": False, + "page_title": None, + "meta_description": None, + "h1_text": None, + "visible_text_snippet": "", + # SEO / webmaster + "has_sitemap": False, + "has_robots": False, + "robots_disallows_google": False, + "analytics_present": [], + "webmaster_verified": [], + "canonical_url": None, + "og_title": None, + # Kit Digital + "kit_digital": False, + "kit_digital_signals": [], + # Contacts + "emails": [], + "phones": [], + "whatsapp": [], + "social_links": [], + # Errors + "error": None, + } + + # ── Fetch main page ─────────────────────────────────────────────────────── + try: + t0 = time.monotonic() + async with httpx.AsyncClient( + timeout=15, follow_redirects=True, verify=False, + headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}, + ) as client: + resp = await client.get(f"https://{domain}") + if resp.status_code >= 400: + resp = await client.get(f"http://{domain}") + + load_ms = int((time.monotonic() - t0) * 1000) + html = resp.text + result.update({ + "reachable": resp.status_code < 400, + "load_time_ms": load_ms, + "status_code": resp.status_code, + "final_url": str(resp.url), + "page_size_kb": round(len(resp.content) / 1024, 1), + "server": resp.headers.get("server"), + }) + + soup = BeautifulSoup(html, "html.parser") + hl = html.lower() + + # Title, meta + title_tag = soup.find("title") + result["page_title"] = title_tag.get_text(strip=True)[:200] if title_tag else None + meta_desc = soup.find("meta", attrs={"name": "description"}) + result["meta_description"] = (meta_desc.get("content") or "")[:300] if meta_desc else None + h1 = soup.find("h1") + result["h1_text"] = h1.get_text(strip=True)[:200] if h1 else None + + # Mobile viewport + result["has_mobile_viewport"] = bool(soup.find("meta", attrs={"name": "viewport"})) + + # Canonical + OG + canon = soup.find("link", rel="canonical") + result["canonical_url"] = canon.get("href") if canon else None + og = soup.find("meta", property="og:title") + result["og_title"] = og.get("content") if og else None + + # Visible text + for tag in soup(["script", "style", "noscript"]): + tag.decompose() + visible_text = soup.get_text(separator=" ", strip=True) + words = visible_text.split() + result["word_count"] = len(words) + result["visible_text_snippet"] = " ".join(words[:500]) + + # Lorem ipsum / placeholder detection + vl = visible_text.lower() + lorem_hits = [p for p in LOREM_PHRASES if p in vl] + result["has_lorem_ipsum"] = len(lorem_hits) > 0 + result["lorem_matches"] = lorem_hits[:5] + ph_hits = [p for p in PLACEHOLDER_PHRASES if p in vl] + result["has_placeholder"] = len(ph_hits) > 0 + result["placeholder_matches"] = ph_hits[:3] + + # Images & scripts + imgs = soup.find_all("img") + result["image_count"] = len(imgs) + result["script_count"] = len(soup.find_all("script", src=True)) + + # Analytics / webmaster tags + for name, sigs in ANALYTICS.items(): + if any(s.lower() in hl for s in sigs): + result["analytics_present"].append(name) + for name, sigs in WEBMASTER.items(): + if any(s.lower() in hl for s in sigs): + result["webmaster_verified"].append(name) + + # Kit Digital + kd_signals = [] + for img in imgs: + combined = ((img.get("src") or "") + (img.get("alt") or "") + (img.get("srcset") or "")).lower() + for p in KIT_IMG_PATS: + if p in combined: + kd_signals.append(f"img:{p}") + break + for p in KIT_TEXT_PATS: + if p in hl: + kd_signals.append(f"text:{p}") + for a in soup.find_all("a", href=True): + href = a["href"].lower() + if "acelerapyme" in href or "red.es" in href or "kit-digital" in href: + kd_signals.append(f"link:{href[:50]}") + kd_signals = list(dict.fromkeys(kd_signals))[:10] + result["kit_digital"] = len(kd_signals) > 0 + result["kit_digital_signals"] = kd_signals + + # Contacts + for a in soup.find_all("a", href=True): + href = a["href"] + if href.startswith("mailto:"): + em = href[7:].split("?")[0].strip().lower() + if em and em not in result["emails"]: + result["emails"].append(em) + elif href.startswith("tel:"): + ph = re.sub(r"[^\d+]", "", href[4:]) + if ph and ph not in result["phones"]: + result["phones"].append(ph) + elif "wa.me" in href or "api.whatsapp.com" in href: + if href not in result["whatsapp"]: + result["whatsapp"].append(href[:80]) + else: + for sd in SOCIAL_DOM: + if sd in href.lower(): + clean = href.split("?")[0].rstrip("/") + if clean not in result["social_links"]: + result["social_links"].append(clean) + break + for em in EMAIL_RE.findall(html[:80000]): + em = em.lower() + if em not in result["emails"] and not any(em.endswith(x) for x in [".png", ".jpg", ".css", ".js", ".svg"]): + result["emails"].append(em) + for ph in PHONE_RE.findall(visible_text): + ph_c = re.sub(r"[\s\-]", "", ph) + if ph_c not in result["phones"]: + result["phones"].append(ph_c) + # Cap + for k in ["emails", "phones", "whatsapp", "social_links"]: + result[k] = list(dict.fromkeys(result[k]))[:5] + + # CMS + from app.enricher import detect_cms + result["cms"] = detect_cms(html, dict(resp.headers)) + + except Exception as e: + result["error"] = str(e)[:300] + + # ── Sitemap & robots (parallel) ─────────────────────────────────────────── + async def _check_url(url: str) -> Optional[str]: + try: + async with httpx.AsyncClient(timeout=6, follow_redirects=True, verify=False) as c: + r = await c.get(url) + return r.text if r.status_code == 200 else None + except Exception: + return None + + sitemap_txt, robots_txt = await asyncio.gather( + _check_url(f"https://{domain}/sitemap.xml"), + _check_url(f"https://{domain}/robots.txt"), + ) + result["has_sitemap"] = sitemap_txt is not None + result["has_robots"] = robots_txt is not None + if robots_txt: + robots_lower = robots_txt.lower() + result["robots_disallows_google"] = ( + "disallow: /" in robots_lower and "googlebot" in robots_lower + ) + + # ── SSL ─────────────────────────────────────────────────────────────────── + import ssl as _ssl, socket as _socket + try: + def _ssl_check(): + import datetime as _dt + ctx = _ssl.create_default_context() + with _socket.create_connection((domain, 443), timeout=5) as s: + with ctx.wrap_socket(s, server_hostname=domain) as ss: + cert = ss.getpeercert() + exp = _dt.datetime.strptime(cert["notAfter"], "%b %d %H:%M:%S %Y %Z") + return True, (_dt.datetime.utcnow() - exp).days * -1 + loop = asyncio.get_event_loop() + result["ssl_valid"], result["ssl_expiry_days"] = await loop.run_in_executor(None, _ssl_check) + except Exception: + pass + + return result diff --git a/app/static/index.html b/app/static/index.html index 64b2e6c..4179254 100644 --- a/app/static/index.html +++ b/app/static/index.html @@ -136,11 +136,10 @@ tr:hover td{background:rgba(255,255,255,.025)} /* AI detail modal */ .modal-bg{position:fixed;inset:0;background:#000a;z-index:300;display:flex;align-items:center;justify-content:center} -.modal{background:var(--surface);border:1px solid var(--border);border-radius:var(--r);padding:20px;max-width:500px;width:90%;max-height:80vh;overflow-y:auto} -.modal h2{font-size:16px;font-weight:800;margin-bottom:12px} -.modal .row{display:flex;gap:8px;margin-bottom:8px;font-size:13px} -.modal .label{color:var(--muted);min-width:110px;font-size:12px} -.modal .val{color:var(--text)} +.modal{background:var(--surface);border:1px solid var(--border);border-radius:var(--r);padding:18px;max-width:560px;width:95%;max-height:88vh;overflow-y:auto} +.modal h2{font-size:15px;font-weight:800} +.mrow{display:flex;gap:8px;margin-bottom:6px;font-size:12px;line-height:1.4} +.mlabel{color:var(--muted);min-width:90px;font-size:11px;padding-top:1px;flex-shrink:0} @media(max-width:700px){.pipeline{grid-template-columns:1fr}.sg{grid-template-columns:1fr 1fr}} @@ -153,15 +152,99 @@ tr:hover td{background:rgba(255,255,255,.025)}