diff --git a/app/__pycache__/__init__.cpython-311.pyc b/app/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000..122516c Binary files /dev/null and b/app/__pycache__/__init__.cpython-311.pyc differ diff --git a/app/__pycache__/db.cpython-311.pyc b/app/__pycache__/db.cpython-311.pyc new file mode 100644 index 0000000..84f93ff Binary files /dev/null and b/app/__pycache__/db.cpython-311.pyc differ diff --git a/app/__pycache__/enricher.cpython-311.pyc b/app/__pycache__/enricher.cpython-311.pyc new file mode 100644 index 0000000..e3e7c77 Binary files /dev/null and b/app/__pycache__/enricher.cpython-311.pyc differ diff --git a/app/__pycache__/replicate_ai.cpython-311.pyc b/app/__pycache__/replicate_ai.cpython-311.pyc new file mode 100644 index 0000000..2e18c92 Binary files /dev/null and b/app/__pycache__/replicate_ai.cpython-311.pyc differ diff --git a/app/__pycache__/site_analyzer.cpython-311.pyc b/app/__pycache__/site_analyzer.cpython-311.pyc new file mode 100644 index 0000000..b7cd420 Binary files /dev/null and b/app/__pycache__/site_analyzer.cpython-311.pyc differ diff --git a/app/enricher.py b/app/enricher.py index cf3d40e..21e33f0 100644 --- a/app/enricher.py +++ b/app/enricher.py @@ -338,53 +338,88 @@ async def worker_loop(): # ── AI assessment worker ────────────────────────────────────────────────────── -async def ai_worker_loop(): +async def _assess_one(domain: str) -> None: + """Process a single AI assessment — safe to call concurrently.""" from app.replicate_ai import assess_domain as gemini_assess from app.site_analyzer import analyze_site - while True: - async with aiosqlite.connect(SQLITE_PATH) as db: - async with db.execute( - "SELECT domain FROM ai_queue WHERE status='pending' LIMIT 10" - ) as cur: - rows = await cur.fetchall() - if rows: - await db.executemany( - "UPDATE ai_queue SET status='running' WHERE domain=?", - [(r[0],) for r in rows], + logger.info("AI: starting analysis for %s", domain) + try: + analysis = await analyze_site(domain) + logger.info("AI: site analyzed %s (reachable=%s, words=%s)", + domain, analysis.get("reachable"), analysis.get("word_count")) + assessment = await gemini_assess(analysis) + logger.info("AI: Gemini done %s → quality=%s", + domain, assessment.get("lead_quality")) + await save_ai_assessment(domain, assessment, site_analysis=analysis) + logger.info("AI: saved %s", domain) + except Exception as e: + logger.error("AI: failed %s — %s", domain, e, exc_info=True) + try: + async with aiosqlite.connect(SQLITE_PATH) as db: + await db.execute( + "UPDATE ai_queue SET status='failed', completed_at=datetime('now'), error=? WHERE domain=?", + (str(e)[:400], domain), ) await db.commit() + except Exception: + pass + + +async def ai_worker_loop(): + logger.info("AI worker loop starting") + while True: + rows = [] + try: + async with aiosqlite.connect(SQLITE_PATH) as db: + async with db.execute( + "SELECT domain FROM ai_queue WHERE status='pending' LIMIT 5" + ) as cur: + rows = await cur.fetchall() + if rows: + await db.executemany( + "UPDATE ai_queue SET status='running' WHERE domain=?", + [(r[0],) for r in rows], + ) + await db.commit() + logger.info("AI worker: picked up %d jobs: %s", + len(rows), [r[0] for r in rows]) + except Exception as e: + logger.error("AI worker DB error: %s", e, exc_info=True) + await asyncio.sleep(5) + continue if not rows: await asyncio.sleep(3) continue - async def assess_one(domain: str): - try: - # Always do a fresh deep scrape — no pre-enrichment required - analysis = await analyze_site(domain) - assessment = await gemini_assess(analysis) - await save_ai_assessment(domain, assessment, site_analysis=analysis) - logger.info("AI done: %s → %s", domain, assessment.get("lead_quality")) - except Exception as e: - async with aiosqlite.connect(SQLITE_PATH) as db: - await db.execute( - "UPDATE ai_queue SET status='failed', completed_at=datetime('now') WHERE domain=?", - (domain,), - ) - await db.commit() - logger.error("AI worker error %s: %s", domain, e) - - # AI_CONCURRENCY concurrent assessments (already enforced by replicate_ai semaphore) - await asyncio.gather(*[asyncio.create_task(assess_one(r[0])) for r in rows], return_exceptions=True) + # Run assessments concurrently (semaphore in replicate_ai enforces AI_CONCURRENCY) + results = await asyncio.gather( + *[_assess_one(r[0]) for r in rows], + return_exceptions=True, + ) + for r, exc in zip(rows, results): + if isinstance(exc, Exception): + logger.error("AI task exception for %s: %s", r[0], exc, exc_info=exc) def start_worker(): global _worker_task, _ai_worker_task if _worker_task is None or _worker_task.done(): _worker_task = asyncio.create_task(worker_loop()) + logger.info("Enrichment worker started") if _ai_worker_task is None or _ai_worker_task.done(): + if _ai_worker_task is not None and _ai_worker_task.done(): + exc = _ai_worker_task.exception() if not _ai_worker_task.cancelled() else None + if exc: + logger.error("AI worker died with: %s", exc, exc_info=exc) _ai_worker_task = asyncio.create_task(ai_worker_loop()) + logger.info("AI worker started/restarted") + + +def ensure_workers_alive(): + """Restart workers if they've died — call periodically.""" + start_worker() def pause_worker(): diff --git a/app/main.py b/app/main.py index 35b87f5..1f13132 100644 --- a/app/main.py +++ b/app/main.py @@ -20,7 +20,7 @@ from app.db import ( queue_domains, get_queue_status, build_duckdb_index, index_status, queue_ai, get_ai_queue_status, save_ai_assessment, ) -from app.enricher import start_worker, pause_worker, resume_worker, is_running +from app.enricher import start_worker, pause_worker, resume_worker, is_running, ensure_workers_alive from app.scorer import run_scoring logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") @@ -61,13 +61,20 @@ async def download_parquet(): logger.info("Parquet download complete") +async def _watchdog(): + """Restart workers if they die every 10 seconds.""" + while True: + await asyncio.sleep(10) + ensure_workers_alive() + + @asynccontextmanager async def lifespan(app: FastAPI): await download_parquet() await init_db() - # Build DuckDB index in background — queries still work (slower) while building asyncio.create_task(build_duckdb_index()) start_worker() + asyncio.create_task(_watchdog()) logger.info("DomGod ready on port 6677") yield @@ -167,9 +174,43 @@ async def ai_assess_batch(body: dict): if not domains_list: return JSONResponse({"error": "no domains provided"}, status_code=400) await queue_ai(domains_list) + ensure_workers_alive() # ensure AI worker is alive when jobs are queued return {"queued": len(domains_list)} +@app.post("/api/ai/worker/restart") +async def ai_worker_restart(): + ensure_workers_alive() + return {"status": "restarted"} + + +@app.get("/api/ai/debug") +async def ai_debug(): + """Returns worker state + last 10 queue entries for troubleshooting.""" + from app.enricher import _ai_worker_task + task_alive = _ai_worker_task is not None and not _ai_worker_task.done() + task_exc = None + if _ai_worker_task and _ai_worker_task.done() and not _ai_worker_task.cancelled(): + try: + task_exc = str(_ai_worker_task.exception()) + except Exception: + pass + + async with aiosqlite.connect(SQLITE_PATH) as db: + db.row_factory = aiosqlite.Row + async with db.execute( + "SELECT domain, status, created_at, completed_at, error FROM ai_queue ORDER BY created_at DESC LIMIT 10" + ) as cur: + recent = [dict(r) async for r in cur] + + return { + "ai_worker_alive": task_alive, + "ai_worker_exception": task_exc, + "recent_queue": recent, + "queue_status": await get_ai_queue_status(), + } + + @app.get("/api/ai/status") async def ai_status(): return await get_ai_queue_status() diff --git a/app/replicate_ai.py b/app/replicate_ai.py index 1e30eb1..7520e62 100644 --- a/app/replicate_ai.py +++ b/app/replicate_ai.py @@ -25,88 +25,105 @@ def _sem() -> asyncio.Semaphore: def _build_prompt(a: dict) -> str: - """Build the Gemini prompt from a full site analysis dict.""" contacts_block = [] - if a.get("emails"): contacts_block.append(f" Emails: {', '.join(a['emails'][:3])}") - if a.get("phones"): contacts_block.append(f" Phones: {', '.join(a['phones'][:3])}") - if a.get("whatsapp"): contacts_block.append(f" WhatsApp: {', '.join(a['whatsapp'][:2])}") - if a.get("social_links"): contacts_block.append(f" Social: {', '.join(a['social_links'][:4])}") + if a.get("emails"): contacts_block.append(f" Emails: {', '.join(a['emails'][:3])}") + if a.get("phones"): contacts_block.append(f" Phones: {', '.join(a['phones'][:3])}") + if a.get("whatsapp"): contacts_block.append(f" WhatsApp: {', '.join(a['whatsapp'][:2])}") + if a.get("social_links"):contacts_block.append(f" Social: {', '.join(a['social_links'][:4])}") contacts_str = "\n".join(contacts_block) or " None found" - kd_str = "\n".join(f" - {s}" for s in (a.get("kit_digital_signals") or [])) or " None detected" - analytics_str = ", ".join(a.get("analytics_present") or []) or "none" - webmaster_str = ", ".join(a.get("webmaster_verified") or []) or "none" - lorem_str = ", ".join(a.get("lorem_matches") or []) or "none" - placeholder_str = ", ".join(a.get("placeholder_matches") or []) or "none" + kd_str = "\n".join(f" - {s}" for s in (a.get("kit_digital_signals") or [])) or " None" + analytics = ", ".join(a.get("analytics_present") or []) or "none" + webmaster = ", ".join(a.get("webmaster_verified") or []) or "none" + lorem_str = ", ".join(a.get("lorem_matches") or []) or "none" + ph_str = ", ".join(a.get("placeholder_matches") or []) or "none" + snippet = (a.get("visible_text_snippet") or "")[:2000] - text_snippet = (a.get("visible_text_snippet") or "")[:2000] + eu_hosted = a.get("eu_hosted") + hosting_flag = "✅ EU" if eu_hosted else ("❌ Non-EU" if eu_hosted is False else "unknown") - return f"""You are a senior web consultant and IT sales analyst reviewing a Spanish SME website. + return f"""You are a senior web consultant and IT sales analyst evaluating a Spanish/European SME website for IT services upsell. -=== TECHNICAL SNAPSHOT === -Domain: {a.get("domain")} -Reachable: {a.get("reachable")} | Status: {a.get("status_code")} | Load time: {a.get("load_time_ms")} ms -Final URL: {a.get("final_url")} -Page size: {a.get("page_size_kb")} KB | Server: {a.get("server")} | CMS: {a.get("cms") or "unknown"} -SSL valid: {a.get("ssl_valid")} | SSL expires in: {a.get("ssl_expiry_days")} days -Mobile viewport: {a.get("has_mobile_viewport")} -Word count: {a.get("word_count")} | Images: {a.get("image_count")} | Scripts: {a.get("script_count")} +=== TECHNICAL === +Domain: {a.get("domain")} +Reachable: {a.get("reachable")} | Status: {a.get("status_code")} | Load time: {a.get("load_time_ms")} ms +Page size: {a.get("page_size_kb")} KB | Server: {a.get("server")} | CMS: {a.get("cms") or "unknown"} +SSL: valid={a.get("ssl_valid")} expires_in={a.get("ssl_expiry_days")} days +Mobile: viewport={a.get("has_mobile_viewport")} +Words: {a.get("word_count")} | Images: {a.get("image_count")} | Scripts: {a.get("script_count")} -=== SEO & INDEXING SIGNALS === -Page title: {a.get("page_title") or "missing"} -H1: {a.get("h1_text") or "missing"} -Meta description: {a.get("meta_description") or "missing"} -Canonical URL: {a.get("canonical_url") or "not set"} -Sitemap.xml: {a.get("has_sitemap")} -Robots.txt: {a.get("has_robots")} | Blocks Googlebot: {a.get("robots_disallows_google")} -Analytics: {analytics_str} -Webmaster verified:{webmaster_str} +=== HOSTING & INFRASTRUCTURE === +IP: {a.get("ip") or "unknown"} +ASN: {a.get("asn") or "unknown"} +Organisation: {a.get("org") or "unknown"} +ISP: {a.get("isp") or "unknown"} +Host country: {a.get("ip_country") or "unknown"} / {a.get("ip_region") or ""} +EU hosted: {hosting_flag} + +=== SEO & INDEXING === +Title: {a.get("page_title") or "MISSING"} +H1: {a.get("h1_text") or "MISSING"} +Meta desc: {a.get("meta_description") or "MISSING"} +Canonical: {a.get("canonical_url") or "not set"} +Sitemap: {a.get("has_sitemap")} | Robots: {a.get("has_robots")} | Blocks Google: {a.get("robots_disallows_google")} +Analytics: {analytics} +Webmaster: {webmaster} + +=== GDPR & LEGAL COMPLIANCE === +Cookie tool: {a.get("cookie_tool") or "none detected"} +Cookie notice: {a.get("has_cookie_notice")} +Privacy policy: {a.get("has_privacy_policy")} +GDPR text: {a.get("has_gdpr_text")} + +=== ACCESSIBILITY (quick scan) === +HTML lang attr: {a.get("html_lang") or "MISSING"} +Images missing alt: {a.get("images_missing_alt")} +Skip navigation link: {a.get("has_skip_nav")} +Empty links: {a.get("empty_links")} +Inputs without labels: {a.get("inputs_without_labels")} === CONTENT QUALITY === -Lorem ipsum found: {a.get("has_lorem_ipsum")} → matches: {lorem_str} -Placeholder text: {a.get("has_placeholder")} → matches: {placeholder_str} +Lorem ipsum: {a.get("has_lorem_ipsum")} → {lorem_str} +Placeholder: {a.get("has_placeholder")} → {ph_str} -=== KIT DIGITAL (Spanish gov digitalization grant — sites must display EU logos) === -Detected: {a.get("kit_digital")} -Signals: +=== KIT DIGITAL (Spanish gov €12k SME grants — sites must show EU logos) === +Detected: {a.get("kit_digital")} {kd_str} === CONTACT CHANNELS === {contacts_str} -=== PAGE TEXT SAMPLE (first 2000 chars) === -{text_snippet} +=== PAGE TEXT SAMPLE === +{snippet} -=== TASK === -Analyse this site for IT services upsell potential. The client sells: -web design/redesign, SEO, hosting migration, SSL renewal, security audits, -maintenance contracts, Google Ads, and AI-assisted tools for SMEs. +=== INSTRUCTIONS === +The client sells: web redesign, SEO, hosting migration, SSL renewal, +security audits, GDPR compliance, accessibility fixes, Google Ads, +maintenance contracts, AI tools for SMEs. -Respond ONLY with valid JSON — no markdown, no text outside the JSON object: +Respond ONLY with valid JSON, no markdown fences, no text outside the JSON: {{ - "summary": "2-3 sentence executive summary of the site's current state", - "site_quality_score": <0-10 integer>, - "content_issues": ["list of specific content problems found — lorem ipsum, broken sections, placeholder text, etc."], - "performance_notes": "comment on load time, page size, mobile readiness", - "seo_status": "brief SEO assessment — indexing signals, missing elements", + "summary": "2-3 sentence executive summary of the site's state", + "site_quality_score": <0-10>, + "content_issues": ["specific issues found in page content"], + "performance_notes": "load time, size, mobile assessment", + "seo_status": "SEO health — what's missing or broken", + "hosting_notes": "ASN/ISP name, EU vs non-EU, any concerns", + "gdpr_compliance": "cookie banner status, privacy policy, GDPR gaps", + "accessibility_issues": ["specific a11y problems found"], "kit_digital_confirmed": true/false, - "kit_digital_reasoning": "1 sentence — why confirmed or not", + "kit_digital_reasoning": "1 sentence", "is_local_sme": true/false, "lead_quality": "HOT|WARM|COLD", - "lead_reasoning": "1-2 sentences on why", + "lead_reasoning": "1-2 sentences", "best_contact_channel": "email|phone|whatsapp|social|web_form|unknown", - "best_contact_value": "the actual value to use (email address, phone number, URL) or empty string", - "all_contacts": {{ - "emails": [], - "phones": [], - "whatsapp": [], - "social": [] - }}, - "pitch_angle": "One concrete opening sentence in Spanish for cold outreach", - "services_needed": ["service1", "service2"], - "urgency_signals": ["list of specific urgent issues — expiring SSL, lorem ipsum live, no GA, blocked robots etc"], - "outreach_notes": "Key context for the sales rep" -}}""" + "best_contact_value": "actual email/phone/URL or empty string", + "all_contacts": {{"emails":[],"phones":[],"whatsapp":[],"social":[]}}, + "pitch_angle": "1 cold-outreach sentence in Spanish", + "services_needed": ["service1","service2"], + "urgency_signals": ["specific urgent issues like 'SSL expires in 12 days', 'lorem ipsum on homepage', 'no cookie banner'"], + "outreach_notes": "sales rep context" +}} def _parse_output(raw: str) -> dict: diff --git a/app/site_analyzer.py b/app/site_analyzer.py index fb01a55..df57552 100644 --- a/app/site_analyzer.py +++ b/app/site_analyzer.py @@ -1,8 +1,9 @@ -"""Deep site analysis: content quality, SEO signals, performance, indexing hints.""" +"""Deep site analysis: content quality, SEO, hosting, GDPR, accessibility.""" import asyncio import re import time import logging +import socket from typing import Optional import httpx @@ -10,39 +11,72 @@ from bs4 import BeautifulSoup logger = logging.getLogger(__name__) -# ── Content quality ─────────────────────────────────────────────────────────── +# ── EU countries (hosting check) ───────────────────────────────────────────── +EU_COUNTRIES = { + 'AT','BE','BG','HR','CY','CZ','DK','EE','FI','FR','DE','GR', + 'HU','IE','IT','LV','LT','LU','MT','NL','PL','PT','RO','SK', + 'SI','ES','SE', + 'NO','IS','LI', # EEA + 'CH','GB','AD', # adequacy / adjacent +} +# ── Content quality ─────────────────────────────────────────────────────────── LOREM_PHRASES = [ "lorem ipsum", "sed ut perspiciatis", "nunc sem sapien", "nulla id nibh", "aenean dignissim", "aliquam tincidunt", "vestibulum commodo", "fusce nunc lacus", "consectetuer", "cras ornare tristique", "ntulla nec ante", "risus id metus", "praesent placerat", "fusce pellentesque", "suscipit nibh", - "integer vitae libero", "felis quis tortor", + "integer vitae libero", "felis quis tortor", "dolor sit amet", ] - PLACEHOLDER_PHRASES = [ "under construction", "coming soon", "sample page", - "this is a demo", "default post", "hello world", - "test post", "uncategorized", + "this is a demo", "hello world", "test content", + "default post", "uncategorized", "demo content", ] -# ── Analytics & webmaster tags ──────────────────────────────────────────────── +# ── Cookie / GDPR consent tools ─────────────────────────────────────────────── +COOKIE_TOOLS = { + "cookiebot": ["cookiebot.com", "CookieConsent", "CybotCookiebot"], + "onetrust": ["onetrust", "otBannerSdk"], + "cookiepro": ["cookiepro.com"], + "osano": ["osano.com"], + "iubenda": ["iubenda.com"], + "borlabs": ["borlabs-cookie"], + "complianz": ["complianz"], + "cookieyes": ["cookieyes.com", "cookie-law-info"], + "usercentrics": ["usercentrics.com"], + "quantcast": ["quantcast.com/cmp"], +} +COOKIE_TEXT_SIGNALS = [ + "accept cookies", "acepta las cookies", "we use cookies", "usamos cookies", + "cookie policy", "política de cookies", "cookie settings", "manage cookies", + "aceptar todas", "rechazar cookies", +] +PRIVACY_SIGNALS = [ + "privacy policy", "política de privacidad", "aviso legal", + "privacy notice", "data protection", +] +GDPR_TEXT_SIGNALS = [ + "rgpd", "gdpr", "reglamento general de protección", + "lopd", "protección de datos", "responsable del tratamiento", +] +# ── Analytics / webmaster ───────────────────────────────────────────────────── ANALYTICS = { - "google_analytics": ["gtag('config'", "google-analytics.com/analytics.js", "G-"], - "google_tag_manager": ["googletagmanager.com/gtm.js", "GTM-"], - "facebook_pixel": ["fbq('init'", "connect.facebook.net/en_US/fbevents"], - "hotjar": ["static.hotjar.com"], - "clarity": ["clarity.ms/tag"], + "google_analytics": ["gtag('config'", "google-analytics.com/analytics.js", "G-"], + "google_tag_manager": ["googletagmanager.com/gtm.js", "GTM-"], + "facebook_pixel": ["fbq('init'", "connect.facebook.net/en_US/fbevents"], + "hotjar": ["static.hotjar.com"], + "clarity": ["clarity.ms/tag"], } - WEBMASTER = { - "google_search_console": ['google-site-verification'], - "bing_webmaster": ['msvalidate.01'], - "yandex": ['yandex-verification'], + "google_search_console": ["google-site-verification"], + "bing_webmaster": ["msvalidate.01"], + "yandex": ["yandex-verification"], } +# ── Kit Digital ─────────────────────────────────────────────────────────────── KIT_IMG_PATS = [ "digitalizadores", "kit-digital", "kitdigital", "kit_digital", "fondos-europeos", "fondos_europeos", "nextgeneration", "next-generation", @@ -56,72 +90,99 @@ KIT_TEXT_PATS = [ EMAIL_RE = re.compile(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}") PHONE_RE = re.compile(r"(?:\+34[\s\-]?)?(?:6|7|8|9)\d{2}[\s\-]?\d{3}[\s\-]?\d{3}") -SOCIAL_DOM = ["facebook.com", "instagram.com", "linkedin.com", "twitter.com", "x.com", "tiktok.com"] +SOCIAL_DOM = ["facebook.com", "instagram.com", "linkedin.com", + "twitter.com", "x.com", "tiktok.com", "youtube.com"] + + +async def _get_hosting_info(domain: str) -> dict: + """Resolve IP, then look up ASN / org / country via ip-api.com.""" + info = {"ip": None, "asn": None, "org": None, "isp": None, + "ip_country": None, "ip_region": None, "eu_hosted": None} + try: + loop = asyncio.get_event_loop() + ip = await loop.run_in_executor(None, socket.gethostbyname, domain) + info["ip"] = ip + async with httpx.AsyncClient(timeout=6) as client: + r = await client.get( + f"http://ip-api.com/json/{ip}", + params={"fields": "status,country,countryCode,regionName,org,as,isp"}, + ) + if r.status_code == 200: + d = r.json() + if d.get("status") == "success": + info.update({ + "asn": d.get("as"), + "org": d.get("org"), + "isp": d.get("isp"), + "ip_country": d.get("countryCode"), + "ip_region": d.get("regionName"), + "eu_hosted": d.get("countryCode") in EU_COUNTRIES, + }) + except Exception as e: + logger.debug("Hosting lookup failed for %s: %s", domain, e) + return info async def analyze_site(domain: str) -> dict: - """Fetch and deeply analyse a site. Returns a rich dict for the AI prompt.""" result = { "domain": domain, - "reachable": False, - "load_time_ms": None, - "status_code": None, - "final_url": None, - "page_size_kb": None, - "server": None, - "cms": None, - "ssl_valid": False, - "ssl_expiry_days": None, + "reachable": False, "load_time_ms": None, "status_code": None, + "final_url": None, "page_size_kb": None, "server": None, "cms": None, + # Hosting + "ip": None, "asn": None, "org": None, "isp": None, + "ip_country": None, "ip_region": None, "eu_hosted": None, + # SSL + "ssl_valid": False, "ssl_expiry_days": None, # Content quality - "has_lorem_ipsum": False, - "lorem_matches": [], - "has_placeholder": False, - "placeholder_matches": [], - "word_count": 0, - "image_count": 0, - "broken_images": 0, - "script_count": 0, + "has_lorem_ipsum": False, "lorem_matches": [], + "has_placeholder": False, "placeholder_matches": [], + "word_count": 0, "image_count": 0, "script_count": 0, "has_mobile_viewport": False, - "page_title": None, - "meta_description": None, - "h1_text": None, + "page_title": None, "meta_description": None, "h1_text": None, "visible_text_snippet": "", - # SEO / webmaster - "has_sitemap": False, - "has_robots": False, - "robots_disallows_google": False, - "analytics_present": [], - "webmaster_verified": [], - "canonical_url": None, - "og_title": None, + # SEO + "has_sitemap": False, "has_robots": False, "robots_disallows_google": False, + "analytics_present": [], "webmaster_verified": [], + "canonical_url": None, "og_title": None, + # GDPR / cookies + "cookie_tool": None, "has_cookie_notice": False, + "has_privacy_policy": False, "has_gdpr_text": False, + # Accessibility + "html_lang": None, "images_missing_alt": 0, + "has_skip_nav": False, "empty_links": 0, + "inputs_without_labels": 0, # Kit Digital - "kit_digital": False, - "kit_digital_signals": [], + "kit_digital": False, "kit_digital_signals": [], # Contacts - "emails": [], - "phones": [], - "whatsapp": [], - "social_links": [], - # Errors + "emails": [], "phones": [], "whatsapp": [], "social_links": [], "error": None, } - # ── Fetch main page ─────────────────────────────────────────────────────── - try: + # ── Fetch + hosting (parallel) ──────────────────────────────────────────── + async def _fetch(): t0 = time.monotonic() - async with httpx.AsyncClient( - timeout=15, follow_redirects=True, verify=False, - headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}, - ) as client: - resp = await client.get(f"https://{domain}") - if resp.status_code >= 400: - resp = await client.get(f"http://{domain}") + try: + async with httpx.AsyncClient( + timeout=15, follow_redirects=True, verify=False, + headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}, + ) as client: + resp = await client.get(f"https://{domain}") + if resp.status_code >= 400: + resp = await client.get(f"http://{domain}") + return resp, int((time.monotonic() - t0) * 1000) + except Exception as e: + return None, int((time.monotonic() - t0) * 1000) - load_ms = int((time.monotonic() - t0) * 1000) + (resp, load_ms), hosting = await asyncio.gather(_fetch(), _get_hosting_info(domain)) + result.update(hosting) + result["load_time_ms"] = load_ms + + if resp is None: + result["error"] = "Failed to fetch site" + else: html = resp.text result.update({ "reachable": resp.status_code < 400, - "load_time_ms": load_ms, "status_code": resp.status_code, "final_url": str(resp.url), "page_size_kb": round(len(resp.content) / 1024, 1), @@ -131,46 +192,42 @@ async def analyze_site(domain: str) -> dict: soup = BeautifulSoup(html, "html.parser") hl = html.lower() - # Title, meta - title_tag = soup.find("title") - result["page_title"] = title_tag.get_text(strip=True)[:200] if title_tag else None - meta_desc = soup.find("meta", attrs={"name": "description"}) - result["meta_description"] = (meta_desc.get("content") or "")[:300] if meta_desc else None + # ── Basic metadata ──────────────────────────────────────────────────── + result["html_lang"] = (soup.find("html") or {}).get("lang") + t = soup.find("title") + result["page_title"] = t.get_text(strip=True)[:200] if t else None + md = soup.find("meta", attrs={"name": "description"}) + result["meta_description"] = (md.get("content") or "")[:300] if md else None h1 = soup.find("h1") result["h1_text"] = h1.get_text(strip=True)[:200] if h1 else None - - # Mobile viewport result["has_mobile_viewport"] = bool(soup.find("meta", attrs={"name": "viewport"})) - - # Canonical + OG - canon = soup.find("link", rel="canonical") - result["canonical_url"] = canon.get("href") if canon else None + c = soup.find("link", rel="canonical") + result["canonical_url"] = c.get("href") if c else None og = soup.find("meta", property="og:title") result["og_title"] = og.get("content") if og else None - # Visible text + # ── Visible text ────────────────────────────────────────────────────── for tag in soup(["script", "style", "noscript"]): tag.decompose() - visible_text = soup.get_text(separator=" ", strip=True) - words = visible_text.split() + visible = soup.get_text(separator=" ", strip=True) + vl = visible.lower() + words = visible.split() result["word_count"] = len(words) - result["visible_text_snippet"] = " ".join(words[:500]) + result["visible_text_snippet"] = " ".join(words[:600]) - # Lorem ipsum / placeholder detection - vl = visible_text.lower() + # ── Content quality ─────────────────────────────────────────────────── lorem_hits = [p for p in LOREM_PHRASES if p in vl] - result["has_lorem_ipsum"] = len(lorem_hits) > 0 - result["lorem_matches"] = lorem_hits[:5] + result["has_lorem_ipsum"] = len(lorem_hits) > 0 + result["lorem_matches"] = lorem_hits[:6] ph_hits = [p for p in PLACEHOLDER_PHRASES if p in vl] - result["has_placeholder"] = len(ph_hits) > 0 + result["has_placeholder"] = len(ph_hits) > 0 result["placeholder_matches"] = ph_hits[:3] - # Images & scripts imgs = soup.find_all("img") - result["image_count"] = len(imgs) + result["image_count"] = len(imgs) result["script_count"] = len(soup.find_all("script", src=True)) - # Analytics / webmaster tags + # ── Analytics / webmaster ───────────────────────────────────────────── for name, sigs in ANALYTICS.items(): if any(s.lower() in hl for s in sigs): result["analytics_present"].append(name) @@ -178,12 +235,42 @@ async def analyze_site(domain: str) -> dict: if any(s.lower() in hl for s in sigs): result["webmaster_verified"].append(name) - # Kit Digital + # ── GDPR / cookies ──────────────────────────────────────────────────── + for tool, sigs in COOKIE_TOOLS.items(): + if any(s.lower() in hl for s in sigs): + result["cookie_tool"] = tool + result["has_cookie_notice"] = True + break + if not result["has_cookie_notice"]: + result["has_cookie_notice"] = any(s in vl for s in COOKIE_TEXT_SIGNALS) + result["has_privacy_policy"] = any(s in vl for s in PRIVACY_SIGNALS) or bool( + soup.find("a", href=lambda h: h and ("privacidad" in h.lower() or "privacy" in h.lower())) + ) + result["has_gdpr_text"] = any(s in vl for s in GDPR_TEXT_SIGNALS) + + # ── Accessibility ───────────────────────────────────────────────────── + result["images_missing_alt"] = sum( + 1 for img in imgs if not img.get("alt") and img.get("alt") != "" + ) + result["has_skip_nav"] = bool( + soup.find("a", href=lambda h: h and h.lower() in ("#main", "#content", "#maincontent", "#skip")) + ) + result["empty_links"] = sum( + 1 for a in soup.find_all("a") if not a.get_text(strip=True) and not a.find("img") + ) + all_inputs = soup.find_all("input", type=lambda t: t not in ("hidden", "submit", "button", None) or t is None) + labeled_ids = {lbl.get("for") for lbl in soup.find_all("label") if lbl.get("for")} + result["inputs_without_labels"] = sum( + 1 for inp in all_inputs + if inp.get("id") not in labeled_ids and not inp.get("aria-label") and not inp.get("aria-labelledby") + ) + + # ── Kit Digital ─────────────────────────────────────────────────────── kd_signals = [] - for img in imgs: - combined = ((img.get("src") or "") + (img.get("alt") or "") + (img.get("srcset") or "")).lower() + for img in soup.find_all("img"): + comb = ((img.get("src") or "") + (img.get("alt") or "") + (img.get("srcset") or "")).lower() for p in KIT_IMG_PATS: - if p in combined: + if p in comb: kd_signals.append(f"img:{p}") break for p in KIT_TEXT_PATS: @@ -194,10 +281,10 @@ async def analyze_site(domain: str) -> dict: if "acelerapyme" in href or "red.es" in href or "kit-digital" in href: kd_signals.append(f"link:{href[:50]}") kd_signals = list(dict.fromkeys(kd_signals))[:10] - result["kit_digital"] = len(kd_signals) > 0 + result["kit_digital"] = len(kd_signals) > 0 result["kit_digital_signals"] = kd_signals - # Contacts + # ── Contacts ────────────────────────────────────────────────────────── for a in soup.find_all("a", href=True): href = a["href"] if href.startswith("mailto:"): @@ -220,25 +307,36 @@ async def analyze_site(domain: str) -> dict: break for em in EMAIL_RE.findall(html[:80000]): em = em.lower() - if em not in result["emails"] and not any(em.endswith(x) for x in [".png", ".jpg", ".css", ".js", ".svg"]): + if em not in result["emails"] and not any(em.endswith(x) for x in [".png",".jpg",".css",".js",".svg"]): result["emails"].append(em) - for ph in PHONE_RE.findall(visible_text): + for ph in PHONE_RE.findall(visible): ph_c = re.sub(r"[\s\-]", "", ph) if ph_c not in result["phones"]: result["phones"].append(ph_c) - # Cap for k in ["emails", "phones", "whatsapp", "social_links"]: result[k] = list(dict.fromkeys(result[k]))[:5] - # CMS - from app.enricher import detect_cms - result["cms"] = detect_cms(html, dict(resp.headers)) - - except Exception as e: - result["error"] = str(e)[:300] + # ── CMS ─────────────────────────────────────────────────────────────── + CMS_SIGS = { + "wordpress": ["/wp-content/", "/wp-includes/", 'content="WordPress'], + "joomla": ["/components/com_", "Joomla!", 'content="Joomla'], + "drupal": ["/sites/default/files/", "Drupal.settings"], + "wix": ["static.wixstatic.com", "X-Wix-"], + "squarespace": ["squarespace.com", "X-Squarespace-"], + "shopify": ["cdn.shopify.com", "Shopify.theme"], + "prestashop": ["PrestaShop", "/modules/prestashop"], + "magento": ["Mage.Cookies", "X-Magento-"], + "typo3": ["typo3temp", "TYPO3 CMS"], + "opencart": ["route=common/home", "OpenCart"], + } + combined_check = html[:60000] + " ".join(f"{k}:{v}" for k, v in resp.headers.items()) + for cms, sigs in CMS_SIGS.items(): + if any(s.lower() in combined_check.lower() for s in sigs): + result["cms"] = cms + break # ── Sitemap & robots (parallel) ─────────────────────────────────────────── - async def _check_url(url: str) -> Optional[str]: + async def _get(url): try: async with httpx.AsyncClient(timeout=6, follow_redirects=True, verify=False) as c: r = await c.get(url) @@ -247,24 +345,22 @@ async def analyze_site(domain: str) -> dict: return None sitemap_txt, robots_txt = await asyncio.gather( - _check_url(f"https://{domain}/sitemap.xml"), - _check_url(f"https://{domain}/robots.txt"), + _get(f"https://{domain}/sitemap.xml"), + _get(f"https://{domain}/robots.txt"), ) result["has_sitemap"] = sitemap_txt is not None - result["has_robots"] = robots_txt is not None + result["has_robots"] = robots_txt is not None if robots_txt: - robots_lower = robots_txt.lower() - result["robots_disallows_google"] = ( - "disallow: /" in robots_lower and "googlebot" in robots_lower - ) + rl = robots_txt.lower() + result["robots_disallows_google"] = "disallow: /" in rl and "googlebot" in rl # ── SSL ─────────────────────────────────────────────────────────────────── - import ssl as _ssl, socket as _socket + import ssl as _ssl try: def _ssl_check(): import datetime as _dt ctx = _ssl.create_default_context() - with _socket.create_connection((domain, 443), timeout=5) as s: + with socket.create_connection((domain, 443), timeout=5) as s: with ctx.wrap_socket(s, server_hostname=domain) as ss: cert = ss.getpeercert() exp = _dt.datetime.strptime(cert["notAfter"], "%b %d %H:%M:%S %Y %Z") diff --git a/app/static/index.html b/app/static/index.html index 4179254..389460e 100644 --- a/app/static/index.html +++ b/app/static/index.html @@ -179,7 +179,9 @@ tr:hover td{background:rgba(255,255,255,.025)}
Reasoning
KD notes
Performance
-
SEO status
+
SEO
+
Hosting
+
GDPR
@@ -225,6 +227,14 @@ tr:hover td{background:rgba(255,255,255,.025)}
+ +
+
Accessibility Issues
+ +
+
Services
Notes
@@ -431,7 +441,11 @@ tr:hover td{background:rgba(255,255,255,.025)}
Auto-assesses enriched domains via Gemini. Detects Kit Digital confirmation, extracts best contact channel, writes pitch.
- +
+ + + 🔍 Debug AI queue +
@@ -616,6 +630,7 @@ function app() { try { this.qst = await fetch('/api/enrich/status').then(r=>r.json()); } catch(e){} }, + async restartAiWorker() { const r=await fetch('/api/ai/worker/restart',{method:'POST'}); this.notify('AI worker restarted','info'); await this.loadAiStatus(); }, async startEnrich() { await fetch('/api/enrich/resume',{method:'POST'}); this.notify('Worker started','success'); await this.loadQueue(); }, async pauseEnrich() { await fetch('/api/enrich/pause',{method:'POST'}); this.notify('Worker paused','success'); await this.loadQueue(); }, async retryFailed() { await fetch('/api/enrich/retry',{method:'POST'}); this.notify('Retrying failed','success'); await this.loadQueue(); },