diff --git a/app/site_analyzer.py b/app/site_analyzer.py index 60e53d2..f3c0c4b 100644 --- a/app/site_analyzer.py +++ b/app/site_analyzer.py @@ -9,6 +9,40 @@ from typing import Optional import httpx from bs4 import BeautifulSoup +# ── Cloudflare challenge detection ─────────────────────────────────────────── +_CF_TITLES = {"un momento", "checking your browser", "just a moment", + "please wait", "verifying you are human", "espere mientras"} + +def _is_cf_challenge(html: str) -> bool: + """Return True if the page looks like a Cloudflare JS challenge.""" + hl = html.lower() + if len(html) < 20_000 and any(t in hl for t in _CF_TITLES): + return True + return "cf-browser-verification" in hl or "cf_chl_opt" in html + + +async def _playwright_fetch(domain: str) -> Optional[str]: + """Fetch via headless Chromium, bypassing Cloudflare JS challenges.""" + try: + from playwright.async_api import async_playwright # type: ignore + async with async_playwright() as p: + browser = await p.chromium.launch( + headless=True, + args=["--no-sandbox", "--disable-setuid-sandbox"], + ) + ctx = await browser.new_context( + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + locale="es-ES", + ) + page = await ctx.new_page() + await page.goto(f"https://{domain}", timeout=25_000) + await asyncio.sleep(3) # let the CF challenge JS execute & redirect + html = await page.content() + await browser.close() + return html + except Exception: + return None + logger = logging.getLogger(__name__) # ── EU countries (hosting check) ───────────────────────────────────────────── @@ -185,6 +219,11 @@ async def _analyze_site_inner(domain: str) -> dict: resp = await client.get(f"https://{domain}") if resp.status_code >= 400: resp = await client.get(f"http://{domain}") + # Cloudflare JS challenge — retry with headless browser + if resp.status_code == 200 and _is_cf_challenge(resp.text): + html_pw = await _playwright_fetch(domain) + if html_pw and not _is_cf_challenge(html_pw): + return ("playwright", html_pw), int((time.monotonic() - t0) * 1000) return resp, int((time.monotonic() - t0) * 1000) except Exception as e: return None, int((time.monotonic() - t0) * 1000) @@ -196,14 +235,25 @@ async def _analyze_site_inner(domain: str) -> dict: if resp is None: result["error"] = "Failed to fetch site" else: - html = resp.text - result.update({ - "reachable": resp.status_code < 400, - "status_code": resp.status_code, - "final_url": str(resp.url), - "page_size_kb": round(len(resp.content) / 1024, 1), - "server": resp.headers.get("server"), - }) + # Handle playwright fallback tuple ("playwright", html_string) + if isinstance(resp, tuple) and resp[0] == "playwright": + html = resp[1] + result.update({ + "reachable": True, + "status_code": 200, + "final_url": f"https://{domain}/", + "page_size_kb": round(len(html.encode()) / 1024, 1), + "server": "cloudflare", + }) + else: + html = resp.text + result.update({ + "reachable": resp.status_code < 400, + "status_code": resp.status_code, + "final_url": str(resp.url), + "page_size_kb": round(len(resp.content) / 1024, 1), + "server": resp.headers.get("server"), + }) soup = BeautifulSoup(html, "html.parser") hl = html.lower()