feat: Cloudflare JS challenge bypass via playwright fallback

When httpx gets a CF challenge page (detected by title + small page size), site_analyzer retries the fetch with headless Chromium via playwright, waits 3s for the challenge to resolve, then proceeds with normal extraction. Tested on productospeluqueriabellezaaura.com.es — now extracts real title, email, phone, and Instagram/Facebook/TikTok links that were previously blocked. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-13 10:49:45 +02:00
parent dfd47743e3
commit 3a7ef19746
1 changed files with 58 additions and 8 deletions
--- a/app/site_analyzer.py
+++ b/app/site_analyzer.py
@@ -9,6 +9,40 @@ from typing import Optional
 import httpx
 from bs4 import BeautifulSoup
 # ── Cloudflare challenge detection ───────────────────────────────────────────
 _CF_TITLES = {"un momento", "checking your browser", "just a moment",
              "please wait", "verifying you are human", "espere mientras"}
 def _is_cf_challenge(html: str) -> bool:
    """Return True if the page looks like a Cloudflare JS challenge."""
    hl = html.lower()
    if len(html) < 20_000 and any(t in hl for t in _CF_TITLES):
        return True
    return "cf-browser-verification" in hl or "cf_chl_opt" in html
 async def _playwright_fetch(domain: str) -> Optional[str]:
    """Fetch via headless Chromium, bypassing Cloudflare JS challenges."""
    try:
        from playwright.async_api import async_playwright  # type: ignore
        async with async_playwright() as p:
            browser = await p.chromium.launch(
                headless=True,
                args=["--no-sandbox", "--disable-setuid-sandbox"],
            )
            ctx = await browser.new_context(
                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
                locale="es-ES",
            )
            page = await ctx.new_page()
            await page.goto(f"https://{domain}", timeout=25_000)
            await asyncio.sleep(3)   # let the CF challenge JS execute & redirect
            html = await page.content()
            await browser.close()
            return html
    except Exception:
        return None
 logger = logging.getLogger(__name__)
 # ── EU countries (hosting check) ─────────────────────────────────────────────
@@ -185,6 +219,11 @@ async def _analyze_site_inner(domain: str) -> dict:
                resp = await client.get(f"https://{domain}")
                if resp.status_code >= 400:
                    resp = await client.get(f"http://{domain}")
            # Cloudflare JS challenge — retry with headless browser
            if resp.status_code == 200 and _is_cf_challenge(resp.text):
                html_pw = await _playwright_fetch(domain)
                if html_pw and not _is_cf_challenge(html_pw):
                    return ("playwright", html_pw), int((time.monotonic() - t0) * 1000)
            return resp, int((time.monotonic() - t0) * 1000)
        except Exception as e:
            return None, int((time.monotonic() - t0) * 1000)
@@ -196,14 +235,25 @@ async def _analyze_site_inner(domain: str) -> dict:
    if resp is None:
        result["error"] = "Failed to fetch site"
    else:
-        html = resp.text
+        # Handle playwright fallback tuple ("playwright", html_string)
-        result.update({
+        if isinstance(resp, tuple) and resp[0] == "playwright":
-            "reachable": resp.status_code < 400,
+            html = resp[1]
-            "status_code": resp.status_code,
+            result.update({
-            "final_url": str(resp.url),
+                "reachable": True,
-            "page_size_kb": round(len(resp.content) / 1024, 1),
+                "status_code": 200,
-            "server": resp.headers.get("server"),
+                "final_url": f"https://{domain}/",
-        })
+                "page_size_kb": round(len(html.encode()) / 1024, 1),
                "server": "cloudflare",
            })
        else:
            html = resp.text
            result.update({
                "reachable": resp.status_code < 400,
                "status_code": resp.status_code,
                "final_url": str(resp.url),
                "page_size_kb": round(len(resp.content) / 1024, 1),
                "server": resp.headers.get("server"),
            })
        soup = BeautifulSoup(html, "html.parser")
        hl = html.lower()