feat: Cloudflare JS challenge bypass via playwright fallback

When httpx gets a CF challenge page (detected by title + small page size), site_analyzer retries the fetch with headless Chromium via playwright, waits 3s for the challenge to resolve, then proceeds with normal extraction. Tested on productospeluqueriabellezaaura.com.es — now extracts real title, email, phone, and Instagram/Facebook/TikTok links that were previously blocked. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-13 10:49:45 +02:00
parent dfd47743e3
commit 3a7ef19746
1 changed files with 58 additions and 8 deletions
--- a/app/site_analyzer.py
+++ b/app/site_analyzer.py
@@ -9,6 +9,40 @@ from typing import Optional
 import httpx
 from bs4 import BeautifulSoup

+# ── Cloudflare challenge detection ───────────────────────────────────────────
+_CF_TITLES = {"un momento", "checking your browser", "just a moment",
+              "please wait", "verifying you are human", "espere mientras"}
+
+def _is_cf_challenge(html: str) -> bool:
+    """Return True if the page looks like a Cloudflare JS challenge."""
+    hl = html.lower()
+    if len(html) < 20_000 and any(t in hl for t in _CF_TITLES):
+        return True
+    return "cf-browser-verification" in hl or "cf_chl_opt" in html
+
+
+async def _playwright_fetch(domain: str) -> Optional[str]:
+    """Fetch via headless Chromium, bypassing Cloudflare JS challenges."""
+    try:
+        from playwright.async_api import async_playwright  # type: ignore
+        async with async_playwright() as p:
+            browser = await p.chromium.launch(
+                headless=True,
+                args=["--no-sandbox", "--disable-setuid-sandbox"],
+            )
+            ctx = await browser.new_context(
+                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+                locale="es-ES",
+            )
+            page = await ctx.new_page()
+            await page.goto(f"https://{domain}", timeout=25_000)
+            await asyncio.sleep(3)   # let the CF challenge JS execute & redirect
+            html = await page.content()
+            await browser.close()
+            return html
+    except Exception:
+        return None
+
 logger = logging.getLogger(__name__)

 # ── EU countries (hosting check) ─────────────────────────────────────────────
@@ -185,6 +219,11 @@ async def _analyze_site_inner(domain: str) -> dict:
                resp = await client.get(f"https://{domain}")
                if resp.status_code >= 400:
                    resp = await client.get(f"http://{domain}")
+            # Cloudflare JS challenge — retry with headless browser
+            if resp.status_code == 200 and _is_cf_challenge(resp.text):
+                html_pw = await _playwright_fetch(domain)
+                if html_pw and not _is_cf_challenge(html_pw):
+                    return ("playwright", html_pw), int((time.monotonic() - t0) * 1000)
            return resp, int((time.monotonic() - t0) * 1000)
        except Exception as e:
            return None, int((time.monotonic() - t0) * 1000)
@@ -196,14 +235,25 @@ async def _analyze_site_inner(domain: str) -> dict:
    if resp is None:
        result["error"] = "Failed to fetch site"
    else:
-        html = resp.text
-        result.update({
-            "reachable": resp.status_code < 400,
-            "status_code": resp.status_code,
-            "final_url": str(resp.url),
-            "page_size_kb": round(len(resp.content) / 1024, 1),
-            "server": resp.headers.get("server"),
-        })
+        # Handle playwright fallback tuple ("playwright", html_string)
+        if isinstance(resp, tuple) and resp[0] == "playwright":
+            html = resp[1]
+            result.update({
+                "reachable": True,
+                "status_code": 200,
+                "final_url": f"https://{domain}/",
+                "page_size_kb": round(len(html.encode()) / 1024, 1),
+                "server": "cloudflare",
+            })
+        else:
+            html = resp.text
+            result.update({
+                "reachable": resp.status_code < 400,
+                "status_code": resp.status_code,
+                "final_url": str(resp.url),
+                "page_size_kb": round(len(resp.content) / 1024, 1),
+                "server": resp.headers.get("server"),
+            })

        soup = BeautifulSoup(html, "html.parser")
        hl = html.lower()