feat: Cloudflare JS challenge bypass via playwright fallback

When httpx gets a CF challenge page (detected by title + small page size),
site_analyzer retries the fetch with headless Chromium via playwright,
waits 3s for the challenge to resolve, then proceeds with normal extraction.
Tested on productospeluqueriabellezaaura.com.es — now extracts real title,
email, phone, and Instagram/Facebook/TikTok links that were previously blocked.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-13 10:49:45 +02:00
parent dfd47743e3
commit 3a7ef19746

View File

@@ -9,6 +9,40 @@ from typing import Optional
import httpx
from bs4 import BeautifulSoup
# ── Cloudflare challenge detection ───────────────────────────────────────────
_CF_TITLES = {"un momento", "checking your browser", "just a moment",
"please wait", "verifying you are human", "espere mientras"}
def _is_cf_challenge(html: str) -> bool:
"""Return True if the page looks like a Cloudflare JS challenge."""
hl = html.lower()
if len(html) < 20_000 and any(t in hl for t in _CF_TITLES):
return True
return "cf-browser-verification" in hl or "cf_chl_opt" in html
async def _playwright_fetch(domain: str) -> Optional[str]:
"""Fetch via headless Chromium, bypassing Cloudflare JS challenges."""
try:
from playwright.async_api import async_playwright # type: ignore
async with async_playwright() as p:
browser = await p.chromium.launch(
headless=True,
args=["--no-sandbox", "--disable-setuid-sandbox"],
)
ctx = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
locale="es-ES",
)
page = await ctx.new_page()
await page.goto(f"https://{domain}", timeout=25_000)
await asyncio.sleep(3) # let the CF challenge JS execute & redirect
html = await page.content()
await browser.close()
return html
except Exception:
return None
logger = logging.getLogger(__name__)
# ── EU countries (hosting check) ─────────────────────────────────────────────
@@ -185,6 +219,11 @@ async def _analyze_site_inner(domain: str) -> dict:
resp = await client.get(f"https://{domain}")
if resp.status_code >= 400:
resp = await client.get(f"http://{domain}")
# Cloudflare JS challenge — retry with headless browser
if resp.status_code == 200 and _is_cf_challenge(resp.text):
html_pw = await _playwright_fetch(domain)
if html_pw and not _is_cf_challenge(html_pw):
return ("playwright", html_pw), int((time.monotonic() - t0) * 1000)
return resp, int((time.monotonic() - t0) * 1000)
except Exception as e:
return None, int((time.monotonic() - t0) * 1000)
@@ -196,14 +235,25 @@ async def _analyze_site_inner(domain: str) -> dict:
if resp is None:
result["error"] = "Failed to fetch site"
else:
html = resp.text
result.update({
"reachable": resp.status_code < 400,
"status_code": resp.status_code,
"final_url": str(resp.url),
"page_size_kb": round(len(resp.content) / 1024, 1),
"server": resp.headers.get("server"),
})
# Handle playwright fallback tuple ("playwright", html_string)
if isinstance(resp, tuple) and resp[0] == "playwright":
html = resp[1]
result.update({
"reachable": True,
"status_code": 200,
"final_url": f"https://{domain}/",
"page_size_kb": round(len(html.encode()) / 1024, 1),
"server": "cloudflare",
})
else:
html = resp.text
result.update({
"reachable": resp.status_code < 400,
"status_code": resp.status_code,
"final_url": str(resp.url),
"page_size_kb": round(len(resp.content) / 1024, 1),
"server": resp.headers.get("server"),
})
soup = BeautifulSoup(html, "html.parser")
hl = html.lower()