feat: Cloudflare JS challenge bypass via playwright fallback
When httpx gets a CF challenge page (detected by title + small page size), site_analyzer retries the fetch with headless Chromium via playwright, waits 3s for the challenge to resolve, then proceeds with normal extraction. Tested on productospeluqueriabellezaaura.com.es — now extracts real title, email, phone, and Instagram/Facebook/TikTok links that were previously blocked. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -9,6 +9,40 @@ from typing import Optional
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# ── Cloudflare challenge detection ───────────────────────────────────────────
|
||||
_CF_TITLES = {"un momento", "checking your browser", "just a moment",
|
||||
"please wait", "verifying you are human", "espere mientras"}
|
||||
|
||||
def _is_cf_challenge(html: str) -> bool:
|
||||
"""Return True if the page looks like a Cloudflare JS challenge."""
|
||||
hl = html.lower()
|
||||
if len(html) < 20_000 and any(t in hl for t in _CF_TITLES):
|
||||
return True
|
||||
return "cf-browser-verification" in hl or "cf_chl_opt" in html
|
||||
|
||||
|
||||
async def _playwright_fetch(domain: str) -> Optional[str]:
|
||||
"""Fetch via headless Chromium, bypassing Cloudflare JS challenges."""
|
||||
try:
|
||||
from playwright.async_api import async_playwright # type: ignore
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(
|
||||
headless=True,
|
||||
args=["--no-sandbox", "--disable-setuid-sandbox"],
|
||||
)
|
||||
ctx = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
locale="es-ES",
|
||||
)
|
||||
page = await ctx.new_page()
|
||||
await page.goto(f"https://{domain}", timeout=25_000)
|
||||
await asyncio.sleep(3) # let the CF challenge JS execute & redirect
|
||||
html = await page.content()
|
||||
await browser.close()
|
||||
return html
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ── EU countries (hosting check) ─────────────────────────────────────────────
|
||||
@@ -185,6 +219,11 @@ async def _analyze_site_inner(domain: str) -> dict:
|
||||
resp = await client.get(f"https://{domain}")
|
||||
if resp.status_code >= 400:
|
||||
resp = await client.get(f"http://{domain}")
|
||||
# Cloudflare JS challenge — retry with headless browser
|
||||
if resp.status_code == 200 and _is_cf_challenge(resp.text):
|
||||
html_pw = await _playwright_fetch(domain)
|
||||
if html_pw and not _is_cf_challenge(html_pw):
|
||||
return ("playwright", html_pw), int((time.monotonic() - t0) * 1000)
|
||||
return resp, int((time.monotonic() - t0) * 1000)
|
||||
except Exception as e:
|
||||
return None, int((time.monotonic() - t0) * 1000)
|
||||
@@ -196,14 +235,25 @@ async def _analyze_site_inner(domain: str) -> dict:
|
||||
if resp is None:
|
||||
result["error"] = "Failed to fetch site"
|
||||
else:
|
||||
html = resp.text
|
||||
result.update({
|
||||
"reachable": resp.status_code < 400,
|
||||
"status_code": resp.status_code,
|
||||
"final_url": str(resp.url),
|
||||
"page_size_kb": round(len(resp.content) / 1024, 1),
|
||||
"server": resp.headers.get("server"),
|
||||
})
|
||||
# Handle playwright fallback tuple ("playwright", html_string)
|
||||
if isinstance(resp, tuple) and resp[0] == "playwright":
|
||||
html = resp[1]
|
||||
result.update({
|
||||
"reachable": True,
|
||||
"status_code": 200,
|
||||
"final_url": f"https://{domain}/",
|
||||
"page_size_kb": round(len(html.encode()) / 1024, 1),
|
||||
"server": "cloudflare",
|
||||
})
|
||||
else:
|
||||
html = resp.text
|
||||
result.update({
|
||||
"reachable": resp.status_code < 400,
|
||||
"status_code": resp.status_code,
|
||||
"final_url": str(resp.url),
|
||||
"page_size_kb": round(len(resp.content) / 1024, 1),
|
||||
"server": resp.headers.get("server"),
|
||||
})
|
||||
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
hl = html.lower()
|
||||
|
||||
Reference in New Issue
Block a user