feat: Cloudflare JS challenge bypass via playwright fallback
When httpx gets a CF challenge page (detected by title + small page size), site_analyzer retries the fetch with headless Chromium via playwright, waits 3s for the challenge to resolve, then proceeds with normal extraction. Tested on productospeluqueriabellezaaura.com.es — now extracts real title, email, phone, and Instagram/Facebook/TikTok links that were previously blocked. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -9,6 +9,40 @@ from typing import Optional
|
|||||||
import httpx
|
import httpx
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
# ── Cloudflare challenge detection ───────────────────────────────────────────
|
||||||
|
_CF_TITLES = {"un momento", "checking your browser", "just a moment",
|
||||||
|
"please wait", "verifying you are human", "espere mientras"}
|
||||||
|
|
||||||
|
def _is_cf_challenge(html: str) -> bool:
|
||||||
|
"""Return True if the page looks like a Cloudflare JS challenge."""
|
||||||
|
hl = html.lower()
|
||||||
|
if len(html) < 20_000 and any(t in hl for t in _CF_TITLES):
|
||||||
|
return True
|
||||||
|
return "cf-browser-verification" in hl or "cf_chl_opt" in html
|
||||||
|
|
||||||
|
|
||||||
|
async def _playwright_fetch(domain: str) -> Optional[str]:
|
||||||
|
"""Fetch via headless Chromium, bypassing Cloudflare JS challenges."""
|
||||||
|
try:
|
||||||
|
from playwright.async_api import async_playwright # type: ignore
|
||||||
|
async with async_playwright() as p:
|
||||||
|
browser = await p.chromium.launch(
|
||||||
|
headless=True,
|
||||||
|
args=["--no-sandbox", "--disable-setuid-sandbox"],
|
||||||
|
)
|
||||||
|
ctx = await browser.new_context(
|
||||||
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||||
|
locale="es-ES",
|
||||||
|
)
|
||||||
|
page = await ctx.new_page()
|
||||||
|
await page.goto(f"https://{domain}", timeout=25_000)
|
||||||
|
await asyncio.sleep(3) # let the CF challenge JS execute & redirect
|
||||||
|
html = await page.content()
|
||||||
|
await browser.close()
|
||||||
|
return html
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# ── EU countries (hosting check) ─────────────────────────────────────────────
|
# ── EU countries (hosting check) ─────────────────────────────────────────────
|
||||||
@@ -185,6 +219,11 @@ async def _analyze_site_inner(domain: str) -> dict:
|
|||||||
resp = await client.get(f"https://{domain}")
|
resp = await client.get(f"https://{domain}")
|
||||||
if resp.status_code >= 400:
|
if resp.status_code >= 400:
|
||||||
resp = await client.get(f"http://{domain}")
|
resp = await client.get(f"http://{domain}")
|
||||||
|
# Cloudflare JS challenge — retry with headless browser
|
||||||
|
if resp.status_code == 200 and _is_cf_challenge(resp.text):
|
||||||
|
html_pw = await _playwright_fetch(domain)
|
||||||
|
if html_pw and not _is_cf_challenge(html_pw):
|
||||||
|
return ("playwright", html_pw), int((time.monotonic() - t0) * 1000)
|
||||||
return resp, int((time.monotonic() - t0) * 1000)
|
return resp, int((time.monotonic() - t0) * 1000)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return None, int((time.monotonic() - t0) * 1000)
|
return None, int((time.monotonic() - t0) * 1000)
|
||||||
@@ -196,14 +235,25 @@ async def _analyze_site_inner(domain: str) -> dict:
|
|||||||
if resp is None:
|
if resp is None:
|
||||||
result["error"] = "Failed to fetch site"
|
result["error"] = "Failed to fetch site"
|
||||||
else:
|
else:
|
||||||
html = resp.text
|
# Handle playwright fallback tuple ("playwright", html_string)
|
||||||
result.update({
|
if isinstance(resp, tuple) and resp[0] == "playwright":
|
||||||
"reachable": resp.status_code < 400,
|
html = resp[1]
|
||||||
"status_code": resp.status_code,
|
result.update({
|
||||||
"final_url": str(resp.url),
|
"reachable": True,
|
||||||
"page_size_kb": round(len(resp.content) / 1024, 1),
|
"status_code": 200,
|
||||||
"server": resp.headers.get("server"),
|
"final_url": f"https://{domain}/",
|
||||||
})
|
"page_size_kb": round(len(html.encode()) / 1024, 1),
|
||||||
|
"server": "cloudflare",
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
html = resp.text
|
||||||
|
result.update({
|
||||||
|
"reachable": resp.status_code < 400,
|
||||||
|
"status_code": resp.status_code,
|
||||||
|
"final_url": str(resp.url),
|
||||||
|
"page_size_kb": round(len(resp.content) / 1024, 1),
|
||||||
|
"server": resp.headers.get("server"),
|
||||||
|
})
|
||||||
|
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
hl = html.lower()
|
hl = html.lower()
|
||||||
|
|||||||
Reference in New Issue
Block a user