fix: rotate UA + treat any HTTP response as live (not just 200/203)

- Rotate across 7 real browser UAs to avoid bot detection
- Any 2xx/3xx/4xx/5xx response = server is UP = live (only no-response = dead)
- Parking signals still checked on 200/203 body content
- Previous 403/404 responses were incorrectly marking live servers as dead

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-20 18:36:32 +02:00
parent 8a4ec88d73
commit 6657e6ea1f

View File

@@ -7,6 +7,7 @@ ip + load_time_ms to enriched_domains.
import asyncio import asyncio
import logging import logging
import os import os
import random
import socket import socket
import time import time
from typing import Optional from typing import Optional
@@ -36,15 +37,28 @@ PARKING_REDIRECT_HOSTS = {
"uniregistry.com", "sedoparking.com", "uniregistry.com", "sedoparking.com",
} }
_UA = ( # Any HTTP response code means the server is UP — only connection failures
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) " # and timeouts are truly "dead". 4xx/5xx still means a live web server.
"AppleWebKit/537.36 (KHTML, like Gecko) " _LIVE_CODES = set(range(200, 600))
"Chrome/122.0.0.0 Safari/537.36"
) _UAS = [
_HEADERS = { "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"User-Agent": _UA, "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
} "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0",
]
def _headers() -> dict:
return {
"User-Agent": random.choice(_UAS),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
}
_val_task: Optional[asyncio.Task] = None _val_task: Optional[asyncio.Task] = None
_val_stats: dict = { _val_stats: dict = {
@@ -98,7 +112,7 @@ async def _check_domain(domain: str) -> dict:
async with httpx.AsyncClient( async with httpx.AsyncClient(
timeout=timeouts[scheme], timeout=timeouts[scheme],
follow_redirects=True, follow_redirects=True,
headers=_HEADERS, headers=_headers(),
verify=False, verify=False,
max_redirects=5, max_redirects=5,
) as client: ) as client:
@@ -112,6 +126,7 @@ async def _check_domain(domain: str) -> dict:
final_url = str(resp.url) final_url = str(resp.url)
final_host = urlparse(final_url).netloc.lower().lstrip("www.") final_host = urlparse(final_url).netloc.lower().lstrip("www.")
# Redirected to a completely different domain
if not _same_domain(domain, final_url): if not _same_domain(domain, final_url):
for ph in PARKING_REDIRECT_HOSTS: for ph in PARKING_REDIRECT_HOSTS:
if ph in final_host: if ph in final_host:
@@ -120,9 +135,10 @@ async def _check_domain(domain: str) -> dict:
result["prescreen_status"] = "redirect" result["prescreen_status"] = "redirect"
return result return result
if resp.status_code not in (200, 203): # Any response from the server = the domain is live.
return result # dead # 4xx/5xx still means a working web server — only no-response = dead.
# Only check parking signals on 200 responses (2xx bodies are readable).
if resp.status_code in (200, 203):
html_lc = resp.text[:20_000].lower() html_lc = resp.text[:20_000].lower()
for sig in PARKING_BODY_SIGNALS: for sig in PARKING_BODY_SIGNALS:
if sig in html_lc: if sig in html_lc:
@@ -133,12 +149,10 @@ async def _check_domain(domain: str) -> dict:
return result return result
except Exception as e: except Exception as e:
# Any failure on http → always try https next # Any failure on http → try https. Any failure on https → dead.
# Any failure on https → give up, leave as dead
logger.debug("Validator %s (%s): %s%s", domain, scheme, type(e).__name__, e) logger.debug("Validator %s (%s): %s%s", domain, scheme, type(e).__name__, e)
if scheme == "https": if scheme == "https":
break break
# fall through to https
result["load_time_ms"] = int((time.monotonic() - t0) * 1000) result["load_time_ms"] = int((time.monotonic() - t0) * 1000)
return result return result