fix: rotate UA + treat any HTTP response as live (not just 200/203)
- Rotate across 7 real browser UAs to avoid bot detection - Any 2xx/3xx/4xx/5xx response = server is UP = live (only no-response = dead) - Parking signals still checked on 200/203 body content - Previous 403/404 responses were incorrectly marking live servers as dead Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -7,6 +7,7 @@ ip + load_time_ms to enriched_domains.
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import socket
|
||||
import time
|
||||
from typing import Optional
|
||||
@@ -36,14 +37,27 @@ PARKING_REDIRECT_HOSTS = {
|
||||
"uniregistry.com", "sedoparking.com",
|
||||
}
|
||||
|
||||
_UA = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/122.0.0.0 Safari/537.36"
|
||||
)
|
||||
_HEADERS = {
|
||||
"User-Agent": _UA,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
# Any HTTP response code means the server is UP — only connection failures
|
||||
# and timeouts are truly "dead". 4xx/5xx still means a live web server.
|
||||
_LIVE_CODES = set(range(200, 600))
|
||||
|
||||
_UAS = [
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Safari/605.1.15",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0",
|
||||
]
|
||||
|
||||
def _headers() -> dict:
|
||||
return {
|
||||
"User-Agent": random.choice(_UAS),
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"Connection": "keep-alive",
|
||||
}
|
||||
|
||||
_val_task: Optional[asyncio.Task] = None
|
||||
@@ -98,7 +112,7 @@ async def _check_domain(domain: str) -> dict:
|
||||
async with httpx.AsyncClient(
|
||||
timeout=timeouts[scheme],
|
||||
follow_redirects=True,
|
||||
headers=_HEADERS,
|
||||
headers=_headers(),
|
||||
verify=False,
|
||||
max_redirects=5,
|
||||
) as client:
|
||||
@@ -112,6 +126,7 @@ async def _check_domain(domain: str) -> dict:
|
||||
final_url = str(resp.url)
|
||||
final_host = urlparse(final_url).netloc.lower().lstrip("www.")
|
||||
|
||||
# Redirected to a completely different domain
|
||||
if not _same_domain(domain, final_url):
|
||||
for ph in PARKING_REDIRECT_HOSTS:
|
||||
if ph in final_host:
|
||||
@@ -120,9 +135,10 @@ async def _check_domain(domain: str) -> dict:
|
||||
result["prescreen_status"] = "redirect"
|
||||
return result
|
||||
|
||||
if resp.status_code not in (200, 203):
|
||||
return result # dead
|
||||
|
||||
# Any response from the server = the domain is live.
|
||||
# 4xx/5xx still means a working web server — only no-response = dead.
|
||||
# Only check parking signals on 200 responses (2xx bodies are readable).
|
||||
if resp.status_code in (200, 203):
|
||||
html_lc = resp.text[:20_000].lower()
|
||||
for sig in PARKING_BODY_SIGNALS:
|
||||
if sig in html_lc:
|
||||
@@ -133,12 +149,10 @@ async def _check_domain(domain: str) -> dict:
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
# Any failure on http → always try https next
|
||||
# Any failure on https → give up, leave as dead
|
||||
# Any failure on http → try https. Any failure on https → dead.
|
||||
logger.debug("Validator %s (%s): %s — %s", domain, scheme, type(e).__name__, e)
|
||||
if scheme == "https":
|
||||
break
|
||||
# fall through to https
|
||||
|
||||
result["load_time_ms"] = int((time.monotonic() - t0) * 1000)
|
||||
return result
|
||||
|
||||
Reference in New Issue
Block a user