fix: rotate UA + treat any HTTP response as live (not just 200/203)
- Rotate across 7 real browser UAs to avoid bot detection - Any 2xx/3xx/4xx/5xx response = server is UP = live (only no-response = dead) - Parking signals still checked on 200/203 body content - Previous 403/404 responses were incorrectly marking live servers as dead Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -7,6 +7,7 @@ ip + load_time_ms to enriched_domains.
|
|||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import random
|
||||||
import socket
|
import socket
|
||||||
import time
|
import time
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
@@ -36,15 +37,28 @@ PARKING_REDIRECT_HOSTS = {
|
|||||||
"uniregistry.com", "sedoparking.com",
|
"uniregistry.com", "sedoparking.com",
|
||||||
}
|
}
|
||||||
|
|
||||||
_UA = (
|
# Any HTTP response code means the server is UP — only connection failures
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
# and timeouts are truly "dead". 4xx/5xx still means a live web server.
|
||||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
_LIVE_CODES = set(range(200, 600))
|
||||||
"Chrome/122.0.0.0 Safari/537.36"
|
|
||||||
)
|
_UAS = [
|
||||||
_HEADERS = {
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||||
"User-Agent": _UA,
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
|
||||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||||
}
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Safari/605.1.15",
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0",
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
def _headers() -> dict:
|
||||||
|
return {
|
||||||
|
"User-Agent": random.choice(_UAS),
|
||||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||||
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
|
"Accept-Encoding": "gzip, deflate, br",
|
||||||
|
"Connection": "keep-alive",
|
||||||
|
}
|
||||||
|
|
||||||
_val_task: Optional[asyncio.Task] = None
|
_val_task: Optional[asyncio.Task] = None
|
||||||
_val_stats: dict = {
|
_val_stats: dict = {
|
||||||
@@ -98,7 +112,7 @@ async def _check_domain(domain: str) -> dict:
|
|||||||
async with httpx.AsyncClient(
|
async with httpx.AsyncClient(
|
||||||
timeout=timeouts[scheme],
|
timeout=timeouts[scheme],
|
||||||
follow_redirects=True,
|
follow_redirects=True,
|
||||||
headers=_HEADERS,
|
headers=_headers(),
|
||||||
verify=False,
|
verify=False,
|
||||||
max_redirects=5,
|
max_redirects=5,
|
||||||
) as client:
|
) as client:
|
||||||
@@ -112,6 +126,7 @@ async def _check_domain(domain: str) -> dict:
|
|||||||
final_url = str(resp.url)
|
final_url = str(resp.url)
|
||||||
final_host = urlparse(final_url).netloc.lower().lstrip("www.")
|
final_host = urlparse(final_url).netloc.lower().lstrip("www.")
|
||||||
|
|
||||||
|
# Redirected to a completely different domain
|
||||||
if not _same_domain(domain, final_url):
|
if not _same_domain(domain, final_url):
|
||||||
for ph in PARKING_REDIRECT_HOSTS:
|
for ph in PARKING_REDIRECT_HOSTS:
|
||||||
if ph in final_host:
|
if ph in final_host:
|
||||||
@@ -120,9 +135,10 @@ async def _check_domain(domain: str) -> dict:
|
|||||||
result["prescreen_status"] = "redirect"
|
result["prescreen_status"] = "redirect"
|
||||||
return result
|
return result
|
||||||
|
|
||||||
if resp.status_code not in (200, 203):
|
# Any response from the server = the domain is live.
|
||||||
return result # dead
|
# 4xx/5xx still means a working web server — only no-response = dead.
|
||||||
|
# Only check parking signals on 200 responses (2xx bodies are readable).
|
||||||
|
if resp.status_code in (200, 203):
|
||||||
html_lc = resp.text[:20_000].lower()
|
html_lc = resp.text[:20_000].lower()
|
||||||
for sig in PARKING_BODY_SIGNALS:
|
for sig in PARKING_BODY_SIGNALS:
|
||||||
if sig in html_lc:
|
if sig in html_lc:
|
||||||
@@ -133,12 +149,10 @@ async def _check_domain(domain: str) -> dict:
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Any failure on http → always try https next
|
# Any failure on http → try https. Any failure on https → dead.
|
||||||
# Any failure on https → give up, leave as dead
|
|
||||||
logger.debug("Validator %s (%s): %s — %s", domain, scheme, type(e).__name__, e)
|
logger.debug("Validator %s (%s): %s — %s", domain, scheme, type(e).__name__, e)
|
||||||
if scheme == "https":
|
if scheme == "https":
|
||||||
break
|
break
|
||||||
# fall through to https
|
|
||||||
|
|
||||||
result["load_time_ms"] = int((time.monotonic() - t0) * 1000)
|
result["load_time_ms"] = int((time.monotonic() - t0) * 1000)
|
||||||
return result
|
return result
|
||||||
|
|||||||
Reference in New Issue
Block a user