From 6657e6ea1fa331a4dba3708d3ff176d17e4a6937 Mon Sep 17 00:00:00 2001 From: Malin Date: Mon, 20 Apr 2026 18:36:32 +0200 Subject: [PATCH] fix: rotate UA + treat any HTTP response as live (not just 200/203) - Rotate across 7 real browser UAs to avoid bot detection - Any 2xx/3xx/4xx/5xx response = server is UP = live (only no-response = dead) - Parking signals still checked on 200/203 body content - Previous 403/404 responses were incorrectly marking live servers as dead Co-Authored-By: Claude Sonnet 4.6 --- app/validator.py | 56 ++++++++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 21 deletions(-) diff --git a/app/validator.py b/app/validator.py index 38660ab..d5a88a0 100644 --- a/app/validator.py +++ b/app/validator.py @@ -7,6 +7,7 @@ ip + load_time_ms to enriched_domains. import asyncio import logging import os +import random import socket import time from typing import Optional @@ -36,15 +37,28 @@ PARKING_REDIRECT_HOSTS = { "uniregistry.com", "sedoparking.com", } -_UA = ( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/122.0.0.0 Safari/537.36" -) -_HEADERS = { - "User-Agent": _UA, - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", -} +# Any HTTP response code means the server is UP — only connection failures +# and timeouts are truly "dead". 4xx/5xx still means a live web server. +_LIVE_CODES = set(range(200, 600)) + +_UAS = [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Safari/605.1.15", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36 Edg/124.0.0.0", +] + +def _headers() -> dict: + return { + "User-Agent": random.choice(_UAS), + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", + "Accept-Encoding": "gzip, deflate, br", + "Connection": "keep-alive", + } _val_task: Optional[asyncio.Task] = None _val_stats: dict = { @@ -98,7 +112,7 @@ async def _check_domain(domain: str) -> dict: async with httpx.AsyncClient( timeout=timeouts[scheme], follow_redirects=True, - headers=_HEADERS, + headers=_headers(), verify=False, max_redirects=5, ) as client: @@ -112,6 +126,7 @@ async def _check_domain(domain: str) -> dict: final_url = str(resp.url) final_host = urlparse(final_url).netloc.lower().lstrip("www.") + # Redirected to a completely different domain if not _same_domain(domain, final_url): for ph in PARKING_REDIRECT_HOSTS: if ph in final_host: @@ -120,25 +135,24 @@ async def _check_domain(domain: str) -> dict: result["prescreen_status"] = "redirect" return result - if resp.status_code not in (200, 203): - return result # dead - - html_lc = resp.text[:20_000].lower() - for sig in PARKING_BODY_SIGNALS: - if sig in html_lc: - result["prescreen_status"] = "parked" - return result + # Any response from the server = the domain is live. + # 4xx/5xx still means a working web server — only no-response = dead. + # Only check parking signals on 200 responses (2xx bodies are readable). + if resp.status_code in (200, 203): + html_lc = resp.text[:20_000].lower() + for sig in PARKING_BODY_SIGNALS: + if sig in html_lc: + result["prescreen_status"] = "parked" + return result result["prescreen_status"] = "live" return result except Exception as e: - # Any failure on http → always try https next - # Any failure on https → give up, leave as dead + # Any failure on http → try https. Any failure on https → dead. logger.debug("Validator %s (%s): %s — %s", domain, scheme, type(e).__name__, e) if scheme == "https": break - # fall through to https result["load_time_ms"] = int((time.monotonic() - t0) * 1000) return result