diff --git a/app/validator.py b/app/validator.py index f400a4f..190dfde 100644 --- a/app/validator.py +++ b/app/validator.py @@ -85,51 +85,59 @@ async def _check_domain(domain: str) -> dict: "load_time_ms": None, } t0 = time.monotonic() - try: - async with httpx.AsyncClient( - timeout=httpx.Timeout(connect=5, read=8, write=5, pool=10), - follow_redirects=True, - headers=_HEADERS, - verify=False, - max_redirects=5, - ) as client: - resp = await client.get(f"http://{domain}") - result["load_time_ms"] = int((time.monotonic() - t0) * 1000) - result["status_code"] = resp.status_code - result["server"] = (resp.headers.get("server") or "")[:100] + # Try http first (follows http→https redirects automatically). + # Fall back to https directly if port 80 is closed/refused — many modern + # servers only listen on 443 and would be wrongly marked dead otherwise. + for scheme in ("http", "https"): + try: + async with httpx.AsyncClient( + timeout=httpx.Timeout(connect=7, read=12, write=5, pool=15), + follow_redirects=True, + headers=_HEADERS, + verify=False, + max_redirects=5, + ) as client: + resp = await client.get(f"{scheme}://{domain}") - # Resolve IP for live-looking domains - result["ip"] = await _resolve_ip(domain) + result["load_time_ms"] = int((time.monotonic() - t0) * 1000) + result["status_code"] = resp.status_code + result["server"] = (resp.headers.get("server") or "")[:100] + result["ip"] = await _resolve_ip(domain) - final_url = str(resp.url) - final_host = urlparse(final_url).netloc.lower().lstrip("www.") + final_url = str(resp.url) + final_host = urlparse(final_url).netloc.lower().lstrip("www.") - # Redirected to a different root domain? - if not _same_domain(domain, final_url): - for ph in PARKING_REDIRECT_HOSTS: - if ph in final_host: - result["prescreen_status"] = "parked" - return result - result["prescreen_status"] = "redirect" - return result - - if resp.status_code not in (200, 203): - return result # dead - - html_lc = resp.text[:20_000].lower() - for sig in PARKING_BODY_SIGNALS: - if sig in html_lc: - result["prescreen_status"] = "parked" + if not _same_domain(domain, final_url): + for ph in PARKING_REDIRECT_HOSTS: + if ph in final_host: + result["prescreen_status"] = "parked" + return result + result["prescreen_status"] = "redirect" return result - result["prescreen_status"] = "live" - return result + if resp.status_code not in (200, 203): + return result # dead - except Exception as e: - logger.debug("Validator %s: %s", domain, e) - result["load_time_ms"] = int((time.monotonic() - t0) * 1000) - return result + html_lc = resp.text[:20_000].lower() + for sig in PARKING_BODY_SIGNALS: + if sig in html_lc: + result["prescreen_status"] = "parked" + return result + + result["prescreen_status"] = "live" + return result + + except httpx.ConnectError: + # Port closed / connection refused — try the other scheme + logger.debug("Validator %s: ConnectError on %s, trying next scheme", domain, scheme) + continue + except Exception as e: + logger.debug("Validator %s (%s): %s", domain, scheme, e) + break # timeout or other error — don't retry + + result["load_time_ms"] = int((time.monotonic() - t0) * 1000) + return result def _get_domains_batch(offset: int, limit: int, tld: Optional[str]) -> list[str]: