fix: try https fallback when http port 80 is closed (fixes HTTPS-only domains marked as dead)

Many modern servers refuse HTTP connections entirely. The validator was
only trying http://, causing HTTPS-only sites to be wrongly marked dead.
Now falls back to https:// on ConnectError. Also increased timeouts slightly.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-19 20:11:00 +02:00
parent 3f042196d3
commit ae2fad0152

View File

@@ -85,51 +85,59 @@ async def _check_domain(domain: str) -> dict:
"load_time_ms": None,
}
t0 = time.monotonic()
try:
async with httpx.AsyncClient(
timeout=httpx.Timeout(connect=5, read=8, write=5, pool=10),
follow_redirects=True,
headers=_HEADERS,
verify=False,
max_redirects=5,
) as client:
resp = await client.get(f"http://{domain}")
result["load_time_ms"] = int((time.monotonic() - t0) * 1000)
result["status_code"] = resp.status_code
result["server"] = (resp.headers.get("server") or "")[:100]
# Try http first (follows http→https redirects automatically).
# Fall back to https directly if port 80 is closed/refused — many modern
# servers only listen on 443 and would be wrongly marked dead otherwise.
for scheme in ("http", "https"):
try:
async with httpx.AsyncClient(
timeout=httpx.Timeout(connect=7, read=12, write=5, pool=15),
follow_redirects=True,
headers=_HEADERS,
verify=False,
max_redirects=5,
) as client:
resp = await client.get(f"{scheme}://{domain}")
# Resolve IP for live-looking domains
result["ip"] = await _resolve_ip(domain)
result["load_time_ms"] = int((time.monotonic() - t0) * 1000)
result["status_code"] = resp.status_code
result["server"] = (resp.headers.get("server") or "")[:100]
result["ip"] = await _resolve_ip(domain)
final_url = str(resp.url)
final_host = urlparse(final_url).netloc.lower().lstrip("www.")
final_url = str(resp.url)
final_host = urlparse(final_url).netloc.lower().lstrip("www.")
# Redirected to a different root domain?
if not _same_domain(domain, final_url):
for ph in PARKING_REDIRECT_HOSTS:
if ph in final_host:
result["prescreen_status"] = "parked"
return result
result["prescreen_status"] = "redirect"
return result
if resp.status_code not in (200, 203):
return result # dead
html_lc = resp.text[:20_000].lower()
for sig in PARKING_BODY_SIGNALS:
if sig in html_lc:
result["prescreen_status"] = "parked"
if not _same_domain(domain, final_url):
for ph in PARKING_REDIRECT_HOSTS:
if ph in final_host:
result["prescreen_status"] = "parked"
return result
result["prescreen_status"] = "redirect"
return result
result["prescreen_status"] = "live"
return result
if resp.status_code not in (200, 203):
return result # dead
except Exception as e:
logger.debug("Validator %s: %s", domain, e)
result["load_time_ms"] = int((time.monotonic() - t0) * 1000)
return result
html_lc = resp.text[:20_000].lower()
for sig in PARKING_BODY_SIGNALS:
if sig in html_lc:
result["prescreen_status"] = "parked"
return result
result["prescreen_status"] = "live"
return result
except httpx.ConnectError:
# Port closed / connection refused — try the other scheme
logger.debug("Validator %s: ConnectError on %s, trying next scheme", domain, scheme)
continue
except Exception as e:
logger.debug("Validator %s (%s): %s", domain, scheme, e)
break # timeout or other error — don't retry
result["load_time_ms"] = int((time.monotonic() - t0) * 1000)
return result
def _get_domains_batch(offset: int, limit: int, tld: Optional[str]) -> list[str]: