fix: try https fallback when http port 80 is closed (fixes HTTPS-only domains marked as dead)
Many modern servers refuse HTTP connections entirely. The validator was only trying http://, causing HTTPS-only sites to be wrongly marked dead. Now falls back to https:// on ConnectError. Also increased timeouts slightly. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -85,51 +85,59 @@ async def _check_domain(domain: str) -> dict:
|
||||
"load_time_ms": None,
|
||||
}
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
timeout=httpx.Timeout(connect=5, read=8, write=5, pool=10),
|
||||
follow_redirects=True,
|
||||
headers=_HEADERS,
|
||||
verify=False,
|
||||
max_redirects=5,
|
||||
) as client:
|
||||
resp = await client.get(f"http://{domain}")
|
||||
|
||||
result["load_time_ms"] = int((time.monotonic() - t0) * 1000)
|
||||
result["status_code"] = resp.status_code
|
||||
result["server"] = (resp.headers.get("server") or "")[:100]
|
||||
# Try http first (follows http→https redirects automatically).
|
||||
# Fall back to https directly if port 80 is closed/refused — many modern
|
||||
# servers only listen on 443 and would be wrongly marked dead otherwise.
|
||||
for scheme in ("http", "https"):
|
||||
try:
|
||||
async with httpx.AsyncClient(
|
||||
timeout=httpx.Timeout(connect=7, read=12, write=5, pool=15),
|
||||
follow_redirects=True,
|
||||
headers=_HEADERS,
|
||||
verify=False,
|
||||
max_redirects=5,
|
||||
) as client:
|
||||
resp = await client.get(f"{scheme}://{domain}")
|
||||
|
||||
# Resolve IP for live-looking domains
|
||||
result["ip"] = await _resolve_ip(domain)
|
||||
result["load_time_ms"] = int((time.monotonic() - t0) * 1000)
|
||||
result["status_code"] = resp.status_code
|
||||
result["server"] = (resp.headers.get("server") or "")[:100]
|
||||
result["ip"] = await _resolve_ip(domain)
|
||||
|
||||
final_url = str(resp.url)
|
||||
final_host = urlparse(final_url).netloc.lower().lstrip("www.")
|
||||
final_url = str(resp.url)
|
||||
final_host = urlparse(final_url).netloc.lower().lstrip("www.")
|
||||
|
||||
# Redirected to a different root domain?
|
||||
if not _same_domain(domain, final_url):
|
||||
for ph in PARKING_REDIRECT_HOSTS:
|
||||
if ph in final_host:
|
||||
result["prescreen_status"] = "parked"
|
||||
return result
|
||||
result["prescreen_status"] = "redirect"
|
||||
return result
|
||||
|
||||
if resp.status_code not in (200, 203):
|
||||
return result # dead
|
||||
|
||||
html_lc = resp.text[:20_000].lower()
|
||||
for sig in PARKING_BODY_SIGNALS:
|
||||
if sig in html_lc:
|
||||
result["prescreen_status"] = "parked"
|
||||
if not _same_domain(domain, final_url):
|
||||
for ph in PARKING_REDIRECT_HOSTS:
|
||||
if ph in final_host:
|
||||
result["prescreen_status"] = "parked"
|
||||
return result
|
||||
result["prescreen_status"] = "redirect"
|
||||
return result
|
||||
|
||||
result["prescreen_status"] = "live"
|
||||
return result
|
||||
if resp.status_code not in (200, 203):
|
||||
return result # dead
|
||||
|
||||
except Exception as e:
|
||||
logger.debug("Validator %s: %s", domain, e)
|
||||
result["load_time_ms"] = int((time.monotonic() - t0) * 1000)
|
||||
return result
|
||||
html_lc = resp.text[:20_000].lower()
|
||||
for sig in PARKING_BODY_SIGNALS:
|
||||
if sig in html_lc:
|
||||
result["prescreen_status"] = "parked"
|
||||
return result
|
||||
|
||||
result["prescreen_status"] = "live"
|
||||
return result
|
||||
|
||||
except httpx.ConnectError:
|
||||
# Port closed / connection refused — try the other scheme
|
||||
logger.debug("Validator %s: ConnectError on %s, trying next scheme", domain, scheme)
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.debug("Validator %s (%s): %s", domain, scheme, e)
|
||||
break # timeout or other error — don't retry
|
||||
|
||||
result["load_time_ms"] = int((time.monotonic() - t0) * 1000)
|
||||
return result
|
||||
|
||||
|
||||
def _get_domains_batch(offset: int, limit: int, tld: Optional[str]) -> list[str]:
|
||||
|
||||
Reference in New Issue
Block a user