271 lines
8.7 KiB
Python
271 lines
8.7 KiB
Python
|
|
import asyncio
|
||
|
|
import os
|
||
|
|
import ssl
|
||
|
|
import socket
|
||
|
|
import datetime
|
||
|
|
import logging
|
||
|
|
from typing import Optional
|
||
|
|
|
||
|
|
import httpx
|
||
|
|
import dns.resolver
|
||
|
|
import aiosqlite
|
||
|
|
from bs4 import BeautifulSoup
|
||
|
|
|
||
|
|
from app.db import SQLITE_PATH
|
||
|
|
from app.scorer import score
|
||
|
|
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
CONCURRENCY_LIMIT = int(os.getenv("CONCURRENCY_LIMIT", "50"))
|
||
|
|
# ip-api.com free tier: 45 req/min → ~1.33/s. We use a separate slower semaphore.
|
||
|
|
IP_API_SEMAPHORE: Optional[asyncio.Semaphore] = None
|
||
|
|
IP_API_RATE = 45 # per minute
|
||
|
|
|
||
|
|
_worker_task: Optional[asyncio.Task] = None
|
||
|
|
_paused = False
|
||
|
|
|
||
|
|
|
||
|
|
def get_ip_semaphore():
|
||
|
|
global IP_API_SEMAPHORE
|
||
|
|
if IP_API_SEMAPHORE is None:
|
||
|
|
IP_API_SEMAPHORE = asyncio.Semaphore(1)
|
||
|
|
return IP_API_SEMAPHORE
|
||
|
|
|
||
|
|
|
||
|
|
CMS_SIGNATURES = {
|
||
|
|
"wordpress": ["/wp-content/", "/wp-includes/", 'name="generator" content="WordPress'],
|
||
|
|
"joomla": ["/components/com_", "Joomla!", 'name="generator" content="Joomla'],
|
||
|
|
"drupal": ["/sites/default/files/", "Drupal.settings", 'name="generator" content="Drupal'],
|
||
|
|
"wix": ["wix.com", "X-Wix-"],
|
||
|
|
"squarespace": ["squarespace.com", "X-Squarespace-"],
|
||
|
|
"shopify": ["cdn.shopify.com", "Shopify.theme"],
|
||
|
|
"prestashop": ["PrestaShop", "/modules/"],
|
||
|
|
"magento": ["Mage.Cookies", "X-Magento-"],
|
||
|
|
"typo3": ["typo3", "TYPO3 CMS"],
|
||
|
|
"opencart": ["route=common/home", "OpenCart"],
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
def detect_cms(html: str, headers: dict) -> Optional[str]:
|
||
|
|
combined = html[:50000] + " ".join(f"{k}:{v}" for k, v in headers.items())
|
||
|
|
for cms, sigs in CMS_SIGNATURES.items():
|
||
|
|
if any(sig.lower() in combined.lower() for sig in sigs):
|
||
|
|
return cms
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
async def check_ssl(domain: str) -> tuple[bool, Optional[int]]:
|
||
|
|
try:
|
||
|
|
ctx = ssl.create_default_context()
|
||
|
|
loop = asyncio.get_event_loop()
|
||
|
|
|
||
|
|
def _check():
|
||
|
|
with socket.create_connection((domain, 443), timeout=5) as sock:
|
||
|
|
with ctx.wrap_socket(sock, server_hostname=domain) as ssock:
|
||
|
|
cert = ssock.getpeercert()
|
||
|
|
expiry_str = cert.get("notAfter", "")
|
||
|
|
expiry = datetime.datetime.strptime(expiry_str, "%b %d %H:%M:%S %Y %Z")
|
||
|
|
days = (expiry - datetime.datetime.utcnow()).days
|
||
|
|
return True, days
|
||
|
|
|
||
|
|
return await loop.run_in_executor(None, _check)
|
||
|
|
except Exception:
|
||
|
|
return False, None
|
||
|
|
|
||
|
|
|
||
|
|
async def check_mx(domain: str) -> bool:
|
||
|
|
try:
|
||
|
|
loop = asyncio.get_event_loop()
|
||
|
|
|
||
|
|
def _check():
|
||
|
|
try:
|
||
|
|
answers = dns.resolver.resolve(domain, "MX", lifetime=5)
|
||
|
|
return len(answers) > 0
|
||
|
|
except Exception:
|
||
|
|
return False
|
||
|
|
|
||
|
|
return await loop.run_in_executor(None, _check)
|
||
|
|
except Exception:
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
_ip_last_call = 0.0
|
||
|
|
_ip_lock = asyncio.Lock() if False else None # initialized lazily
|
||
|
|
|
||
|
|
|
||
|
|
async def get_ip_country(ip: str) -> Optional[str]:
|
||
|
|
global _ip_last_call
|
||
|
|
# Enforce 45 req/min = 1 req per 1.33s
|
||
|
|
async with get_ip_semaphore():
|
||
|
|
now = asyncio.get_event_loop().time()
|
||
|
|
wait = (1 / (IP_API_RATE / 60)) - (now - _ip_last_call)
|
||
|
|
if wait > 0:
|
||
|
|
await asyncio.sleep(wait)
|
||
|
|
_ip_last_call = asyncio.get_event_loop().time()
|
||
|
|
|
||
|
|
try:
|
||
|
|
async with httpx.AsyncClient(timeout=5) as client:
|
||
|
|
resp = await client.get(f"http://ip-api.com/json/{ip}?fields=countryCode")
|
||
|
|
if resp.status_code == 200:
|
||
|
|
return resp.json().get("countryCode")
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
async def enrich_domain(domain: str) -> dict:
|
||
|
|
result = {
|
||
|
|
"domain": domain,
|
||
|
|
"is_live": False,
|
||
|
|
"status_code": None,
|
||
|
|
"ssl_valid": False,
|
||
|
|
"ssl_expiry_days": None,
|
||
|
|
"cms": None,
|
||
|
|
"has_mx": False,
|
||
|
|
"ip_country": None,
|
||
|
|
"page_title": None,
|
||
|
|
"server": None,
|
||
|
|
"enriched_at": datetime.datetime.utcnow().isoformat(),
|
||
|
|
"error": None,
|
||
|
|
}
|
||
|
|
|
||
|
|
try:
|
||
|
|
async with httpx.AsyncClient(
|
||
|
|
timeout=10,
|
||
|
|
follow_redirects=True,
|
||
|
|
verify=False,
|
||
|
|
headers={"User-Agent": "Mozilla/5.0 (compatible; DomGod/1.0)"},
|
||
|
|
) as client:
|
||
|
|
resp = await client.get(f"http://{domain}")
|
||
|
|
result["is_live"] = resp.status_code in (200, 301, 302, 303, 307, 308)
|
||
|
|
result["status_code"] = resp.status_code
|
||
|
|
result["server"] = resp.headers.get("server")
|
||
|
|
|
||
|
|
html = resp.text
|
||
|
|
soup = BeautifulSoup(html, "html.parser")
|
||
|
|
title_tag = soup.find("title")
|
||
|
|
result["page_title"] = title_tag.get_text(strip=True)[:500] if title_tag else None
|
||
|
|
result["cms"] = detect_cms(html, dict(resp.headers))
|
||
|
|
|
||
|
|
# Resolve IP for country lookup
|
||
|
|
try:
|
||
|
|
loop = asyncio.get_event_loop()
|
||
|
|
ip = await loop.run_in_executor(None, socket.gethostbyname, domain)
|
||
|
|
result["ip_country"] = await get_ip_country(ip)
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
result["error"] = str(e)[:500]
|
||
|
|
|
||
|
|
# SSL check (independent of HTTP)
|
||
|
|
ssl_valid, ssl_days = await check_ssl(domain)
|
||
|
|
result["ssl_valid"] = ssl_valid
|
||
|
|
result["ssl_expiry_days"] = ssl_days
|
||
|
|
|
||
|
|
# MX check
|
||
|
|
result["has_mx"] = await check_mx(domain)
|
||
|
|
|
||
|
|
# Score
|
||
|
|
result["score"] = score(result)
|
||
|
|
|
||
|
|
return result
|
||
|
|
|
||
|
|
|
||
|
|
async def save_enriched(data: dict):
|
||
|
|
async with aiosqlite.connect(SQLITE_PATH) as db:
|
||
|
|
await db.execute(
|
||
|
|
"""INSERT INTO enriched_domains
|
||
|
|
(domain, is_live, status_code, ssl_valid, ssl_expiry_days, cms,
|
||
|
|
has_mx, ip_country, page_title, server, enriched_at, error, score)
|
||
|
|
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)
|
||
|
|
ON CONFLICT(domain) DO UPDATE SET
|
||
|
|
is_live=excluded.is_live, status_code=excluded.status_code,
|
||
|
|
ssl_valid=excluded.ssl_valid, ssl_expiry_days=excluded.ssl_expiry_days,
|
||
|
|
cms=excluded.cms, has_mx=excluded.has_mx, ip_country=excluded.ip_country,
|
||
|
|
page_title=excluded.page_title, server=excluded.server,
|
||
|
|
enriched_at=excluded.enriched_at, error=excluded.error, score=excluded.score""",
|
||
|
|
(
|
||
|
|
data["domain"], data["is_live"], data["status_code"],
|
||
|
|
data["ssl_valid"], data["ssl_expiry_days"], data["cms"],
|
||
|
|
data["has_mx"], data["ip_country"], data["page_title"],
|
||
|
|
data["server"], data["enriched_at"], data["error"], data["score"],
|
||
|
|
),
|
||
|
|
)
|
||
|
|
await db.execute(
|
||
|
|
"""INSERT INTO scores (domain, score) VALUES (?,?)
|
||
|
|
ON CONFLICT(domain) DO UPDATE SET score=excluded.score, scored_at=datetime('now')""",
|
||
|
|
(data["domain"], data["score"]),
|
||
|
|
)
|
||
|
|
await db.commit()
|
||
|
|
|
||
|
|
|
||
|
|
async def mark_job(domain: str, status: str, error: str = None):
|
||
|
|
async with aiosqlite.connect(SQLITE_PATH) as db:
|
||
|
|
if status == "running":
|
||
|
|
await db.execute(
|
||
|
|
"UPDATE job_queue SET status=?, started_at=datetime('now') WHERE domain=?",
|
||
|
|
(status, domain),
|
||
|
|
)
|
||
|
|
elif status in ("done", "failed"):
|
||
|
|
await db.execute(
|
||
|
|
"UPDATE job_queue SET status=?, completed_at=datetime('now'), error=? WHERE domain=?",
|
||
|
|
(status, error, domain),
|
||
|
|
)
|
||
|
|
await db.commit()
|
||
|
|
|
||
|
|
|
||
|
|
async def worker_loop():
|
||
|
|
global _paused
|
||
|
|
sem = asyncio.Semaphore(CONCURRENCY_LIMIT)
|
||
|
|
|
||
|
|
async def process(domain: str):
|
||
|
|
async with sem:
|
||
|
|
await mark_job(domain, "running")
|
||
|
|
try:
|
||
|
|
data = await enrich_domain(domain)
|
||
|
|
await save_enriched(data)
|
||
|
|
await mark_job(domain, "done")
|
||
|
|
except Exception as e:
|
||
|
|
await mark_job(domain, "failed", str(e)[:500])
|
||
|
|
|
||
|
|
while True:
|
||
|
|
if _paused:
|
||
|
|
await asyncio.sleep(1)
|
||
|
|
continue
|
||
|
|
|
||
|
|
async with aiosqlite.connect(SQLITE_PATH) as db:
|
||
|
|
async with db.execute(
|
||
|
|
"SELECT domain FROM job_queue WHERE status='pending' LIMIT 100"
|
||
|
|
) as cur:
|
||
|
|
rows = await cur.fetchall()
|
||
|
|
|
||
|
|
if not rows:
|
||
|
|
await asyncio.sleep(2)
|
||
|
|
continue
|
||
|
|
|
||
|
|
tasks = [asyncio.create_task(process(r[0])) for r in rows]
|
||
|
|
await asyncio.gather(*tasks, return_exceptions=True)
|
||
|
|
|
||
|
|
|
||
|
|
def start_worker():
|
||
|
|
global _worker_task
|
||
|
|
if _worker_task is None or _worker_task.done():
|
||
|
|
_worker_task = asyncio.create_task(worker_loop())
|
||
|
|
_paused = False
|
||
|
|
|
||
|
|
|
||
|
|
def pause_worker():
|
||
|
|
global _paused
|
||
|
|
_paused = True
|
||
|
|
|
||
|
|
|
||
|
|
def resume_worker():
|
||
|
|
global _paused
|
||
|
|
_paused = False
|
||
|
|
start_worker()
|
||
|
|
|
||
|
|
|
||
|
|
def is_running() -> bool:
|
||
|
|
return _worker_task is not None and not _worker_task.done() and not _paused
|