fix: AI worker crash-proof + GDPR/hosting/accessibility analysis
AI worker fixes (root cause of "nothing reaches Replicate"): - Worker task died silently — no exception handler around while loop - Added try/except around entire loop body with exc_info logging - Added watchdog task that restarts dead workers every 10 seconds - ensure_workers_alive() called on every /api/ai/assess/batch POST - _assess_one() is now a top-level function (not closure) — avoids subtle scoping bugs with async inner functions in while loops - /api/ai/debug endpoint: shows worker alive status, task exception, last 10 queue entries — browse to /api/ai/debug to diagnose - /api/ai/worker/restart endpoint + UI button - "Restart AI worker" button + "Debug AI queue" link in enrichment tab site_analyzer.py — new signals: - IP resolution + ip-api.com for ASN, org, ISP, host country - EU hosting detection (27 EU + EEA + adequacy countries) - GDPR: detects Cookiebot, OneTrust, CookiePro, Osano, Iubenda, Borlabs, CookieYes, Complianz, Usercentrics + text signals - Privacy policy and GDPR text presence - Accessibility: html lang missing, images without alt count, skip nav link, empty links, inputs without labels Gemini prompt additions: - Hosting section: IP, ASN, org/ISP, EU vs non-EU flag - GDPR section: cookie tool, notice, privacy policy - Accessibility section: all quick-scan results - New output fields: hosting_notes, gdpr_compliance, accessibility_issues[] Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
BIN
app/__pycache__/__init__.cpython-311.pyc
Normal file
BIN
app/__pycache__/__init__.cpython-311.pyc
Normal file
Binary file not shown.
BIN
app/__pycache__/db.cpython-311.pyc
Normal file
BIN
app/__pycache__/db.cpython-311.pyc
Normal file
Binary file not shown.
BIN
app/__pycache__/enricher.cpython-311.pyc
Normal file
BIN
app/__pycache__/enricher.cpython-311.pyc
Normal file
Binary file not shown.
BIN
app/__pycache__/replicate_ai.cpython-311.pyc
Normal file
BIN
app/__pycache__/replicate_ai.cpython-311.pyc
Normal file
Binary file not shown.
BIN
app/__pycache__/site_analyzer.cpython-311.pyc
Normal file
BIN
app/__pycache__/site_analyzer.cpython-311.pyc
Normal file
Binary file not shown.
@@ -338,14 +338,42 @@ async def worker_loop():
|
|||||||
|
|
||||||
# ── AI assessment worker ──────────────────────────────────────────────────────
|
# ── AI assessment worker ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
async def ai_worker_loop():
|
async def _assess_one(domain: str) -> None:
|
||||||
|
"""Process a single AI assessment — safe to call concurrently."""
|
||||||
from app.replicate_ai import assess_domain as gemini_assess
|
from app.replicate_ai import assess_domain as gemini_assess
|
||||||
from app.site_analyzer import analyze_site
|
from app.site_analyzer import analyze_site
|
||||||
|
|
||||||
|
logger.info("AI: starting analysis for %s", domain)
|
||||||
|
try:
|
||||||
|
analysis = await analyze_site(domain)
|
||||||
|
logger.info("AI: site analyzed %s (reachable=%s, words=%s)",
|
||||||
|
domain, analysis.get("reachable"), analysis.get("word_count"))
|
||||||
|
assessment = await gemini_assess(analysis)
|
||||||
|
logger.info("AI: Gemini done %s → quality=%s",
|
||||||
|
domain, assessment.get("lead_quality"))
|
||||||
|
await save_ai_assessment(domain, assessment, site_analysis=analysis)
|
||||||
|
logger.info("AI: saved %s", domain)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("AI: failed %s — %s", domain, e, exc_info=True)
|
||||||
|
try:
|
||||||
|
async with aiosqlite.connect(SQLITE_PATH) as db:
|
||||||
|
await db.execute(
|
||||||
|
"UPDATE ai_queue SET status='failed', completed_at=datetime('now'), error=? WHERE domain=?",
|
||||||
|
(str(e)[:400], domain),
|
||||||
|
)
|
||||||
|
await db.commit()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
async def ai_worker_loop():
|
||||||
|
logger.info("AI worker loop starting")
|
||||||
while True:
|
while True:
|
||||||
|
rows = []
|
||||||
|
try:
|
||||||
async with aiosqlite.connect(SQLITE_PATH) as db:
|
async with aiosqlite.connect(SQLITE_PATH) as db:
|
||||||
async with db.execute(
|
async with db.execute(
|
||||||
"SELECT domain FROM ai_queue WHERE status='pending' LIMIT 10"
|
"SELECT domain FROM ai_queue WHERE status='pending' LIMIT 5"
|
||||||
) as cur:
|
) as cur:
|
||||||
rows = await cur.fetchall()
|
rows = await cur.fetchall()
|
||||||
if rows:
|
if rows:
|
||||||
@@ -354,37 +382,44 @@ async def ai_worker_loop():
|
|||||||
[(r[0],) for r in rows],
|
[(r[0],) for r in rows],
|
||||||
)
|
)
|
||||||
await db.commit()
|
await db.commit()
|
||||||
|
logger.info("AI worker: picked up %d jobs: %s",
|
||||||
|
len(rows), [r[0] for r in rows])
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("AI worker DB error: %s", e, exc_info=True)
|
||||||
|
await asyncio.sleep(5)
|
||||||
|
continue
|
||||||
|
|
||||||
if not rows:
|
if not rows:
|
||||||
await asyncio.sleep(3)
|
await asyncio.sleep(3)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
async def assess_one(domain: str):
|
# Run assessments concurrently (semaphore in replicate_ai enforces AI_CONCURRENCY)
|
||||||
try:
|
results = await asyncio.gather(
|
||||||
# Always do a fresh deep scrape — no pre-enrichment required
|
*[_assess_one(r[0]) for r in rows],
|
||||||
analysis = await analyze_site(domain)
|
return_exceptions=True,
|
||||||
assessment = await gemini_assess(analysis)
|
|
||||||
await save_ai_assessment(domain, assessment, site_analysis=analysis)
|
|
||||||
logger.info("AI done: %s → %s", domain, assessment.get("lead_quality"))
|
|
||||||
except Exception as e:
|
|
||||||
async with aiosqlite.connect(SQLITE_PATH) as db:
|
|
||||||
await db.execute(
|
|
||||||
"UPDATE ai_queue SET status='failed', completed_at=datetime('now') WHERE domain=?",
|
|
||||||
(domain,),
|
|
||||||
)
|
)
|
||||||
await db.commit()
|
for r, exc in zip(rows, results):
|
||||||
logger.error("AI worker error %s: %s", domain, e)
|
if isinstance(exc, Exception):
|
||||||
|
logger.error("AI task exception for %s: %s", r[0], exc, exc_info=exc)
|
||||||
# AI_CONCURRENCY concurrent assessments (already enforced by replicate_ai semaphore)
|
|
||||||
await asyncio.gather(*[asyncio.create_task(assess_one(r[0])) for r in rows], return_exceptions=True)
|
|
||||||
|
|
||||||
|
|
||||||
def start_worker():
|
def start_worker():
|
||||||
global _worker_task, _ai_worker_task
|
global _worker_task, _ai_worker_task
|
||||||
if _worker_task is None or _worker_task.done():
|
if _worker_task is None or _worker_task.done():
|
||||||
_worker_task = asyncio.create_task(worker_loop())
|
_worker_task = asyncio.create_task(worker_loop())
|
||||||
|
logger.info("Enrichment worker started")
|
||||||
if _ai_worker_task is None or _ai_worker_task.done():
|
if _ai_worker_task is None or _ai_worker_task.done():
|
||||||
|
if _ai_worker_task is not None and _ai_worker_task.done():
|
||||||
|
exc = _ai_worker_task.exception() if not _ai_worker_task.cancelled() else None
|
||||||
|
if exc:
|
||||||
|
logger.error("AI worker died with: %s", exc, exc_info=exc)
|
||||||
_ai_worker_task = asyncio.create_task(ai_worker_loop())
|
_ai_worker_task = asyncio.create_task(ai_worker_loop())
|
||||||
|
logger.info("AI worker started/restarted")
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_workers_alive():
|
||||||
|
"""Restart workers if they've died — call periodically."""
|
||||||
|
start_worker()
|
||||||
|
|
||||||
|
|
||||||
def pause_worker():
|
def pause_worker():
|
||||||
|
|||||||
45
app/main.py
45
app/main.py
@@ -20,7 +20,7 @@ from app.db import (
|
|||||||
queue_domains, get_queue_status, build_duckdb_index, index_status,
|
queue_domains, get_queue_status, build_duckdb_index, index_status,
|
||||||
queue_ai, get_ai_queue_status, save_ai_assessment,
|
queue_ai, get_ai_queue_status, save_ai_assessment,
|
||||||
)
|
)
|
||||||
from app.enricher import start_worker, pause_worker, resume_worker, is_running
|
from app.enricher import start_worker, pause_worker, resume_worker, is_running, ensure_workers_alive
|
||||||
from app.scorer import run_scoring
|
from app.scorer import run_scoring
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||||
@@ -61,13 +61,20 @@ async def download_parquet():
|
|||||||
logger.info("Parquet download complete")
|
logger.info("Parquet download complete")
|
||||||
|
|
||||||
|
|
||||||
|
async def _watchdog():
|
||||||
|
"""Restart workers if they die every 10 seconds."""
|
||||||
|
while True:
|
||||||
|
await asyncio.sleep(10)
|
||||||
|
ensure_workers_alive()
|
||||||
|
|
||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def lifespan(app: FastAPI):
|
async def lifespan(app: FastAPI):
|
||||||
await download_parquet()
|
await download_parquet()
|
||||||
await init_db()
|
await init_db()
|
||||||
# Build DuckDB index in background — queries still work (slower) while building
|
|
||||||
asyncio.create_task(build_duckdb_index())
|
asyncio.create_task(build_duckdb_index())
|
||||||
start_worker()
|
start_worker()
|
||||||
|
asyncio.create_task(_watchdog())
|
||||||
logger.info("DomGod ready on port 6677")
|
logger.info("DomGod ready on port 6677")
|
||||||
yield
|
yield
|
||||||
|
|
||||||
@@ -167,9 +174,43 @@ async def ai_assess_batch(body: dict):
|
|||||||
if not domains_list:
|
if not domains_list:
|
||||||
return JSONResponse({"error": "no domains provided"}, status_code=400)
|
return JSONResponse({"error": "no domains provided"}, status_code=400)
|
||||||
await queue_ai(domains_list)
|
await queue_ai(domains_list)
|
||||||
|
ensure_workers_alive() # ensure AI worker is alive when jobs are queued
|
||||||
return {"queued": len(domains_list)}
|
return {"queued": len(domains_list)}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/api/ai/worker/restart")
|
||||||
|
async def ai_worker_restart():
|
||||||
|
ensure_workers_alive()
|
||||||
|
return {"status": "restarted"}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/ai/debug")
|
||||||
|
async def ai_debug():
|
||||||
|
"""Returns worker state + last 10 queue entries for troubleshooting."""
|
||||||
|
from app.enricher import _ai_worker_task
|
||||||
|
task_alive = _ai_worker_task is not None and not _ai_worker_task.done()
|
||||||
|
task_exc = None
|
||||||
|
if _ai_worker_task and _ai_worker_task.done() and not _ai_worker_task.cancelled():
|
||||||
|
try:
|
||||||
|
task_exc = str(_ai_worker_task.exception())
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
async with aiosqlite.connect(SQLITE_PATH) as db:
|
||||||
|
db.row_factory = aiosqlite.Row
|
||||||
|
async with db.execute(
|
||||||
|
"SELECT domain, status, created_at, completed_at, error FROM ai_queue ORDER BY created_at DESC LIMIT 10"
|
||||||
|
) as cur:
|
||||||
|
recent = [dict(r) async for r in cur]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"ai_worker_alive": task_alive,
|
||||||
|
"ai_worker_exception": task_exc,
|
||||||
|
"recent_queue": recent,
|
||||||
|
"queue_status": await get_ai_queue_status(),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@app.get("/api/ai/status")
|
@app.get("/api/ai/status")
|
||||||
async def ai_status():
|
async def ai_status():
|
||||||
return await get_ai_queue_status()
|
return await get_ai_queue_status()
|
||||||
|
|||||||
@@ -25,88 +25,105 @@ def _sem() -> asyncio.Semaphore:
|
|||||||
|
|
||||||
|
|
||||||
def _build_prompt(a: dict) -> str:
|
def _build_prompt(a: dict) -> str:
|
||||||
"""Build the Gemini prompt from a full site analysis dict."""
|
|
||||||
contacts_block = []
|
contacts_block = []
|
||||||
if a.get("emails"): contacts_block.append(f" Emails: {', '.join(a['emails'][:3])}")
|
if a.get("emails"): contacts_block.append(f" Emails: {', '.join(a['emails'][:3])}")
|
||||||
if a.get("phones"): contacts_block.append(f" Phones: {', '.join(a['phones'][:3])}")
|
if a.get("phones"): contacts_block.append(f" Phones: {', '.join(a['phones'][:3])}")
|
||||||
if a.get("whatsapp"): contacts_block.append(f" WhatsApp: {', '.join(a['whatsapp'][:2])}")
|
if a.get("whatsapp"): contacts_block.append(f" WhatsApp: {', '.join(a['whatsapp'][:2])}")
|
||||||
if a.get("social_links"): contacts_block.append(f" Social: {', '.join(a['social_links'][:4])}")
|
if a.get("social_links"):contacts_block.append(f" Social: {', '.join(a['social_links'][:4])}")
|
||||||
contacts_str = "\n".join(contacts_block) or " None found"
|
contacts_str = "\n".join(contacts_block) or " None found"
|
||||||
|
|
||||||
kd_str = "\n".join(f" - {s}" for s in (a.get("kit_digital_signals") or [])) or " None detected"
|
kd_str = "\n".join(f" - {s}" for s in (a.get("kit_digital_signals") or [])) or " None"
|
||||||
analytics_str = ", ".join(a.get("analytics_present") or []) or "none"
|
analytics = ", ".join(a.get("analytics_present") or []) or "none"
|
||||||
webmaster_str = ", ".join(a.get("webmaster_verified") or []) or "none"
|
webmaster = ", ".join(a.get("webmaster_verified") or []) or "none"
|
||||||
lorem_str = ", ".join(a.get("lorem_matches") or []) or "none"
|
lorem_str = ", ".join(a.get("lorem_matches") or []) or "none"
|
||||||
placeholder_str = ", ".join(a.get("placeholder_matches") or []) or "none"
|
ph_str = ", ".join(a.get("placeholder_matches") or []) or "none"
|
||||||
|
snippet = (a.get("visible_text_snippet") or "")[:2000]
|
||||||
|
|
||||||
text_snippet = (a.get("visible_text_snippet") or "")[:2000]
|
eu_hosted = a.get("eu_hosted")
|
||||||
|
hosting_flag = "✅ EU" if eu_hosted else ("❌ Non-EU" if eu_hosted is False else "unknown")
|
||||||
|
|
||||||
return f"""You are a senior web consultant and IT sales analyst reviewing a Spanish SME website.
|
return f"""You are a senior web consultant and IT sales analyst evaluating a Spanish/European SME website for IT services upsell.
|
||||||
|
|
||||||
=== TECHNICAL SNAPSHOT ===
|
=== TECHNICAL ===
|
||||||
Domain: {a.get("domain")}
|
Domain: {a.get("domain")}
|
||||||
Reachable: {a.get("reachable")} | Status: {a.get("status_code")} | Load time: {a.get("load_time_ms")} ms
|
Reachable: {a.get("reachable")} | Status: {a.get("status_code")} | Load time: {a.get("load_time_ms")} ms
|
||||||
Final URL: {a.get("final_url")}
|
|
||||||
Page size: {a.get("page_size_kb")} KB | Server: {a.get("server")} | CMS: {a.get("cms") or "unknown"}
|
Page size: {a.get("page_size_kb")} KB | Server: {a.get("server")} | CMS: {a.get("cms") or "unknown"}
|
||||||
SSL valid: {a.get("ssl_valid")} | SSL expires in: {a.get("ssl_expiry_days")} days
|
SSL: valid={a.get("ssl_valid")} expires_in={a.get("ssl_expiry_days")} days
|
||||||
Mobile viewport: {a.get("has_mobile_viewport")}
|
Mobile: viewport={a.get("has_mobile_viewport")}
|
||||||
Word count: {a.get("word_count")} | Images: {a.get("image_count")} | Scripts: {a.get("script_count")}
|
Words: {a.get("word_count")} | Images: {a.get("image_count")} | Scripts: {a.get("script_count")}
|
||||||
|
|
||||||
=== SEO & INDEXING SIGNALS ===
|
=== HOSTING & INFRASTRUCTURE ===
|
||||||
Page title: {a.get("page_title") or "missing"}
|
IP: {a.get("ip") or "unknown"}
|
||||||
H1: {a.get("h1_text") or "missing"}
|
ASN: {a.get("asn") or "unknown"}
|
||||||
Meta description: {a.get("meta_description") or "missing"}
|
Organisation: {a.get("org") or "unknown"}
|
||||||
Canonical URL: {a.get("canonical_url") or "not set"}
|
ISP: {a.get("isp") or "unknown"}
|
||||||
Sitemap.xml: {a.get("has_sitemap")}
|
Host country: {a.get("ip_country") or "unknown"} / {a.get("ip_region") or ""}
|
||||||
Robots.txt: {a.get("has_robots")} | Blocks Googlebot: {a.get("robots_disallows_google")}
|
EU hosted: {hosting_flag}
|
||||||
Analytics: {analytics_str}
|
|
||||||
Webmaster verified:{webmaster_str}
|
=== SEO & INDEXING ===
|
||||||
|
Title: {a.get("page_title") or "MISSING"}
|
||||||
|
H1: {a.get("h1_text") or "MISSING"}
|
||||||
|
Meta desc: {a.get("meta_description") or "MISSING"}
|
||||||
|
Canonical: {a.get("canonical_url") or "not set"}
|
||||||
|
Sitemap: {a.get("has_sitemap")} | Robots: {a.get("has_robots")} | Blocks Google: {a.get("robots_disallows_google")}
|
||||||
|
Analytics: {analytics}
|
||||||
|
Webmaster: {webmaster}
|
||||||
|
|
||||||
|
=== GDPR & LEGAL COMPLIANCE ===
|
||||||
|
Cookie tool: {a.get("cookie_tool") or "none detected"}
|
||||||
|
Cookie notice: {a.get("has_cookie_notice")}
|
||||||
|
Privacy policy: {a.get("has_privacy_policy")}
|
||||||
|
GDPR text: {a.get("has_gdpr_text")}
|
||||||
|
|
||||||
|
=== ACCESSIBILITY (quick scan) ===
|
||||||
|
HTML lang attr: {a.get("html_lang") or "MISSING"}
|
||||||
|
Images missing alt: {a.get("images_missing_alt")}
|
||||||
|
Skip navigation link: {a.get("has_skip_nav")}
|
||||||
|
Empty links: {a.get("empty_links")}
|
||||||
|
Inputs without labels: {a.get("inputs_without_labels")}
|
||||||
|
|
||||||
=== CONTENT QUALITY ===
|
=== CONTENT QUALITY ===
|
||||||
Lorem ipsum found: {a.get("has_lorem_ipsum")} → matches: {lorem_str}
|
Lorem ipsum: {a.get("has_lorem_ipsum")} → {lorem_str}
|
||||||
Placeholder text: {a.get("has_placeholder")} → matches: {placeholder_str}
|
Placeholder: {a.get("has_placeholder")} → {ph_str}
|
||||||
|
|
||||||
=== KIT DIGITAL (Spanish gov digitalization grant — sites must display EU logos) ===
|
=== KIT DIGITAL (Spanish gov €12k SME grants — sites must show EU logos) ===
|
||||||
Detected: {a.get("kit_digital")}
|
Detected: {a.get("kit_digital")}
|
||||||
Signals:
|
|
||||||
{kd_str}
|
{kd_str}
|
||||||
|
|
||||||
=== CONTACT CHANNELS ===
|
=== CONTACT CHANNELS ===
|
||||||
{contacts_str}
|
{contacts_str}
|
||||||
|
|
||||||
=== PAGE TEXT SAMPLE (first 2000 chars) ===
|
=== PAGE TEXT SAMPLE ===
|
||||||
{text_snippet}
|
{snippet}
|
||||||
|
|
||||||
=== TASK ===
|
=== INSTRUCTIONS ===
|
||||||
Analyse this site for IT services upsell potential. The client sells:
|
The client sells: web redesign, SEO, hosting migration, SSL renewal,
|
||||||
web design/redesign, SEO, hosting migration, SSL renewal, security audits,
|
security audits, GDPR compliance, accessibility fixes, Google Ads,
|
||||||
maintenance contracts, Google Ads, and AI-assisted tools for SMEs.
|
maintenance contracts, AI tools for SMEs.
|
||||||
|
|
||||||
Respond ONLY with valid JSON — no markdown, no text outside the JSON object:
|
Respond ONLY with valid JSON, no markdown fences, no text outside the JSON:
|
||||||
{{
|
{{
|
||||||
"summary": "2-3 sentence executive summary of the site's current state",
|
"summary": "2-3 sentence executive summary of the site's state",
|
||||||
"site_quality_score": <0-10 integer>,
|
"site_quality_score": <0-10>,
|
||||||
"content_issues": ["list of specific content problems found — lorem ipsum, broken sections, placeholder text, etc."],
|
"content_issues": ["specific issues found in page content"],
|
||||||
"performance_notes": "comment on load time, page size, mobile readiness",
|
"performance_notes": "load time, size, mobile assessment",
|
||||||
"seo_status": "brief SEO assessment — indexing signals, missing elements",
|
"seo_status": "SEO health — what's missing or broken",
|
||||||
|
"hosting_notes": "ASN/ISP name, EU vs non-EU, any concerns",
|
||||||
|
"gdpr_compliance": "cookie banner status, privacy policy, GDPR gaps",
|
||||||
|
"accessibility_issues": ["specific a11y problems found"],
|
||||||
"kit_digital_confirmed": true/false,
|
"kit_digital_confirmed": true/false,
|
||||||
"kit_digital_reasoning": "1 sentence — why confirmed or not",
|
"kit_digital_reasoning": "1 sentence",
|
||||||
"is_local_sme": true/false,
|
"is_local_sme": true/false,
|
||||||
"lead_quality": "HOT|WARM|COLD",
|
"lead_quality": "HOT|WARM|COLD",
|
||||||
"lead_reasoning": "1-2 sentences on why",
|
"lead_reasoning": "1-2 sentences",
|
||||||
"best_contact_channel": "email|phone|whatsapp|social|web_form|unknown",
|
"best_contact_channel": "email|phone|whatsapp|social|web_form|unknown",
|
||||||
"best_contact_value": "the actual value to use (email address, phone number, URL) or empty string",
|
"best_contact_value": "actual email/phone/URL or empty string",
|
||||||
"all_contacts": {{
|
"all_contacts": {{"emails":[],"phones":[],"whatsapp":[],"social":[]}},
|
||||||
"emails": [],
|
"pitch_angle": "1 cold-outreach sentence in Spanish",
|
||||||
"phones": [],
|
"services_needed": ["service1","service2"],
|
||||||
"whatsapp": [],
|
"urgency_signals": ["specific urgent issues like 'SSL expires in 12 days', 'lorem ipsum on homepage', 'no cookie banner'"],
|
||||||
"social": []
|
"outreach_notes": "sales rep context"
|
||||||
}},
|
}}
|
||||||
"pitch_angle": "One concrete opening sentence in Spanish for cold outreach",
|
|
||||||
"services_needed": ["service1", "service2"],
|
|
||||||
"urgency_signals": ["list of specific urgent issues — expiring SSL, lorem ipsum live, no GA, blocked robots etc"],
|
|
||||||
"outreach_notes": "Key context for the sales rep"
|
|
||||||
}}"""
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_output(raw: str) -> dict:
|
def _parse_output(raw: str) -> dict:
|
||||||
|
|||||||
@@ -1,8 +1,9 @@
|
|||||||
"""Deep site analysis: content quality, SEO signals, performance, indexing hints."""
|
"""Deep site analysis: content quality, SEO, hosting, GDPR, accessibility."""
|
||||||
import asyncio
|
import asyncio
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
import logging
|
import logging
|
||||||
|
import socket
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
@@ -10,25 +11,58 @@ from bs4 import BeautifulSoup
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# ── Content quality ───────────────────────────────────────────────────────────
|
# ── EU countries (hosting check) ─────────────────────────────────────────────
|
||||||
|
EU_COUNTRIES = {
|
||||||
|
'AT','BE','BG','HR','CY','CZ','DK','EE','FI','FR','DE','GR',
|
||||||
|
'HU','IE','IT','LV','LT','LU','MT','NL','PL','PT','RO','SK',
|
||||||
|
'SI','ES','SE',
|
||||||
|
'NO','IS','LI', # EEA
|
||||||
|
'CH','GB','AD', # adequacy / adjacent
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Content quality ───────────────────────────────────────────────────────────
|
||||||
LOREM_PHRASES = [
|
LOREM_PHRASES = [
|
||||||
"lorem ipsum", "sed ut perspiciatis", "nunc sem sapien",
|
"lorem ipsum", "sed ut perspiciatis", "nunc sem sapien",
|
||||||
"nulla id nibh", "aenean dignissim", "aliquam tincidunt",
|
"nulla id nibh", "aenean dignissim", "aliquam tincidunt",
|
||||||
"vestibulum commodo", "fusce nunc lacus", "consectetuer",
|
"vestibulum commodo", "fusce nunc lacus", "consectetuer",
|
||||||
"cras ornare tristique", "ntulla nec ante", "risus id metus",
|
"cras ornare tristique", "ntulla nec ante", "risus id metus",
|
||||||
"praesent placerat", "fusce pellentesque", "suscipit nibh",
|
"praesent placerat", "fusce pellentesque", "suscipit nibh",
|
||||||
"integer vitae libero", "felis quis tortor",
|
"integer vitae libero", "felis quis tortor", "dolor sit amet",
|
||||||
]
|
]
|
||||||
|
|
||||||
PLACEHOLDER_PHRASES = [
|
PLACEHOLDER_PHRASES = [
|
||||||
"under construction", "coming soon", "sample page",
|
"under construction", "coming soon", "sample page",
|
||||||
"this is a demo", "default post", "hello world",
|
"this is a demo", "hello world", "test content",
|
||||||
"test post", "uncategorized",
|
"default post", "uncategorized", "demo content",
|
||||||
]
|
]
|
||||||
|
|
||||||
# ── Analytics & webmaster tags ────────────────────────────────────────────────
|
# ── Cookie / GDPR consent tools ───────────────────────────────────────────────
|
||||||
|
COOKIE_TOOLS = {
|
||||||
|
"cookiebot": ["cookiebot.com", "CookieConsent", "CybotCookiebot"],
|
||||||
|
"onetrust": ["onetrust", "otBannerSdk"],
|
||||||
|
"cookiepro": ["cookiepro.com"],
|
||||||
|
"osano": ["osano.com"],
|
||||||
|
"iubenda": ["iubenda.com"],
|
||||||
|
"borlabs": ["borlabs-cookie"],
|
||||||
|
"complianz": ["complianz"],
|
||||||
|
"cookieyes": ["cookieyes.com", "cookie-law-info"],
|
||||||
|
"usercentrics": ["usercentrics.com"],
|
||||||
|
"quantcast": ["quantcast.com/cmp"],
|
||||||
|
}
|
||||||
|
COOKIE_TEXT_SIGNALS = [
|
||||||
|
"accept cookies", "acepta las cookies", "we use cookies", "usamos cookies",
|
||||||
|
"cookie policy", "política de cookies", "cookie settings", "manage cookies",
|
||||||
|
"aceptar todas", "rechazar cookies",
|
||||||
|
]
|
||||||
|
PRIVACY_SIGNALS = [
|
||||||
|
"privacy policy", "política de privacidad", "aviso legal",
|
||||||
|
"privacy notice", "data protection",
|
||||||
|
]
|
||||||
|
GDPR_TEXT_SIGNALS = [
|
||||||
|
"rgpd", "gdpr", "reglamento general de protección",
|
||||||
|
"lopd", "protección de datos", "responsable del tratamiento",
|
||||||
|
]
|
||||||
|
|
||||||
|
# ── Analytics / webmaster ─────────────────────────────────────────────────────
|
||||||
ANALYTICS = {
|
ANALYTICS = {
|
||||||
"google_analytics": ["gtag('config'", "google-analytics.com/analytics.js", "G-"],
|
"google_analytics": ["gtag('config'", "google-analytics.com/analytics.js", "G-"],
|
||||||
"google_tag_manager": ["googletagmanager.com/gtm.js", "GTM-"],
|
"google_tag_manager": ["googletagmanager.com/gtm.js", "GTM-"],
|
||||||
@@ -36,13 +70,13 @@ ANALYTICS = {
|
|||||||
"hotjar": ["static.hotjar.com"],
|
"hotjar": ["static.hotjar.com"],
|
||||||
"clarity": ["clarity.ms/tag"],
|
"clarity": ["clarity.ms/tag"],
|
||||||
}
|
}
|
||||||
|
|
||||||
WEBMASTER = {
|
WEBMASTER = {
|
||||||
"google_search_console": ['google-site-verification'],
|
"google_search_console": ["google-site-verification"],
|
||||||
"bing_webmaster": ['msvalidate.01'],
|
"bing_webmaster": ["msvalidate.01"],
|
||||||
"yandex": ['yandex-verification'],
|
"yandex": ["yandex-verification"],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# ── Kit Digital ───────────────────────────────────────────────────────────────
|
||||||
KIT_IMG_PATS = [
|
KIT_IMG_PATS = [
|
||||||
"digitalizadores", "kit-digital", "kitdigital", "kit_digital",
|
"digitalizadores", "kit-digital", "kitdigital", "kit_digital",
|
||||||
"fondos-europeos", "fondos_europeos", "nextgeneration", "next-generation",
|
"fondos-europeos", "fondos_europeos", "nextgeneration", "next-generation",
|
||||||
@@ -56,59 +90,78 @@ KIT_TEXT_PATS = [
|
|||||||
|
|
||||||
EMAIL_RE = re.compile(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}")
|
EMAIL_RE = re.compile(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}")
|
||||||
PHONE_RE = re.compile(r"(?:\+34[\s\-]?)?(?:6|7|8|9)\d{2}[\s\-]?\d{3}[\s\-]?\d{3}")
|
PHONE_RE = re.compile(r"(?:\+34[\s\-]?)?(?:6|7|8|9)\d{2}[\s\-]?\d{3}[\s\-]?\d{3}")
|
||||||
SOCIAL_DOM = ["facebook.com", "instagram.com", "linkedin.com", "twitter.com", "x.com", "tiktok.com"]
|
SOCIAL_DOM = ["facebook.com", "instagram.com", "linkedin.com",
|
||||||
|
"twitter.com", "x.com", "tiktok.com", "youtube.com"]
|
||||||
|
|
||||||
|
|
||||||
|
async def _get_hosting_info(domain: str) -> dict:
|
||||||
|
"""Resolve IP, then look up ASN / org / country via ip-api.com."""
|
||||||
|
info = {"ip": None, "asn": None, "org": None, "isp": None,
|
||||||
|
"ip_country": None, "ip_region": None, "eu_hosted": None}
|
||||||
|
try:
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
ip = await loop.run_in_executor(None, socket.gethostbyname, domain)
|
||||||
|
info["ip"] = ip
|
||||||
|
async with httpx.AsyncClient(timeout=6) as client:
|
||||||
|
r = await client.get(
|
||||||
|
f"http://ip-api.com/json/{ip}",
|
||||||
|
params={"fields": "status,country,countryCode,regionName,org,as,isp"},
|
||||||
|
)
|
||||||
|
if r.status_code == 200:
|
||||||
|
d = r.json()
|
||||||
|
if d.get("status") == "success":
|
||||||
|
info.update({
|
||||||
|
"asn": d.get("as"),
|
||||||
|
"org": d.get("org"),
|
||||||
|
"isp": d.get("isp"),
|
||||||
|
"ip_country": d.get("countryCode"),
|
||||||
|
"ip_region": d.get("regionName"),
|
||||||
|
"eu_hosted": d.get("countryCode") in EU_COUNTRIES,
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug("Hosting lookup failed for %s: %s", domain, e)
|
||||||
|
return info
|
||||||
|
|
||||||
|
|
||||||
async def analyze_site(domain: str) -> dict:
|
async def analyze_site(domain: str) -> dict:
|
||||||
"""Fetch and deeply analyse a site. Returns a rich dict for the AI prompt."""
|
|
||||||
result = {
|
result = {
|
||||||
"domain": domain,
|
"domain": domain,
|
||||||
"reachable": False,
|
"reachable": False, "load_time_ms": None, "status_code": None,
|
||||||
"load_time_ms": None,
|
"final_url": None, "page_size_kb": None, "server": None, "cms": None,
|
||||||
"status_code": None,
|
# Hosting
|
||||||
"final_url": None,
|
"ip": None, "asn": None, "org": None, "isp": None,
|
||||||
"page_size_kb": None,
|
"ip_country": None, "ip_region": None, "eu_hosted": None,
|
||||||
"server": None,
|
# SSL
|
||||||
"cms": None,
|
"ssl_valid": False, "ssl_expiry_days": None,
|
||||||
"ssl_valid": False,
|
|
||||||
"ssl_expiry_days": None,
|
|
||||||
# Content quality
|
# Content quality
|
||||||
"has_lorem_ipsum": False,
|
"has_lorem_ipsum": False, "lorem_matches": [],
|
||||||
"lorem_matches": [],
|
"has_placeholder": False, "placeholder_matches": [],
|
||||||
"has_placeholder": False,
|
"word_count": 0, "image_count": 0, "script_count": 0,
|
||||||
"placeholder_matches": [],
|
|
||||||
"word_count": 0,
|
|
||||||
"image_count": 0,
|
|
||||||
"broken_images": 0,
|
|
||||||
"script_count": 0,
|
|
||||||
"has_mobile_viewport": False,
|
"has_mobile_viewport": False,
|
||||||
"page_title": None,
|
"page_title": None, "meta_description": None, "h1_text": None,
|
||||||
"meta_description": None,
|
|
||||||
"h1_text": None,
|
|
||||||
"visible_text_snippet": "",
|
"visible_text_snippet": "",
|
||||||
# SEO / webmaster
|
# SEO
|
||||||
"has_sitemap": False,
|
"has_sitemap": False, "has_robots": False, "robots_disallows_google": False,
|
||||||
"has_robots": False,
|
"analytics_present": [], "webmaster_verified": [],
|
||||||
"robots_disallows_google": False,
|
"canonical_url": None, "og_title": None,
|
||||||
"analytics_present": [],
|
# GDPR / cookies
|
||||||
"webmaster_verified": [],
|
"cookie_tool": None, "has_cookie_notice": False,
|
||||||
"canonical_url": None,
|
"has_privacy_policy": False, "has_gdpr_text": False,
|
||||||
"og_title": None,
|
# Accessibility
|
||||||
|
"html_lang": None, "images_missing_alt": 0,
|
||||||
|
"has_skip_nav": False, "empty_links": 0,
|
||||||
|
"inputs_without_labels": 0,
|
||||||
# Kit Digital
|
# Kit Digital
|
||||||
"kit_digital": False,
|
"kit_digital": False, "kit_digital_signals": [],
|
||||||
"kit_digital_signals": [],
|
|
||||||
# Contacts
|
# Contacts
|
||||||
"emails": [],
|
"emails": [], "phones": [], "whatsapp": [], "social_links": [],
|
||||||
"phones": [],
|
|
||||||
"whatsapp": [],
|
|
||||||
"social_links": [],
|
|
||||||
# Errors
|
|
||||||
"error": None,
|
"error": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
# ── Fetch main page ───────────────────────────────────────────────────────
|
# ── Fetch + hosting (parallel) ────────────────────────────────────────────
|
||||||
try:
|
async def _fetch():
|
||||||
t0 = time.monotonic()
|
t0 = time.monotonic()
|
||||||
|
try:
|
||||||
async with httpx.AsyncClient(
|
async with httpx.AsyncClient(
|
||||||
timeout=15, follow_redirects=True, verify=False,
|
timeout=15, follow_redirects=True, verify=False,
|
||||||
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"},
|
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"},
|
||||||
@@ -116,12 +169,20 @@ async def analyze_site(domain: str) -> dict:
|
|||||||
resp = await client.get(f"https://{domain}")
|
resp = await client.get(f"https://{domain}")
|
||||||
if resp.status_code >= 400:
|
if resp.status_code >= 400:
|
||||||
resp = await client.get(f"http://{domain}")
|
resp = await client.get(f"http://{domain}")
|
||||||
|
return resp, int((time.monotonic() - t0) * 1000)
|
||||||
|
except Exception as e:
|
||||||
|
return None, int((time.monotonic() - t0) * 1000)
|
||||||
|
|
||||||
load_ms = int((time.monotonic() - t0) * 1000)
|
(resp, load_ms), hosting = await asyncio.gather(_fetch(), _get_hosting_info(domain))
|
||||||
|
result.update(hosting)
|
||||||
|
result["load_time_ms"] = load_ms
|
||||||
|
|
||||||
|
if resp is None:
|
||||||
|
result["error"] = "Failed to fetch site"
|
||||||
|
else:
|
||||||
html = resp.text
|
html = resp.text
|
||||||
result.update({
|
result.update({
|
||||||
"reachable": resp.status_code < 400,
|
"reachable": resp.status_code < 400,
|
||||||
"load_time_ms": load_ms,
|
|
||||||
"status_code": resp.status_code,
|
"status_code": resp.status_code,
|
||||||
"final_url": str(resp.url),
|
"final_url": str(resp.url),
|
||||||
"page_size_kb": round(len(resp.content) / 1024, 1),
|
"page_size_kb": round(len(resp.content) / 1024, 1),
|
||||||
@@ -131,46 +192,42 @@ async def analyze_site(domain: str) -> dict:
|
|||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
hl = html.lower()
|
hl = html.lower()
|
||||||
|
|
||||||
# Title, meta
|
# ── Basic metadata ────────────────────────────────────────────────────
|
||||||
title_tag = soup.find("title")
|
result["html_lang"] = (soup.find("html") or {}).get("lang")
|
||||||
result["page_title"] = title_tag.get_text(strip=True)[:200] if title_tag else None
|
t = soup.find("title")
|
||||||
meta_desc = soup.find("meta", attrs={"name": "description"})
|
result["page_title"] = t.get_text(strip=True)[:200] if t else None
|
||||||
result["meta_description"] = (meta_desc.get("content") or "")[:300] if meta_desc else None
|
md = soup.find("meta", attrs={"name": "description"})
|
||||||
|
result["meta_description"] = (md.get("content") or "")[:300] if md else None
|
||||||
h1 = soup.find("h1")
|
h1 = soup.find("h1")
|
||||||
result["h1_text"] = h1.get_text(strip=True)[:200] if h1 else None
|
result["h1_text"] = h1.get_text(strip=True)[:200] if h1 else None
|
||||||
|
|
||||||
# Mobile viewport
|
|
||||||
result["has_mobile_viewport"] = bool(soup.find("meta", attrs={"name": "viewport"}))
|
result["has_mobile_viewport"] = bool(soup.find("meta", attrs={"name": "viewport"}))
|
||||||
|
c = soup.find("link", rel="canonical")
|
||||||
# Canonical + OG
|
result["canonical_url"] = c.get("href") if c else None
|
||||||
canon = soup.find("link", rel="canonical")
|
|
||||||
result["canonical_url"] = canon.get("href") if canon else None
|
|
||||||
og = soup.find("meta", property="og:title")
|
og = soup.find("meta", property="og:title")
|
||||||
result["og_title"] = og.get("content") if og else None
|
result["og_title"] = og.get("content") if og else None
|
||||||
|
|
||||||
# Visible text
|
# ── Visible text ──────────────────────────────────────────────────────
|
||||||
for tag in soup(["script", "style", "noscript"]):
|
for tag in soup(["script", "style", "noscript"]):
|
||||||
tag.decompose()
|
tag.decompose()
|
||||||
visible_text = soup.get_text(separator=" ", strip=True)
|
visible = soup.get_text(separator=" ", strip=True)
|
||||||
words = visible_text.split()
|
vl = visible.lower()
|
||||||
|
words = visible.split()
|
||||||
result["word_count"] = len(words)
|
result["word_count"] = len(words)
|
||||||
result["visible_text_snippet"] = " ".join(words[:500])
|
result["visible_text_snippet"] = " ".join(words[:600])
|
||||||
|
|
||||||
# Lorem ipsum / placeholder detection
|
# ── Content quality ───────────────────────────────────────────────────
|
||||||
vl = visible_text.lower()
|
|
||||||
lorem_hits = [p for p in LOREM_PHRASES if p in vl]
|
lorem_hits = [p for p in LOREM_PHRASES if p in vl]
|
||||||
result["has_lorem_ipsum"] = len(lorem_hits) > 0
|
result["has_lorem_ipsum"] = len(lorem_hits) > 0
|
||||||
result["lorem_matches"] = lorem_hits[:5]
|
result["lorem_matches"] = lorem_hits[:6]
|
||||||
ph_hits = [p for p in PLACEHOLDER_PHRASES if p in vl]
|
ph_hits = [p for p in PLACEHOLDER_PHRASES if p in vl]
|
||||||
result["has_placeholder"] = len(ph_hits) > 0
|
result["has_placeholder"] = len(ph_hits) > 0
|
||||||
result["placeholder_matches"] = ph_hits[:3]
|
result["placeholder_matches"] = ph_hits[:3]
|
||||||
|
|
||||||
# Images & scripts
|
|
||||||
imgs = soup.find_all("img")
|
imgs = soup.find_all("img")
|
||||||
result["image_count"] = len(imgs)
|
result["image_count"] = len(imgs)
|
||||||
result["script_count"] = len(soup.find_all("script", src=True))
|
result["script_count"] = len(soup.find_all("script", src=True))
|
||||||
|
|
||||||
# Analytics / webmaster tags
|
# ── Analytics / webmaster ─────────────────────────────────────────────
|
||||||
for name, sigs in ANALYTICS.items():
|
for name, sigs in ANALYTICS.items():
|
||||||
if any(s.lower() in hl for s in sigs):
|
if any(s.lower() in hl for s in sigs):
|
||||||
result["analytics_present"].append(name)
|
result["analytics_present"].append(name)
|
||||||
@@ -178,12 +235,42 @@ async def analyze_site(domain: str) -> dict:
|
|||||||
if any(s.lower() in hl for s in sigs):
|
if any(s.lower() in hl for s in sigs):
|
||||||
result["webmaster_verified"].append(name)
|
result["webmaster_verified"].append(name)
|
||||||
|
|
||||||
# Kit Digital
|
# ── GDPR / cookies ────────────────────────────────────────────────────
|
||||||
|
for tool, sigs in COOKIE_TOOLS.items():
|
||||||
|
if any(s.lower() in hl for s in sigs):
|
||||||
|
result["cookie_tool"] = tool
|
||||||
|
result["has_cookie_notice"] = True
|
||||||
|
break
|
||||||
|
if not result["has_cookie_notice"]:
|
||||||
|
result["has_cookie_notice"] = any(s in vl for s in COOKIE_TEXT_SIGNALS)
|
||||||
|
result["has_privacy_policy"] = any(s in vl for s in PRIVACY_SIGNALS) or bool(
|
||||||
|
soup.find("a", href=lambda h: h and ("privacidad" in h.lower() or "privacy" in h.lower()))
|
||||||
|
)
|
||||||
|
result["has_gdpr_text"] = any(s in vl for s in GDPR_TEXT_SIGNALS)
|
||||||
|
|
||||||
|
# ── Accessibility ─────────────────────────────────────────────────────
|
||||||
|
result["images_missing_alt"] = sum(
|
||||||
|
1 for img in imgs if not img.get("alt") and img.get("alt") != ""
|
||||||
|
)
|
||||||
|
result["has_skip_nav"] = bool(
|
||||||
|
soup.find("a", href=lambda h: h and h.lower() in ("#main", "#content", "#maincontent", "#skip"))
|
||||||
|
)
|
||||||
|
result["empty_links"] = sum(
|
||||||
|
1 for a in soup.find_all("a") if not a.get_text(strip=True) and not a.find("img")
|
||||||
|
)
|
||||||
|
all_inputs = soup.find_all("input", type=lambda t: t not in ("hidden", "submit", "button", None) or t is None)
|
||||||
|
labeled_ids = {lbl.get("for") for lbl in soup.find_all("label") if lbl.get("for")}
|
||||||
|
result["inputs_without_labels"] = sum(
|
||||||
|
1 for inp in all_inputs
|
||||||
|
if inp.get("id") not in labeled_ids and not inp.get("aria-label") and not inp.get("aria-labelledby")
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Kit Digital ───────────────────────────────────────────────────────
|
||||||
kd_signals = []
|
kd_signals = []
|
||||||
for img in imgs:
|
for img in soup.find_all("img"):
|
||||||
combined = ((img.get("src") or "") + (img.get("alt") or "") + (img.get("srcset") or "")).lower()
|
comb = ((img.get("src") or "") + (img.get("alt") or "") + (img.get("srcset") or "")).lower()
|
||||||
for p in KIT_IMG_PATS:
|
for p in KIT_IMG_PATS:
|
||||||
if p in combined:
|
if p in comb:
|
||||||
kd_signals.append(f"img:{p}")
|
kd_signals.append(f"img:{p}")
|
||||||
break
|
break
|
||||||
for p in KIT_TEXT_PATS:
|
for p in KIT_TEXT_PATS:
|
||||||
@@ -197,7 +284,7 @@ async def analyze_site(domain: str) -> dict:
|
|||||||
result["kit_digital"] = len(kd_signals) > 0
|
result["kit_digital"] = len(kd_signals) > 0
|
||||||
result["kit_digital_signals"] = kd_signals
|
result["kit_digital_signals"] = kd_signals
|
||||||
|
|
||||||
# Contacts
|
# ── Contacts ──────────────────────────────────────────────────────────
|
||||||
for a in soup.find_all("a", href=True):
|
for a in soup.find_all("a", href=True):
|
||||||
href = a["href"]
|
href = a["href"]
|
||||||
if href.startswith("mailto:"):
|
if href.startswith("mailto:"):
|
||||||
@@ -220,25 +307,36 @@ async def analyze_site(domain: str) -> dict:
|
|||||||
break
|
break
|
||||||
for em in EMAIL_RE.findall(html[:80000]):
|
for em in EMAIL_RE.findall(html[:80000]):
|
||||||
em = em.lower()
|
em = em.lower()
|
||||||
if em not in result["emails"] and not any(em.endswith(x) for x in [".png", ".jpg", ".css", ".js", ".svg"]):
|
if em not in result["emails"] and not any(em.endswith(x) for x in [".png",".jpg",".css",".js",".svg"]):
|
||||||
result["emails"].append(em)
|
result["emails"].append(em)
|
||||||
for ph in PHONE_RE.findall(visible_text):
|
for ph in PHONE_RE.findall(visible):
|
||||||
ph_c = re.sub(r"[\s\-]", "", ph)
|
ph_c = re.sub(r"[\s\-]", "", ph)
|
||||||
if ph_c not in result["phones"]:
|
if ph_c not in result["phones"]:
|
||||||
result["phones"].append(ph_c)
|
result["phones"].append(ph_c)
|
||||||
# Cap
|
|
||||||
for k in ["emails", "phones", "whatsapp", "social_links"]:
|
for k in ["emails", "phones", "whatsapp", "social_links"]:
|
||||||
result[k] = list(dict.fromkeys(result[k]))[:5]
|
result[k] = list(dict.fromkeys(result[k]))[:5]
|
||||||
|
|
||||||
# CMS
|
# ── CMS ───────────────────────────────────────────────────────────────
|
||||||
from app.enricher import detect_cms
|
CMS_SIGS = {
|
||||||
result["cms"] = detect_cms(html, dict(resp.headers))
|
"wordpress": ["/wp-content/", "/wp-includes/", 'content="WordPress'],
|
||||||
|
"joomla": ["/components/com_", "Joomla!", 'content="Joomla'],
|
||||||
except Exception as e:
|
"drupal": ["/sites/default/files/", "Drupal.settings"],
|
||||||
result["error"] = str(e)[:300]
|
"wix": ["static.wixstatic.com", "X-Wix-"],
|
||||||
|
"squarespace": ["squarespace.com", "X-Squarespace-"],
|
||||||
|
"shopify": ["cdn.shopify.com", "Shopify.theme"],
|
||||||
|
"prestashop": ["PrestaShop", "/modules/prestashop"],
|
||||||
|
"magento": ["Mage.Cookies", "X-Magento-"],
|
||||||
|
"typo3": ["typo3temp", "TYPO3 CMS"],
|
||||||
|
"opencart": ["route=common/home", "OpenCart"],
|
||||||
|
}
|
||||||
|
combined_check = html[:60000] + " ".join(f"{k}:{v}" for k, v in resp.headers.items())
|
||||||
|
for cms, sigs in CMS_SIGS.items():
|
||||||
|
if any(s.lower() in combined_check.lower() for s in sigs):
|
||||||
|
result["cms"] = cms
|
||||||
|
break
|
||||||
|
|
||||||
# ── Sitemap & robots (parallel) ───────────────────────────────────────────
|
# ── Sitemap & robots (parallel) ───────────────────────────────────────────
|
||||||
async def _check_url(url: str) -> Optional[str]:
|
async def _get(url):
|
||||||
try:
|
try:
|
||||||
async with httpx.AsyncClient(timeout=6, follow_redirects=True, verify=False) as c:
|
async with httpx.AsyncClient(timeout=6, follow_redirects=True, verify=False) as c:
|
||||||
r = await c.get(url)
|
r = await c.get(url)
|
||||||
@@ -247,24 +345,22 @@ async def analyze_site(domain: str) -> dict:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
sitemap_txt, robots_txt = await asyncio.gather(
|
sitemap_txt, robots_txt = await asyncio.gather(
|
||||||
_check_url(f"https://{domain}/sitemap.xml"),
|
_get(f"https://{domain}/sitemap.xml"),
|
||||||
_check_url(f"https://{domain}/robots.txt"),
|
_get(f"https://{domain}/robots.txt"),
|
||||||
)
|
)
|
||||||
result["has_sitemap"] = sitemap_txt is not None
|
result["has_sitemap"] = sitemap_txt is not None
|
||||||
result["has_robots"] = robots_txt is not None
|
result["has_robots"] = robots_txt is not None
|
||||||
if robots_txt:
|
if robots_txt:
|
||||||
robots_lower = robots_txt.lower()
|
rl = robots_txt.lower()
|
||||||
result["robots_disallows_google"] = (
|
result["robots_disallows_google"] = "disallow: /" in rl and "googlebot" in rl
|
||||||
"disallow: /" in robots_lower and "googlebot" in robots_lower
|
|
||||||
)
|
|
||||||
|
|
||||||
# ── SSL ───────────────────────────────────────────────────────────────────
|
# ── SSL ───────────────────────────────────────────────────────────────────
|
||||||
import ssl as _ssl, socket as _socket
|
import ssl as _ssl
|
||||||
try:
|
try:
|
||||||
def _ssl_check():
|
def _ssl_check():
|
||||||
import datetime as _dt
|
import datetime as _dt
|
||||||
ctx = _ssl.create_default_context()
|
ctx = _ssl.create_default_context()
|
||||||
with _socket.create_connection((domain, 443), timeout=5) as s:
|
with socket.create_connection((domain, 443), timeout=5) as s:
|
||||||
with ctx.wrap_socket(s, server_hostname=domain) as ss:
|
with ctx.wrap_socket(s, server_hostname=domain) as ss:
|
||||||
cert = ss.getpeercert()
|
cert = ss.getpeercert()
|
||||||
exp = _dt.datetime.strptime(cert["notAfter"], "%b %d %H:%M:%S %Y %Z")
|
exp = _dt.datetime.strptime(cert["notAfter"], "%b %d %H:%M:%S %Y %Z")
|
||||||
|
|||||||
@@ -179,7 +179,9 @@ tr:hover td{background:rgba(255,255,255,.025)}
|
|||||||
<div class="mrow"><span class="mlabel">Reasoning</span><span x-text="modal.ai.lead_reasoning||'—'"></span></div>
|
<div class="mrow"><span class="mlabel">Reasoning</span><span x-text="modal.ai.lead_reasoning||'—'"></span></div>
|
||||||
<div class="mrow"><span class="mlabel">KD notes</span><span x-text="modal.ai.kit_digital_reasoning||'—'"></span></div>
|
<div class="mrow"><span class="mlabel">KD notes</span><span x-text="modal.ai.kit_digital_reasoning||'—'"></span></div>
|
||||||
<div class="mrow"><span class="mlabel">Performance</span><span x-text="modal.ai.performance_notes||'—'"></span></div>
|
<div class="mrow"><span class="mlabel">Performance</span><span x-text="modal.ai.performance_notes||'—'"></span></div>
|
||||||
<div class="mrow"><span class="mlabel">SEO status</span><span x-text="modal.ai.seo_status||'—'"></span></div>
|
<div class="mrow"><span class="mlabel">SEO</span><span x-text="modal.ai.seo_status||'—'"></span></div>
|
||||||
|
<div class="mrow"><span class="mlabel">Hosting</span><span x-text="(modal.sa?.org||'?') + ' / ' + (modal.sa?.ip_country||'?') + (modal.sa?.eu_hosted===false?' ❌ Non-EU':modal.sa?.eu_hosted?' ✅ EU':'')"></span></div>
|
||||||
|
<div class="mrow"><span class="mlabel">GDPR</span><span :style="(!modal.sa?.has_cookie_notice)?'color:var(--danger)':''" x-text="modal.ai.gdpr_compliance||'—'"></span></div>
|
||||||
|
|
||||||
<!-- Content issues -->
|
<!-- Content issues -->
|
||||||
<div x-show="(modal.ai.content_issues||[]).length>0" style="margin:8px 0">
|
<div x-show="(modal.ai.content_issues||[]).length>0" style="margin:8px 0">
|
||||||
@@ -225,6 +227,14 @@ tr:hover td{background:rgba(255,255,255,.025)}
|
|||||||
<div style="font-size:13px;font-style:italic;color:var(--accent2)" x-text="modal.ai.pitch_angle||'—'"></div>
|
<div style="font-size:13px;font-style:italic;color:var(--accent2)" x-text="modal.ai.pitch_angle||'—'"></div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<!-- Accessibility issues -->
|
||||||
|
<div x-show="(modal.ai.accessibility_issues||[]).length>0" style="margin:8px 0">
|
||||||
|
<div style="font-size:10px;color:var(--muted);text-transform:uppercase;margin-bottom:4px">Accessibility Issues</div>
|
||||||
|
<template x-for="issue in (modal.ai.accessibility_issues||[])">
|
||||||
|
<div style="font-size:12px;color:var(--warn);padding:2px 0">♿ <span x-text="issue"></span></div>
|
||||||
|
</template>
|
||||||
|
</div>
|
||||||
|
|
||||||
<div class="mrow"><span class="mlabel">Services</span><span x-text="(modal.ai.services_needed||[]).join(', ')||'—'"></span></div>
|
<div class="mrow"><span class="mlabel">Services</span><span x-text="(modal.ai.services_needed||[]).join(', ')||'—'"></span></div>
|
||||||
<div class="mrow"><span class="mlabel">Notes</span><span x-text="modal.ai.outreach_notes||'—'"></span></div>
|
<div class="mrow"><span class="mlabel">Notes</span><span x-text="modal.ai.outreach_notes||'—'"></span></div>
|
||||||
|
|
||||||
@@ -431,7 +441,11 @@ tr:hover td{background:rgba(255,255,255,.025)}
|
|||||||
<div style="font-size:12px;color:var(--muted);margin-bottom:8px">
|
<div style="font-size:12px;color:var(--muted);margin-bottom:8px">
|
||||||
Auto-assesses enriched domains via Gemini. Detects Kit Digital confirmation, extracts best contact channel, writes pitch.
|
Auto-assesses enriched domains via Gemini. Detects Kit Digital confirmation, extracts best contact channel, writes pitch.
|
||||||
</div>
|
</div>
|
||||||
|
<div style="display:flex;gap:6px;flex-wrap:wrap">
|
||||||
<button class="btn bai" @click="aiAssessAllKD()">🤖 AI Assess all Kit Digital domains</button>
|
<button class="btn bai" @click="aiAssessAllKD()">🤖 AI Assess all Kit Digital domains</button>
|
||||||
|
<button class="btn bg sm" @click="restartAiWorker()">↺ Restart AI worker</button>
|
||||||
|
<a class="btn bg sm" href="/api/ai/debug" target="_blank" style="text-decoration:none">🔍 Debug AI queue</a>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div style="margin-top:18px;padding-top:14px;border-top:1px solid var(--border)">
|
<div style="margin-top:18px;padding-top:14px;border-top:1px solid var(--border)">
|
||||||
@@ -616,6 +630,7 @@ function app() {
|
|||||||
try { this.qst = await fetch('/api/enrich/status').then(r=>r.json()); } catch(e){}
|
try { this.qst = await fetch('/api/enrich/status').then(r=>r.json()); } catch(e){}
|
||||||
},
|
},
|
||||||
|
|
||||||
|
async restartAiWorker() { const r=await fetch('/api/ai/worker/restart',{method:'POST'}); this.notify('AI worker restarted','info'); await this.loadAiStatus(); },
|
||||||
async startEnrich() { await fetch('/api/enrich/resume',{method:'POST'}); this.notify('Worker started','success'); await this.loadQueue(); },
|
async startEnrich() { await fetch('/api/enrich/resume',{method:'POST'}); this.notify('Worker started','success'); await this.loadQueue(); },
|
||||||
async pauseEnrich() { await fetch('/api/enrich/pause',{method:'POST'}); this.notify('Worker paused','success'); await this.loadQueue(); },
|
async pauseEnrich() { await fetch('/api/enrich/pause',{method:'POST'}); this.notify('Worker paused','success'); await this.loadQueue(); },
|
||||||
async retryFailed() { await fetch('/api/enrich/retry',{method:'POST'}); this.notify('Retrying failed','success'); await this.loadQueue(); },
|
async retryFailed() { await fetch('/api/enrich/retry',{method:'POST'}); this.notify('Retrying failed','success'); await this.loadQueue(); },
|
||||||
|
|||||||
Reference in New Issue
Block a user