101 lines
3.1 KiB
Python
101 lines
3.1 KiB
Python
import argparse
|
|
import os
|
|
import time
|
|
import requests
|
|
from dotenv import load_dotenv
|
|
from pyworker.utils.app_logging import setup_logging
|
|
from typing import Any
|
|
|
|
logger = setup_logging(__name__)
|
|
|
|
|
|
# Load environment variables from .env file
|
|
load_dotenv()
|
|
|
|
# Include API token header if set
|
|
CRAWL4AI_API_TOKEN = os.environ.get("CRAWL4AI_API_TOKEN", "")
|
|
HEADERS = (
|
|
{"Authorization": f"Bearer {CRAWL4AI_API_TOKEN}"} if CRAWL4AI_API_TOKEN else {}
|
|
)
|
|
|
|
CRAWL4AI_BASE_URL = os.environ.get("CRAWL4AI_BASE_URL", "http://crawl4ai:11235")
|
|
CRAWL4AI_TIMEOUT = int(os.environ.get("CRAWL4AI_TIMEOUT", 300))
|
|
CRAWL4AI_POLL_INTERVAL = int(os.environ.get("CRAWL4AI_POLL_INTERVAL", 2))
|
|
|
|
|
|
def fetch_fallback(url: str) -> str:
|
|
if not url:
|
|
raise ValueError("URL must not be empty")
|
|
logger.info(f"Fetching fallback for {url}")
|
|
fallback_url = f"https://r.jina.ai/{url.lstrip('/')}"
|
|
response = requests.get(fallback_url, timeout=80)
|
|
response.raise_for_status()
|
|
return response.text
|
|
|
|
|
|
def fetch_markdown(url: str, wait_for_dynamic_content: bool = True) -> str:
|
|
if not CRAWL4AI_API_TOKEN:
|
|
return fetch_fallback(url)
|
|
|
|
try:
|
|
payload: dict[str, Any] = {"urls": url}
|
|
if wait_for_dynamic_content:
|
|
# According to Crawl4AI docs, wait_for_images=True also waits for network idle state,
|
|
# which is helpful for JS-generated content.
|
|
# Adding scan_full_page and scroll_delay helps trigger lazy-loaded content.
|
|
payload["config"] = {
|
|
"wait_for_images": True,
|
|
"scan_full_page": True,
|
|
"scroll_delay": 0.5,
|
|
"magic": True,
|
|
}
|
|
|
|
response = requests.post(
|
|
f"{CRAWL4AI_BASE_URL}/crawl",
|
|
json=payload,
|
|
headers=HEADERS,
|
|
)
|
|
response.raise_for_status()
|
|
task_id = response.json().get("task_id")
|
|
start_time = time.time()
|
|
while True:
|
|
if time.time() - start_time > CRAWL4AI_TIMEOUT:
|
|
raise TimeoutError(f"Task {task_id} timeout")
|
|
status_resp = requests.get(
|
|
f"{CRAWL4AI_BASE_URL}/task/{task_id}",
|
|
headers=HEADERS,
|
|
)
|
|
status_resp.raise_for_status()
|
|
status = status_resp.json()
|
|
if status.get("status") == "completed":
|
|
markdown = status["result"].get("markdown", "")
|
|
metadata = status["result"].get("metadata", {})
|
|
return f"""
|
|
URL: {url}
|
|
Page Metadata: `{metadata}`
|
|
|
|
Markdown Content
|
|
----------------
|
|
{markdown}
|
|
"""
|
|
time.sleep(CRAWL4AI_POLL_INTERVAL)
|
|
except (requests.exceptions.RequestException, TimeoutError):
|
|
return fetch_fallback(url)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Crawl a URL and print its markdown content."
|
|
)
|
|
parser.add_argument("--url", required=True, help="The URL to crawl")
|
|
|
|
args = parser.parse_args()
|
|
print(f"Crawling {args.url}...")
|
|
markdown_content = fetch_markdown(args.url)
|
|
print("\n--- Markdown Content ---")
|
|
print(markdown_content)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|