/* * Copyright (c) 2026 by Christian Kellner. * Licensed under Apache-2.0 with Commons Clause and Attribution/Naming Clause */ import { launch } from 'cloakbrowser/puppeteer'; import { debug, botDetected } from './utils.js'; import { getPreLaunchConfig } from './botPrevention.js'; import logger from '../logger.js'; /** * Launch a CloakBrowser/Puppeteer browser instance with stealth and humanizer enabled. * * CloakBrowser applies 49 C++ source-level patches (canvas, WebGL, audio, WebRTC, * navigator.*, automation signals) that are indistinguishable from a real browser. * All fingerprinting and human-behaviour simulation is handled natively; no CDP * overrides (setUserAgent, setExtraHTTPHeaders, evaluateOnNewDocument) are applied * here because they would create detectable inconsistencies on top of the C++ patches. * * @param {string} url - Initial URL (used to derive locale/timezone hints). * @param {object} [options] * @param {boolean} [options.puppeteerHeadless] * @param {number} [options.puppeteerTimeout] * @param {string} [options.proxyUrl] * @param {string} [options.timezone] * @param {string} [options.acceptLanguage] * @param {object} [options.viewport] * @returns {Promise} */ export async function launchBrowser(url, options) { const preCfg = getPreLaunchConfig(url, options || {}); // Docker requires --no-sandbox; CloakBrowser handles all stealth args internally. // --ignore-certificate-errors is needed because CloakBrowser ships its own Chromium // binary with an independent CA bundle that may not trust proxies or interceptors // present in the host environment. const args = [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--no-first-run', '--no-default-browser-check', '--ignore-certificate-errors', // Disables the zygote process model. Required in some container environments // (e.g. limited kernel namespaces) where the zygote cannot acquire the // locks it needs and exits with "Invalid file descriptor to ICU data received". '--no-zygote', preCfg.windowSizeArg, ]; const browser = await launch({ headless: options?.puppeteerHeadless ?? true, humanize: true, args, // locale sets Accept-Language headers and JS navigator.language consistently locale: preCfg.langForFlag, ...(options?.proxyUrl ? { proxy: options.proxyUrl } : {}), ...(preCfg.timezone ? { timezone: preCfg.timezone } : {}), }); return browser; } /** * Close a browser instance returned by {@link launchBrowser}. * * @param {import('puppeteer-core').Browser | null} browser */ export async function closeBrowser(browser) { if (!browser) return; try { await browser.close(); } catch { // ignore } } /** * Open a page in a (possibly reused) browser, navigate to `url`, and return the HTML source. * Returns `null` when a bot-detection page is encountered or on timeout. * * @param {string} url * @param {string | null} waitForSelector * @param {object} [options] * @returns {Promise} */ export default async function execute(url, waitForSelector, options) { let browser = options?.browser; let isExternalBrowser = !!browser; let page; let result; try { debug(`Sending request to ${url} using CloakBrowser.`); if (!isExternalBrowser) { browser = await launchBrowser(url, options); } page = await browser.newPage(); if (Array.isArray(options?.cookies) && options.cookies.length > 0) { await page.setCookie(...options.cookies); } // Warm-up navigation: visit a trusted page first so the site sees an // established session before the actual target URL. Silently ignored on // failure so it never blocks the main request. if (options?.preNavigateUrl) { try { await page.goto(options.preNavigateUrl, { waitUntil: 'domcontentloaded', timeout: 30_000 }); await new Promise((r) => setTimeout(r, 1500 + Math.random() * 2000)); } catch { // ignore } } const response = await page.goto(url, { waitUntil: options?.waitUntil || 'domcontentloaded', timeout: options?.puppeteerTimeout || 60000, }); // Optional second idle wait: useful for React SPAs that trigger API calls // after domcontentloaded. Times out silently so we use whatever is rendered. if (options?.waitForNetworkIdle) { try { await page.waitForNetworkIdle({ timeout: options?.waitForNetworkIdleTimeout ?? 60_000 }); } catch { // ignore — we proceed with whatever the DOM contains at this point } } let pageSource; if (waitForSelector != null) { const selectorTimeout = options?.puppeteerSelectorTimeout ?? options?.puppeteerTimeout ?? 30_000; await page.waitForSelector(waitForSelector, { timeout: selectorTimeout }); pageSource = await page.evaluate((selector) => { const el = document.querySelector(selector); return el ? el.innerHTML : ''; }, waitForSelector); } else { pageSource = await page.content(); } const statusCode = response?.status?.() ?? 200; if (botDetected(pageSource, statusCode)) { logger.warn('We have been detected as a bot :-/ Tried url: => ', url); result = null; } else { result = pageSource || (await page.content()); } } catch (error) { if (error?.name?.includes('Timeout')) { logger.debug('Error executing with CloakBrowser executor', error); } else { logger.warn('Error executing with CloakBrowser executor', error); } result = null; } finally { try { if (page) { await page.close(); } } catch { // ignore } if (browser != null && !isExternalBrowser) { await closeBrowser(browser); } } return result; }