mirror of
https://github.com/orangecoding/fredy.git
synced 2026-06-16 12:31:07 +00:00
182 lines
6.1 KiB
JavaScript
182 lines
6.1 KiB
JavaScript
/*
|
|
* Copyright (c) 2026 by Christian Kellner.
|
|
* Licensed under Apache-2.0 with Commons Clause and Attribution/Naming Clause
|
|
*/
|
|
|
|
import { launch } from 'cloakbrowser/puppeteer';
|
|
import { debug, botDetected } from './utils.js';
|
|
import { getPreLaunchConfig } from './botPrevention.js';
|
|
import logger from '../logger.js';
|
|
import { trackPoi } from '../tracking/Tracker.js';
|
|
import { TRACKING_POIS } from '../../TRACKING_POIS.js';
|
|
|
|
/**
|
|
* Launch a CloakBrowser/Puppeteer browser instance with stealth and humanizer enabled.
|
|
*
|
|
* CloakBrowser applies 49 C++ source-level patches (canvas, WebGL, audio, WebRTC,
|
|
* navigator.*, automation signals) that are indistinguishable from a real browser.
|
|
* All fingerprinting and human-behaviour simulation is handled natively; no CDP
|
|
* overrides (setUserAgent, setExtraHTTPHeaders, evaluateOnNewDocument) are applied
|
|
* here because they would create detectable inconsistencies on top of the C++ patches.
|
|
*
|
|
* @param {string} url - Initial URL (used to derive locale/timezone hints).
|
|
* @param {object} [options]
|
|
* @param {boolean} [options.puppeteerHeadless]
|
|
* @param {number} [options.puppeteerTimeout]
|
|
* @param {string} [options.proxyUrl]
|
|
* @param {string} [options.timezone]
|
|
* @param {string} [options.acceptLanguage]
|
|
* @param {object} [options.viewport]
|
|
* @returns {Promise<import('puppeteer-core').Browser>}
|
|
*/
|
|
export async function launchBrowser(url, options) {
|
|
const preCfg = getPreLaunchConfig(url, options || {});
|
|
|
|
// Docker requires --no-sandbox; CloakBrowser handles all stealth args internally.
|
|
// --ignore-certificate-errors is needed because CloakBrowser ships its own Chromium
|
|
// binary with an independent CA bundle that may not trust proxies or interceptors
|
|
// present in the host environment.
|
|
const args = [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-dev-shm-usage',
|
|
'--no-first-run',
|
|
'--no-default-browser-check',
|
|
'--ignore-certificate-errors',
|
|
// Disables the zygote process model. Required in some container environments
|
|
// (e.g. limited kernel namespaces) where the zygote cannot acquire the
|
|
// locks it needs and exits with "Invalid file descriptor to ICU data received".
|
|
'--no-zygote',
|
|
preCfg.windowSizeArg,
|
|
];
|
|
|
|
const browser = await launch({
|
|
headless: options?.puppeteerHeadless ?? true,
|
|
humanize: true,
|
|
args,
|
|
// locale sets Accept-Language headers and JS navigator.language consistently
|
|
locale: preCfg.langForFlag,
|
|
...(options?.proxyUrl ? { proxy: options.proxyUrl } : {}),
|
|
...(preCfg.timezone ? { timezone: preCfg.timezone } : {}),
|
|
});
|
|
|
|
return browser;
|
|
}
|
|
|
|
/**
|
|
* Close a browser instance returned by {@link launchBrowser}.
|
|
*
|
|
* @param {import('puppeteer-core').Browser | null} browser
|
|
*/
|
|
export async function closeBrowser(browser) {
|
|
if (!browser) return;
|
|
try {
|
|
await browser.close();
|
|
} catch {
|
|
// ignore
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Open a page in a (possibly reused) browser, navigate to `url`, and return the HTML source.
|
|
* Returns `null` when a bot-detection page is encountered or on timeout.
|
|
*
|
|
* @param {string} url
|
|
* @param {string | null} waitForSelector
|
|
* @param {object} [options]
|
|
* @returns {Promise<string | null>}
|
|
*/
|
|
export default async function execute(url, waitForSelector, options) {
|
|
let browser = options?.browser;
|
|
let isExternalBrowser = !!browser;
|
|
let page;
|
|
let result;
|
|
try {
|
|
debug(`Sending request to ${url} using CloakBrowser.`);
|
|
|
|
if (!isExternalBrowser) {
|
|
browser = await launchBrowser(url, options);
|
|
}
|
|
|
|
page = await browser.newPage();
|
|
|
|
if (Array.isArray(options?.cookies) && options.cookies.length > 0) {
|
|
await page.setCookie(...options.cookies);
|
|
}
|
|
|
|
// Warm-up navigation: visit a trusted page first so the site sees an
|
|
// established session before the actual target URL. Silently ignored on
|
|
// failure so it never blocks the main request.
|
|
if (options?.preNavigateUrl) {
|
|
try {
|
|
await page.goto(options.preNavigateUrl, { waitUntil: 'domcontentloaded', timeout: 30_000 });
|
|
await new Promise((r) => setTimeout(r, 1500 + Math.random() * 2000));
|
|
} catch {
|
|
// ignore
|
|
}
|
|
}
|
|
|
|
const response = await page.goto(url, {
|
|
waitUntil: options?.waitUntil || 'domcontentloaded',
|
|
timeout: options?.puppeteerTimeout || 60000,
|
|
});
|
|
|
|
// Optional second idle wait: useful for React SPAs that trigger API calls
|
|
// after domcontentloaded. Times out silently so we use whatever is rendered.
|
|
if (options?.waitForNetworkIdle) {
|
|
try {
|
|
await page.waitForNetworkIdle({ timeout: options?.waitForNetworkIdleTimeout ?? 60_000 });
|
|
} catch {
|
|
// ignore — we proceed with whatever the DOM contains at this point
|
|
}
|
|
}
|
|
|
|
let pageSource;
|
|
if (waitForSelector != null) {
|
|
const selectorTimeout = options?.puppeteerSelectorTimeout ?? options?.puppeteerTimeout ?? 30_000;
|
|
await page.waitForSelector(waitForSelector, { timeout: selectorTimeout });
|
|
pageSource = await page.evaluate((selector) => {
|
|
const el = document.querySelector(selector);
|
|
return el ? el.innerHTML : '';
|
|
}, waitForSelector);
|
|
} else {
|
|
pageSource = await page.content();
|
|
}
|
|
|
|
const statusCode = response?.status?.() ?? 200;
|
|
|
|
if (botDetected(pageSource, statusCode)) {
|
|
logger.warn('We have been detected as a bot :-/ Tried url: => ', url);
|
|
|
|
if (options != null && options.name != null) {
|
|
await trackPoi(TRACKING_POIS.DETECTED_AS_BOT + '_' + options.name);
|
|
} else {
|
|
await trackPoi(TRACKING_POIS.DETECTED_AS_BOT);
|
|
}
|
|
|
|
result = null;
|
|
} else {
|
|
result = pageSource || (await page.content());
|
|
}
|
|
} catch (error) {
|
|
if (error?.name?.includes('Timeout')) {
|
|
logger.debug('Error executing with CloakBrowser executor', error);
|
|
} else {
|
|
logger.warn('Error executing with CloakBrowser executor', error);
|
|
}
|
|
result = null;
|
|
} finally {
|
|
try {
|
|
if (page) {
|
|
await page.close();
|
|
}
|
|
} catch {
|
|
// ignore
|
|
}
|
|
if (browser != null && !isExternalBrowser) {
|
|
await closeBrowser(browser);
|
|
}
|
|
}
|
|
return result;
|
|
}
|