mirror of
https://github.com/orangecoding/fredy.git
synced 2026-06-16 12:31:07 +00:00
161 lines
4.5 KiB
JavaScript
161 lines
4.5 KiB
JavaScript
/*
|
|
* Copyright (c) 2026 by Christian Kellner.
|
|
* Licensed under Apache-2.0 with Commons Clause and Attribution/Naming Clause
|
|
*/
|
|
|
|
import puppeteer from 'puppeteer-extra';
|
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
|
import { debug, botDetected } from './utils.js';
|
|
import {
|
|
getPreLaunchConfig,
|
|
applyBotPreventionToPage,
|
|
applyLanguagePersistence,
|
|
applyPostNavigationHumanSignals,
|
|
} from './botPrevention.js';
|
|
import logger from '../logger.js';
|
|
import fs from 'fs';
|
|
import os from 'os';
|
|
import path from 'path';
|
|
|
|
puppeteer.use(StealthPlugin());
|
|
|
|
export async function launchBrowser(url, options) {
|
|
const preCfg = getPreLaunchConfig(url, options || {});
|
|
const launchArgs = [
|
|
'--no-sandbox',
|
|
'--disable-gpu',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-dev-shm-usage',
|
|
'--disable-crash-reporter',
|
|
'--no-first-run',
|
|
'--no-default-browser-check',
|
|
preCfg.langArg,
|
|
preCfg.windowSizeArg,
|
|
...preCfg.extraArgs,
|
|
];
|
|
if (options?.proxyUrl) {
|
|
launchArgs.push(`--proxy-server=${options.proxyUrl}`);
|
|
}
|
|
|
|
let userDataDir;
|
|
let removeUserDataDir = false;
|
|
if (options && options.userDataDir) {
|
|
userDataDir = options.userDataDir;
|
|
} else {
|
|
const prefix = path.join(os.tmpdir(), 'puppeteer-fredy-');
|
|
userDataDir = fs.mkdtempSync(prefix);
|
|
removeUserDataDir = true;
|
|
}
|
|
|
|
// On ARM64 Docker, Chrome for Testing has no native binary - use system Chromium instead.
|
|
const executablePath =
|
|
options?.executablePath ||
|
|
(process.arch === 'arm64' && process.env.IS_DOCKER === 'true' ? '/usr/bin/chromium' : undefined);
|
|
|
|
const browser = await puppeteer.launch({
|
|
headless: options?.puppeteerHeadless ?? true,
|
|
args: launchArgs,
|
|
timeout: options?.puppeteerTimeout || 45_000,
|
|
userDataDir,
|
|
executablePath,
|
|
});
|
|
|
|
browser.__fredy_userDataDir = userDataDir;
|
|
browser.__fredy_removeUserDataDir = removeUserDataDir;
|
|
|
|
return browser;
|
|
}
|
|
|
|
export async function closeBrowser(browser) {
|
|
if (!browser) return;
|
|
const userDataDir = browser.__fredy_userDataDir;
|
|
const removeUserDataDir = browser.__fredy_removeUserDataDir;
|
|
try {
|
|
await browser.close();
|
|
} catch {
|
|
// ignore
|
|
}
|
|
if (removeUserDataDir && userDataDir) {
|
|
try {
|
|
await fs.promises.rm(userDataDir, { recursive: true, force: true });
|
|
} catch {
|
|
// ignore
|
|
}
|
|
}
|
|
}
|
|
|
|
export default async function execute(url, waitForSelector, options) {
|
|
let browser = options?.browser;
|
|
let isExternalBrowser = !!browser;
|
|
let page;
|
|
let result;
|
|
try {
|
|
debug(`Sending request to ${url} using Puppeteer.`);
|
|
|
|
if (!isExternalBrowser) {
|
|
browser = await launchBrowser(url, options);
|
|
}
|
|
|
|
page = await browser.newPage();
|
|
const preCfg = getPreLaunchConfig(url, options || {});
|
|
await applyBotPreventionToPage(page, preCfg);
|
|
// Provide languages value before navigation
|
|
await applyLanguagePersistence(page, preCfg);
|
|
|
|
// Optional cookies
|
|
if (Array.isArray(options?.cookies) && options.cookies.length > 0) {
|
|
await page.setCookie(...options.cookies);
|
|
}
|
|
|
|
// Navigation
|
|
const response = await page.goto(url, {
|
|
waitUntil: options?.waitUntil || 'domcontentloaded',
|
|
timeout: options?.puppeteerTimeout || 60000,
|
|
});
|
|
|
|
// Optionally wait and add subtle human-like interactions
|
|
await applyPostNavigationHumanSignals(page, preCfg);
|
|
|
|
let pageSource;
|
|
// if we're extracting data from a SPA, we must wait for the selector
|
|
if (waitForSelector != null) {
|
|
const selectorTimeout = options?.puppeteerSelectorTimeout ?? options?.puppeteerTimeout ?? 30_000;
|
|
await page.waitForSelector(waitForSelector, { timeout: selectorTimeout });
|
|
pageSource = await page.evaluate((selector) => {
|
|
const el = document.querySelector(selector);
|
|
return el ? el.innerHTML : '';
|
|
}, waitForSelector);
|
|
} else {
|
|
pageSource = await page.content();
|
|
}
|
|
|
|
const statusCode = response?.status?.() ?? 200;
|
|
|
|
if (botDetected(pageSource, statusCode)) {
|
|
logger.warn('We have been detected as a bot :-/ Tried url: => ', url);
|
|
result = null;
|
|
} else {
|
|
result = pageSource || (await page.content());
|
|
}
|
|
} catch (error) {
|
|
if (error?.name?.includes('Timeout')) {
|
|
logger.debug('Error executing with puppeteer executor', error);
|
|
} else {
|
|
logger.warn('Error executing with puppeteer executor', error);
|
|
}
|
|
result = null;
|
|
} finally {
|
|
try {
|
|
if (page) {
|
|
await page.close();
|
|
}
|
|
} catch {
|
|
// ignore
|
|
}
|
|
if (browser != null && !isExternalBrowser) {
|
|
await closeBrowser(browser);
|
|
}
|
|
}
|
|
return result;
|
|
}
|