Files
fredy/lib/services/extractor/puppeteerExtractor.js
2025-11-16 19:59:08 +01:00

171 lines
5.2 KiB
JavaScript

import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import { debug, DEFAULT_HEADER, botDetected } from './utils.js';
import logger from '../logger.js';
import fs from 'fs';
import os from 'os';
import path from 'path';
import { URL } from 'url';
puppeteer.use(StealthPlugin());
export default async function execute(url, waitForSelector, options) {
let browser;
let page;
let result = null;
let userDataDir;
let removeUserDataDir = false;
try {
debug(`Sending request to ${url} using Puppeteer.`);
// Prepare a dedicated temporary userDataDir to avoid leaking /tmp/.org.chromium.* dirs
if (options && options.userDataDir) {
userDataDir = options.userDataDir;
removeUserDataDir = !!options.cleanupUserDataDir;
} else {
const prefix = path.join(os.tmpdir(), 'puppeteer-fredy-');
userDataDir = fs.mkdtempSync(prefix);
removeUserDataDir = true;
}
const launchArgs = [
'--no-sandbox',
'--disable-gpu',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-crash-reporter',
'--no-first-run',
'--no-default-browser-check',
];
if (options?.proxyUrl) {
launchArgs.push(`--proxy-server=${options.proxyUrl}`);
}
browser = await puppeteer.launch({
headless: options?.puppeteerHeadless ?? true,
args: launchArgs,
timeout: options?.puppeteerTimeout || 30_000,
userDataDir,
executablePath: options?.executablePath, // allow using system Chrome
});
page = await browser.newPage();
// Derive domain-specific defaults
const { hostname } = new URL(url);
// Set a realistic modern user agent unless provided
const userAgent =
options?.userAgent ||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36';
await page.setUserAgent(userAgent);
// Viewport and device scale for typical desktop
await page.setViewport({ width: 1366, height: 768, deviceScaleFactor: 1 });
// Extra HTTP headers with localized Accept-Language
const acceptLanguage = options?.acceptLanguage || 'de-DE,de;q=0.9,en-US;q=0.7,en;q=0.5';
const headers = {
...DEFAULT_HEADER,
'Accept-Language': acceptLanguage,
'User-Agent': userAgent,
Referer: options?.referer || `https://${hostname}/`,
Connection: 'keep-alive',
DNT: '1',
};
await page.setExtraHTTPHeaders(headers);
// Timezone and locale tweaks to look German when needed
try {
const tz = options?.timezone || 'Europe/Berlin';
if (tz) await page.emulateTimezone(tz);
} catch {
//noop
}
// Harden navigator properties (stealth already covers many, but we ensure critical ones)
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
// Plugins and mimeTypes
// @ts-ignore
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3] });
// @ts-ignore
Object.defineProperty(navigator, 'languages', {
get: () => (window.localStorage.getItem('__LANGS__') || 'de-DE,de').split(','),
});
});
// Provide languages value before navigation
await page.evaluateOnNewDocument((langs) => {
try {
window.localStorage.setItem('__LANGS__', langs);
} catch {
//noop
}
}, acceptLanguage.split(';')[0]);
// Optional cookies
if (Array.isArray(options?.cookies) && options.cookies.length > 0) {
await page.setCookie(...options.cookies);
}
// Navigation
const response = await page.goto(url, {
waitUntil: options?.waitUntil || 'domcontentloaded',
});
// Optionally wait a random small delay to mimic human rendering time
if (options?.humanDelay !== false) {
const delay = 200 + Math.floor(Math.random() * 400);
await new Promise((res) => setTimeout(res, delay));
}
let pageSource;
// if we're extracting data from a SPA, we must wait for the selector
if (waitForSelector != null) {
const selectorTimeout = options?.puppeteerSelectorTimeout ?? options?.puppeteerTimeout ?? 30_000;
await page.waitForSelector(waitForSelector, { timeout: selectorTimeout });
pageSource = await page.evaluate((selector) => {
const el = document.querySelector(selector);
return el ? el.innerHTML : '';
}, waitForSelector);
} else {
pageSource = await page.content();
}
const statusCode = response?.status?.() ?? 200;
if (botDetected(pageSource, statusCode)) {
logger.warn('We have been detected as a bot :-/ Tried url: => ', url);
result = null;
} else {
result = pageSource || (await page.content());
}
} catch (error) {
logger.warn('Error executing with puppeteer executor', error);
result = null;
} finally {
try {
if (page) {
await page.close();
}
} catch {
// ignore
}
try {
if (browser != null) {
await browser.close();
}
} catch {
// ignore
}
try {
if (removeUserDataDir && userDataDir) {
await fs.promises.rm(userDataDir, { recursive: true, force: true });
}
} catch {
// ignore
}
}
return result;
}