improve puppeteer handling

This commit is contained in:
orangecoding
2025-09-22 10:08:59 +02:00
parent 33175ffb81
commit 2d461756c1
2 changed files with 57 additions and 12 deletions

View File

@@ -2,30 +2,56 @@ import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import { debug, DEFAULT_HEADER, botDetected } from './utils.js';
import logger from '../logger.js';
import fs from 'fs';
import os from 'os';
import path from 'path';
puppeteer.use(StealthPlugin());
export default async function execute(url, waitForSelector, options) {
let browser;
let page;
let result = null;
let userDataDir;
let removeUserDataDir = false;
try {
debug(`Sending request to ${url} using Puppeteer.`);
// Prepare a dedicated temporary userDataDir to avoid leaking /tmp/.org.chromium.* dirs
if (options && options.userDataDir) {
userDataDir = options.userDataDir;
removeUserDataDir = !!options.cleanupUserDataDir;
} else {
const prefix = path.join(os.tmpdir(), 'puppeteer-fredy-');
userDataDir = fs.mkdtempSync(prefix);
removeUserDataDir = true;
}
browser = await puppeteer.launch({
headless: options.puppeteerHeadless ?? true,
args: ['--no-sandbox', '--disable-gpu', '--disable-setuid-sandbox'],
args: [
'--no-sandbox',
'--disable-gpu',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-crash-reporter',
],
timeout: options.puppeteerTimeout || 30_000,
userDataDir,
});
let page = await browser.newPage();
page = await browser.newPage();
await page.setExtraHTTPHeaders(DEFAULT_HEADER);
const response = await page.goto(url, {
waitUntil: 'domcontentloaded',
});
let pageSource;
//if we're extracting data from a spa, we must wait for the selector
// if we're extracting data from a SPA, we must wait for the selector
if (waitForSelector != null) {
await page.waitForSelector(waitForSelector);
const selectorTimeout = options?.puppeteerSelectorTimeout ?? options?.puppeteerTimeout ?? 30_000;
await page.waitForSelector(waitForSelector, { timeout: selectorTimeout });
pageSource = await page.evaluate((selector) => {
return document.querySelector(selector).innerHTML;
const el = document.querySelector(selector);
return el ? el.innerHTML : '';
}, waitForSelector);
} else {
pageSource = await page.content();
@@ -35,16 +61,35 @@ export default async function execute(url, waitForSelector, options) {
if (botDetected(pageSource, statusCode)) {
logger.warn('We have been detected as a bot :-/ Tried url: => ', url);
return null;
result = null;
} else {
result = pageSource || (await page.content());
}
return await page.content();
} catch (error) {
logger.error('Error executing with puppeteer executor', error);
return null;
result = null;
} finally {
if (browser != null) {
await browser.close();
try {
if (page) {
await page.close();
}
} catch {
// ignore
}
try {
if (browser != null) {
await browser.close();
}
} catch {
// ignore
}
try {
if (removeUserDataDir && userDataDir) {
await fs.promises.rm(userDataDir, { recursive: true, force: true });
}
} catch {
// ignore
}
}
return result;
}

View File

@@ -1,6 +1,6 @@
{
"name": "fredy",
"version": "12.1.5",
"version": "12.1.6",
"description": "[F]ind [R]eal [E]states [d]amn eas[y].",
"scripts": {
"prepare": "husky",