From 2d461756c172d16029883077240c0707ec06d9cf Mon Sep 17 00:00:00 2001 From: orangecoding Date: Mon, 22 Sep 2025 10:08:59 +0200 Subject: [PATCH] improve puppeteer handling --- lib/services/extractor/puppeteerExtractor.js | 67 ++++++++++++++++---- package.json | 2 +- 2 files changed, 57 insertions(+), 12 deletions(-) diff --git a/lib/services/extractor/puppeteerExtractor.js b/lib/services/extractor/puppeteerExtractor.js index b8f7259..0818a5d 100644 --- a/lib/services/extractor/puppeteerExtractor.js +++ b/lib/services/extractor/puppeteerExtractor.js @@ -2,30 +2,56 @@ import puppeteer from 'puppeteer-extra'; import StealthPlugin from 'puppeteer-extra-plugin-stealth'; import { debug, DEFAULT_HEADER, botDetected } from './utils.js'; import logger from '../logger.js'; +import fs from 'fs'; +import os from 'os'; +import path from 'path'; puppeteer.use(StealthPlugin()); export default async function execute(url, waitForSelector, options) { let browser; + let page; + let result = null; + let userDataDir; + let removeUserDataDir = false; try { debug(`Sending request to ${url} using Puppeteer.`); + // Prepare a dedicated temporary userDataDir to avoid leaking /tmp/.org.chromium.* dirs + if (options && options.userDataDir) { + userDataDir = options.userDataDir; + removeUserDataDir = !!options.cleanupUserDataDir; + } else { + const prefix = path.join(os.tmpdir(), 'puppeteer-fredy-'); + userDataDir = fs.mkdtempSync(prefix); + removeUserDataDir = true; + } + browser = await puppeteer.launch({ headless: options.puppeteerHeadless ?? true, - args: ['--no-sandbox', '--disable-gpu', '--disable-setuid-sandbox'], + args: [ + '--no-sandbox', + '--disable-gpu', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-crash-reporter', + ], timeout: options.puppeteerTimeout || 30_000, + userDataDir, }); - let page = await browser.newPage(); + page = await browser.newPage(); await page.setExtraHTTPHeaders(DEFAULT_HEADER); const response = await page.goto(url, { waitUntil: 'domcontentloaded', }); let pageSource; - //if we're extracting data from a spa, we must wait for the selector + // if we're extracting data from a SPA, we must wait for the selector if (waitForSelector != null) { - await page.waitForSelector(waitForSelector); + const selectorTimeout = options?.puppeteerSelectorTimeout ?? options?.puppeteerTimeout ?? 30_000; + await page.waitForSelector(waitForSelector, { timeout: selectorTimeout }); pageSource = await page.evaluate((selector) => { - return document.querySelector(selector).innerHTML; + const el = document.querySelector(selector); + return el ? el.innerHTML : ''; }, waitForSelector); } else { pageSource = await page.content(); @@ -35,16 +61,35 @@ export default async function execute(url, waitForSelector, options) { if (botDetected(pageSource, statusCode)) { logger.warn('We have been detected as a bot :-/ Tried url: => ', url); - return null; + result = null; + } else { + result = pageSource || (await page.content()); } - - return await page.content(); } catch (error) { logger.error('Error executing with puppeteer executor', error); - return null; + result = null; } finally { - if (browser != null) { - await browser.close(); + try { + if (page) { + await page.close(); + } + } catch { + // ignore + } + try { + if (browser != null) { + await browser.close(); + } + } catch { + // ignore + } + try { + if (removeUserDataDir && userDataDir) { + await fs.promises.rm(userDataDir, { recursive: true, force: true }); + } + } catch { + // ignore } } + return result; } diff --git a/package.json b/package.json index 386ebc7..3e2e3a1 100755 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "fredy", - "version": "12.1.5", + "version": "12.1.6", "description": "[F]ind [R]eal [E]states [d]amn eas[y].", "scripts": { "prepare": "husky",