Puppeteer improvements (#186)

* improving puppeteer handling

* upgrade dependencies

* reduce logging

* upgrade nanoid
This commit is contained in:
Christian Kellner
2025-09-22 20:53:00 +02:00
committed by GitHub
parent c839f3abc9
commit 11fd18e76a
5 changed files with 65 additions and 20 deletions

View File

@@ -23,7 +23,7 @@ const config = {
url: null,
crawlContainer: '.col-12.mb-4',
sortByDateParam: 'Sortierung=Id&Richtung=DESC',
waitForSelector: '.nbk-section',
waitForSelector: 'div[data-live-name-value="SearchList"]',
crawlFields: {
id: 'a@href',
title: 'a@title | removeNewline | trim',

View File

@@ -9,12 +9,12 @@ export function loadParser(text) {
export function parse(crawlContainer, crawlFields, text, url) {
if (!text) {
logger.warn('No content found for ', url);
logger.debug('No content found for ', url);
return null;
}
if (!crawlContainer || !crawlFields) {
logger.warn('Cannot parse, selector was empty for url ', url);
logger.debug('Cannot parse, selector was empty for url ', url);
return null;
}

View File

@@ -2,30 +2,56 @@ import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import { debug, DEFAULT_HEADER, botDetected } from './utils.js';
import logger from '../logger.js';
import fs from 'fs';
import os from 'os';
import path from 'path';
puppeteer.use(StealthPlugin());
export default async function execute(url, waitForSelector, options) {
let browser;
let page;
let result = null;
let userDataDir;
let removeUserDataDir = false;
try {
debug(`Sending request to ${url} using Puppeteer.`);
// Prepare a dedicated temporary userDataDir to avoid leaking /tmp/.org.chromium.* dirs
if (options && options.userDataDir) {
userDataDir = options.userDataDir;
removeUserDataDir = !!options.cleanupUserDataDir;
} else {
const prefix = path.join(os.tmpdir(), 'puppeteer-fredy-');
userDataDir = fs.mkdtempSync(prefix);
removeUserDataDir = true;
}
browser = await puppeteer.launch({
headless: options.puppeteerHeadless ?? true,
args: ['--no-sandbox', '--disable-gpu', '--disable-setuid-sandbox'],
args: [
'--no-sandbox',
'--disable-gpu',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-crash-reporter',
],
timeout: options.puppeteerTimeout || 30_000,
userDataDir,
});
let page = await browser.newPage();
page = await browser.newPage();
await page.setExtraHTTPHeaders(DEFAULT_HEADER);
const response = await page.goto(url, {
waitUntil: 'domcontentloaded',
});
let pageSource;
//if we're extracting data from a spa, we must wait for the selector
// if we're extracting data from a SPA, we must wait for the selector
if (waitForSelector != null) {
await page.waitForSelector(waitForSelector);
const selectorTimeout = options?.puppeteerSelectorTimeout ?? options?.puppeteerTimeout ?? 30_000;
await page.waitForSelector(waitForSelector, { timeout: selectorTimeout });
pageSource = await page.evaluate((selector) => {
return document.querySelector(selector).innerHTML;
const el = document.querySelector(selector);
return el ? el.innerHTML : '';
}, waitForSelector);
} else {
pageSource = await page.content();
@@ -35,16 +61,35 @@ export default async function execute(url, waitForSelector, options) {
if (botDetected(pageSource, statusCode)) {
logger.warn('We have been detected as a bot :-/ Tried url: => ', url);
return null;
result = null;
} else {
result = pageSource || (await page.content());
}
return await page.content();
} catch (error) {
logger.error('Error executing with puppeteer executor', error);
return null;
result = null;
} finally {
if (browser != null) {
await browser.close();
try {
if (page) {
await page.close();
}
} catch {
// ignore
}
try {
if (browser != null) {
await browser.close();
}
} catch {
// ignore
}
try {
if (removeUserDataDir && userDataDir) {
await fs.promises.rm(userDataDir, { recursive: true, force: true });
}
} catch {
// ignore
}
}
return result;
}

View File

@@ -1,6 +1,6 @@
{
"name": "fredy",
"version": "12.1.5",
"version": "12.1.6",
"description": "[F]ind [R]eal [E]states [d]amn eas[y].",
"scripts": {
"prepare": "husky",
@@ -70,7 +70,7 @@
"handlebars": "4.7.8",
"lodash": "4.17.21",
"markdown": "^0.5.0",
"nanoid": "5.1.5",
"nanoid": "5.1.6",
"node-cron": "^4.2.1",
"node-fetch": "3.3.2",
"node-mailjet": "6.0.9",

View File

@@ -5406,10 +5406,10 @@ nano-spawn@1.0.3:
resolved "https://registry.yarnpkg.com/nano-spawn/-/nano-spawn-1.0.3.tgz#ef8d89a275eebc8657e67b95fc312a6527a05b8d"
integrity sha512-jtpsQDetTnvS2Ts1fiRdci5rx0VYws5jGyC+4IYOTnIQ/wwdf6JdomlHBwqC3bJYOvaKu0C2GSZ1A60anrYpaA==
nanoid@5.1.5:
version "5.1.5"
resolved "https://registry.yarnpkg.com/nanoid/-/nanoid-5.1.5.tgz#f7597f9d9054eb4da9548cdd53ca70f1790e87de"
integrity sha512-Ir/+ZpE9fDsNH0hQ3C68uyThDXzYcim2EqcZ8zn8Chtt1iylPT9xXJB0kPCnqzgcEGikO9RxSrh63MsmVCU7Fw==
nanoid@5.1.6:
version "5.1.6"
resolved "https://registry.yarnpkg.com/nanoid/-/nanoid-5.1.6.tgz#30363f664797e7d40429f6c16946d6bd7a3f26c9"
integrity sha512-c7+7RQ+dMB5dPwwCp4ee1/iV/q2P6aK1mTZcfr1BTuVlyW9hJYiMPybJCcnBlQtuSmTIWNeazm/zqNoZSSElBg==
nanoid@^3.3.11:
version "3.3.11"