Puppeteer improvements (#270)

* improve puppeteer handling. Now only 1 puppeteer instance is being used which is WAY more efficient

* removing package-lock

* reduce logging

* removing problematic docker command

* Remove Immonet. They now belong to immowelt
This commit is contained in:
Christian Kellner
2026-02-18 20:05:02 +01:00
committed by GitHub
parent 05218800d2
commit 00d6a12b30
10 changed files with 106 additions and 13729 deletions

View File

@@ -63,13 +63,15 @@ class FredyPipelineExecutioner {
* @param {string} providerId The ID of the provider currently in use.
* @param {string} jobKey Key of the job that is currently running (from within the config).
* @param {SimilarityCache} similarityCache Cache instance for checking similar entries.
* @param browser
*/
constructor(providerConfig, notificationConfig, providerId, jobKey, similarityCache) {
constructor(providerConfig, notificationConfig, providerId, jobKey, similarityCache, browser) {
this._providerConfig = providerConfig;
this._notificationConfig = notificationConfig;
this._providerId = providerId;
this._jobKey = jobKey;
this._similarityCache = similarityCache;
this._browser = browser;
}
/**
@@ -119,7 +121,7 @@ class FredyPipelineExecutioner {
* @returns {Promise<Listing[]>} Resolves with an array of listings (empty when none found).
*/
_getListings(url) {
const extractor = new Extractor();
const extractor = new Extractor({ ...this._providerConfig.puppeteerOptions, browser: this._browser });
return new Promise((resolve, reject) => {
extractor
.execute(url, this._providerConfig.waitForSelector)

View File

@@ -1,53 +0,0 @@
/*
* Copyright (c) 2026 by Christian Kellner.
* Licensed under Apache-2.0 with Commons Clause and Attribution/Naming Clause
*/
import { isOneOf, buildHash } from '../utils.js';
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
let appliedBlackList = [];
function normalize(o) {
const size = o.size != null ? o.size.replace('Wohnfläche ', '') : 'N/A m²';
const price = o.price.replace('Kaufpreis ', '');
const address = o.address?.split(' • ')?.pop() ?? null;
const title = o.title || 'No title available';
const link = o.link != null ? decodeURIComponent(o.link) : config.url;
const id = buildHash(title, price);
return Object.assign(o, { id, address, price, size, title, link });
}
function applyBlacklist(o) {
const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList);
const descNotBlacklisted = !isOneOf(o.description, appliedBlackList);
return titleNotBlacklisted && descNotBlacklisted;
}
const config = {
url: null,
crawlContainer: 'div[data-testid="serp-core-classified-card-testid"]',
sortByDateParam: 'sortby=19',
waitForSelector: 'div[data-testid="serp-gridcontainer-testid"]',
crawlFields: {
id: 'button@title |trim',
title: 'button@title |trim',
price: 'div[data-testid="cardmfe-price-testid"] | trim',
size: 'div[data-testid="cardmfe-keyfacts-testid"] | trim',
address: 'div[data-testid="cardmfe-description-box-address"] | trim',
image: 'div[data-testid="cardmfe-picture-box-test-id"] img@src',
link: 'button@data-base',
description: 'div[data-testid="cardmfe-description-text-test-id"] | trim',
},
normalize: normalize,
filter: applyBlacklist,
activeTester: checkIfListingIsActive,
};
export const init = (sourceConfig, blacklist) => {
config.enabled = sourceConfig.enabled;
config.url = sourceConfig.url;
appliedBlackList = blacklist || [];
};
export const metaInformation = {
name: 'Immonet',
baseUrl: 'https://www.immonet.de/',
id: 'immonet',
};
export { config };

View File

@@ -19,52 +19,80 @@ import path from 'path';
puppeteer.use(StealthPlugin());
export default async function execute(url, waitForSelector, options) {
let browser;
let page;
let result;
export async function launchBrowser(url, options) {
const preCfg = getPreLaunchConfig(url, options || {});
const launchArgs = [
'--no-sandbox',
'--disable-gpu',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-crash-reporter',
'--no-first-run',
'--no-default-browser-check',
preCfg.langArg,
preCfg.windowSizeArg,
...preCfg.extraArgs,
];
if (options?.proxyUrl) {
launchArgs.push(`--proxy-server=${options.proxyUrl}`);
}
let userDataDir;
let removeUserDataDir = false;
if (options && options.userDataDir) {
userDataDir = options.userDataDir;
} else {
const prefix = path.join(os.tmpdir(), 'puppeteer-fredy-');
userDataDir = fs.mkdtempSync(prefix);
removeUserDataDir = true;
}
const browser = await puppeteer.launch({
headless: options?.puppeteerHeadless ?? true,
args: launchArgs,
timeout: options?.puppeteerTimeout || 30_000,
userDataDir,
executablePath: options?.executablePath,
});
browser.__fredy_userDataDir = userDataDir;
browser.__fredy_removeUserDataDir = removeUserDataDir;
return browser;
}
export async function closeBrowser(browser) {
if (!browser) return;
const userDataDir = browser.__fredy_userDataDir;
const removeUserDataDir = browser.__fredy_removeUserDataDir;
try {
await browser.close();
} catch {
// ignore
}
if (removeUserDataDir && userDataDir) {
try {
await fs.promises.rm(userDataDir, { recursive: true, force: true });
} catch {
// ignore
}
}
}
export default async function execute(url, waitForSelector, options) {
let browser = options?.browser;
let isExternalBrowser = !!browser;
let page;
let result;
try {
debug(`Sending request to ${url} using Puppeteer.`);
// Prepare a dedicated temporary userDataDir to avoid leaking /tmp/.org.chromium.* dirs
if (options && options.userDataDir) {
userDataDir = options.userDataDir;
removeUserDataDir = !!options.cleanupUserDataDir;
} else {
const prefix = path.join(os.tmpdir(), 'puppeteer-fredy-');
userDataDir = fs.mkdtempSync(prefix);
removeUserDataDir = true;
if (!isExternalBrowser) {
browser = await launchBrowser(url, options);
}
const launchArgs = [
'--no-sandbox',
'--disable-gpu',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-crash-reporter',
'--no-first-run',
'--no-default-browser-check',
];
if (options?.proxyUrl) {
launchArgs.push(`--proxy-server=${options.proxyUrl}`);
}
// Prepare bot prevention pre-launch config
const preCfg = getPreLaunchConfig(url, options || {});
launchArgs.push(preCfg.langArg);
launchArgs.push(preCfg.windowSizeArg);
launchArgs.push(...preCfg.extraArgs);
browser = await puppeteer.launch({
headless: options?.puppeteerHeadless ?? true,
args: launchArgs,
timeout: options?.puppeteerTimeout || 30_000,
userDataDir,
executablePath: options?.executablePath, // allow using system Chrome
});
page = await browser.newPage();
const preCfg = getPreLaunchConfig(url, options || {});
await applyBotPreventionToPage(page, preCfg);
// Provide languages value before navigation
await applyLanguagePersistence(page, preCfg);
@@ -104,7 +132,7 @@ export default async function execute(url, waitForSelector, options) {
result = pageSource || (await page.content());
}
} catch (error) {
if (error?.message?.includes('Timeout')) {
if (error?.name?.includes('Timeout')) {
logger.debug('Error executing with puppeteer executor', error);
} else {
logger.warn('Error executing with puppeteer executor', error);
@@ -118,19 +146,8 @@ export default async function execute(url, waitForSelector, options) {
} catch {
// ignore
}
try {
if (browser != null) {
await browser.close();
}
} catch {
// ignore
}
try {
if (removeUserDataDir && userDataDir) {
await fs.promises.rm(userDataDir, { recursive: true, force: true });
}
} catch {
// ignore
if (browser != null && !isExternalBrowser) {
await closeBrowser(browser);
}
}
return result;

View File

@@ -13,6 +13,7 @@ import FredyPipelineExecutioner from '../../FredyPipelineExecutioner.js';
import * as similarityCache from '../similarity-check/similarityCache.js';
import { isRunning, markFinished, markRunning } from './run-state.js';
import { sendToUsers } from '../sse/sse-broker.js';
import * as puppeteerExtractor from '../extractor/puppeteerExtractor.js';
/**
* Initializes the job execution service.
@@ -94,7 +95,7 @@ export function initJobExecutionService({ providers, settings, intervalMs }) {
* @param {{userId?: string, isAdmin?: boolean}} [context] - Who requested the run; determines job filtering.
* @returns {void}
*/
function runAll(respectWorkingHours = true, context = undefined) {
async function runAll(respectWorkingHours = true, context = undefined) {
if (settings.demoMode) return;
const now = Date.now();
const withinHours = duringWorkingHoursOrNotSet(settings, now);
@@ -103,15 +104,18 @@ export function initJobExecutionService({ providers, settings, intervalMs }) {
return;
}
settings.lastRun = now;
jobStorage
const jobs = jobStorage
.getJobs()
.filter((job) => job.enabled)
.filter((job) => {
if (!context) return true; // startup/cron → all
if (context.isAdmin) return true; // admin → all
return context.userId ? job.userId === context.userId : false; // user → own
})
.forEach((job) => executeJob(job));
});
for (const job of jobs) {
await executeJob(job);
}
}
/**
@@ -154,28 +158,36 @@ export function initJobExecutionService({ providers, settings, intervalMs }) {
} catch (err) {
logger.warn('Failed to emit start status for job', job.id, err);
}
let browser;
try {
const jobProviders = job.provider.filter(
(p) => providers.find((loaded) => loaded.metaInformation.id === p.id) != null,
);
const executions = jobProviders.map(async (prov) => {
const matchedProvider = providers.find((loaded) => loaded.metaInformation.id === prov.id);
matchedProvider.init(prov, job.blacklist);
await new FredyPipelineExecutioner(
matchedProvider.config,
job.notificationAdapter,
prov.id,
job.id,
similarityCache,
).execute();
});
const results = await Promise.allSettled(executions);
for (const r of results) {
if (r.status === 'rejected') {
logger.error(r.reason);
for (const prov of jobProviders) {
try {
const matchedProvider = providers.find((loaded) => loaded.metaInformation.id === prov.id);
matchedProvider.init(prov, job.blacklist);
if (!browser && matchedProvider.config.getListings == null) {
browser = await puppeteerExtractor.launchBrowser(matchedProvider.config.url, {});
}
await new FredyPipelineExecutioner(
matchedProvider.config,
job.notificationAdapter,
prov.id,
job.id,
similarityCache,
browser,
).execute();
} catch (err) {
logger.error(err);
}
}
} finally {
if (browser) {
await puppeteerExtractor.closeBrowser(browser);
}
markFinished(job.id);
try {
bus.emit('jobs:status', { jobId: job.id, running: false });