mirror of
https://github.com/orangecoding/fredy.git
synced 2026-06-16 12:31:07 +00:00
Puppeteer improvements (#270)
* improve puppeteer handling. Now only 1 puppeteer instance is being used which is WAY more efficient * removing package-lock * reduce logging * removing problematic docker command * Remove Immonet. They now belong to immowelt
This commit is contained in:
committed by
GitHub
parent
05218800d2
commit
00d6a12b30
@@ -19,52 +19,80 @@ import path from 'path';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
export default async function execute(url, waitForSelector, options) {
|
||||
let browser;
|
||||
let page;
|
||||
let result;
|
||||
export async function launchBrowser(url, options) {
|
||||
const preCfg = getPreLaunchConfig(url, options || {});
|
||||
const launchArgs = [
|
||||
'--no-sandbox',
|
||||
'--disable-gpu',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-crash-reporter',
|
||||
'--no-first-run',
|
||||
'--no-default-browser-check',
|
||||
preCfg.langArg,
|
||||
preCfg.windowSizeArg,
|
||||
...preCfg.extraArgs,
|
||||
];
|
||||
if (options?.proxyUrl) {
|
||||
launchArgs.push(`--proxy-server=${options.proxyUrl}`);
|
||||
}
|
||||
|
||||
let userDataDir;
|
||||
let removeUserDataDir = false;
|
||||
if (options && options.userDataDir) {
|
||||
userDataDir = options.userDataDir;
|
||||
} else {
|
||||
const prefix = path.join(os.tmpdir(), 'puppeteer-fredy-');
|
||||
userDataDir = fs.mkdtempSync(prefix);
|
||||
removeUserDataDir = true;
|
||||
}
|
||||
|
||||
const browser = await puppeteer.launch({
|
||||
headless: options?.puppeteerHeadless ?? true,
|
||||
args: launchArgs,
|
||||
timeout: options?.puppeteerTimeout || 30_000,
|
||||
userDataDir,
|
||||
executablePath: options?.executablePath,
|
||||
});
|
||||
|
||||
browser.__fredy_userDataDir = userDataDir;
|
||||
browser.__fredy_removeUserDataDir = removeUserDataDir;
|
||||
|
||||
return browser;
|
||||
}
|
||||
|
||||
export async function closeBrowser(browser) {
|
||||
if (!browser) return;
|
||||
const userDataDir = browser.__fredy_userDataDir;
|
||||
const removeUserDataDir = browser.__fredy_removeUserDataDir;
|
||||
try {
|
||||
await browser.close();
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
if (removeUserDataDir && userDataDir) {
|
||||
try {
|
||||
await fs.promises.rm(userDataDir, { recursive: true, force: true });
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export default async function execute(url, waitForSelector, options) {
|
||||
let browser = options?.browser;
|
||||
let isExternalBrowser = !!browser;
|
||||
let page;
|
||||
let result;
|
||||
try {
|
||||
debug(`Sending request to ${url} using Puppeteer.`);
|
||||
|
||||
// Prepare a dedicated temporary userDataDir to avoid leaking /tmp/.org.chromium.* dirs
|
||||
if (options && options.userDataDir) {
|
||||
userDataDir = options.userDataDir;
|
||||
removeUserDataDir = !!options.cleanupUserDataDir;
|
||||
} else {
|
||||
const prefix = path.join(os.tmpdir(), 'puppeteer-fredy-');
|
||||
userDataDir = fs.mkdtempSync(prefix);
|
||||
removeUserDataDir = true;
|
||||
if (!isExternalBrowser) {
|
||||
browser = await launchBrowser(url, options);
|
||||
}
|
||||
|
||||
const launchArgs = [
|
||||
'--no-sandbox',
|
||||
'--disable-gpu',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-crash-reporter',
|
||||
'--no-first-run',
|
||||
'--no-default-browser-check',
|
||||
];
|
||||
if (options?.proxyUrl) {
|
||||
launchArgs.push(`--proxy-server=${options.proxyUrl}`);
|
||||
}
|
||||
// Prepare bot prevention pre-launch config
|
||||
const preCfg = getPreLaunchConfig(url, options || {});
|
||||
launchArgs.push(preCfg.langArg);
|
||||
launchArgs.push(preCfg.windowSizeArg);
|
||||
launchArgs.push(...preCfg.extraArgs);
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
headless: options?.puppeteerHeadless ?? true,
|
||||
args: launchArgs,
|
||||
timeout: options?.puppeteerTimeout || 30_000,
|
||||
userDataDir,
|
||||
executablePath: options?.executablePath, // allow using system Chrome
|
||||
});
|
||||
|
||||
page = await browser.newPage();
|
||||
const preCfg = getPreLaunchConfig(url, options || {});
|
||||
await applyBotPreventionToPage(page, preCfg);
|
||||
// Provide languages value before navigation
|
||||
await applyLanguagePersistence(page, preCfg);
|
||||
@@ -104,7 +132,7 @@ export default async function execute(url, waitForSelector, options) {
|
||||
result = pageSource || (await page.content());
|
||||
}
|
||||
} catch (error) {
|
||||
if (error?.message?.includes('Timeout')) {
|
||||
if (error?.name?.includes('Timeout')) {
|
||||
logger.debug('Error executing with puppeteer executor', error);
|
||||
} else {
|
||||
logger.warn('Error executing with puppeteer executor', error);
|
||||
@@ -118,19 +146,8 @@ export default async function execute(url, waitForSelector, options) {
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
try {
|
||||
if (browser != null) {
|
||||
await browser.close();
|
||||
}
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
try {
|
||||
if (removeUserDataDir && userDataDir) {
|
||||
await fs.promises.rm(userDataDir, { recursive: true, force: true });
|
||||
}
|
||||
} catch {
|
||||
// ignore
|
||||
if (browser != null && !isExternalBrowser) {
|
||||
await closeBrowser(browser);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
|
||||
@@ -13,6 +13,7 @@ import FredyPipelineExecutioner from '../../FredyPipelineExecutioner.js';
|
||||
import * as similarityCache from '../similarity-check/similarityCache.js';
|
||||
import { isRunning, markFinished, markRunning } from './run-state.js';
|
||||
import { sendToUsers } from '../sse/sse-broker.js';
|
||||
import * as puppeteerExtractor from '../extractor/puppeteerExtractor.js';
|
||||
|
||||
/**
|
||||
* Initializes the job execution service.
|
||||
@@ -94,7 +95,7 @@ export function initJobExecutionService({ providers, settings, intervalMs }) {
|
||||
* @param {{userId?: string, isAdmin?: boolean}} [context] - Who requested the run; determines job filtering.
|
||||
* @returns {void}
|
||||
*/
|
||||
function runAll(respectWorkingHours = true, context = undefined) {
|
||||
async function runAll(respectWorkingHours = true, context = undefined) {
|
||||
if (settings.demoMode) return;
|
||||
const now = Date.now();
|
||||
const withinHours = duringWorkingHoursOrNotSet(settings, now);
|
||||
@@ -103,15 +104,18 @@ export function initJobExecutionService({ providers, settings, intervalMs }) {
|
||||
return;
|
||||
}
|
||||
settings.lastRun = now;
|
||||
jobStorage
|
||||
const jobs = jobStorage
|
||||
.getJobs()
|
||||
.filter((job) => job.enabled)
|
||||
.filter((job) => {
|
||||
if (!context) return true; // startup/cron → all
|
||||
if (context.isAdmin) return true; // admin → all
|
||||
return context.userId ? job.userId === context.userId : false; // user → own
|
||||
})
|
||||
.forEach((job) => executeJob(job));
|
||||
});
|
||||
|
||||
for (const job of jobs) {
|
||||
await executeJob(job);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -154,28 +158,36 @@ export function initJobExecutionService({ providers, settings, intervalMs }) {
|
||||
} catch (err) {
|
||||
logger.warn('Failed to emit start status for job', job.id, err);
|
||||
}
|
||||
let browser;
|
||||
try {
|
||||
const jobProviders = job.provider.filter(
|
||||
(p) => providers.find((loaded) => loaded.metaInformation.id === p.id) != null,
|
||||
);
|
||||
const executions = jobProviders.map(async (prov) => {
|
||||
const matchedProvider = providers.find((loaded) => loaded.metaInformation.id === prov.id);
|
||||
matchedProvider.init(prov, job.blacklist);
|
||||
await new FredyPipelineExecutioner(
|
||||
matchedProvider.config,
|
||||
job.notificationAdapter,
|
||||
prov.id,
|
||||
job.id,
|
||||
similarityCache,
|
||||
).execute();
|
||||
});
|
||||
const results = await Promise.allSettled(executions);
|
||||
for (const r of results) {
|
||||
if (r.status === 'rejected') {
|
||||
logger.error(r.reason);
|
||||
for (const prov of jobProviders) {
|
||||
try {
|
||||
const matchedProvider = providers.find((loaded) => loaded.metaInformation.id === prov.id);
|
||||
matchedProvider.init(prov, job.blacklist);
|
||||
|
||||
if (!browser && matchedProvider.config.getListings == null) {
|
||||
browser = await puppeteerExtractor.launchBrowser(matchedProvider.config.url, {});
|
||||
}
|
||||
|
||||
await new FredyPipelineExecutioner(
|
||||
matchedProvider.config,
|
||||
job.notificationAdapter,
|
||||
prov.id,
|
||||
job.id,
|
||||
similarityCache,
|
||||
browser,
|
||||
).execute();
|
||||
} catch (err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
if (browser) {
|
||||
await puppeteerExtractor.closeBrowser(browser);
|
||||
}
|
||||
markFinished(job.id);
|
||||
try {
|
||||
bus.emit('jobs:status', { jobId: job.id, running: false });
|
||||
|
||||
Reference in New Issue
Block a user