mirror of
https://github.com/orangecoding/fredy.git
synced 2026-06-16 12:31:07 +00:00
Puppeteer improvements (#186)
* improving puppeteer handling * upgrade dependencies * reduce logging * upgrade nanoid
This commit is contained in:
committed by
GitHub
parent
c839f3abc9
commit
11fd18e76a
@@ -9,12 +9,12 @@ export function loadParser(text) {
|
||||
|
||||
export function parse(crawlContainer, crawlFields, text, url) {
|
||||
if (!text) {
|
||||
logger.warn('No content found for ', url);
|
||||
logger.debug('No content found for ', url);
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!crawlContainer || !crawlFields) {
|
||||
logger.warn('Cannot parse, selector was empty for url ', url);
|
||||
logger.debug('Cannot parse, selector was empty for url ', url);
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
@@ -2,30 +2,56 @@ import puppeteer from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
import { debug, DEFAULT_HEADER, botDetected } from './utils.js';
|
||||
import logger from '../logger.js';
|
||||
import fs from 'fs';
|
||||
import os from 'os';
|
||||
import path from 'path';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
export default async function execute(url, waitForSelector, options) {
|
||||
let browser;
|
||||
let page;
|
||||
let result = null;
|
||||
let userDataDir;
|
||||
let removeUserDataDir = false;
|
||||
try {
|
||||
debug(`Sending request to ${url} using Puppeteer.`);
|
||||
|
||||
// Prepare a dedicated temporary userDataDir to avoid leaking /tmp/.org.chromium.* dirs
|
||||
if (options && options.userDataDir) {
|
||||
userDataDir = options.userDataDir;
|
||||
removeUserDataDir = !!options.cleanupUserDataDir;
|
||||
} else {
|
||||
const prefix = path.join(os.tmpdir(), 'puppeteer-fredy-');
|
||||
userDataDir = fs.mkdtempSync(prefix);
|
||||
removeUserDataDir = true;
|
||||
}
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
headless: options.puppeteerHeadless ?? true,
|
||||
args: ['--no-sandbox', '--disable-gpu', '--disable-setuid-sandbox'],
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-gpu',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-crash-reporter',
|
||||
],
|
||||
timeout: options.puppeteerTimeout || 30_000,
|
||||
userDataDir,
|
||||
});
|
||||
let page = await browser.newPage();
|
||||
page = await browser.newPage();
|
||||
await page.setExtraHTTPHeaders(DEFAULT_HEADER);
|
||||
const response = await page.goto(url, {
|
||||
waitUntil: 'domcontentloaded',
|
||||
});
|
||||
let pageSource;
|
||||
//if we're extracting data from a spa, we must wait for the selector
|
||||
// if we're extracting data from a SPA, we must wait for the selector
|
||||
if (waitForSelector != null) {
|
||||
await page.waitForSelector(waitForSelector);
|
||||
const selectorTimeout = options?.puppeteerSelectorTimeout ?? options?.puppeteerTimeout ?? 30_000;
|
||||
await page.waitForSelector(waitForSelector, { timeout: selectorTimeout });
|
||||
pageSource = await page.evaluate((selector) => {
|
||||
return document.querySelector(selector).innerHTML;
|
||||
const el = document.querySelector(selector);
|
||||
return el ? el.innerHTML : '';
|
||||
}, waitForSelector);
|
||||
} else {
|
||||
pageSource = await page.content();
|
||||
@@ -35,16 +61,35 @@ export default async function execute(url, waitForSelector, options) {
|
||||
|
||||
if (botDetected(pageSource, statusCode)) {
|
||||
logger.warn('We have been detected as a bot :-/ Tried url: => ', url);
|
||||
return null;
|
||||
result = null;
|
||||
} else {
|
||||
result = pageSource || (await page.content());
|
||||
}
|
||||
|
||||
return await page.content();
|
||||
} catch (error) {
|
||||
logger.error('Error executing with puppeteer executor', error);
|
||||
return null;
|
||||
result = null;
|
||||
} finally {
|
||||
if (browser != null) {
|
||||
await browser.close();
|
||||
try {
|
||||
if (page) {
|
||||
await page.close();
|
||||
}
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
try {
|
||||
if (browser != null) {
|
||||
await browser.close();
|
||||
}
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
try {
|
||||
if (removeUserDataDir && userDataDir) {
|
||||
await fs.promises.rm(userDataDir, { recursive: true, force: true });
|
||||
}
|
||||
} catch {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user