adding puppeteer timeout and fixing waitForSelector

This commit is contained in:
Christian Kellner
2025-01-07 12:37:50 +01:00
parent 26127eeac1
commit e1db3840f6
4 changed files with 45 additions and 43 deletions

View File

@@ -1,4 +1,4 @@
import utils, {buildHash} from '../utils.js';
import utils, { buildHash } from '../utils.js';
let appliedBlackList = [];
/**
@@ -26,7 +26,7 @@ const config = {
url: null,
crawlContainer: 'div[data-testid="serp-core-classified-card-testid"]',
sortByDateParam: 'sortby=19',
waitForSelector: 'div[data-testid="serp-core-classified-card-testid"]',
waitForSelector: 'div[data-testid="serp-resultscount-testid"]',
crawlFields: {
id: 'button@title |trim', // immonet is a piece of sh*t. See comment above
title: 'button@title |trim',

View File

@@ -18,7 +18,7 @@ const config = {
crawlContainer:
'div[data-testid="serp-core-scrollablelistview-testid"]:not(div[data-testid="serp-enlargementlist-testid"] div[data-testid="serp-card-testid"]) div[data-testid="serp-core-classified-card-testid"]',
sortByDateParam: 'order=DateDesc',
waitForSelector: 'div[data-testid="cardmfe-price-testid"]',
waitForSelector: 'div[data-testid="serp-gridcontainer-testid"]',
crawlFields: {
id: 'a@href',
price: 'div[data-testid="cardmfe-price-testid"] | removeNewline | trim',

View File

@@ -21,6 +21,7 @@ export function parse(crawlContainer, crawlFields, text, url) {
if ($(crawlContainer).length === 0) {
console.error('No elements in crawl container found for url ', url);
return null;
}
$(crawlContainer).each((_, element) => {

View File

@@ -1,48 +1,49 @@
import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import {debug, DEFAULT_HEADER, botDetected} from './utils.js';
import { debug, DEFAULT_HEADER, botDetected } from './utils.js';
puppeteer.use(StealthPlugin());
export default async function execute(url, waitForSelector, options) {
let browser;
try {
debug(`Sending request to ${url} using Puppeteer.`);
let browser;
try {
debug(`Sending request to ${url} using Puppeteer.`);
browser = await puppeteer.launch({
headless: options.puppeteerHeadless ?? true,
args: ['--no-sandbox', '--disable-gpu', '--disable-setuid-sandbox']
});
let page = await browser.newPage();
await page.setExtraHTTPHeaders(DEFAULT_HEADER);
const response = await page.goto(url, {
waitUntil: 'domcontentloaded'
});
let pageSource;
//if we're extracting data from a spa, we must wait for the selector
if (waitForSelector != null) {
await page.waitForSelector(waitForSelector);
pageSource = await page.evaluate(selector => {
return document.querySelector(selector).innerHTML;
}, waitForSelector);
} else {
pageSource = await page.content();
}
const statusCode = response.status();
if (botDetected(pageSource, statusCode)) {
console.warn('We have been detected as a bot :-/ Tried url: => ', url);
return null;
}
return await page.content();
} catch (error) {
console.error('Error executing with puppeteer executor', error);
return null;
} finally {
if (browser != null) {
await browser.close();
}
browser = await puppeteer.launch({
headless: options.puppeteerHeadless ?? true,
args: ['--no-sandbox', '--disable-gpu', '--disable-setuid-sandbox'],
timeout: options.puppeteerTimeout || 30_000,
});
let page = await browser.newPage();
await page.setExtraHTTPHeaders(DEFAULT_HEADER);
const response = await page.goto(url, {
waitUntil: 'domcontentloaded',
});
let pageSource;
//if we're extracting data from a spa, we must wait for the selector
if (waitForSelector != null) {
await page.waitForSelector(waitForSelector);
pageSource = await page.evaluate((selector) => {
return document.querySelector(selector).innerHTML;
}, waitForSelector);
} else {
pageSource = await page.content();
}
}
const statusCode = response.status();
if (botDetected(pageSource, statusCode)) {
console.warn('We have been detected as a bot :-/ Tried url: => ', url);
return null;
}
return await page.content();
} catch (error) {
console.error('Error executing with puppeteer executor', error);
return null;
} finally {
if (browser != null) {
await browser.close();
}
}
}