From e1db3840f64a85d15927769c6704ca6654773846 Mon Sep 17 00:00:00 2001 From: Christian Kellner Date: Tue, 7 Jan 2025 12:37:50 +0100 Subject: [PATCH] adding puppeteer timeout and fixing waitForSelector --- lib/provider/immonet.js | 4 +- lib/provider/immowelt.js | 2 +- lib/services/extractor/parser/parser.js | 1 + lib/services/extractor/puppeteerExtractor.js | 81 ++++++++++---------- 4 files changed, 45 insertions(+), 43 deletions(-) diff --git a/lib/provider/immonet.js b/lib/provider/immonet.js index cb53ee5..1abf91a 100755 --- a/lib/provider/immonet.js +++ b/lib/provider/immonet.js @@ -1,4 +1,4 @@ -import utils, {buildHash} from '../utils.js'; +import utils, { buildHash } from '../utils.js'; let appliedBlackList = []; /** @@ -26,7 +26,7 @@ const config = { url: null, crawlContainer: 'div[data-testid="serp-core-classified-card-testid"]', sortByDateParam: 'sortby=19', - waitForSelector: 'div[data-testid="serp-core-classified-card-testid"]', + waitForSelector: 'div[data-testid="serp-resultscount-testid"]', crawlFields: { id: 'button@title |trim', // immonet is a piece of sh*t. See comment above title: 'button@title |trim', diff --git a/lib/provider/immowelt.js b/lib/provider/immowelt.js index 393ddbd..6be7254 100755 --- a/lib/provider/immowelt.js +++ b/lib/provider/immowelt.js @@ -18,7 +18,7 @@ const config = { crawlContainer: 'div[data-testid="serp-core-scrollablelistview-testid"]:not(div[data-testid="serp-enlargementlist-testid"] div[data-testid="serp-card-testid"]) div[data-testid="serp-core-classified-card-testid"]', sortByDateParam: 'order=DateDesc', - waitForSelector: 'div[data-testid="cardmfe-price-testid"]', + waitForSelector: 'div[data-testid="serp-gridcontainer-testid"]', crawlFields: { id: 'a@href', price: 'div[data-testid="cardmfe-price-testid"] | removeNewline | trim', diff --git a/lib/services/extractor/parser/parser.js b/lib/services/extractor/parser/parser.js index 194200b..3023b98 100644 --- a/lib/services/extractor/parser/parser.js +++ b/lib/services/extractor/parser/parser.js @@ -21,6 +21,7 @@ export function parse(crawlContainer, crawlFields, text, url) { if ($(crawlContainer).length === 0) { console.error('No elements in crawl container found for url ', url); + return null; } $(crawlContainer).each((_, element) => { diff --git a/lib/services/extractor/puppeteerExtractor.js b/lib/services/extractor/puppeteerExtractor.js index 8a3200d..7d858de 100644 --- a/lib/services/extractor/puppeteerExtractor.js +++ b/lib/services/extractor/puppeteerExtractor.js @@ -1,48 +1,49 @@ import puppeteer from 'puppeteer-extra'; import StealthPlugin from 'puppeteer-extra-plugin-stealth'; -import {debug, DEFAULT_HEADER, botDetected} from './utils.js'; +import { debug, DEFAULT_HEADER, botDetected } from './utils.js'; puppeteer.use(StealthPlugin()); export default async function execute(url, waitForSelector, options) { - let browser; - try { - debug(`Sending request to ${url} using Puppeteer.`); + let browser; + try { + debug(`Sending request to ${url} using Puppeteer.`); - browser = await puppeteer.launch({ - headless: options.puppeteerHeadless ?? true, - args: ['--no-sandbox', '--disable-gpu', '--disable-setuid-sandbox'] - }); - let page = await browser.newPage(); - await page.setExtraHTTPHeaders(DEFAULT_HEADER); - const response = await page.goto(url, { - waitUntil: 'domcontentloaded' - }); - let pageSource; - //if we're extracting data from a spa, we must wait for the selector - if (waitForSelector != null) { - await page.waitForSelector(waitForSelector); - pageSource = await page.evaluate(selector => { - return document.querySelector(selector).innerHTML; - }, waitForSelector); - } else { - pageSource = await page.content(); - } - - const statusCode = response.status(); - - if (botDetected(pageSource, statusCode)) { - console.warn('We have been detected as a bot :-/ Tried url: => ', url); - return null; - } - - return await page.content(); - } catch (error) { - console.error('Error executing with puppeteer executor', error); - return null; - } finally { - if (browser != null) { - await browser.close(); - } + browser = await puppeteer.launch({ + headless: options.puppeteerHeadless ?? true, + args: ['--no-sandbox', '--disable-gpu', '--disable-setuid-sandbox'], + timeout: options.puppeteerTimeout || 30_000, + }); + let page = await browser.newPage(); + await page.setExtraHTTPHeaders(DEFAULT_HEADER); + const response = await page.goto(url, { + waitUntil: 'domcontentloaded', + }); + let pageSource; + //if we're extracting data from a spa, we must wait for the selector + if (waitForSelector != null) { + await page.waitForSelector(waitForSelector); + pageSource = await page.evaluate((selector) => { + return document.querySelector(selector).innerHTML; + }, waitForSelector); + } else { + pageSource = await page.content(); } -} \ No newline at end of file + + const statusCode = response.status(); + + if (botDetected(pageSource, statusCode)) { + console.warn('We have been detected as a bot :-/ Tried url: => ', url); + return null; + } + + return await page.content(); + } catch (error) { + console.error('Error executing with puppeteer executor', error); + return null; + } finally { + if (browser != null) { + await browser.close(); + } + } +}