diff --git a/lib/provider/neubauKompass.js b/lib/provider/neubauKompass.js index a6cf3cc..de4808f 100755 --- a/lib/provider/neubauKompass.js +++ b/lib/provider/neubauKompass.js @@ -23,7 +23,7 @@ const config = { url: null, crawlContainer: '.col-12.mb-4', sortByDateParam: 'Sortierung=Id&Richtung=DESC', - waitForSelector: '.nbk-section', + waitForSelector: 'div[data-live-name-value="SearchList"]', crawlFields: { id: 'a@href', title: 'a@title | removeNewline | trim', diff --git a/lib/services/extractor/parser/parser.js b/lib/services/extractor/parser/parser.js index 82d27f7..0666ac6 100644 --- a/lib/services/extractor/parser/parser.js +++ b/lib/services/extractor/parser/parser.js @@ -9,12 +9,12 @@ export function loadParser(text) { export function parse(crawlContainer, crawlFields, text, url) { if (!text) { - logger.warn('No content found for ', url); + logger.debug('No content found for ', url); return null; } if (!crawlContainer || !crawlFields) { - logger.warn('Cannot parse, selector was empty for url ', url); + logger.debug('Cannot parse, selector was empty for url ', url); return null; } diff --git a/lib/services/extractor/puppeteerExtractor.js b/lib/services/extractor/puppeteerExtractor.js index b8f7259..0818a5d 100644 --- a/lib/services/extractor/puppeteerExtractor.js +++ b/lib/services/extractor/puppeteerExtractor.js @@ -2,30 +2,56 @@ import puppeteer from 'puppeteer-extra'; import StealthPlugin from 'puppeteer-extra-plugin-stealth'; import { debug, DEFAULT_HEADER, botDetected } from './utils.js'; import logger from '../logger.js'; +import fs from 'fs'; +import os from 'os'; +import path from 'path'; puppeteer.use(StealthPlugin()); export default async function execute(url, waitForSelector, options) { let browser; + let page; + let result = null; + let userDataDir; + let removeUserDataDir = false; try { debug(`Sending request to ${url} using Puppeteer.`); + // Prepare a dedicated temporary userDataDir to avoid leaking /tmp/.org.chromium.* dirs + if (options && options.userDataDir) { + userDataDir = options.userDataDir; + removeUserDataDir = !!options.cleanupUserDataDir; + } else { + const prefix = path.join(os.tmpdir(), 'puppeteer-fredy-'); + userDataDir = fs.mkdtempSync(prefix); + removeUserDataDir = true; + } + browser = await puppeteer.launch({ headless: options.puppeteerHeadless ?? true, - args: ['--no-sandbox', '--disable-gpu', '--disable-setuid-sandbox'], + args: [ + '--no-sandbox', + '--disable-gpu', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-crash-reporter', + ], timeout: options.puppeteerTimeout || 30_000, + userDataDir, }); - let page = await browser.newPage(); + page = await browser.newPage(); await page.setExtraHTTPHeaders(DEFAULT_HEADER); const response = await page.goto(url, { waitUntil: 'domcontentloaded', }); let pageSource; - //if we're extracting data from a spa, we must wait for the selector + // if we're extracting data from a SPA, we must wait for the selector if (waitForSelector != null) { - await page.waitForSelector(waitForSelector); + const selectorTimeout = options?.puppeteerSelectorTimeout ?? options?.puppeteerTimeout ?? 30_000; + await page.waitForSelector(waitForSelector, { timeout: selectorTimeout }); pageSource = await page.evaluate((selector) => { - return document.querySelector(selector).innerHTML; + const el = document.querySelector(selector); + return el ? el.innerHTML : ''; }, waitForSelector); } else { pageSource = await page.content(); @@ -35,16 +61,35 @@ export default async function execute(url, waitForSelector, options) { if (botDetected(pageSource, statusCode)) { logger.warn('We have been detected as a bot :-/ Tried url: => ', url); - return null; + result = null; + } else { + result = pageSource || (await page.content()); } - - return await page.content(); } catch (error) { logger.error('Error executing with puppeteer executor', error); - return null; + result = null; } finally { - if (browser != null) { - await browser.close(); + try { + if (page) { + await page.close(); + } + } catch { + // ignore + } + try { + if (browser != null) { + await browser.close(); + } + } catch { + // ignore + } + try { + if (removeUserDataDir && userDataDir) { + await fs.promises.rm(userDataDir, { recursive: true, force: true }); + } + } catch { + // ignore } } + return result; } diff --git a/package.json b/package.json index 386ebc7..99d8e56 100755 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "fredy", - "version": "12.1.5", + "version": "12.1.6", "description": "[F]ind [R]eal [E]states [d]amn eas[y].", "scripts": { "prepare": "husky", @@ -70,7 +70,7 @@ "handlebars": "4.7.8", "lodash": "4.17.21", "markdown": "^0.5.0", - "nanoid": "5.1.5", + "nanoid": "5.1.6", "node-cron": "^4.2.1", "node-fetch": "3.3.2", "node-mailjet": "6.0.9", diff --git a/yarn.lock b/yarn.lock index df4458f..9b7f570 100644 --- a/yarn.lock +++ b/yarn.lock @@ -5406,10 +5406,10 @@ nano-spawn@1.0.3: resolved "https://registry.yarnpkg.com/nano-spawn/-/nano-spawn-1.0.3.tgz#ef8d89a275eebc8657e67b95fc312a6527a05b8d" integrity sha512-jtpsQDetTnvS2Ts1fiRdci5rx0VYws5jGyC+4IYOTnIQ/wwdf6JdomlHBwqC3bJYOvaKu0C2GSZ1A60anrYpaA== -nanoid@5.1.5: - version "5.1.5" - resolved "https://registry.yarnpkg.com/nanoid/-/nanoid-5.1.5.tgz#f7597f9d9054eb4da9548cdd53ca70f1790e87de" - integrity sha512-Ir/+ZpE9fDsNH0hQ3C68uyThDXzYcim2EqcZ8zn8Chtt1iylPT9xXJB0kPCnqzgcEGikO9RxSrh63MsmVCU7Fw== +nanoid@5.1.6: + version "5.1.6" + resolved "https://registry.yarnpkg.com/nanoid/-/nanoid-5.1.6.tgz#30363f664797e7d40429f6c16946d6bd7a3f26c9" + integrity sha512-c7+7RQ+dMB5dPwwCp4ee1/iV/q2P6aK1mTZcfr1BTuVlyW9hJYiMPybJCcnBlQtuSmTIWNeazm/zqNoZSSElBg== nanoid@^3.3.11: version "3.3.11"