diff --git a/lib/FredyPipelineExecutioner.js b/lib/FredyPipelineExecutioner.js index 1ab3453..a9b5c90 100755 --- a/lib/FredyPipelineExecutioner.js +++ b/lib/FredyPipelineExecutioner.js @@ -227,7 +227,7 @@ class FredyPipelineExecutioner { const extractor = new Extractor({ ...this._providerConfig.puppeteerOptions, browser: this._browser }); return new Promise((resolve, reject) => { extractor - .execute(url, this._providerConfig.waitForSelector) + .execute(url, this._providerConfig.waitForSelector, this._jobKey) .then(() => { const listings = extractor.parseResponseText( this._providerConfig.crawlContainer, diff --git a/lib/provider/immobilienDe.js b/lib/provider/immobilienDe.js index 40fbb2d..040f65f 100644 --- a/lib/provider/immobilienDe.js +++ b/lib/provider/immobilienDe.js @@ -26,7 +26,7 @@ function parseId(shortenedLink) { async function fetchDetails(listing, browser) { try { - const html = await puppeteerExtractor(listing.link, null, { browser }); + const html = await puppeteerExtractor(listing.link, null, { browser, name: 'immobilienDe_details' }); if (!html) return listing; const $ = cheerio.load(html); diff --git a/lib/provider/immowelt.js b/lib/provider/immowelt.js index 09e1ff1..722896b 100755 --- a/lib/provider/immowelt.js +++ b/lib/provider/immowelt.js @@ -16,7 +16,7 @@ let appliedBlackList = []; async function fetchDetails(listing, browser) { try { - const html = await puppeteerExtractor(listing.link, null, { browser }); + const html = await puppeteerExtractor(listing.link, null, { browser, name: 'immowelt_details' }); if (!html) return listing; const $ = cheerio.load(html); diff --git a/lib/provider/kleinanzeigen.js b/lib/provider/kleinanzeigen.js index a132d11..6ec3965 100755 --- a/lib/provider/kleinanzeigen.js +++ b/lib/provider/kleinanzeigen.js @@ -128,7 +128,7 @@ async function enrichListingFromDetails(listing, browser) { if (!absoluteLink) return listing; try { - const html = await puppeteerExtractor(absoluteLink, null, { browser }); + const html = await puppeteerExtractor(absoluteLink, null, { browser, name: 'kleinanzeigen_details' }); if (!html) return { ...listing, link: absoluteLink }; const { detailAddress, detailDescription } = extractDetailFromHtml(html); diff --git a/lib/provider/sparkasse.js b/lib/provider/sparkasse.js index 71c77c4..84ea195 100755 --- a/lib/provider/sparkasse.js +++ b/lib/provider/sparkasse.js @@ -16,7 +16,7 @@ let appliedBlackList = []; async function fetchDetails(listing, browser) { try { - const html = await puppeteerExtractor(listing.link, 'body', { browser }); + const html = await puppeteerExtractor(listing.link, 'body', { browser, name: 'sparkasse_details' }); const $ = cheerio.load(html); const nextDataRaw = $('#__NEXT_DATA__').text; diff --git a/lib/provider/wgGesucht.js b/lib/provider/wgGesucht.js index cf1257b..b20c84d 100755 --- a/lib/provider/wgGesucht.js +++ b/lib/provider/wgGesucht.js @@ -16,7 +16,7 @@ let appliedBlackList = []; async function fetchDetails(listing, browser) { try { - const html = await puppeteerExtractor(listing.link, null, { browser }); + const html = await puppeteerExtractor(listing.link, null, { browser, name: 'wgGesucht_details' }); if (!html) return listing; const $ = cheerio.load(html); diff --git a/lib/services/extractor/extractor.js b/lib/services/extractor/extractor.js index 79cb0ea..2dd630c 100644 --- a/lib/services/extractor/extractor.js +++ b/lib/services/extractor/extractor.js @@ -29,11 +29,12 @@ export default class Extractor { * your response will never contain what you are really looking for * @param url * @param waitForSelector + * @param jobKey */ - execute = async (url, waitForSelector = null) => { + execute = async (url, waitForSelector = null, jobKey = null) => { this.responseText = null; try { - this.responseText = await puppeteerExtractor(url, waitForSelector, this.options); + this.responseText = await puppeteerExtractor(url, waitForSelector, { ...this.options, name: jobKey }); if (this.responseText != null) { loadParser(this.responseText); } diff --git a/lib/services/extractor/puppeteerExtractor.js b/lib/services/extractor/puppeteerExtractor.js index 96727eb..fb4147e 100644 --- a/lib/services/extractor/puppeteerExtractor.js +++ b/lib/services/extractor/puppeteerExtractor.js @@ -148,7 +148,11 @@ export default async function execute(url, waitForSelector, options) { if (botDetected(pageSource, statusCode)) { logger.warn('We have been detected as a bot :-/ Tried url: => ', url); - await trackPoi(TRACKING_POIS.DETECTED_AS_BOT); + if (options != null && options.name != null) { + await trackPoi(TRACKING_POIS.DETECTED_AS_BOT + '_' + options.name); + } else { + await trackPoi(TRACKING_POIS.DETECTED_AS_BOT); + } result = null; } else { diff --git a/package.json b/package.json index dce50c1..049f157 100755 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "fredy", - "version": "22.0.5", + "version": "22.0.6", "description": "[F]ind [R]eal [E]states [d]amn eas[y].", "scripts": { "prepare": "husky", diff --git a/tools/testFixtures/downloadFixtures.js b/tools/testFixtures/downloadFixtures.js index 0d08441..8322b6c 100644 --- a/tools/testFixtures/downloadFixtures.js +++ b/tools/testFixtures/downloadFixtures.js @@ -95,7 +95,10 @@ async function downloadHtmlProvider(name, providerConfig, launchBrowser, closeBr const browser = await launchBrowser(providerConfig.url, {}); try { - const html = await puppeteerExtractor(providerConfig.url, providerConfig.waitForSelector, { browser }); + const html = await puppeteerExtractor(providerConfig.url, providerConfig.waitForSelector, { + browser, + name: 'dowload_fixtures', + }); if (!html) { console.warn(` Failed to download ${name}`);