diff --git a/lib/FredyRuntime.js b/lib/FredyRuntime.js index 949c700..639c74d 100755 --- a/lib/FredyRuntime.js +++ b/lib/FredyRuntime.js @@ -1,118 +1,124 @@ -import {NoNewListingsWarning} from './errors.js'; -import {setKnownListings, getKnownListings} from './services/storage/listingsStorage.js'; +import { NoNewListingsWarning } from './errors.js'; +import { setKnownListings, getKnownListings } from './services/storage/listingsStorage.js'; import * as notify from './notification/notify.js'; import Extractor from './services/extractor/extractor.js'; import urlModifier from './services/queryStringMutator.js'; class FredyRuntime { - /** - * - * @param providerConfig the config for the specific provider, we're going to query at the moment - * @param notificationConfig the config for all notifications - * @param providerId the id of the provider currently in use - * @param jobKey key of the job that is currently running (from within the config) - * @param similarityCache cache instance holding values to check for similarity of entries - */ - constructor(providerConfig, notificationConfig, providerId, jobKey, similarityCache) { - this._providerConfig = providerConfig; - this._notificationConfig = notificationConfig; - this._providerId = providerId; - this._jobKey = jobKey; - this._similarityCache = similarityCache; - } + /** + * + * @param providerConfig the config for the specific provider, we're going to query at the moment + * @param notificationConfig the config for all notifications + * @param providerId the id of the provider currently in use + * @param jobKey key of the job that is currently running (from within the config) + * @param similarityCache cache instance holding values to check for similarity of entries + */ + constructor(providerConfig, notificationConfig, providerId, jobKey, similarityCache) { + this._providerConfig = providerConfig; + this._notificationConfig = notificationConfig; + this._providerId = providerId; + this._jobKey = jobKey; + this._similarityCache = similarityCache; + } - execute() { - return ( - //modify the url to make sure search order is correctly set - Promise.resolve(urlModifier(this._providerConfig.url, this._providerConfig.sortByDateParam)) - //scraping the site and try finding new listings - .then(this._getListings.bind(this)) - //bring them in a proper form (dictated by the provider) - .then(this._normalize.bind(this)) - //filter listings with stuff tagged by the blacklist of the provider - .then(this._filter.bind(this)) - //check if new listings available. if so proceed - .then(this._findNew.bind(this)) - //store everything in db - .then(this._save.bind(this)) - //check for similar listings. if found, remove them before notifying - .then(this._filterBySimilarListings.bind(this)) - //notify the user using the configured notification adapter - .then(this._notify.bind(this)) - //if an error occurred on the way, handle it here. - .catch(this._handleError.bind(this)) - ); - } + execute() { + return ( + //modify the url to make sure search order is correctly set + Promise.resolve(urlModifier(this._providerConfig.url, this._providerConfig.sortByDateParam)) + //scraping the site and try finding new listings + .then(this._getListings.bind(this)) + //bring them in a proper form (dictated by the provider) + .then(this._normalize.bind(this)) + //filter listings with stuff tagged by the blacklist of the provider + .then(this._filter.bind(this)) + //check if new listings available. if so proceed + .then(this._findNew.bind(this)) + //store everything in db + .then(this._save.bind(this)) + //check for similar listings. if found, remove them before notifying + .then(this._filterBySimilarListings.bind(this)) + //notify the user using the configured notification adapter + .then(this._notify.bind(this)) + //if an error occurred on the way, handle it here. + .catch(this._handleError.bind(this)) + ); + } - _getListings(url) { - const extractor = new Extractor(); - return new Promise((resolve, reject) => { - extractor.execute(url,this._providerConfig.waitForSelector) - .then(() => { - const listings = extractor.parseResponseText(this._providerConfig.crawlContainer, this._providerConfig.crawlFields); - resolve(listings == null ? [] : listings); - }).catch(err => { - reject(err); - /* eslint-disable no-console */ - console.error(err); - /* eslint-enable no-console */ - }); + _getListings(url) { + const extractor = new Extractor(); + return new Promise((resolve, reject) => { + extractor + .execute(url, this._providerConfig.waitForSelector) + .then(() => { + const listings = extractor.parseResponseText( + this._providerConfig.crawlContainer, + this._providerConfig.crawlFields, + url, + ); + resolve(listings == null ? [] : listings); + }) + .catch((err) => { + reject(err); + /* eslint-disable no-console */ + console.error(err); + /* eslint-enable no-console */ }); - } + }); + } - _normalize(listings) { - return listings.map(this._providerConfig.normalize); - } + _normalize(listings) { + return listings.map(this._providerConfig.normalize); + } - _filter(listings) { - //only return those where all the fields have been found - const keys = Object.keys(this._providerConfig.crawlFields); - const filteredListings = listings.filter((item) => keys.every((key) => key in item)); - return filteredListings.filter(this._providerConfig.filter); - } + _filter(listings) { + //only return those where all the fields have been found + const keys = Object.keys(this._providerConfig.crawlFields); + const filteredListings = listings.filter((item) => keys.every((key) => key in item)); + return filteredListings.filter(this._providerConfig.filter); + } - _findNew(listings) { - const newListings = listings.filter((o) => getKnownListings(this._jobKey, this._providerId)[o.id] == null); - if (newListings.length === 0) { - throw new NoNewListingsWarning(); - } - return newListings; + _findNew(listings) { + const newListings = listings.filter((o) => getKnownListings(this._jobKey, this._providerId)[o.id] == null); + if (newListings.length === 0) { + throw new NoNewListingsWarning(); } + return newListings; + } - _notify(newListings) { - if (newListings.length === 0) { - throw new NoNewListingsWarning(); - } - const sendNotifications = notify.send(this._providerId, newListings, this._notificationConfig, this._jobKey); - return Promise.all(sendNotifications).then(() => newListings); + _notify(newListings) { + if (newListings.length === 0) { + throw new NoNewListingsWarning(); } + const sendNotifications = notify.send(this._providerId, newListings, this._notificationConfig, this._jobKey); + return Promise.all(sendNotifications).then(() => newListings); + } - _save(newListings) { - const currentListings = getKnownListings(this._jobKey, this._providerId) || {}; - newListings.forEach((listing) => { - currentListings[listing.id] = Date.now(); - }); - setKnownListings(this._jobKey, this._providerId, currentListings); - return newListings; - } + _save(newListings) { + const currentListings = getKnownListings(this._jobKey, this._providerId) || {}; + newListings.forEach((listing) => { + currentListings[listing.id] = Date.now(); + }); + setKnownListings(this._jobKey, this._providerId, currentListings); + return newListings; + } - _filterBySimilarListings(listings) { - const filteredList = listings.filter((listing) => { - const similar = this._similarityCache.hasSimilarEntries(this._jobKey, listing.title); - if (similar) { - /* eslint-disable no-console */ - console.debug(`Filtering similar entry for job with id ${this._jobKey} with title: `, listing.title); - /* eslint-enable no-console */ - } - return !similar; - }); - filteredList.forEach((filter) => this._similarityCache.addCacheEntry(this._jobKey, filter.title)); - return filteredList; - } + _filterBySimilarListings(listings) { + const filteredList = listings.filter((listing) => { + const similar = this._similarityCache.hasSimilarEntries(this._jobKey, listing.title); + if (similar) { + /* eslint-disable no-console */ + console.debug(`Filtering similar entry for job with id ${this._jobKey} with title: `, listing.title); + /* eslint-enable no-console */ + } + return !similar; + }); + filteredList.forEach((filter) => this._similarityCache.addCacheEntry(this._jobKey, filter.title)); + return filteredList; + } - _handleError(err) { - if (err.name !== 'NoNewListingsWarning') console.error(err); - } + _handleError(err) { + if (err.name !== 'NoNewListingsWarning') console.error(err); + } } export default FredyRuntime; diff --git a/lib/api/routes/jobRouter.js b/lib/api/routes/jobRouter.js index d2c9f21..6b6f6a6 100644 --- a/lib/api/routes/jobRouter.js +++ b/lib/api/routes/jobRouter.js @@ -1,11 +1,9 @@ import restana from 'restana'; -import fetch from 'node-fetch'; import * as jobStorage from '../../services/storage/jobStorage.js'; import * as userStorage from '../../services/storage/userStorage.js'; -import * as immoscoutProvider from '../../provider/immoscout.js'; import { config } from '../../utils.js'; import { isAdmin } from '../security.js'; -import {trackDemoJobCreated} from '../../services/tracking/Tracker.js'; +import { trackDemoJobCreated } from '../../services/tracking/Tracker.js'; const service = restana(); const jobRouter = service.newRouter(); function doesJobBelongsToUser(job, req) { @@ -28,7 +26,7 @@ jobRouter.get('/', async (req, res) => { jobRouter.get('/processingTimes', async (req, res) => { res.body = { interval: config.interval, - lastRun: config.lastRun || null + lastRun: config.lastRun || null, }; res.send(); }); @@ -51,7 +49,7 @@ jobRouter.post('/', async (req, res) => { trackDemoJobCreated({ name, provider, - adapter: notificationAdapter + adapter: notificationAdapter, }); res.send(); }); diff --git a/lib/services/extractor/extractor.js b/lib/services/extractor/extractor.js index a17d193..8e5cbaf 100644 --- a/lib/services/extractor/extractor.js +++ b/lib/services/extractor/extractor.js @@ -1,45 +1,43 @@ -import {setDebug} from './utils.js'; +import { setDebug } from './utils.js'; import puppeteerExtractor from './puppeteerExtractor.js'; -import {loadParser, parse} from './parser/parser.js'; +import { loadParser, parse } from './parser/parser.js'; const DEFAULT_OPTIONS = { - debug: false, - puppeteerTimeout: 20_000, - puppeteerHeadless: true - + debug: false, + puppeteerTimeout: 60_000, + puppeteerHeadless: true, }; export default class Extractor { - constructor(options) { - this.options = { - ...DEFAULT_OPTIONS, - ...options - }; - this.responseText = null; - setDebug(this.options); + constructor(options) { + this.options = { + ...DEFAULT_OPTIONS, + ...options, + }; + this.responseText = null; + setDebug(this.options); + } + + /** + * if you are extracting data from a SPA, you must provide a selector, otherwise + * your response will never contain what you are really looking for + * @param url + * @param waitForSelector + */ + execute = async (url, waitForSelector = null) => { + this.responseText = null; + try { + this.responseText = await puppeteerExtractor(url, waitForSelector, this.options); + if (this.responseText != null) { + loadParser(this.responseText); + } + } catch (error) { + console.error('Error trying to load page.', error); } + return this; + }; - /** - * if you are extracting data from a SPA, you must provide a selector, otherwise - * your response will never contain what you are really looking for - * @param url - * @param waitForSelector - */ - execute = async (url, waitForSelector = null) => { - this.responseText = null; - try { - this.responseText = await puppeteerExtractor(url, waitForSelector, this.options); - if(this.responseText != null) { - loadParser(this.responseText); - } - } catch (error) { - console.error('Error trying to load page.', error); - } - return this; - }; - - - parseResponseText = (crawlContainer, crawlFields) => { - return parse(crawlContainer, crawlFields, this.responseText); - }; + parseResponseText = (crawlContainer, crawlFields, url) => { + return parse(crawlContainer, crawlFields, this.responseText, url); + }; } diff --git a/lib/services/extractor/parser/parser.js b/lib/services/extractor/parser/parser.js index 456fe5a..194200b 100644 --- a/lib/services/extractor/parser/parser.js +++ b/lib/services/extractor/parser/parser.js @@ -3,92 +3,94 @@ import * as cheerio from 'cheerio'; let $ = null; export function loadParser(text) { - $ = cheerio.load(text); + $ = cheerio.load(text); } -export function parse(crawlContainer, crawlFields, text) { - if (!text) { - console.warn('Cannot parse, text was empty.'); - return null; - } +export function parse(crawlContainer, crawlFields, text, url) { + if (!text) { + console.warn('Cannot parse, text was empty for url ', url); + return null; + } - if (!crawlContainer || !crawlFields) { - console.warn('Cannot parse, selector was empty.'); - return null; - } + if (!crawlContainer || !crawlFields) { + console.warn('Cannot parse, selector was empty for url ', url); + return null; + } - const result = []; + const result = []; - if ($(crawlContainer).length === 0) { - console.error('No elements in crawl container found!'); - } + if ($(crawlContainer).length === 0) { + console.error('No elements in crawl container found for url ', url); + } - $(crawlContainer).each((_, element) => { - const container = $(element); - const parsedObject = {}; + $(crawlContainer).each((_, element) => { + const container = $(element); + const parsedObject = {}; - // Parse fields based on crawlFields - for (const [key, fieldSelector] of Object.entries(crawlFields)) { - let value; + // Parse fields based on crawlFields + for (const [key, fieldSelector] of Object.entries(crawlFields)) { + let value; - try { + try { + const selector = fieldSelector.includes('|') + ? fieldSelector.substring(0, fieldSelector.indexOf('|')).trim() + : fieldSelector; - const selector = fieldSelector.includes('|') ? fieldSelector.substring(0, fieldSelector.indexOf('|')).trim() : fieldSelector; - - if (selector.includes('@')) { - const [sel, attr] = selector.split('@'); - if (sel.length === 0) { - value = container.attr(attr.trim()); - } else { - value = container.find(sel.trim()).attr(attr.trim()); - } - } else { - value = container.find(selector.trim()).text(); - } - - // Apply modifiers if specified - if (fieldSelector.includes('|')) { - const [_, ...modifiers] = fieldSelector.split('|').map(s => s.trim()); - value = applyModifiers(value, modifiers); - } - - parsedObject[key] = value || null; - } catch (error) { - console.error(`Error parsing field '${key}' with selector '${fieldSelector}':`, error); - parsedObject[key] = null; - } - } - - if (parsedObject.id != null) { - result.push(parsedObject); + if (selector.includes('@')) { + const [sel, attr] = selector.split('@'); + if (sel.length === 0) { + value = container.attr(attr.trim()); + } else { + value = container.find(sel.trim()).attr(attr.trim()); + } } else { - console.warn('ID not found. Not relaying object.'); + value = container.find(selector.trim()).text(); } - }); - return result; + // Apply modifiers if specified + if (fieldSelector.includes('|')) { + /* eslint-disable no-unused-vars */ + const [_, ...modifiers] = fieldSelector.split('|').map((s) => s.trim()); + /* eslint-disable no-unused-vars */ + value = applyModifiers(value, modifiers); + } + + parsedObject[key] = value || null; + } catch (error) { + console.error(`Error parsing field '${key}' with selector '${fieldSelector}':`, error); + parsedObject[key] = null; + } + } + + if (parsedObject.id != null) { + result.push(parsedObject); + } else { + console.warn('ID not found. Not relaying object.'); + } + }); + + return result; } // Helper function to apply modifiers function applyModifiers(value, modifiers) { - if (!value) return value; + if (!value) return value; - modifiers.forEach(modifier => { - switch (modifier) { - case 'int': - value = parseInt(value, 10); - break; - case 'trim': - value = value.replace(/\s+/g, ' ').trim(); - break; - case 'removeNewline': - value = value.replace(/\n/g, ' '); - break; - default: - console.warn(`Unknown modifier: ${modifier}`); - } - }); + modifiers.forEach((modifier) => { + switch (modifier) { + case 'int': + value = parseInt(value, 10); + break; + case 'trim': + value = value.replace(/\s+/g, ' ').trim(); + break; + case 'removeNewline': + value = value.replace(/\n/g, ' '); + break; + default: + console.warn(`Unknown modifier: ${modifier}`); + } + }); - return value; + return value; } - diff --git a/lib/services/extractor/utils.js b/lib/services/extractor/utils.js index b28cb05..a356ece 100644 --- a/lib/services/extractor/utils.js +++ b/lib/services/extractor/utils.js @@ -1,35 +1,32 @@ let debuggingOn = false; export const DEFAULT_HEADER = { - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.5', - 'Connection': 'keep-alive', - 'Upgrade-Insecure-Requests': '1', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36' + Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + Connection: 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'User-Agent': + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', }; -export const setDebug = options => { - debuggingOn = !!options?.debug; +export const setDebug = (options) => { + debuggingOn = !!options?.debug; }; export const debug = (message) => { - if(debuggingOn) { - console.debug(message); - } + if (debuggingOn) { + /* eslint-disable no-console */ + console.debug(message); + /* eslint-enable no-console */ + } }; export const botDetected = (pageSource, statusCode) => { - const suspiciousStatusCodes = [ - 403, 429 - ]; - const botDetectionPatterns = [ - /verify you are human/i, - /access denied/i, - /x-amz-cf-id/i, - ]; + const suspiciousStatusCodes = [403, 429]; + const botDetectionPatterns = [/verify you are human/i, /access denied/i, /x-amz-cf-id/i]; - const detectedInSource = botDetectionPatterns.some(pattern => pattern.test(pageSource)); - const detectedByStatus = suspiciousStatusCodes.includes(statusCode); + const detectedInSource = botDetectionPatterns.some((pattern) => pattern.test(pageSource)); + const detectedByStatus = suspiciousStatusCodes.includes(statusCode); - return detectedInSource || detectedByStatus; -}; \ No newline at end of file + return detectedInSource || detectedByStatus; +};