mirror of
https://github.com/orangecoding/fredy.git
synced 2026-06-16 12:31:07 +00:00
better logging, fixing code smells
This commit is contained in:
@@ -1,118 +1,124 @@
|
||||
import {NoNewListingsWarning} from './errors.js';
|
||||
import {setKnownListings, getKnownListings} from './services/storage/listingsStorage.js';
|
||||
import { NoNewListingsWarning } from './errors.js';
|
||||
import { setKnownListings, getKnownListings } from './services/storage/listingsStorage.js';
|
||||
import * as notify from './notification/notify.js';
|
||||
import Extractor from './services/extractor/extractor.js';
|
||||
import urlModifier from './services/queryStringMutator.js';
|
||||
|
||||
class FredyRuntime {
|
||||
/**
|
||||
*
|
||||
* @param providerConfig the config for the specific provider, we're going to query at the moment
|
||||
* @param notificationConfig the config for all notifications
|
||||
* @param providerId the id of the provider currently in use
|
||||
* @param jobKey key of the job that is currently running (from within the config)
|
||||
* @param similarityCache cache instance holding values to check for similarity of entries
|
||||
*/
|
||||
constructor(providerConfig, notificationConfig, providerId, jobKey, similarityCache) {
|
||||
this._providerConfig = providerConfig;
|
||||
this._notificationConfig = notificationConfig;
|
||||
this._providerId = providerId;
|
||||
this._jobKey = jobKey;
|
||||
this._similarityCache = similarityCache;
|
||||
}
|
||||
/**
|
||||
*
|
||||
* @param providerConfig the config for the specific provider, we're going to query at the moment
|
||||
* @param notificationConfig the config for all notifications
|
||||
* @param providerId the id of the provider currently in use
|
||||
* @param jobKey key of the job that is currently running (from within the config)
|
||||
* @param similarityCache cache instance holding values to check for similarity of entries
|
||||
*/
|
||||
constructor(providerConfig, notificationConfig, providerId, jobKey, similarityCache) {
|
||||
this._providerConfig = providerConfig;
|
||||
this._notificationConfig = notificationConfig;
|
||||
this._providerId = providerId;
|
||||
this._jobKey = jobKey;
|
||||
this._similarityCache = similarityCache;
|
||||
}
|
||||
|
||||
execute() {
|
||||
return (
|
||||
//modify the url to make sure search order is correctly set
|
||||
Promise.resolve(urlModifier(this._providerConfig.url, this._providerConfig.sortByDateParam))
|
||||
//scraping the site and try finding new listings
|
||||
.then(this._getListings.bind(this))
|
||||
//bring them in a proper form (dictated by the provider)
|
||||
.then(this._normalize.bind(this))
|
||||
//filter listings with stuff tagged by the blacklist of the provider
|
||||
.then(this._filter.bind(this))
|
||||
//check if new listings available. if so proceed
|
||||
.then(this._findNew.bind(this))
|
||||
//store everything in db
|
||||
.then(this._save.bind(this))
|
||||
//check for similar listings. if found, remove them before notifying
|
||||
.then(this._filterBySimilarListings.bind(this))
|
||||
//notify the user using the configured notification adapter
|
||||
.then(this._notify.bind(this))
|
||||
//if an error occurred on the way, handle it here.
|
||||
.catch(this._handleError.bind(this))
|
||||
);
|
||||
}
|
||||
execute() {
|
||||
return (
|
||||
//modify the url to make sure search order is correctly set
|
||||
Promise.resolve(urlModifier(this._providerConfig.url, this._providerConfig.sortByDateParam))
|
||||
//scraping the site and try finding new listings
|
||||
.then(this._getListings.bind(this))
|
||||
//bring them in a proper form (dictated by the provider)
|
||||
.then(this._normalize.bind(this))
|
||||
//filter listings with stuff tagged by the blacklist of the provider
|
||||
.then(this._filter.bind(this))
|
||||
//check if new listings available. if so proceed
|
||||
.then(this._findNew.bind(this))
|
||||
//store everything in db
|
||||
.then(this._save.bind(this))
|
||||
//check for similar listings. if found, remove them before notifying
|
||||
.then(this._filterBySimilarListings.bind(this))
|
||||
//notify the user using the configured notification adapter
|
||||
.then(this._notify.bind(this))
|
||||
//if an error occurred on the way, handle it here.
|
||||
.catch(this._handleError.bind(this))
|
||||
);
|
||||
}
|
||||
|
||||
_getListings(url) {
|
||||
const extractor = new Extractor();
|
||||
return new Promise((resolve, reject) => {
|
||||
extractor.execute(url,this._providerConfig.waitForSelector)
|
||||
.then(() => {
|
||||
const listings = extractor.parseResponseText(this._providerConfig.crawlContainer, this._providerConfig.crawlFields);
|
||||
resolve(listings == null ? [] : listings);
|
||||
}).catch(err => {
|
||||
reject(err);
|
||||
/* eslint-disable no-console */
|
||||
console.error(err);
|
||||
/* eslint-enable no-console */
|
||||
});
|
||||
_getListings(url) {
|
||||
const extractor = new Extractor();
|
||||
return new Promise((resolve, reject) => {
|
||||
extractor
|
||||
.execute(url, this._providerConfig.waitForSelector)
|
||||
.then(() => {
|
||||
const listings = extractor.parseResponseText(
|
||||
this._providerConfig.crawlContainer,
|
||||
this._providerConfig.crawlFields,
|
||||
url,
|
||||
);
|
||||
resolve(listings == null ? [] : listings);
|
||||
})
|
||||
.catch((err) => {
|
||||
reject(err);
|
||||
/* eslint-disable no-console */
|
||||
console.error(err);
|
||||
/* eslint-enable no-console */
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
_normalize(listings) {
|
||||
return listings.map(this._providerConfig.normalize);
|
||||
}
|
||||
_normalize(listings) {
|
||||
return listings.map(this._providerConfig.normalize);
|
||||
}
|
||||
|
||||
_filter(listings) {
|
||||
//only return those where all the fields have been found
|
||||
const keys = Object.keys(this._providerConfig.crawlFields);
|
||||
const filteredListings = listings.filter((item) => keys.every((key) => key in item));
|
||||
return filteredListings.filter(this._providerConfig.filter);
|
||||
}
|
||||
_filter(listings) {
|
||||
//only return those where all the fields have been found
|
||||
const keys = Object.keys(this._providerConfig.crawlFields);
|
||||
const filteredListings = listings.filter((item) => keys.every((key) => key in item));
|
||||
return filteredListings.filter(this._providerConfig.filter);
|
||||
}
|
||||
|
||||
_findNew(listings) {
|
||||
const newListings = listings.filter((o) => getKnownListings(this._jobKey, this._providerId)[o.id] == null);
|
||||
if (newListings.length === 0) {
|
||||
throw new NoNewListingsWarning();
|
||||
}
|
||||
return newListings;
|
||||
_findNew(listings) {
|
||||
const newListings = listings.filter((o) => getKnownListings(this._jobKey, this._providerId)[o.id] == null);
|
||||
if (newListings.length === 0) {
|
||||
throw new NoNewListingsWarning();
|
||||
}
|
||||
return newListings;
|
||||
}
|
||||
|
||||
_notify(newListings) {
|
||||
if (newListings.length === 0) {
|
||||
throw new NoNewListingsWarning();
|
||||
}
|
||||
const sendNotifications = notify.send(this._providerId, newListings, this._notificationConfig, this._jobKey);
|
||||
return Promise.all(sendNotifications).then(() => newListings);
|
||||
_notify(newListings) {
|
||||
if (newListings.length === 0) {
|
||||
throw new NoNewListingsWarning();
|
||||
}
|
||||
const sendNotifications = notify.send(this._providerId, newListings, this._notificationConfig, this._jobKey);
|
||||
return Promise.all(sendNotifications).then(() => newListings);
|
||||
}
|
||||
|
||||
_save(newListings) {
|
||||
const currentListings = getKnownListings(this._jobKey, this._providerId) || {};
|
||||
newListings.forEach((listing) => {
|
||||
currentListings[listing.id] = Date.now();
|
||||
});
|
||||
setKnownListings(this._jobKey, this._providerId, currentListings);
|
||||
return newListings;
|
||||
}
|
||||
_save(newListings) {
|
||||
const currentListings = getKnownListings(this._jobKey, this._providerId) || {};
|
||||
newListings.forEach((listing) => {
|
||||
currentListings[listing.id] = Date.now();
|
||||
});
|
||||
setKnownListings(this._jobKey, this._providerId, currentListings);
|
||||
return newListings;
|
||||
}
|
||||
|
||||
_filterBySimilarListings(listings) {
|
||||
const filteredList = listings.filter((listing) => {
|
||||
const similar = this._similarityCache.hasSimilarEntries(this._jobKey, listing.title);
|
||||
if (similar) {
|
||||
/* eslint-disable no-console */
|
||||
console.debug(`Filtering similar entry for job with id ${this._jobKey} with title: `, listing.title);
|
||||
/* eslint-enable no-console */
|
||||
}
|
||||
return !similar;
|
||||
});
|
||||
filteredList.forEach((filter) => this._similarityCache.addCacheEntry(this._jobKey, filter.title));
|
||||
return filteredList;
|
||||
}
|
||||
_filterBySimilarListings(listings) {
|
||||
const filteredList = listings.filter((listing) => {
|
||||
const similar = this._similarityCache.hasSimilarEntries(this._jobKey, listing.title);
|
||||
if (similar) {
|
||||
/* eslint-disable no-console */
|
||||
console.debug(`Filtering similar entry for job with id ${this._jobKey} with title: `, listing.title);
|
||||
/* eslint-enable no-console */
|
||||
}
|
||||
return !similar;
|
||||
});
|
||||
filteredList.forEach((filter) => this._similarityCache.addCacheEntry(this._jobKey, filter.title));
|
||||
return filteredList;
|
||||
}
|
||||
|
||||
_handleError(err) {
|
||||
if (err.name !== 'NoNewListingsWarning') console.error(err);
|
||||
}
|
||||
_handleError(err) {
|
||||
if (err.name !== 'NoNewListingsWarning') console.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
export default FredyRuntime;
|
||||
|
||||
@@ -1,11 +1,9 @@
|
||||
import restana from 'restana';
|
||||
import fetch from 'node-fetch';
|
||||
import * as jobStorage from '../../services/storage/jobStorage.js';
|
||||
import * as userStorage from '../../services/storage/userStorage.js';
|
||||
import * as immoscoutProvider from '../../provider/immoscout.js';
|
||||
import { config } from '../../utils.js';
|
||||
import { isAdmin } from '../security.js';
|
||||
import {trackDemoJobCreated} from '../../services/tracking/Tracker.js';
|
||||
import { trackDemoJobCreated } from '../../services/tracking/Tracker.js';
|
||||
const service = restana();
|
||||
const jobRouter = service.newRouter();
|
||||
function doesJobBelongsToUser(job, req) {
|
||||
@@ -28,7 +26,7 @@ jobRouter.get('/', async (req, res) => {
|
||||
jobRouter.get('/processingTimes', async (req, res) => {
|
||||
res.body = {
|
||||
interval: config.interval,
|
||||
lastRun: config.lastRun || null
|
||||
lastRun: config.lastRun || null,
|
||||
};
|
||||
res.send();
|
||||
});
|
||||
@@ -51,7 +49,7 @@ jobRouter.post('/', async (req, res) => {
|
||||
trackDemoJobCreated({
|
||||
name,
|
||||
provider,
|
||||
adapter: notificationAdapter
|
||||
adapter: notificationAdapter,
|
||||
});
|
||||
res.send();
|
||||
});
|
||||
|
||||
@@ -1,45 +1,43 @@
|
||||
import {setDebug} from './utils.js';
|
||||
import { setDebug } from './utils.js';
|
||||
import puppeteerExtractor from './puppeteerExtractor.js';
|
||||
import {loadParser, parse} from './parser/parser.js';
|
||||
import { loadParser, parse } from './parser/parser.js';
|
||||
|
||||
const DEFAULT_OPTIONS = {
|
||||
debug: false,
|
||||
puppeteerTimeout: 20_000,
|
||||
puppeteerHeadless: true
|
||||
|
||||
debug: false,
|
||||
puppeteerTimeout: 60_000,
|
||||
puppeteerHeadless: true,
|
||||
};
|
||||
|
||||
export default class Extractor {
|
||||
constructor(options) {
|
||||
this.options = {
|
||||
...DEFAULT_OPTIONS,
|
||||
...options
|
||||
};
|
||||
this.responseText = null;
|
||||
setDebug(this.options);
|
||||
constructor(options) {
|
||||
this.options = {
|
||||
...DEFAULT_OPTIONS,
|
||||
...options,
|
||||
};
|
||||
this.responseText = null;
|
||||
setDebug(this.options);
|
||||
}
|
||||
|
||||
/**
|
||||
* if you are extracting data from a SPA, you must provide a selector, otherwise
|
||||
* your response will never contain what you are really looking for
|
||||
* @param url
|
||||
* @param waitForSelector
|
||||
*/
|
||||
execute = async (url, waitForSelector = null) => {
|
||||
this.responseText = null;
|
||||
try {
|
||||
this.responseText = await puppeteerExtractor(url, waitForSelector, this.options);
|
||||
if (this.responseText != null) {
|
||||
loadParser(this.responseText);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error trying to load page.', error);
|
||||
}
|
||||
return this;
|
||||
};
|
||||
|
||||
/**
|
||||
* if you are extracting data from a SPA, you must provide a selector, otherwise
|
||||
* your response will never contain what you are really looking for
|
||||
* @param url
|
||||
* @param waitForSelector
|
||||
*/
|
||||
execute = async (url, waitForSelector = null) => {
|
||||
this.responseText = null;
|
||||
try {
|
||||
this.responseText = await puppeteerExtractor(url, waitForSelector, this.options);
|
||||
if(this.responseText != null) {
|
||||
loadParser(this.responseText);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error trying to load page.', error);
|
||||
}
|
||||
return this;
|
||||
};
|
||||
|
||||
|
||||
parseResponseText = (crawlContainer, crawlFields) => {
|
||||
return parse(crawlContainer, crawlFields, this.responseText);
|
||||
};
|
||||
parseResponseText = (crawlContainer, crawlFields, url) => {
|
||||
return parse(crawlContainer, crawlFields, this.responseText, url);
|
||||
};
|
||||
}
|
||||
|
||||
@@ -3,92 +3,94 @@ import * as cheerio from 'cheerio';
|
||||
let $ = null;
|
||||
|
||||
export function loadParser(text) {
|
||||
$ = cheerio.load(text);
|
||||
$ = cheerio.load(text);
|
||||
}
|
||||
|
||||
export function parse(crawlContainer, crawlFields, text) {
|
||||
if (!text) {
|
||||
console.warn('Cannot parse, text was empty.');
|
||||
return null;
|
||||
}
|
||||
export function parse(crawlContainer, crawlFields, text, url) {
|
||||
if (!text) {
|
||||
console.warn('Cannot parse, text was empty for url ', url);
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!crawlContainer || !crawlFields) {
|
||||
console.warn('Cannot parse, selector was empty.');
|
||||
return null;
|
||||
}
|
||||
if (!crawlContainer || !crawlFields) {
|
||||
console.warn('Cannot parse, selector was empty for url ', url);
|
||||
return null;
|
||||
}
|
||||
|
||||
const result = [];
|
||||
const result = [];
|
||||
|
||||
if ($(crawlContainer).length === 0) {
|
||||
console.error('No elements in crawl container found!');
|
||||
}
|
||||
if ($(crawlContainer).length === 0) {
|
||||
console.error('No elements in crawl container found for url ', url);
|
||||
}
|
||||
|
||||
$(crawlContainer).each((_, element) => {
|
||||
const container = $(element);
|
||||
const parsedObject = {};
|
||||
$(crawlContainer).each((_, element) => {
|
||||
const container = $(element);
|
||||
const parsedObject = {};
|
||||
|
||||
// Parse fields based on crawlFields
|
||||
for (const [key, fieldSelector] of Object.entries(crawlFields)) {
|
||||
let value;
|
||||
// Parse fields based on crawlFields
|
||||
for (const [key, fieldSelector] of Object.entries(crawlFields)) {
|
||||
let value;
|
||||
|
||||
try {
|
||||
try {
|
||||
const selector = fieldSelector.includes('|')
|
||||
? fieldSelector.substring(0, fieldSelector.indexOf('|')).trim()
|
||||
: fieldSelector;
|
||||
|
||||
const selector = fieldSelector.includes('|') ? fieldSelector.substring(0, fieldSelector.indexOf('|')).trim() : fieldSelector;
|
||||
|
||||
if (selector.includes('@')) {
|
||||
const [sel, attr] = selector.split('@');
|
||||
if (sel.length === 0) {
|
||||
value = container.attr(attr.trim());
|
||||
} else {
|
||||
value = container.find(sel.trim()).attr(attr.trim());
|
||||
}
|
||||
} else {
|
||||
value = container.find(selector.trim()).text();
|
||||
}
|
||||
|
||||
// Apply modifiers if specified
|
||||
if (fieldSelector.includes('|')) {
|
||||
const [_, ...modifiers] = fieldSelector.split('|').map(s => s.trim());
|
||||
value = applyModifiers(value, modifiers);
|
||||
}
|
||||
|
||||
parsedObject[key] = value || null;
|
||||
} catch (error) {
|
||||
console.error(`Error parsing field '${key}' with selector '${fieldSelector}':`, error);
|
||||
parsedObject[key] = null;
|
||||
}
|
||||
}
|
||||
|
||||
if (parsedObject.id != null) {
|
||||
result.push(parsedObject);
|
||||
if (selector.includes('@')) {
|
||||
const [sel, attr] = selector.split('@');
|
||||
if (sel.length === 0) {
|
||||
value = container.attr(attr.trim());
|
||||
} else {
|
||||
value = container.find(sel.trim()).attr(attr.trim());
|
||||
}
|
||||
} else {
|
||||
console.warn('ID not found. Not relaying object.');
|
||||
value = container.find(selector.trim()).text();
|
||||
}
|
||||
});
|
||||
|
||||
return result;
|
||||
// Apply modifiers if specified
|
||||
if (fieldSelector.includes('|')) {
|
||||
/* eslint-disable no-unused-vars */
|
||||
const [_, ...modifiers] = fieldSelector.split('|').map((s) => s.trim());
|
||||
/* eslint-disable no-unused-vars */
|
||||
value = applyModifiers(value, modifiers);
|
||||
}
|
||||
|
||||
parsedObject[key] = value || null;
|
||||
} catch (error) {
|
||||
console.error(`Error parsing field '${key}' with selector '${fieldSelector}':`, error);
|
||||
parsedObject[key] = null;
|
||||
}
|
||||
}
|
||||
|
||||
if (parsedObject.id != null) {
|
||||
result.push(parsedObject);
|
||||
} else {
|
||||
console.warn('ID not found. Not relaying object.');
|
||||
}
|
||||
});
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Helper function to apply modifiers
|
||||
function applyModifiers(value, modifiers) {
|
||||
if (!value) return value;
|
||||
if (!value) return value;
|
||||
|
||||
modifiers.forEach(modifier => {
|
||||
switch (modifier) {
|
||||
case 'int':
|
||||
value = parseInt(value, 10);
|
||||
break;
|
||||
case 'trim':
|
||||
value = value.replace(/\s+/g, ' ').trim();
|
||||
break;
|
||||
case 'removeNewline':
|
||||
value = value.replace(/\n/g, ' ');
|
||||
break;
|
||||
default:
|
||||
console.warn(`Unknown modifier: ${modifier}`);
|
||||
}
|
||||
});
|
||||
modifiers.forEach((modifier) => {
|
||||
switch (modifier) {
|
||||
case 'int':
|
||||
value = parseInt(value, 10);
|
||||
break;
|
||||
case 'trim':
|
||||
value = value.replace(/\s+/g, ' ').trim();
|
||||
break;
|
||||
case 'removeNewline':
|
||||
value = value.replace(/\n/g, ' ');
|
||||
break;
|
||||
default:
|
||||
console.warn(`Unknown modifier: ${modifier}`);
|
||||
}
|
||||
});
|
||||
|
||||
return value;
|
||||
return value;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,35 +1,32 @@
|
||||
let debuggingOn = false;
|
||||
|
||||
export const DEFAULT_HEADER = {
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
|
||||
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
Connection: 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'User-Agent':
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
|
||||
};
|
||||
|
||||
export const setDebug = options => {
|
||||
debuggingOn = !!options?.debug;
|
||||
export const setDebug = (options) => {
|
||||
debuggingOn = !!options?.debug;
|
||||
};
|
||||
|
||||
export const debug = (message) => {
|
||||
if(debuggingOn) {
|
||||
console.debug(message);
|
||||
}
|
||||
if (debuggingOn) {
|
||||
/* eslint-disable no-console */
|
||||
console.debug(message);
|
||||
/* eslint-enable no-console */
|
||||
}
|
||||
};
|
||||
|
||||
export const botDetected = (pageSource, statusCode) => {
|
||||
const suspiciousStatusCodes = [
|
||||
403, 429
|
||||
];
|
||||
const botDetectionPatterns = [
|
||||
/verify you are human/i,
|
||||
/access denied/i,
|
||||
/x-amz-cf-id/i,
|
||||
];
|
||||
const suspiciousStatusCodes = [403, 429];
|
||||
const botDetectionPatterns = [/verify you are human/i, /access denied/i, /x-amz-cf-id/i];
|
||||
|
||||
const detectedInSource = botDetectionPatterns.some(pattern => pattern.test(pageSource));
|
||||
const detectedByStatus = suspiciousStatusCodes.includes(statusCode);
|
||||
const detectedInSource = botDetectionPatterns.some((pattern) => pattern.test(pageSource));
|
||||
const detectedByStatus = suspiciousStatusCodes.includes(statusCode);
|
||||
|
||||
return detectedInSource || detectedByStatus;
|
||||
};
|
||||
return detectedInSource || detectedByStatus;
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user