mirror of
https://github.com/orangecoding/fredy.git
synced 2026-06-16 12:31:07 +00:00
adding similarity check (#29)
* adding similarity check * adding paging * fixing tests * docu * better error handling * fixing tests * adjusting page limit * fixing login screen * cleanup * upgrade browser list * prevent spamming the log * fixing tests * removing job listings when removing a job or the user
This commit is contained in:
committed by
GitHub
parent
88c046dbd4
commit
59e6d287fc
@@ -1,4 +1,4 @@
|
||||
const { NoNewListingsError } = require('./errors');
|
||||
const { NoNewListingsWarning } = require('./errors');
|
||||
const { setKnownListings, getKnownListings } = require('./services/storage/listingsStorage');
|
||||
|
||||
const notify = require('./notification/notify');
|
||||
@@ -12,12 +12,14 @@ class FredyRuntime {
|
||||
* @param notificationConfig the config for all notifications
|
||||
* @param providerId the id of the provider currently in use
|
||||
* @param jobKey key of the job that is currently running (from within the config)
|
||||
* @param similarityCache cache instance holding values to check for similarity of entries
|
||||
*/
|
||||
constructor(providerConfig, notificationConfig, providerId, jobKey) {
|
||||
constructor(providerConfig, notificationConfig, providerId, jobKey, similarityCache) {
|
||||
this._providerConfig = providerConfig;
|
||||
this._notificationConfig = notificationConfig;
|
||||
this._providerId = providerId;
|
||||
this._jobKey = jobKey;
|
||||
this._similarityCache = similarityCache;
|
||||
}
|
||||
|
||||
execute() {
|
||||
@@ -33,6 +35,8 @@ class FredyRuntime {
|
||||
.then(this._findNew.bind(this))
|
||||
//store everything in db
|
||||
.then(this._save.bind(this))
|
||||
//check for similar listings. if found, remove them before notifying
|
||||
.then(this._filterBySimilarListings.bind(this))
|
||||
//notify the user using the configured notification adapter
|
||||
.then(this._notify.bind(this))
|
||||
//if an error occurred on the way, handle it here.
|
||||
@@ -53,14 +57,29 @@ class FredyRuntime {
|
||||
}
|
||||
const u = scrapingAnt.isImmoscout(id) ? scrapingAnt.transformUrlForScrapingAnt(url, id) : url;
|
||||
try {
|
||||
xray(u, this._providerConfig.crawlContainer, [this._providerConfig.crawlFields])
|
||||
.then((listings) => {
|
||||
resolve(listings == null ? [] : listings);
|
||||
})
|
||||
.catch((err) => {
|
||||
reject(err);
|
||||
console.error(err);
|
||||
});
|
||||
if (this._providerConfig.paginate != null) {
|
||||
xray(u, this._providerConfig.crawlContainer, [this._providerConfig.crawlFields])
|
||||
//the first 2 pages should be enough here
|
||||
//TODO: Think about automagically sort by date
|
||||
.limit(2)
|
||||
.paginate(this._providerConfig.paginate)
|
||||
.then((listings) => {
|
||||
resolve(listings == null ? [] : listings);
|
||||
})
|
||||
.catch((err) => {
|
||||
reject(err);
|
||||
console.error(err);
|
||||
});
|
||||
} else {
|
||||
xray(u, this._providerConfig.crawlContainer, [this._providerConfig.crawlFields])
|
||||
.then((listings) => {
|
||||
resolve(listings == null ? [] : listings);
|
||||
})
|
||||
.catch((err) => {
|
||||
reject(err);
|
||||
console.error(err);
|
||||
});
|
||||
}
|
||||
} catch (error) {
|
||||
reject(error);
|
||||
console.error(error);
|
||||
@@ -80,7 +99,7 @@ class FredyRuntime {
|
||||
const newListings = listings.filter((o) => getKnownListings(this._jobKey, this._providerId)[o.id] == null);
|
||||
|
||||
if (newListings.length === 0) {
|
||||
throw new NoNewListingsError();
|
||||
throw new NoNewListingsWarning();
|
||||
}
|
||||
|
||||
return newListings;
|
||||
@@ -100,8 +119,22 @@ class FredyRuntime {
|
||||
return newListings;
|
||||
}
|
||||
|
||||
_filterBySimilarListings(listings) {
|
||||
const filteredList = listings.filter((listing) => {
|
||||
const similar = this._similarityCache.hasSimilarEntries(this._jobKey, listing.title);
|
||||
if (similar) {
|
||||
/* eslint-disable no-console */
|
||||
console.debug(`Filtering similar entry for job with id ${this._jobKey} with title: `, listing.title);
|
||||
/* eslint-enable no-console */
|
||||
}
|
||||
return !similar;
|
||||
});
|
||||
filteredList.forEach((filter) => this._similarityCache.addCacheEntry(this._jobKey, filter.title));
|
||||
return filteredList;
|
||||
}
|
||||
|
||||
_handleError(err) {
|
||||
if (err.name !== 'NoNewListingsError') console.error(err);
|
||||
if (err.name !== 'NoNewListingsWarning') console.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -10,6 +10,6 @@ class ExtendableError extends Error {
|
||||
}
|
||||
}
|
||||
|
||||
class NoNewListingsError extends ExtendableError {}
|
||||
class NoNewListingsWarning extends ExtendableError {}
|
||||
|
||||
module.exports = { NoNewListingsError };
|
||||
module.exports = { NoNewListingsWarning };
|
||||
|
||||
@@ -30,7 +30,6 @@ const config = {
|
||||
title: '.tabelle .inner_object_data .tabelle_inhalt_titel_black | removeNewline | trim',
|
||||
description: '.tabelle .inner_object_data .objekt_beschreibung | removeNewline | trim',
|
||||
},
|
||||
paginate: '.pagination_blocks div:last a@href',
|
||||
normalize: normalize,
|
||||
filter: applyBlacklist,
|
||||
};
|
||||
|
||||
@@ -20,7 +20,7 @@ function applyBlacklist(o) {
|
||||
|
||||
const config = {
|
||||
url: null,
|
||||
crawlContainer: '#srchrslt-adtable .ad-listitem',
|
||||
crawlContainer: '#srchrslt-adtable .ad-listitem ',
|
||||
crawlFields: {
|
||||
id: '.aditem@data-adid | int',
|
||||
price: '.aditem-main--middle--price | removeNewline | trim',
|
||||
|
||||
@@ -24,7 +24,6 @@ const config = {
|
||||
title: '.truncate_title a |removeNewline |trim',
|
||||
link: '.truncate_title a@href',
|
||||
},
|
||||
paginate: '.pagination-sm:first a:last@href',
|
||||
normalize: normalize,
|
||||
filter: applyBlacklist,
|
||||
};
|
||||
|
||||
36
lib/services/similarity-check/SimilarityCacheEntry.js
Normal file
36
lib/services/similarity-check/SimilarityCacheEntry.js
Normal file
@@ -0,0 +1,36 @@
|
||||
const stringSimilarity = require('string-similarity');
|
||||
|
||||
//if the score is higher than this, it will be considered a match
|
||||
const MAX_DICE_INDEX = 0.7;
|
||||
|
||||
/**
|
||||
* The similarity check is based on the dice coefficient. => https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient
|
||||
*
|
||||
* @type {module.SimilarityCacheEntry}
|
||||
*/
|
||||
module.exports = class SimilarityCacheEntry {
|
||||
constructor(time) {
|
||||
this.time = time;
|
||||
this.values = [];
|
||||
}
|
||||
|
||||
setCacheEntry = (entry) => {
|
||||
this.values.push(entry);
|
||||
};
|
||||
|
||||
getTime = () => {
|
||||
return this.time;
|
||||
};
|
||||
|
||||
hasSimilarEntries = (value) => {
|
||||
if (this.values.length > 0) {
|
||||
for (let i = 0; i < this.values.length; i++) {
|
||||
const index = stringSimilarity.compareTwoStrings(value, this.values[i]);
|
||||
if (index >= MAX_DICE_INDEX) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
};
|
||||
};
|
||||
63
lib/services/similarity-check/similarityCache.js
Normal file
63
lib/services/similarity-check/similarityCache.js
Normal file
@@ -0,0 +1,63 @@
|
||||
/**
|
||||
* each job that runs scrapes all provider. This cache holds the titles of the found listing(s) and provides
|
||||
* a similarity check. if this check returns true, it will not be forwarded to the notification adapter, thus
|
||||
* the user won't see any duplicates
|
||||
*
|
||||
* The retention of this cache is per default 5 minutes, but can be smaller if the interval is > 5 mins.
|
||||
*
|
||||
* @type {module.SimilarityCacheEntry|{}}
|
||||
*/
|
||||
const SimilarityCacheEntry = require('./SimilarityCacheEntry');
|
||||
const config = require('../../../conf/config.json');
|
||||
|
||||
//5 minutes
|
||||
let retention = 5 * 60 * 1000;
|
||||
|
||||
const intervalInMs = config.interval * 60 * 1000;
|
||||
//an interval below 5 mins sounds crazy, but there are ppl out there doing crazy shit.
|
||||
if (intervalInMs <= retention) {
|
||||
retention = Math.floor(intervalInMs / 2);
|
||||
}
|
||||
|
||||
//jobid -> SimilarityCacheEntry
|
||||
const cache = {};
|
||||
|
||||
let intervalId;
|
||||
|
||||
exports.addCacheEntry = (jobId, value) => {
|
||||
cache[jobId] = cache[jobId] || new SimilarityCacheEntry(Date.now());
|
||||
cache[jobId].setCacheEntry(value);
|
||||
};
|
||||
|
||||
exports.hasSimilarEntries = (jobId, value) => {
|
||||
if (cache[jobId] == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return cache[jobId].hasSimilarEntries(value);
|
||||
};
|
||||
|
||||
/**
|
||||
* cleanup
|
||||
*/
|
||||
intervalId = setInterval(() => {
|
||||
const keysToBeRemoved = [];
|
||||
const now = Date.now();
|
||||
|
||||
Object.keys(cache).forEach((key) => {
|
||||
if (cache[key].getTime() + retention < now) {
|
||||
keysToBeRemoved.push(key);
|
||||
}
|
||||
});
|
||||
|
||||
if (keysToBeRemoved.length > 0) {
|
||||
keysToBeRemoved.forEach((key) => delete cache[key]);
|
||||
}
|
||||
}, 10000);
|
||||
|
||||
/**
|
||||
* mostly used for tests
|
||||
*/
|
||||
exports.stopCacheCleanup = () => {
|
||||
clearInterval(intervalId);
|
||||
};
|
||||
@@ -61,12 +61,18 @@ exports.setJobStatus = ({ jobId, status }) => {
|
||||
};
|
||||
|
||||
exports.removeJob = (jobId) => {
|
||||
listingStorage.removeListings(jobId);
|
||||
db.get('jobs')
|
||||
.remove((job) => job.id === jobId)
|
||||
.write();
|
||||
};
|
||||
|
||||
exports.removeJobsByUserId = (userId) => {
|
||||
db.get('jobs')
|
||||
.value()
|
||||
.filter((job) => job.userId === userId)
|
||||
.forEach((job) => listingStorage.removeListings(job.id));
|
||||
|
||||
db.get('jobs')
|
||||
.remove((job) => job.userId === userId)
|
||||
.write();
|
||||
|
||||
@@ -47,3 +47,7 @@ exports.setLastJobExecution = (jobId) => {
|
||||
const key = buildKey(jobId, null, 'lastExecution');
|
||||
return db.set(key, Date.now()).write();
|
||||
};
|
||||
|
||||
exports.removeListings = (jobId) => {
|
||||
db.unset(jobId).write();
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user