adding similarity check (#29)

* adding similarity check

* adding paging

* fixing tests

* docu

* better error handling

* fixing tests

* adjusting page limit

* fixing login screen

* cleanup

* upgrade browser list

* prevent spamming the log

* fixing tests

* removing job listings when removing a job or the user
This commit is contained in:
Christian Kellner
2021-06-28 08:52:09 +02:00
committed by GitHub
parent 88c046dbd4
commit 59e6d287fc
26 changed files with 1114 additions and 878 deletions

View File

@@ -1,4 +1,4 @@
const { NoNewListingsError } = require('./errors');
const { NoNewListingsWarning } = require('./errors');
const { setKnownListings, getKnownListings } = require('./services/storage/listingsStorage');
const notify = require('./notification/notify');
@@ -12,12 +12,14 @@ class FredyRuntime {
* @param notificationConfig the config for all notifications
* @param providerId the id of the provider currently in use
* @param jobKey key of the job that is currently running (from within the config)
* @param similarityCache cache instance holding values to check for similarity of entries
*/
constructor(providerConfig, notificationConfig, providerId, jobKey) {
constructor(providerConfig, notificationConfig, providerId, jobKey, similarityCache) {
this._providerConfig = providerConfig;
this._notificationConfig = notificationConfig;
this._providerId = providerId;
this._jobKey = jobKey;
this._similarityCache = similarityCache;
}
execute() {
@@ -33,6 +35,8 @@ class FredyRuntime {
.then(this._findNew.bind(this))
//store everything in db
.then(this._save.bind(this))
//check for similar listings. if found, remove them before notifying
.then(this._filterBySimilarListings.bind(this))
//notify the user using the configured notification adapter
.then(this._notify.bind(this))
//if an error occurred on the way, handle it here.
@@ -53,14 +57,29 @@ class FredyRuntime {
}
const u = scrapingAnt.isImmoscout(id) ? scrapingAnt.transformUrlForScrapingAnt(url, id) : url;
try {
xray(u, this._providerConfig.crawlContainer, [this._providerConfig.crawlFields])
.then((listings) => {
resolve(listings == null ? [] : listings);
})
.catch((err) => {
reject(err);
console.error(err);
});
if (this._providerConfig.paginate != null) {
xray(u, this._providerConfig.crawlContainer, [this._providerConfig.crawlFields])
//the first 2 pages should be enough here
//TODO: Think about automagically sort by date
.limit(2)
.paginate(this._providerConfig.paginate)
.then((listings) => {
resolve(listings == null ? [] : listings);
})
.catch((err) => {
reject(err);
console.error(err);
});
} else {
xray(u, this._providerConfig.crawlContainer, [this._providerConfig.crawlFields])
.then((listings) => {
resolve(listings == null ? [] : listings);
})
.catch((err) => {
reject(err);
console.error(err);
});
}
} catch (error) {
reject(error);
console.error(error);
@@ -80,7 +99,7 @@ class FredyRuntime {
const newListings = listings.filter((o) => getKnownListings(this._jobKey, this._providerId)[o.id] == null);
if (newListings.length === 0) {
throw new NoNewListingsError();
throw new NoNewListingsWarning();
}
return newListings;
@@ -100,8 +119,22 @@ class FredyRuntime {
return newListings;
}
_filterBySimilarListings(listings) {
const filteredList = listings.filter((listing) => {
const similar = this._similarityCache.hasSimilarEntries(this._jobKey, listing.title);
if (similar) {
/* eslint-disable no-console */
console.debug(`Filtering similar entry for job with id ${this._jobKey} with title: `, listing.title);
/* eslint-enable no-console */
}
return !similar;
});
filteredList.forEach((filter) => this._similarityCache.addCacheEntry(this._jobKey, filter.title));
return filteredList;
}
_handleError(err) {
if (err.name !== 'NoNewListingsError') console.error(err);
if (err.name !== 'NoNewListingsWarning') console.error(err);
}
}

View File

@@ -10,6 +10,6 @@ class ExtendableError extends Error {
}
}
class NoNewListingsError extends ExtendableError {}
class NoNewListingsWarning extends ExtendableError {}
module.exports = { NoNewListingsError };
module.exports = { NoNewListingsWarning };

View File

@@ -30,7 +30,6 @@ const config = {
title: '.tabelle .inner_object_data .tabelle_inhalt_titel_black | removeNewline | trim',
description: '.tabelle .inner_object_data .objekt_beschreibung | removeNewline | trim',
},
paginate: '.pagination_blocks div:last a@href',
normalize: normalize,
filter: applyBlacklist,
};

View File

@@ -20,7 +20,7 @@ function applyBlacklist(o) {
const config = {
url: null,
crawlContainer: '#srchrslt-adtable .ad-listitem',
crawlContainer: '#srchrslt-adtable .ad-listitem ',
crawlFields: {
id: '.aditem@data-adid | int',
price: '.aditem-main--middle--price | removeNewline | trim',

View File

@@ -24,7 +24,6 @@ const config = {
title: '.truncate_title a |removeNewline |trim',
link: '.truncate_title a@href',
},
paginate: '.pagination-sm:first a:last@href',
normalize: normalize,
filter: applyBlacklist,
};

View File

@@ -0,0 +1,36 @@
const stringSimilarity = require('string-similarity');
//if the score is higher than this, it will be considered a match
const MAX_DICE_INDEX = 0.7;
/**
* The similarity check is based on the dice coefficient. => https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient
*
* @type {module.SimilarityCacheEntry}
*/
module.exports = class SimilarityCacheEntry {
constructor(time) {
this.time = time;
this.values = [];
}
setCacheEntry = (entry) => {
this.values.push(entry);
};
getTime = () => {
return this.time;
};
hasSimilarEntries = (value) => {
if (this.values.length > 0) {
for (let i = 0; i < this.values.length; i++) {
const index = stringSimilarity.compareTwoStrings(value, this.values[i]);
if (index >= MAX_DICE_INDEX) {
return true;
}
}
}
return false;
};
};

View File

@@ -0,0 +1,63 @@
/**
* each job that runs scrapes all provider. This cache holds the titles of the found listing(s) and provides
* a similarity check. if this check returns true, it will not be forwarded to the notification adapter, thus
* the user won't see any duplicates
*
* The retention of this cache is per default 5 minutes, but can be smaller if the interval is > 5 mins.
*
* @type {module.SimilarityCacheEntry|{}}
*/
const SimilarityCacheEntry = require('./SimilarityCacheEntry');
const config = require('../../../conf/config.json');
//5 minutes
let retention = 5 * 60 * 1000;
const intervalInMs = config.interval * 60 * 1000;
//an interval below 5 mins sounds crazy, but there are ppl out there doing crazy shit.
if (intervalInMs <= retention) {
retention = Math.floor(intervalInMs / 2);
}
//jobid -> SimilarityCacheEntry
const cache = {};
let intervalId;
exports.addCacheEntry = (jobId, value) => {
cache[jobId] = cache[jobId] || new SimilarityCacheEntry(Date.now());
cache[jobId].setCacheEntry(value);
};
exports.hasSimilarEntries = (jobId, value) => {
if (cache[jobId] == null) {
return false;
}
return cache[jobId].hasSimilarEntries(value);
};
/**
* cleanup
*/
intervalId = setInterval(() => {
const keysToBeRemoved = [];
const now = Date.now();
Object.keys(cache).forEach((key) => {
if (cache[key].getTime() + retention < now) {
keysToBeRemoved.push(key);
}
});
if (keysToBeRemoved.length > 0) {
keysToBeRemoved.forEach((key) => delete cache[key]);
}
}, 10000);
/**
* mostly used for tests
*/
exports.stopCacheCleanup = () => {
clearInterval(intervalId);
};

View File

@@ -61,12 +61,18 @@ exports.setJobStatus = ({ jobId, status }) => {
};
exports.removeJob = (jobId) => {
listingStorage.removeListings(jobId);
db.get('jobs')
.remove((job) => job.id === jobId)
.write();
};
exports.removeJobsByUserId = (userId) => {
db.get('jobs')
.value()
.filter((job) => job.userId === userId)
.forEach((job) => listingStorage.removeListings(job.id));
db.get('jobs')
.remove((job) => job.userId === userId)
.write();

View File

@@ -47,3 +47,7 @@ exports.setLastJobExecution = (jobId) => {
const key = buildKey(jobId, null, 'lastExecution');
return db.set(key, Date.now()).write();
};
exports.removeListings = (jobId) => {
db.unset(jobId).write();
};