adding similarity check (#29)

* adding similarity check

* adding paging

* fixing tests

* docu

* better error handling

* fixing tests

* adjusting page limit

* fixing login screen

* cleanup

* upgrade browser list

* prevent spamming the log

* fixing tests

* removing job listings when removing a job or the user
This commit is contained in:
Christian Kellner
2021-06-28 08:52:09 +02:00
committed by GitHub
parent 88c046dbd4
commit 59e6d287fc
26 changed files with 1114 additions and 878 deletions

View File

@@ -0,0 +1,36 @@
const stringSimilarity = require('string-similarity');
//if the score is higher than this, it will be considered a match
const MAX_DICE_INDEX = 0.7;
/**
* The similarity check is based on the dice coefficient. => https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient
*
* @type {module.SimilarityCacheEntry}
*/
module.exports = class SimilarityCacheEntry {
constructor(time) {
this.time = time;
this.values = [];
}
setCacheEntry = (entry) => {
this.values.push(entry);
};
getTime = () => {
return this.time;
};
hasSimilarEntries = (value) => {
if (this.values.length > 0) {
for (let i = 0; i < this.values.length; i++) {
const index = stringSimilarity.compareTwoStrings(value, this.values[i]);
if (index >= MAX_DICE_INDEX) {
return true;
}
}
}
return false;
};
};

View File

@@ -0,0 +1,63 @@
/**
* each job that runs scrapes all provider. This cache holds the titles of the found listing(s) and provides
* a similarity check. if this check returns true, it will not be forwarded to the notification adapter, thus
* the user won't see any duplicates
*
* The retention of this cache is per default 5 minutes, but can be smaller if the interval is > 5 mins.
*
* @type {module.SimilarityCacheEntry|{}}
*/
const SimilarityCacheEntry = require('./SimilarityCacheEntry');
const config = require('../../../conf/config.json');
//5 minutes
let retention = 5 * 60 * 1000;
const intervalInMs = config.interval * 60 * 1000;
//an interval below 5 mins sounds crazy, but there are ppl out there doing crazy shit.
if (intervalInMs <= retention) {
retention = Math.floor(intervalInMs / 2);
}
//jobid -> SimilarityCacheEntry
const cache = {};
let intervalId;
exports.addCacheEntry = (jobId, value) => {
cache[jobId] = cache[jobId] || new SimilarityCacheEntry(Date.now());
cache[jobId].setCacheEntry(value);
};
exports.hasSimilarEntries = (jobId, value) => {
if (cache[jobId] == null) {
return false;
}
return cache[jobId].hasSimilarEntries(value);
};
/**
* cleanup
*/
intervalId = setInterval(() => {
const keysToBeRemoved = [];
const now = Date.now();
Object.keys(cache).forEach((key) => {
if (cache[key].getTime() + retention < now) {
keysToBeRemoved.push(key);
}
});
if (keysToBeRemoved.length > 0) {
keysToBeRemoved.forEach((key) => delete cache[key]);
}
}, 10000);
/**
* mostly used for tests
*/
exports.stopCacheCleanup = () => {
clearInterval(intervalId);
};

View File

@@ -61,12 +61,18 @@ exports.setJobStatus = ({ jobId, status }) => {
};
exports.removeJob = (jobId) => {
listingStorage.removeListings(jobId);
db.get('jobs')
.remove((job) => job.id === jobId)
.write();
};
exports.removeJobsByUserId = (userId) => {
db.get('jobs')
.value()
.filter((job) => job.userId === userId)
.forEach((job) => listingStorage.removeListings(job.id));
db.get('jobs')
.remove((job) => job.userId === userId)
.write();

View File

@@ -47,3 +47,7 @@ exports.setLastJobExecution = (jobId) => {
const key = buildKey(jobId, null, 'lastExecution');
return db.set(key, Date.now()).write();
};
exports.removeListings = (jobId) => {
db.unset(jobId).write();
};