mirror of
https://github.com/orangecoding/fredy.git
synced 2026-06-16 12:31:07 +00:00
adding similarity check (#29)
* adding similarity check * adding paging * fixing tests * docu * better error handling * fixing tests * adjusting page limit * fixing login screen * cleanup * upgrade browser list * prevent spamming the log * fixing tests * removing job listings when removing a job or the user
This commit is contained in:
committed by
GitHub
parent
88c046dbd4
commit
59e6d287fc
36
lib/services/similarity-check/SimilarityCacheEntry.js
Normal file
36
lib/services/similarity-check/SimilarityCacheEntry.js
Normal file
@@ -0,0 +1,36 @@
|
||||
const stringSimilarity = require('string-similarity');
|
||||
|
||||
//if the score is higher than this, it will be considered a match
|
||||
const MAX_DICE_INDEX = 0.7;
|
||||
|
||||
/**
|
||||
* The similarity check is based on the dice coefficient. => https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient
|
||||
*
|
||||
* @type {module.SimilarityCacheEntry}
|
||||
*/
|
||||
module.exports = class SimilarityCacheEntry {
|
||||
constructor(time) {
|
||||
this.time = time;
|
||||
this.values = [];
|
||||
}
|
||||
|
||||
setCacheEntry = (entry) => {
|
||||
this.values.push(entry);
|
||||
};
|
||||
|
||||
getTime = () => {
|
||||
return this.time;
|
||||
};
|
||||
|
||||
hasSimilarEntries = (value) => {
|
||||
if (this.values.length > 0) {
|
||||
for (let i = 0; i < this.values.length; i++) {
|
||||
const index = stringSimilarity.compareTwoStrings(value, this.values[i]);
|
||||
if (index >= MAX_DICE_INDEX) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
};
|
||||
};
|
||||
63
lib/services/similarity-check/similarityCache.js
Normal file
63
lib/services/similarity-check/similarityCache.js
Normal file
@@ -0,0 +1,63 @@
|
||||
/**
|
||||
* each job that runs scrapes all provider. This cache holds the titles of the found listing(s) and provides
|
||||
* a similarity check. if this check returns true, it will not be forwarded to the notification adapter, thus
|
||||
* the user won't see any duplicates
|
||||
*
|
||||
* The retention of this cache is per default 5 minutes, but can be smaller if the interval is > 5 mins.
|
||||
*
|
||||
* @type {module.SimilarityCacheEntry|{}}
|
||||
*/
|
||||
const SimilarityCacheEntry = require('./SimilarityCacheEntry');
|
||||
const config = require('../../../conf/config.json');
|
||||
|
||||
//5 minutes
|
||||
let retention = 5 * 60 * 1000;
|
||||
|
||||
const intervalInMs = config.interval * 60 * 1000;
|
||||
//an interval below 5 mins sounds crazy, but there are ppl out there doing crazy shit.
|
||||
if (intervalInMs <= retention) {
|
||||
retention = Math.floor(intervalInMs / 2);
|
||||
}
|
||||
|
||||
//jobid -> SimilarityCacheEntry
|
||||
const cache = {};
|
||||
|
||||
let intervalId;
|
||||
|
||||
exports.addCacheEntry = (jobId, value) => {
|
||||
cache[jobId] = cache[jobId] || new SimilarityCacheEntry(Date.now());
|
||||
cache[jobId].setCacheEntry(value);
|
||||
};
|
||||
|
||||
exports.hasSimilarEntries = (jobId, value) => {
|
||||
if (cache[jobId] == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return cache[jobId].hasSimilarEntries(value);
|
||||
};
|
||||
|
||||
/**
|
||||
* cleanup
|
||||
*/
|
||||
intervalId = setInterval(() => {
|
||||
const keysToBeRemoved = [];
|
||||
const now = Date.now();
|
||||
|
||||
Object.keys(cache).forEach((key) => {
|
||||
if (cache[key].getTime() + retention < now) {
|
||||
keysToBeRemoved.push(key);
|
||||
}
|
||||
});
|
||||
|
||||
if (keysToBeRemoved.length > 0) {
|
||||
keysToBeRemoved.forEach((key) => delete cache[key]);
|
||||
}
|
||||
}, 10000);
|
||||
|
||||
/**
|
||||
* mostly used for tests
|
||||
*/
|
||||
exports.stopCacheCleanup = () => {
|
||||
clearInterval(intervalId);
|
||||
};
|
||||
@@ -61,12 +61,18 @@ exports.setJobStatus = ({ jobId, status }) => {
|
||||
};
|
||||
|
||||
exports.removeJob = (jobId) => {
|
||||
listingStorage.removeListings(jobId);
|
||||
db.get('jobs')
|
||||
.remove((job) => job.id === jobId)
|
||||
.write();
|
||||
};
|
||||
|
||||
exports.removeJobsByUserId = (userId) => {
|
||||
db.get('jobs')
|
||||
.value()
|
||||
.filter((job) => job.userId === userId)
|
||||
.forEach((job) => listingStorage.removeListings(job.id));
|
||||
|
||||
db.get('jobs')
|
||||
.remove((job) => job.userId === userId)
|
||||
.write();
|
||||
|
||||
@@ -47,3 +47,7 @@ exports.setLastJobExecution = (jobId) => {
|
||||
const key = buildKey(jobId, null, 'lastExecution');
|
||||
return db.set(key, Date.now()).write();
|
||||
};
|
||||
|
||||
exports.removeListings = (jobId) => {
|
||||
db.unset(jobId).write();
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user