mirror of
https://github.com/orangecoding/fredy.git
synced 2026-06-16 12:31:07 +00:00
improve similarity cache. It now checks for similarities independend from jobs
This commit is contained in:
@@ -102,15 +102,15 @@ class FredyRuntime {
|
||||
|
||||
_filterBySimilarListings(listings) {
|
||||
const filteredList = listings.filter((listing) => {
|
||||
const similar = this._similarityCache.hasSimilarEntries(this._jobKey, listing.title);
|
||||
const similar = this._similarityCache.hasSimilarEntries(listing.title, listing.address);
|
||||
if (similar) {
|
||||
/* eslint-disable no-console */
|
||||
console.debug(`Filtering similar entry for job with id ${this._jobKey} with title: `, listing.title);
|
||||
console.debug(`Filtering similar entry for title: ${listing.title} and address ${listing.address}`);
|
||||
/* eslint-enable no-console */
|
||||
}
|
||||
return !similar;
|
||||
});
|
||||
filteredList.forEach((filter) => this._similarityCache.addCacheEntry(this._jobKey, filter.title));
|
||||
filteredList.forEach((filter) => this._similarityCache.addCacheEntry(filter.title, listings.address));
|
||||
return filteredList;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,26 +0,0 @@
|
||||
import stringSimilarity from 'string-similarity';
|
||||
//if the score is higher than this, it will be considered a match
|
||||
const MAX_DICE_INDEX = 0.7;
|
||||
export default (class SimilarityCacheEntry {
|
||||
constructor(time) {
|
||||
this.time = time;
|
||||
this.values = [];
|
||||
}
|
||||
setCacheEntry = (entry) => {
|
||||
this.values.push(entry);
|
||||
};
|
||||
getTime = () => {
|
||||
return this.time;
|
||||
};
|
||||
hasSimilarEntries = (value) => {
|
||||
if (this.values.length > 0) {
|
||||
for (let i = 0; i < this.values.length; i++) {
|
||||
const index = stringSimilarity.compareTwoStrings(value, this.values[i]);
|
||||
if (index >= MAX_DICE_INDEX) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
};
|
||||
});
|
||||
@@ -1,40 +1,116 @@
|
||||
import SimilarityCacheEntry from './SimilarityCacheEntry.js';
|
||||
import { config } from '../../utils.js';
|
||||
//5 minutes
|
||||
let retention = 5 * 60 * 1000;
|
||||
const intervalInMs = config.interval * 60 * 1000;
|
||||
//an interval below 5 mins sounds crazy, but there are ppl out there doing crazy shit.
|
||||
if (intervalInMs <= retention) {
|
||||
retention = Math.floor(intervalInMs / 2);
|
||||
}
|
||||
//jobid -> SimilarityCacheEntry
|
||||
const cache = {};
|
||||
let intervalId;
|
||||
import crypto from 'crypto';
|
||||
|
||||
const retention = 60 * 60 * 1000;
|
||||
/**
|
||||
* cleanup
|
||||
* Internal cache storage.
|
||||
* Maps a SHA-256 hash (string) to its expiry timestamp (number in ms).
|
||||
* @type {Map<string, number>}
|
||||
*/
|
||||
intervalId = setInterval(() => {
|
||||
const keysToBeRemoved = [];
|
||||
const entries = new Map();
|
||||
|
||||
/**
|
||||
* Reference to the currently scheduled cleanup timer.
|
||||
* @type {NodeJS.Timeout | null}
|
||||
*/
|
||||
let timer = null;
|
||||
|
||||
/**
|
||||
* Generate a SHA-256 hash from a list of input strings.
|
||||
* Null or undefined values are ignored.
|
||||
*
|
||||
* @param {...(string|null|undefined)} strings - Input values to hash
|
||||
* @returns {string} Hexadecimal hash
|
||||
*/
|
||||
function toHash(...strings) {
|
||||
return crypto.createHash('sha256').update(strings.filter(Boolean).join('|')).digest('hex');
|
||||
}
|
||||
|
||||
/**
|
||||
* Cleanup expired cache entries and schedule the next cleanup run.
|
||||
* This function is invoked automatically by scheduled timers.
|
||||
*
|
||||
* @private
|
||||
*/
|
||||
function runCleanup() {
|
||||
const now = Date.now();
|
||||
Object.keys(cache).forEach((key) => {
|
||||
if (cache[key].getTime() + retention < now) {
|
||||
keysToBeRemoved.push(key);
|
||||
}
|
||||
});
|
||||
if (keysToBeRemoved.length > 0) {
|
||||
keysToBeRemoved.forEach((key) => delete cache[key]);
|
||||
for (const [hash, expiry] of entries) {
|
||||
if (expiry <= now) entries.delete(hash);
|
||||
}
|
||||
}, 10000);
|
||||
export const addCacheEntry = (jobId, value) => {
|
||||
cache[jobId] = cache[jobId] || new SimilarityCacheEntry(Date.now());
|
||||
cache[jobId].setCacheEntry(value);
|
||||
};
|
||||
export const hasSimilarEntries = (jobId, value) => {
|
||||
if (cache[jobId] == null) {
|
||||
scheduleNext();
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the soonest expiry timestamp among all cache entries
|
||||
* and schedule a one-shot timer that will trigger at that time.
|
||||
* Cancels any existing timer before scheduling a new one.
|
||||
*
|
||||
* @private
|
||||
*/
|
||||
function scheduleNext() {
|
||||
if (timer) {
|
||||
clearTimeout(timer);
|
||||
timer = null;
|
||||
}
|
||||
let next = Infinity;
|
||||
const now = Date.now();
|
||||
for (const expiry of entries.values()) {
|
||||
if (expiry > now && expiry < next) next = expiry;
|
||||
}
|
||||
if (next !== Infinity) {
|
||||
timer = setTimeout(runCleanup, Math.max(0, next - now));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add or refresh a cache entry for the given title and address.
|
||||
* The entry will automatically expire after the configured retention window.
|
||||
*
|
||||
* @param {string} title - The title used to build the cache key
|
||||
* @param {string} address - The address used to build the cache key
|
||||
*/
|
||||
export function addCacheEntry(title, address) {
|
||||
const hash = toHash(title, address);
|
||||
const expiry = Date.now() + retention;
|
||||
entries.set(hash, expiry);
|
||||
scheduleNext();
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a cache entry with the same title and address exists
|
||||
* and is still valid (not expired).
|
||||
*
|
||||
* @param {string} title - The title used to build the cache key
|
||||
* @param {string} address - The address used to build the cache key
|
||||
* @returns {boolean} True if a valid cache entry exists, false otherwise
|
||||
*/
|
||||
export function hasSimilarEntries(title, address) {
|
||||
const hash = toHash(title, address);
|
||||
const expiry = entries.get(hash);
|
||||
if (expiry == null) return false;
|
||||
if (expiry <= Date.now()) {
|
||||
entries.delete(hash);
|
||||
scheduleNext();
|
||||
return false;
|
||||
}
|
||||
return cache[jobId].hasSimilarEntries(value);
|
||||
};
|
||||
export const stopCacheCleanup = () => {
|
||||
clearInterval(intervalId);
|
||||
};
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop any scheduled cleanup timers and prevent further automatic cleanup.
|
||||
* Entries that are already in the cache will remain until removed manually
|
||||
* or until cleanup is started again by adding new entries.
|
||||
*/
|
||||
export function stopCacheCleanup() {
|
||||
if (timer) clearTimeout(timer);
|
||||
timer = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* this is only for test purposes
|
||||
*/
|
||||
export function invalidateAllForTest() {
|
||||
for (const key of entries.keys()) {
|
||||
entries.set(key, 0);
|
||||
}
|
||||
runCleanup();
|
||||
}
|
||||
|
||||
@@ -90,7 +90,6 @@
|
||||
"restana": "5.1.0",
|
||||
"serve-static": "2.2.0",
|
||||
"slack": "11.0.2",
|
||||
"string-similarity": "^4.0.4",
|
||||
"vite": "7.1.4",
|
||||
"x-var": "^2.1.0"
|
||||
},
|
||||
|
||||
@@ -1,40 +1,30 @@
|
||||
import SimilarityCacheEntry from '../../lib/services/similarity-check/SimilarityCacheEntry.js';
|
||||
import { expect } from 'chai';
|
||||
import * as similarityCache from '../../lib/services/similarity-check/similarityCache.js';
|
||||
|
||||
describe('similarityCheck', () => {
|
||||
describe('#similarityCheck()', () => {
|
||||
it('should be false', () => {
|
||||
const check = new SimilarityCacheEntry(0);
|
||||
check.setCacheEntry('Hallo');
|
||||
expect(check.hasSimilarEntries('Welt')).to.be.false;
|
||||
});
|
||||
it('should be true', () => {
|
||||
const check = new SimilarityCacheEntry(0);
|
||||
check.setCacheEntry('Hallo');
|
||||
expect(check.hasSimilarEntries('hallo')).to.be.true;
|
||||
});
|
||||
it('should be true', () => {
|
||||
const check = new SimilarityCacheEntry(0);
|
||||
check.setCacheEntry('Selling an incredible house in san francisco');
|
||||
expect(check.hasSimilarEntries('incredible house in san francisco for sale')).to.be.true;
|
||||
});
|
||||
it('should be true', () => {
|
||||
const check = new SimilarityCacheEntry(0);
|
||||
check.setCacheEntry('a');
|
||||
check.setCacheEntry('b');
|
||||
check.setCacheEntry('c');
|
||||
check.setCacheEntry('d');
|
||||
expect(check.hasSimilarEntries('b')).to.be.true;
|
||||
});
|
||||
it('should be false', () => {
|
||||
const check = new SimilarityCacheEntry(0);
|
||||
check.setCacheEntry(
|
||||
'The index is known by several other names, especially Sørensen–Dice index,[3] Sørensen index and Dice\'s coefficient. Other variations include the "similarity coefficient" or "index", such as Dice similarity coefficient (DSC). Common alternate spellings for Sørensen are Sorenson, Soerenson and Sörenson, and all three can also be seen with the –sen ending.',
|
||||
);
|
||||
check.setCacheEntry(
|
||||
'where |X| and |Y| are the cardinalities of the two sets (i.e. the number of elements in each set). The Sørensen index equals twice the number of elements common to both sets divided by the sum of the number of elements in each set.',
|
||||
);
|
||||
expect(check.hasSimilarEntries('unrelated text')).to.be.false;
|
||||
});
|
||||
it('should return true on duplicate', () => {
|
||||
similarityCache.addCacheEntry('Hello World', 'Test');
|
||||
expect(similarityCache.hasSimilarEntries('Hello World', 'Test')).to.be.true;
|
||||
});
|
||||
|
||||
it('should return true even if one value is null', () => {
|
||||
similarityCache.addCacheEntry('Hello World', null);
|
||||
expect(similarityCache.hasSimilarEntries('Hello World', null)).to.be.true;
|
||||
});
|
||||
|
||||
it('should return true even if one value is an obj', () => {
|
||||
similarityCache.addCacheEntry('Hello World', [{ TR: 'OLOLO' }]);
|
||||
expect(similarityCache.hasSimilarEntries('Hello World', [{ TR: 'OLOLO' }])).to.be.true;
|
||||
});
|
||||
|
||||
it('should return false when no duplicate', () => {
|
||||
similarityCache.addCacheEntry('Hello World__', 'Test');
|
||||
expect(similarityCache.hasSimilarEntries('Hello World___', 'Test')).to.be.false;
|
||||
});
|
||||
|
||||
it('should return false when no duplicate', () => {
|
||||
expect(similarityCache.hasSimilarEntries('Hello World', 'Test')).to.be.true;
|
||||
similarityCache.invalidateAllForTest();
|
||||
expect(similarityCache.hasSimilarEntries('Hello World', 'Test')).to.be.false;
|
||||
});
|
||||
});
|
||||
|
||||
@@ -6899,11 +6899,6 @@ string-argv@^0.3.2:
|
||||
resolved "https://registry.yarnpkg.com/string-argv/-/string-argv-0.3.2.tgz#2b6d0ef24b656274d957d54e0a4bbf6153dc02b6"
|
||||
integrity sha512-aqD2Q0144Z+/RqG52NeHEkZauTAUWJO8c6yTftGJKO3Tja5tUgIfmIl6kExvhtxSDP7fXB6DvzkfMpCd/F3G+Q==
|
||||
|
||||
string-similarity@^4.0.4:
|
||||
version "4.0.4"
|
||||
resolved "https://registry.yarnpkg.com/string-similarity/-/string-similarity-4.0.4.tgz#42d01ab0b34660ea8a018da8f56a3309bb8b2a5b"
|
||||
integrity sha512-/q/8Q4Bl4ZKAPjj8WerIBJWALKkaPRfrvhfF8k/B23i4nzrlRj2/go1m90In7nG/3XDSbOo0+pu6RvCTM9RGMQ==
|
||||
|
||||
"string-width-cjs@npm:string-width@^4.2.0":
|
||||
version "4.2.3"
|
||||
resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.3.tgz#269c7117d27b05ad2e536830a8ec895ef9c6d010"
|
||||
|
||||
Reference in New Issue
Block a user