improve similarity cache. It now checks for similarities independend from jobs

This commit is contained in:
orangecoding
2025-09-07 22:15:14 +02:00
parent 7fa9a265ef
commit 09c6ce1d0b
6 changed files with 138 additions and 104 deletions

View File

@@ -102,15 +102,15 @@ class FredyRuntime {
_filterBySimilarListings(listings) {
const filteredList = listings.filter((listing) => {
const similar = this._similarityCache.hasSimilarEntries(this._jobKey, listing.title);
const similar = this._similarityCache.hasSimilarEntries(listing.title, listing.address);
if (similar) {
/* eslint-disable no-console */
console.debug(`Filtering similar entry for job with id ${this._jobKey} with title: `, listing.title);
console.debug(`Filtering similar entry for title: ${listing.title} and address ${listing.address}`);
/* eslint-enable no-console */
}
return !similar;
});
filteredList.forEach((filter) => this._similarityCache.addCacheEntry(this._jobKey, filter.title));
filteredList.forEach((filter) => this._similarityCache.addCacheEntry(filter.title, listings.address));
return filteredList;
}

View File

@@ -1,26 +0,0 @@
import stringSimilarity from 'string-similarity';
//if the score is higher than this, it will be considered a match
const MAX_DICE_INDEX = 0.7;
export default (class SimilarityCacheEntry {
constructor(time) {
this.time = time;
this.values = [];
}
setCacheEntry = (entry) => {
this.values.push(entry);
};
getTime = () => {
return this.time;
};
hasSimilarEntries = (value) => {
if (this.values.length > 0) {
for (let i = 0; i < this.values.length; i++) {
const index = stringSimilarity.compareTwoStrings(value, this.values[i]);
if (index >= MAX_DICE_INDEX) {
return true;
}
}
}
return false;
};
});

View File

@@ -1,40 +1,116 @@
import SimilarityCacheEntry from './SimilarityCacheEntry.js';
import { config } from '../../utils.js';
//5 minutes
let retention = 5 * 60 * 1000;
const intervalInMs = config.interval * 60 * 1000;
//an interval below 5 mins sounds crazy, but there are ppl out there doing crazy shit.
if (intervalInMs <= retention) {
retention = Math.floor(intervalInMs / 2);
}
//jobid -> SimilarityCacheEntry
const cache = {};
let intervalId;
import crypto from 'crypto';
const retention = 60 * 60 * 1000;
/**
* cleanup
* Internal cache storage.
* Maps a SHA-256 hash (string) to its expiry timestamp (number in ms).
* @type {Map<string, number>}
*/
intervalId = setInterval(() => {
const keysToBeRemoved = [];
const entries = new Map();
/**
* Reference to the currently scheduled cleanup timer.
* @type {NodeJS.Timeout | null}
*/
let timer = null;
/**
* Generate a SHA-256 hash from a list of input strings.
* Null or undefined values are ignored.
*
* @param {...(string|null|undefined)} strings - Input values to hash
* @returns {string} Hexadecimal hash
*/
function toHash(...strings) {
return crypto.createHash('sha256').update(strings.filter(Boolean).join('|')).digest('hex');
}
/**
* Cleanup expired cache entries and schedule the next cleanup run.
* This function is invoked automatically by scheduled timers.
*
* @private
*/
function runCleanup() {
const now = Date.now();
Object.keys(cache).forEach((key) => {
if (cache[key].getTime() + retention < now) {
keysToBeRemoved.push(key);
}
});
if (keysToBeRemoved.length > 0) {
keysToBeRemoved.forEach((key) => delete cache[key]);
for (const [hash, expiry] of entries) {
if (expiry <= now) entries.delete(hash);
}
}, 10000);
export const addCacheEntry = (jobId, value) => {
cache[jobId] = cache[jobId] || new SimilarityCacheEntry(Date.now());
cache[jobId].setCacheEntry(value);
};
export const hasSimilarEntries = (jobId, value) => {
if (cache[jobId] == null) {
scheduleNext();
}
/**
* Find the soonest expiry timestamp among all cache entries
* and schedule a one-shot timer that will trigger at that time.
* Cancels any existing timer before scheduling a new one.
*
* @private
*/
function scheduleNext() {
if (timer) {
clearTimeout(timer);
timer = null;
}
let next = Infinity;
const now = Date.now();
for (const expiry of entries.values()) {
if (expiry > now && expiry < next) next = expiry;
}
if (next !== Infinity) {
timer = setTimeout(runCleanup, Math.max(0, next - now));
}
}
/**
* Add or refresh a cache entry for the given title and address.
* The entry will automatically expire after the configured retention window.
*
* @param {string} title - The title used to build the cache key
* @param {string} address - The address used to build the cache key
*/
export function addCacheEntry(title, address) {
const hash = toHash(title, address);
const expiry = Date.now() + retention;
entries.set(hash, expiry);
scheduleNext();
}
/**
* Check if a cache entry with the same title and address exists
* and is still valid (not expired).
*
* @param {string} title - The title used to build the cache key
* @param {string} address - The address used to build the cache key
* @returns {boolean} True if a valid cache entry exists, false otherwise
*/
export function hasSimilarEntries(title, address) {
const hash = toHash(title, address);
const expiry = entries.get(hash);
if (expiry == null) return false;
if (expiry <= Date.now()) {
entries.delete(hash);
scheduleNext();
return false;
}
return cache[jobId].hasSimilarEntries(value);
};
export const stopCacheCleanup = () => {
clearInterval(intervalId);
};
return true;
}
/**
* Stop any scheduled cleanup timers and prevent further automatic cleanup.
* Entries that are already in the cache will remain until removed manually
* or until cleanup is started again by adding new entries.
*/
export function stopCacheCleanup() {
if (timer) clearTimeout(timer);
timer = null;
}
/**
* this is only for test purposes
*/
export function invalidateAllForTest() {
for (const key of entries.keys()) {
entries.set(key, 0);
}
runCleanup();
}

View File

@@ -90,7 +90,6 @@
"restana": "5.1.0",
"serve-static": "2.2.0",
"slack": "11.0.2",
"string-similarity": "^4.0.4",
"vite": "7.1.4",
"x-var": "^2.1.0"
},

View File

@@ -1,40 +1,30 @@
import SimilarityCacheEntry from '../../lib/services/similarity-check/SimilarityCacheEntry.js';
import { expect } from 'chai';
import * as similarityCache from '../../lib/services/similarity-check/similarityCache.js';
describe('similarityCheck', () => {
describe('#similarityCheck()', () => {
it('should be false', () => {
const check = new SimilarityCacheEntry(0);
check.setCacheEntry('Hallo');
expect(check.hasSimilarEntries('Welt')).to.be.false;
});
it('should be true', () => {
const check = new SimilarityCacheEntry(0);
check.setCacheEntry('Hallo');
expect(check.hasSimilarEntries('hallo')).to.be.true;
});
it('should be true', () => {
const check = new SimilarityCacheEntry(0);
check.setCacheEntry('Selling an incredible house in san francisco');
expect(check.hasSimilarEntries('incredible house in san francisco for sale')).to.be.true;
});
it('should be true', () => {
const check = new SimilarityCacheEntry(0);
check.setCacheEntry('a');
check.setCacheEntry('b');
check.setCacheEntry('c');
check.setCacheEntry('d');
expect(check.hasSimilarEntries('b')).to.be.true;
});
it('should be false', () => {
const check = new SimilarityCacheEntry(0);
check.setCacheEntry(
'The index is known by several other names, especially SørensenDice index,[3] Sørensen index and Dice\'s coefficient. Other variations include the "similarity coefficient" or "index", such as Dice similarity coefficient (DSC). Common alternate spellings for Sørensen are Sorenson, Soerenson and Sörenson, and all three can also be seen with the sen ending.',
);
check.setCacheEntry(
'where |X| and |Y| are the cardinalities of the two sets (i.e. the number of elements in each set). The Sørensen index equals twice the number of elements common to both sets divided by the sum of the number of elements in each set.',
);
expect(check.hasSimilarEntries('unrelated text')).to.be.false;
});
it('should return true on duplicate', () => {
similarityCache.addCacheEntry('Hello World', 'Test');
expect(similarityCache.hasSimilarEntries('Hello World', 'Test')).to.be.true;
});
it('should return true even if one value is null', () => {
similarityCache.addCacheEntry('Hello World', null);
expect(similarityCache.hasSimilarEntries('Hello World', null)).to.be.true;
});
it('should return true even if one value is an obj', () => {
similarityCache.addCacheEntry('Hello World', [{ TR: 'OLOLO' }]);
expect(similarityCache.hasSimilarEntries('Hello World', [{ TR: 'OLOLO' }])).to.be.true;
});
it('should return false when no duplicate', () => {
similarityCache.addCacheEntry('Hello World__', 'Test');
expect(similarityCache.hasSimilarEntries('Hello World___', 'Test')).to.be.false;
});
it('should return false when no duplicate', () => {
expect(similarityCache.hasSimilarEntries('Hello World', 'Test')).to.be.true;
similarityCache.invalidateAllForTest();
expect(similarityCache.hasSimilarEntries('Hello World', 'Test')).to.be.false;
});
});

View File

@@ -6899,11 +6899,6 @@ string-argv@^0.3.2:
resolved "https://registry.yarnpkg.com/string-argv/-/string-argv-0.3.2.tgz#2b6d0ef24b656274d957d54e0a4bbf6153dc02b6"
integrity sha512-aqD2Q0144Z+/RqG52NeHEkZauTAUWJO8c6yTftGJKO3Tja5tUgIfmIl6kExvhtxSDP7fXB6DvzkfMpCd/F3G+Q==
string-similarity@^4.0.4:
version "4.0.4"
resolved "https://registry.yarnpkg.com/string-similarity/-/string-similarity-4.0.4.tgz#42d01ab0b34660ea8a018da8f56a3309bb8b2a5b"
integrity sha512-/q/8Q4Bl4ZKAPjj8WerIBJWALKkaPRfrvhfF8k/B23i4nzrlRj2/go1m90In7nG/3XDSbOo0+pu6RvCTM9RGMQ==
"string-width-cjs@npm:string-width@^4.2.0":
version "4.2.3"
resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.3.tgz#269c7117d27b05ad2e536830a8ec895ef9c6d010"