improve similarity cache. It now checks for similarities independend from jobs

This commit is contained in:
orangecoding
2025-09-07 22:15:14 +02:00
parent 7fa9a265ef
commit 09c6ce1d0b
6 changed files with 138 additions and 104 deletions

View File

@@ -102,15 +102,15 @@ class FredyRuntime {
_filterBySimilarListings(listings) {
const filteredList = listings.filter((listing) => {
const similar = this._similarityCache.hasSimilarEntries(this._jobKey, listing.title);
const similar = this._similarityCache.hasSimilarEntries(listing.title, listing.address);
if (similar) {
/* eslint-disable no-console */
console.debug(`Filtering similar entry for job with id ${this._jobKey} with title: `, listing.title);
console.debug(`Filtering similar entry for title: ${listing.title} and address ${listing.address}`);
/* eslint-enable no-console */
}
return !similar;
});
filteredList.forEach((filter) => this._similarityCache.addCacheEntry(this._jobKey, filter.title));
filteredList.forEach((filter) => this._similarityCache.addCacheEntry(filter.title, listings.address));
return filteredList;
}

View File

@@ -1,26 +0,0 @@
import stringSimilarity from 'string-similarity';
//if the score is higher than this, it will be considered a match
const MAX_DICE_INDEX = 0.7;
export default (class SimilarityCacheEntry {
constructor(time) {
this.time = time;
this.values = [];
}
setCacheEntry = (entry) => {
this.values.push(entry);
};
getTime = () => {
return this.time;
};
hasSimilarEntries = (value) => {
if (this.values.length > 0) {
for (let i = 0; i < this.values.length; i++) {
const index = stringSimilarity.compareTwoStrings(value, this.values[i]);
if (index >= MAX_DICE_INDEX) {
return true;
}
}
}
return false;
};
});

View File

@@ -1,40 +1,116 @@
import SimilarityCacheEntry from './SimilarityCacheEntry.js';
import { config } from '../../utils.js';
//5 minutes
let retention = 5 * 60 * 1000;
const intervalInMs = config.interval * 60 * 1000;
//an interval below 5 mins sounds crazy, but there are ppl out there doing crazy shit.
if (intervalInMs <= retention) {
retention = Math.floor(intervalInMs / 2);
}
//jobid -> SimilarityCacheEntry
const cache = {};
let intervalId;
import crypto from 'crypto';
const retention = 60 * 60 * 1000;
/**
* cleanup
* Internal cache storage.
* Maps a SHA-256 hash (string) to its expiry timestamp (number in ms).
* @type {Map<string, number>}
*/
intervalId = setInterval(() => {
const keysToBeRemoved = [];
const entries = new Map();
/**
* Reference to the currently scheduled cleanup timer.
* @type {NodeJS.Timeout | null}
*/
let timer = null;
/**
* Generate a SHA-256 hash from a list of input strings.
* Null or undefined values are ignored.
*
* @param {...(string|null|undefined)} strings - Input values to hash
* @returns {string} Hexadecimal hash
*/
function toHash(...strings) {
return crypto.createHash('sha256').update(strings.filter(Boolean).join('|')).digest('hex');
}
/**
* Cleanup expired cache entries and schedule the next cleanup run.
* This function is invoked automatically by scheduled timers.
*
* @private
*/
function runCleanup() {
const now = Date.now();
Object.keys(cache).forEach((key) => {
if (cache[key].getTime() + retention < now) {
keysToBeRemoved.push(key);
}
});
if (keysToBeRemoved.length > 0) {
keysToBeRemoved.forEach((key) => delete cache[key]);
for (const [hash, expiry] of entries) {
if (expiry <= now) entries.delete(hash);
}
}, 10000);
export const addCacheEntry = (jobId, value) => {
cache[jobId] = cache[jobId] || new SimilarityCacheEntry(Date.now());
cache[jobId].setCacheEntry(value);
};
export const hasSimilarEntries = (jobId, value) => {
if (cache[jobId] == null) {
scheduleNext();
}
/**
* Find the soonest expiry timestamp among all cache entries
* and schedule a one-shot timer that will trigger at that time.
* Cancels any existing timer before scheduling a new one.
*
* @private
*/
function scheduleNext() {
if (timer) {
clearTimeout(timer);
timer = null;
}
let next = Infinity;
const now = Date.now();
for (const expiry of entries.values()) {
if (expiry > now && expiry < next) next = expiry;
}
if (next !== Infinity) {
timer = setTimeout(runCleanup, Math.max(0, next - now));
}
}
/**
* Add or refresh a cache entry for the given title and address.
* The entry will automatically expire after the configured retention window.
*
* @param {string} title - The title used to build the cache key
* @param {string} address - The address used to build the cache key
*/
export function addCacheEntry(title, address) {
const hash = toHash(title, address);
const expiry = Date.now() + retention;
entries.set(hash, expiry);
scheduleNext();
}
/**
* Check if a cache entry with the same title and address exists
* and is still valid (not expired).
*
* @param {string} title - The title used to build the cache key
* @param {string} address - The address used to build the cache key
* @returns {boolean} True if a valid cache entry exists, false otherwise
*/
export function hasSimilarEntries(title, address) {
const hash = toHash(title, address);
const expiry = entries.get(hash);
if (expiry == null) return false;
if (expiry <= Date.now()) {
entries.delete(hash);
scheduleNext();
return false;
}
return cache[jobId].hasSimilarEntries(value);
};
export const stopCacheCleanup = () => {
clearInterval(intervalId);
};
return true;
}
/**
* Stop any scheduled cleanup timers and prevent further automatic cleanup.
* Entries that are already in the cache will remain until removed manually
* or until cleanup is started again by adding new entries.
*/
export function stopCacheCleanup() {
if (timer) clearTimeout(timer);
timer = null;
}
/**
* this is only for test purposes
*/
export function invalidateAllForTest() {
for (const key of entries.keys()) {
entries.set(key, 0);
}
runCleanup();
}