mirror of
https://github.com/orangecoding/fredy.git
synced 2026-06-16 12:31:07 +00:00
improving similarity cache
This commit is contained in:
@@ -1,116 +1,92 @@
|
||||
import crypto from 'crypto';
|
||||
|
||||
const retention = 60 * 60 * 1000;
|
||||
/**
|
||||
* Internal cache storage.
|
||||
* Maps a SHA-256 hash (string) to its expiry timestamp (number in ms).
|
||||
* @type {Map<string, number>}
|
||||
*/
|
||||
const entries = new Map();
|
||||
|
||||
/**
|
||||
* Reference to the currently scheduled cleanup timer.
|
||||
* @type {NodeJS.Timeout | null}
|
||||
*/
|
||||
let timer = null;
|
||||
|
||||
/**
|
||||
* Generate a SHA-256 hash from a list of input strings.
|
||||
* Null or undefined values are ignored.
|
||||
* Similarity cache
|
||||
*
|
||||
* @param {...(string|null|undefined)} strings - Input values to hash
|
||||
* Maintains an in-memory Set of content hashes to detect whether a listing
|
||||
* (identified by a tuple of title, price and address) has been seen before.
|
||||
*
|
||||
* Design notes:
|
||||
* - The cache is refreshed periodically from persistent storage. To avoid
|
||||
* modification-during-iteration issues, the refresh builds a new Set and
|
||||
* atomically swaps the reference instead of mutating in place.
|
||||
* - Hashing ignores null/undefined values but preserves falsy-yet-valid values
|
||||
* like 0. Non-string values are coerced to strings before hashing.
|
||||
*
|
||||
* This module has no persistence of its own; it relies on
|
||||
* getAllEntriesFromListings() for data hydration.
|
||||
* @module similarityCache
|
||||
*/
|
||||
import crypto from 'crypto';
|
||||
import { getAllEntriesFromListings } from '../storage/listingsStorage.js';
|
||||
|
||||
/** @type {number} Refresh interval in milliseconds (defaults to one hour). */
|
||||
const reloadCycle = 60 * 60 * 1000; // every hour, refresh
|
||||
|
||||
/**
|
||||
* Internal cache of content hashes for known listings.
|
||||
*
|
||||
* Each entry is an SHA-256 hex digest produced by toHash(title, price, address).
|
||||
* @type {Set<string>}
|
||||
*/
|
||||
let cache = new Set();
|
||||
|
||||
// Periodically refresh the cache from storage
|
||||
setInterval(() => {
|
||||
initSimilarityCache();
|
||||
}, reloadCycle);
|
||||
|
||||
/**
|
||||
* Initialize or refresh the similarity cache from persistent storage.
|
||||
*
|
||||
* Reads all stored listings via getAllEntriesFromListings(), computes a hash for
|
||||
* each, and swaps the in-memory Set atomically to avoid in-place mutations that
|
||||
* could interfere with concurrent iteration.
|
||||
*
|
||||
* This function is idempotent and safe to call at any time.
|
||||
* @returns {void}
|
||||
*/
|
||||
export const initSimilarityCache = () => {
|
||||
const allEntries = getAllEntriesFromListings();
|
||||
const newCache = new Set();
|
||||
for (const entry of allEntries) {
|
||||
newCache.add(toHash(entry?.title, entry?.price, entry?.address));
|
||||
}
|
||||
// Atomic swap to avoid mutating the cache while it may be iterated elsewhere
|
||||
cache = newCache;
|
||||
};
|
||||
|
||||
/**
|
||||
* Check if a listing is already known and add it to the cache if not.
|
||||
*
|
||||
* The listing is identified by the combination of its title, price and
|
||||
* address. Null/undefined fields are ignored during hashing. Falsy-but-valid
|
||||
* values (e.g., price 0) are preserved.
|
||||
*
|
||||
* @param {Object} params - Listing fields
|
||||
* @param {string|undefined|null} params.title - The listing title
|
||||
* @param {string|undefined|null} params.address - The listing address
|
||||
* @param {number|string|undefined|null} params.price - The listing price
|
||||
* @returns {boolean} true if the entry already existed in the cache (duplicate), otherwise false
|
||||
*/
|
||||
export const checkAndAddEntry = ({ title, address, price }) => {
|
||||
const hash = toHash(title, price, address);
|
||||
if (cache.has(hash)) {
|
||||
return true;
|
||||
}
|
||||
cache.add(hash);
|
||||
return false;
|
||||
};
|
||||
|
||||
/**
|
||||
* Generate an SHA-256 hash from a list of input values.
|
||||
* Null or undefined values are ignored. Falsy but valid values like 0 are preserved.
|
||||
* Non-string values are coerced to strings prior to hashing.
|
||||
*
|
||||
* @param {...(string|number|null|undefined)} strings - Input values to hash
|
||||
* @returns {string} Hexadecimal hash
|
||||
*/
|
||||
function toHash(...strings) {
|
||||
return crypto.createHash('sha256').update(strings.filter(Boolean).join('|')).digest('hex');
|
||||
}
|
||||
|
||||
/**
|
||||
* Cleanup expired cache entries and schedule the next cleanup run.
|
||||
* This function is invoked automatically by scheduled timers.
|
||||
*
|
||||
* @private
|
||||
*/
|
||||
function runCleanup() {
|
||||
const now = Date.now();
|
||||
for (const [hash, expiry] of entries) {
|
||||
if (expiry <= now) entries.delete(hash);
|
||||
}
|
||||
scheduleNext();
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the soonest expiry timestamp among all cache entries
|
||||
* and schedule a one-shot timer that will trigger at that time.
|
||||
* Cancels any existing timer before scheduling a new one.
|
||||
*
|
||||
* @private
|
||||
*/
|
||||
function scheduleNext() {
|
||||
if (timer) {
|
||||
clearTimeout(timer);
|
||||
timer = null;
|
||||
}
|
||||
let next = Infinity;
|
||||
const now = Date.now();
|
||||
for (const expiry of entries.values()) {
|
||||
if (expiry > now && expiry < next) next = expiry;
|
||||
}
|
||||
if (next !== Infinity) {
|
||||
timer = setTimeout(runCleanup, Math.max(0, next - now));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add or refresh a cache entry for the given title and address.
|
||||
* The entry will automatically expire after the configured retention window.
|
||||
*
|
||||
* @param {string} title - The title used to build the cache key
|
||||
* @param {string} address - The address used to build the cache key
|
||||
*/
|
||||
export function addCacheEntry(title, address) {
|
||||
const hash = toHash(title, address);
|
||||
const expiry = Date.now() + retention;
|
||||
entries.set(hash, expiry);
|
||||
scheduleNext();
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a cache entry with the same title and address exists
|
||||
* and is still valid (not expired).
|
||||
*
|
||||
* @param {string} title - The title used to build the cache key
|
||||
* @param {string} address - The address used to build the cache key
|
||||
* @returns {boolean} True if a valid cache entry exists, false otherwise
|
||||
*/
|
||||
export function hasSimilarEntries(title, address) {
|
||||
const hash = toHash(title, address);
|
||||
const expiry = entries.get(hash);
|
||||
if (expiry == null) return false;
|
||||
if (expiry <= Date.now()) {
|
||||
entries.delete(hash);
|
||||
scheduleNext();
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop any scheduled cleanup timers and prevent further automatic cleanup.
|
||||
* Entries that are already in the cache will remain until removed manually
|
||||
* or until cleanup is started again by adding new entries.
|
||||
*/
|
||||
export function stopCacheCleanup() {
|
||||
if (timer) clearTimeout(timer);
|
||||
timer = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* this is only for test purposes
|
||||
*/
|
||||
export function invalidateAllForTest() {
|
||||
for (const key of entries.keys()) {
|
||||
entries.set(key, 0);
|
||||
}
|
||||
runCleanup();
|
||||
const normalized = strings
|
||||
.filter((v) => v !== null && v !== undefined)
|
||||
.map((v) => (typeof v === 'string' ? v : String(v)));
|
||||
return crypto.createHash('sha256').update(normalized.join('|')).digest('hex');
|
||||
}
|
||||
|
||||
@@ -310,8 +310,8 @@ export const deleteListingsByJobId = (jobId) => {
|
||||
if (!jobId) return;
|
||||
return SqliteConnection.execute(
|
||||
`DELETE
|
||||
FROM listings
|
||||
WHERE job_id = @jobId`,
|
||||
FROM listings
|
||||
WHERE job_id = @jobId`,
|
||||
{ jobId },
|
||||
);
|
||||
};
|
||||
@@ -332,3 +332,13 @@ export const deleteListingsById = (ids) => {
|
||||
ids,
|
||||
);
|
||||
};
|
||||
|
||||
/**
|
||||
* Return all listings with only the fields: title, address, and price.
|
||||
* This is the single helper requested for simple consumers.
|
||||
*
|
||||
* @returns {{title: string|null, address: string|null, price: number|null}[]}
|
||||
*/
|
||||
export const getAllEntriesFromListings = () => {
|
||||
return SqliteConnection.query(`SELECT title, address, price FROM listings`);
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user