diff --git a/index.js b/index.js index eb70746..840c475 100755 --- a/index.js +++ b/index.js @@ -37,6 +37,8 @@ await runMigrations(); // Load provider modules once at startup const providers = await getProviders(); +similarityCache.initSimilarityCache(); + //assuming interval is always in minutes const INTERVAL = config.interval * 60 * 1000; diff --git a/lib/FredyPipeline.js b/lib/FredyPipeline.js index 77e5099..eb20168 100755 --- a/lib/FredyPipeline.js +++ b/lib/FredyPipeline.js @@ -183,8 +183,12 @@ class FredyPipeline { * @returns {Listing[]} Listings considered unique enough to keep. */ _filterBySimilarListings(listings) { - const filteredList = listings.filter((listing) => { - const similar = this._similarityCache.hasSimilarEntries(listing.title, listing.address); + return listings.filter((listing) => { + const similar = this._similarityCache.checkAndAddEntry({ + title: listing.title, + address: listing.address, + price: listing.price, + }); if (similar) { logger.debug( `Filtering similar entry for title '${listing.title}' and address '${listing.address}' (Provider: '${this._providerId}')`, @@ -192,8 +196,6 @@ class FredyPipeline { } return !similar; }); - filteredList.forEach((filter) => this._similarityCache.addCacheEntry(filter.title, filter.address)); - return filteredList; } /** diff --git a/lib/services/similarity-check/similarityCache.js b/lib/services/similarity-check/similarityCache.js index f89f8f3..4175bb6 100644 --- a/lib/services/similarity-check/similarityCache.js +++ b/lib/services/similarity-check/similarityCache.js @@ -1,116 +1,92 @@ -import crypto from 'crypto'; - -const retention = 60 * 60 * 1000; /** - * Internal cache storage. - * Maps a SHA-256 hash (string) to its expiry timestamp (number in ms). - * @type {Map} - */ -const entries = new Map(); - -/** - * Reference to the currently scheduled cleanup timer. - * @type {NodeJS.Timeout | null} - */ -let timer = null; - -/** - * Generate a SHA-256 hash from a list of input strings. - * Null or undefined values are ignored. + * Similarity cache * - * @param {...(string|null|undefined)} strings - Input values to hash + * Maintains an in-memory Set of content hashes to detect whether a listing + * (identified by a tuple of title, price and address) has been seen before. + * + * Design notes: + * - The cache is refreshed periodically from persistent storage. To avoid + * modification-during-iteration issues, the refresh builds a new Set and + * atomically swaps the reference instead of mutating in place. + * - Hashing ignores null/undefined values but preserves falsy-yet-valid values + * like 0. Non-string values are coerced to strings before hashing. + * + * This module has no persistence of its own; it relies on + * getAllEntriesFromListings() for data hydration. + * @module similarityCache + */ +import crypto from 'crypto'; +import { getAllEntriesFromListings } from '../storage/listingsStorage.js'; + +/** @type {number} Refresh interval in milliseconds (defaults to one hour). */ +const reloadCycle = 60 * 60 * 1000; // every hour, refresh + +/** + * Internal cache of content hashes for known listings. + * + * Each entry is an SHA-256 hex digest produced by toHash(title, price, address). + * @type {Set} + */ +let cache = new Set(); + +// Periodically refresh the cache from storage +setInterval(() => { + initSimilarityCache(); +}, reloadCycle); + +/** + * Initialize or refresh the similarity cache from persistent storage. + * + * Reads all stored listings via getAllEntriesFromListings(), computes a hash for + * each, and swaps the in-memory Set atomically to avoid in-place mutations that + * could interfere with concurrent iteration. + * + * This function is idempotent and safe to call at any time. + * @returns {void} + */ +export const initSimilarityCache = () => { + const allEntries = getAllEntriesFromListings(); + const newCache = new Set(); + for (const entry of allEntries) { + newCache.add(toHash(entry?.title, entry?.price, entry?.address)); + } + // Atomic swap to avoid mutating the cache while it may be iterated elsewhere + cache = newCache; +}; + +/** + * Check if a listing is already known and add it to the cache if not. + * + * The listing is identified by the combination of its title, price and + * address. Null/undefined fields are ignored during hashing. Falsy-but-valid + * values (e.g., price 0) are preserved. + * + * @param {Object} params - Listing fields + * @param {string|undefined|null} params.title - The listing title + * @param {string|undefined|null} params.address - The listing address + * @param {number|string|undefined|null} params.price - The listing price + * @returns {boolean} true if the entry already existed in the cache (duplicate), otherwise false + */ +export const checkAndAddEntry = ({ title, address, price }) => { + const hash = toHash(title, price, address); + if (cache.has(hash)) { + return true; + } + cache.add(hash); + return false; +}; + +/** + * Generate an SHA-256 hash from a list of input values. + * Null or undefined values are ignored. Falsy but valid values like 0 are preserved. + * Non-string values are coerced to strings prior to hashing. + * + * @param {...(string|number|null|undefined)} strings - Input values to hash * @returns {string} Hexadecimal hash */ function toHash(...strings) { - return crypto.createHash('sha256').update(strings.filter(Boolean).join('|')).digest('hex'); -} - -/** - * Cleanup expired cache entries and schedule the next cleanup run. - * This function is invoked automatically by scheduled timers. - * - * @private - */ -function runCleanup() { - const now = Date.now(); - for (const [hash, expiry] of entries) { - if (expiry <= now) entries.delete(hash); - } - scheduleNext(); -} - -/** - * Find the soonest expiry timestamp among all cache entries - * and schedule a one-shot timer that will trigger at that time. - * Cancels any existing timer before scheduling a new one. - * - * @private - */ -function scheduleNext() { - if (timer) { - clearTimeout(timer); - timer = null; - } - let next = Infinity; - const now = Date.now(); - for (const expiry of entries.values()) { - if (expiry > now && expiry < next) next = expiry; - } - if (next !== Infinity) { - timer = setTimeout(runCleanup, Math.max(0, next - now)); - } -} - -/** - * Add or refresh a cache entry for the given title and address. - * The entry will automatically expire after the configured retention window. - * - * @param {string} title - The title used to build the cache key - * @param {string} address - The address used to build the cache key - */ -export function addCacheEntry(title, address) { - const hash = toHash(title, address); - const expiry = Date.now() + retention; - entries.set(hash, expiry); - scheduleNext(); -} - -/** - * Check if a cache entry with the same title and address exists - * and is still valid (not expired). - * - * @param {string} title - The title used to build the cache key - * @param {string} address - The address used to build the cache key - * @returns {boolean} True if a valid cache entry exists, false otherwise - */ -export function hasSimilarEntries(title, address) { - const hash = toHash(title, address); - const expiry = entries.get(hash); - if (expiry == null) return false; - if (expiry <= Date.now()) { - entries.delete(hash); - scheduleNext(); - return false; - } - return true; -} - -/** - * Stop any scheduled cleanup timers and prevent further automatic cleanup. - * Entries that are already in the cache will remain until removed manually - * or until cleanup is started again by adding new entries. - */ -export function stopCacheCleanup() { - if (timer) clearTimeout(timer); - timer = null; -} - -/** - * this is only for test purposes - */ -export function invalidateAllForTest() { - for (const key of entries.keys()) { - entries.set(key, 0); - } - runCleanup(); + const normalized = strings + .filter((v) => v !== null && v !== undefined) + .map((v) => (typeof v === 'string' ? v : String(v))); + return crypto.createHash('sha256').update(normalized.join('|')).digest('hex'); } diff --git a/lib/services/storage/listingsStorage.js b/lib/services/storage/listingsStorage.js index 5a74bbe..94e96f8 100755 --- a/lib/services/storage/listingsStorage.js +++ b/lib/services/storage/listingsStorage.js @@ -310,8 +310,8 @@ export const deleteListingsByJobId = (jobId) => { if (!jobId) return; return SqliteConnection.execute( `DELETE - FROM listings - WHERE job_id = @jobId`, + FROM listings + WHERE job_id = @jobId`, { jobId }, ); }; @@ -332,3 +332,13 @@ export const deleteListingsById = (ids) => { ids, ); }; + +/** + * Return all listings with only the fields: title, address, and price. + * This is the single helper requested for simple consumers. + * + * @returns {{title: string|null, address: string|null, price: number|null}[]} + */ +export const getAllEntriesFromListings = () => { + return SqliteConnection.query(`SELECT title, address, price FROM listings`); +}; diff --git a/package.json b/package.json index 5f0a706..248ab6f 100755 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "fredy", - "version": "14.2.2", + "version": "14.3.0", "description": "[F]ind [R]eal [E]states [d]amn eas[y].", "scripts": { "prepare": "husky", diff --git a/test/FredyPipeline/FredyPipeline.test.js b/test/FredyPipeline/FredyPipeline.test.js deleted file mode 100644 index 500b7e4..0000000 --- a/test/FredyPipeline/FredyPipeline.test.js +++ /dev/null @@ -1,53 +0,0 @@ -import { expect } from 'chai'; -import * as similarityCache from '../../lib/services/similarity-check/similarityCache.js'; -import { mockFredy } from '../utils.js'; - -describe('FredyPipeline', () => { - afterEach(() => { - similarityCache.invalidateAllForTest(); - }); - - after(() => { - similarityCache.stopCacheCleanup(); - }); - - describe('_filterBySimilarListings', () => { - let fredyRuntime; - - beforeEach(async () => { - const FredyRuntime = await mockFredy(); - fredyRuntime = new FredyRuntime({}, null, 'dummy-provider', 'dummy-job', similarityCache); - }); - - it('should filter out listings with similar title and address already in cache', () => { - similarityCache.addCacheEntry('Penthouse', 'Mustermann Straße 1'); - - const listings = [ - { id: '1', title: 'Penthouse', address: 'Mustermann Straße 1' }, - { id: '2', title: 'Nice apartment', address: 'Mustermann Straße 15' }, - ]; - - const result = fredyRuntime._filterBySimilarListings(listings); - - expect(result).to.have.length(1); - expect(result[0].id).to.equal('2'); - expect(result[0].title).to.equal('Nice apartment'); - - expect(similarityCache.hasSimilarEntries('Nice apartment', 'Mustermann Straße 15')).to.be.true; - }); - - it('should handle listings with null or undefined address', () => { - const listings = [ - { id: '1', title: 'Penthouse', address: null }, - { id: '2', title: 'Nice apartment', address: undefined }, - ]; - - const result = fredyRuntime._filterBySimilarListings(listings); - - expect(result).to.have.length(2); - - expect(similarityCache.hasSimilarEntries('Penthouse', null)).to.be.true; - expect(similarityCache.hasSimilarEntries('Nice apartment', undefined)).to.be.true; - }); - }); -}); diff --git a/test/provider/einsAImmobilien.test.js b/test/provider/einsAImmobilien.test.js index 4efba2a..ad6933a 100644 --- a/test/provider/einsAImmobilien.test.js +++ b/test/provider/einsAImmobilien.test.js @@ -5,9 +5,6 @@ import { expect } from 'chai'; import * as provider from '../../lib/provider/einsAImmobilien.js'; describe('#einsAImmobilien testsuite()', () => { - after(() => { - similarityCache.stopCacheCleanup(); - }); provider.init(providerConfig.einsAImmobilien, [], []); it('should test einsAImmobilien provider', async () => { const Fredy = await mockFredy(); diff --git a/test/provider/immobilienDe.test.js b/test/provider/immobilienDe.test.js index 6383bb6..887a06a 100644 --- a/test/provider/immobilienDe.test.js +++ b/test/provider/immobilienDe.test.js @@ -5,9 +5,6 @@ import { expect } from 'chai'; import * as provider from '../../lib/provider/immobilienDe.js'; describe('#immobilien.de testsuite()', () => { - after(() => { - similarityCache.stopCacheCleanup(); - }); provider.init(providerConfig.immobilienDe, [], []); it('should test immobilien.de provider', async () => { const Fredy = await mockFredy(); diff --git a/test/provider/immonet.test.js b/test/provider/immonet.test.js index b730c56..5350360 100644 --- a/test/provider/immonet.test.js +++ b/test/provider/immonet.test.js @@ -5,10 +5,6 @@ import { expect } from 'chai'; import * as provider from '../../lib/provider/immonet.js'; describe('#immonet testsuite()', () => { - after(() => { - similarityCache.stopCacheCleanup(); - }); - it('should test immonet provider', async () => { const Fredy = await mockFredy(); provider.init(providerConfig.immonet, [], []); diff --git a/test/provider/immoscout.test.js b/test/provider/immoscout.test.js index ec6911d..322ebc3 100644 --- a/test/provider/immoscout.test.js +++ b/test/provider/immoscout.test.js @@ -5,10 +5,6 @@ import { get } from '../mocks/mockNotification.js'; import * as provider from '../../lib/provider/immoscout.js'; describe('#immoscout provider testsuite()', () => { - after(() => { - similarityCache.stopCacheCleanup(); - }); - provider.init(providerConfig.immoscout, [], []); it('should test immoscout provider', async () => { const Fredy = await mockFredy(); diff --git a/test/provider/immoswp.test.js b/test/provider/immoswp.test.js index 603bba9..f0f987c 100644 --- a/test/provider/immoswp.test.js +++ b/test/provider/immoswp.test.js @@ -5,9 +5,6 @@ import { expect } from 'chai'; import * as provider from '../../lib/provider/immoswp.js'; describe('#immoswp testsuite()', () => { - after(() => { - similarityCache.stopCacheCleanup(); - }); provider.init(providerConfig.immoswp, [], []); it('should test immoswp provider', async () => { const Fredy = await mockFredy(); diff --git a/test/provider/immowelt.test.js b/test/provider/immowelt.test.js index fb81f24..2285b71 100644 --- a/test/provider/immowelt.test.js +++ b/test/provider/immowelt.test.js @@ -5,10 +5,6 @@ import { expect } from 'chai'; import * as provider from '../../lib/provider/immowelt.js'; describe('#immowelt testsuite()', () => { - after(() => { - similarityCache.stopCacheCleanup(); - }); - it('should test immowelt provider', async () => { const Fredy = await mockFredy(); provider.init(providerConfig.immowelt, [], []); diff --git a/test/provider/kleinanzeigen.test.js b/test/provider/kleinanzeigen.test.js index b573361..c228790 100644 --- a/test/provider/kleinanzeigen.test.js +++ b/test/provider/kleinanzeigen.test.js @@ -5,9 +5,6 @@ import { expect } from 'chai'; import * as provider from '../../lib/provider/kleinanzeigen.js'; describe('#kleinanzeigen testsuite()', () => { - after(() => { - similarityCache.stopCacheCleanup(); - }); it('should test kleinanzeigen provider', async () => { const Fredy = await mockFredy(); provider.init(providerConfig.kleinanzeigen, [], []); diff --git a/test/provider/mcMakler.test.js b/test/provider/mcMakler.test.js index 2dbfa31..b48c40d 100644 --- a/test/provider/mcMakler.test.js +++ b/test/provider/mcMakler.test.js @@ -5,10 +5,6 @@ import { expect } from 'chai'; import * as provider from '../../lib/provider/mcMakler.js'; describe('#mcMakler testsuite()', () => { - after(() => { - similarityCache.stopCacheCleanup(); - }); - it('should test mcMakler provider', async () => { const Fredy = await mockFredy(); provider.init(providerConfig.mcMakler, []); diff --git a/test/provider/neubauKompass.test.js b/test/provider/neubauKompass.test.js index 4682bd0..5e5a445 100644 --- a/test/provider/neubauKompass.test.js +++ b/test/provider/neubauKompass.test.js @@ -5,9 +5,6 @@ import { expect } from 'chai'; import * as provider from '../../lib/provider/neubauKompass.js'; describe('#neubauKompass testsuite()', () => { - after(() => { - similarityCache.stopCacheCleanup(); - }); provider.init(providerConfig.neubauKompass, [], []); it('should test neubauKompass provider', async () => { const Fredy = await mockFredy(); diff --git a/test/provider/regionalimmobilien24.test.js b/test/provider/regionalimmobilien24.test.js index 19cbe57..6dafb08 100644 --- a/test/provider/regionalimmobilien24.test.js +++ b/test/provider/regionalimmobilien24.test.js @@ -5,10 +5,6 @@ import { expect } from 'chai'; import * as provider from '../../lib/provider/regionalimmobilien24.js'; describe('#regionalimmobilien24 testsuite()', () => { - after(() => { - similarityCache.stopCacheCleanup(); - }); - it('should test regionalimmobilien24 provider', async () => { const Fredy = await mockFredy(); provider.init(providerConfig.regionalimmobilien24, []); diff --git a/test/provider/sparkasse.test.js b/test/provider/sparkasse.test.js index ac65e4f..0b16c3c 100644 --- a/test/provider/sparkasse.test.js +++ b/test/provider/sparkasse.test.js @@ -5,10 +5,6 @@ import { expect } from 'chai'; import * as provider from '../../lib/provider/sparkasse.js'; describe('#sparkasse testsuite()', () => { - after(() => { - similarityCache.stopCacheCleanup(); - }); - it('should test sparkasse provider', async () => { const Fredy = await mockFredy(); provider.init(providerConfig.sparkasse, []); diff --git a/test/provider/wgGesucht.test.js b/test/provider/wgGesucht.test.js index 116a34f..4bde924 100644 --- a/test/provider/wgGesucht.test.js +++ b/test/provider/wgGesucht.test.js @@ -5,9 +5,6 @@ import { expect } from 'chai'; import * as provider from '../../lib/provider/wgGesucht.js'; describe('#wgGesucht testsuite()', () => { - after(() => { - similarityCache.stopCacheCleanup(); - }); provider.init(providerConfig.wgGesucht, [], []); it('should test wgGesucht provider', async () => { const Fredy = await mockFredy(); diff --git a/test/similarity/similarity.test.js b/test/similarity/similarity.test.js deleted file mode 100644 index 418e43e..0000000 --- a/test/similarity/similarity.test.js +++ /dev/null @@ -1,30 +0,0 @@ -import { expect } from 'chai'; -import * as similarityCache from '../../lib/services/similarity-check/similarityCache.js'; - -describe('similarityCheck', () => { - it('should return true on duplicate', () => { - similarityCache.addCacheEntry('Hello World', 'Test'); - expect(similarityCache.hasSimilarEntries('Hello World', 'Test')).to.be.true; - }); - - it('should return true even if one value is null', () => { - similarityCache.addCacheEntry('Hello World', null); - expect(similarityCache.hasSimilarEntries('Hello World', null)).to.be.true; - }); - - it('should return true even if one value is an obj', () => { - similarityCache.addCacheEntry('Hello World', [{ TR: 'OLOLO' }]); - expect(similarityCache.hasSimilarEntries('Hello World', [{ TR: 'OLOLO' }])).to.be.true; - }); - - it('should return false when no duplicate', () => { - similarityCache.addCacheEntry('Hello World__', 'Test'); - expect(similarityCache.hasSimilarEntries('Hello World___', 'Test')).to.be.false; - }); - - it('should return false when no duplicate', () => { - expect(similarityCache.hasSimilarEntries('Hello World', 'Test')).to.be.true; - similarityCache.invalidateAllForTest(); - expect(similarityCache.hasSimilarEntries('Hello World', 'Test')).to.be.false; - }); -}); diff --git a/test/similarity/similarityCache.test.js b/test/similarity/similarityCache.test.js new file mode 100644 index 0000000..100ea1c --- /dev/null +++ b/test/similarity/similarityCache.test.js @@ -0,0 +1,62 @@ +import { expect } from 'chai'; +import esmock from 'esmock'; + +// Helper to create module under test with mocks +async function loadModuleWith({ entries = [] } = {}) { + const mod = await esmock('../../lib/services/similarity-check/similarityCache.js', { + // Mock the storage to return our controlled entries + '../../lib/services/storage/listingsStorage.js': { + getAllEntriesFromListings: () => entries, + }, + }); + return mod; +} + +describe('similarityCache', () => { + it('initSimilarityCache builds cache from storage and enables duplicate detection', async () => { + const entries = [ + { title: 'A', price: 1000, address: 'Main 1' }, + { title: 'B', price: 0, address: 'Zero St' }, + ]; + + const { initSimilarityCache, checkAndAddEntry } = await loadModuleWith({ entries }); + + // Initially, duplicates should not be detected for new data + expect(checkAndAddEntry({ title: 'X', price: 200, address: 'Y' })).to.equal(false); + + // Now initialize from storage + initSimilarityCache(); + + // Exact duplicates should be detected + expect(checkAndAddEntry({ title: 'A', price: 1000, address: 'Main 1' })).to.equal(true); + // Ensure falsy-but-valid price 0 is preserved by hashing and detected as duplicate + expect(checkAndAddEntry({ title: 'B', price: 0, address: 'Zero St' })).to.equal(true); + }); + + it('checkAndAddEntry returns false for new entry then true for duplicate on second call', async () => { + const { checkAndAddEntry } = await loadModuleWith(); + + const first = checkAndAddEntry({ title: 'C', price: 300, address: 'Road 3' }); + const second = checkAndAddEntry({ title: 'C', price: 300, address: 'Road 3' }); + + expect(first).to.equal(false); + expect(second).to.equal(true); + }); + + it('hashing ignores null/undefined but preserves 0 via behavior', async () => { + const { checkAndAddEntry } = await loadModuleWith(); + + // Add baseline (null address ignored) + const add1 = checkAndAddEntry({ title: 'T', price: 1, address: null }); + expect(add1).to.equal(false); + // Duplicate with undefined address should match + const dup = checkAndAddEntry({ title: 'T', price: 1, address: undefined }); + expect(dup).to.equal(true); + + // Now test that price 0 is preserved (not filtered out) + const addZero = checkAndAddEntry({ title: 'Z', price: 0, address: 'Zero' }); + expect(addZero).to.equal(false); + const dupZero = checkAndAddEntry({ title: 'Z', price: 0, address: 'Zero' }); + expect(dupZero).to.equal(true); + }); +});