mirror of
https://github.com/orangecoding/fredy.git
synced 2026-06-16 12:31:07 +00:00
Check if a listing is still active (#184)
* check if a listing is still active * upgrade dependencies
This commit is contained in:
committed by
GitHub
parent
28eddc5d7f
commit
c839f3abc9
@@ -1,7 +1,7 @@
|
||||
import { removeJobsByUserId } from './storage/jobStorage.js';
|
||||
import { config } from '../utils.js';
|
||||
import { getUsers } from './storage/userStorage.js';
|
||||
import logger from './logger.js';
|
||||
import { removeJobsByUserId } from '../storage/jobStorage.js';
|
||||
import { config } from '../../utils.js';
|
||||
import { getUsers } from '../storage/userStorage.js';
|
||||
import logger from '../logger.js';
|
||||
import cron from 'node-cron';
|
||||
|
||||
/**
|
||||
13
lib/services/crons/listing-alive-cron.js
Normal file
13
lib/services/crons/listing-alive-cron.js
Normal file
@@ -0,0 +1,13 @@
|
||||
import cron from 'node-cron';
|
||||
import runActiveChecker from '../listings/listingActiveService.js';
|
||||
|
||||
async function runTask() {
|
||||
await runActiveChecker();
|
||||
}
|
||||
|
||||
export async function initActiveCheckerCron() {
|
||||
//run directly on start
|
||||
await runTask();
|
||||
// then every day at 1 am
|
||||
cron.schedule('0 1 * * *', runTask);
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
import cron from 'node-cron';
|
||||
import { config, inDevMode } from '../../utils.js';
|
||||
import { trackMainEvent } from './Tracker.js';
|
||||
import { trackMainEvent } from '../tracking/Tracker.js';
|
||||
|
||||
async function runTask() {
|
||||
//make sure to only send tracking events if the user gave us the green light and we are not in dev mode
|
||||
@@ -60,6 +60,7 @@ https://api.mobile.immobilienscout24.de/search/map/v3?publishedafter=2025-05-14T
|
||||
https://api.mobile.immobilienscout24.de/search/map/v3?features=disableNHBGrouping,nextGen,fairPrice,listingsInListFirstSummary,xxlListingType,contactDetails&publishedafter=2025-05-14T09:19:43&sorting=standard&pagesize=300&searchType=shape&realEstateType=housebuy&pagenumber=1&shape=%7D%7BjwHy%7Cqh@jCKdCgAvB_BdB%7DBzAaCjAqCfAqC~@uCt@iCh@eCZkCLyC?_EO%7DEa@%7DEa@iE_@%7BD%5DaDe@gDi@gDo@uCu@kBcB_AeDOiE?iDCgCMuBOkDCkG?yFRgD%60@cB%5C%7BA%60@eBx@aB%7C@kAbAy@rAe@bBUxCAhE?dFh@fGlAzGbBbHlBxGdB%60FrAhDz@xBh@nAf@l@RNNXkCkMJR~B%7CEnCpErCnDtClCvC~ApCh@rCJpC?
|
||||
*/
|
||||
import queryString from 'query-string';
|
||||
import { nullOrEmpty } from '../../utils.js';
|
||||
|
||||
const PARAM_NAME_MAP = {
|
||||
heatingtypes: 'heatingtypes',
|
||||
@@ -193,3 +194,14 @@ export function convertWebToMobile(webUrl) {
|
||||
|
||||
return `https://api.mobile.immobilienscout24.de/search/list?${mobileQuery}`;
|
||||
}
|
||||
|
||||
export function convertImmoscoutListingToMobileListing(url) {
|
||||
if (nullOrEmpty(url)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return url.replace(
|
||||
/^https:\/\/www\.immobilienscout24\.de\/expose\//,
|
||||
'https://api.mobile.immobilienscout24.de/expose/',
|
||||
);
|
||||
}
|
||||
|
||||
104
lib/services/listings/listingActiveService.js
Normal file
104
lib/services/listings/listingActiveService.js
Normal file
@@ -0,0 +1,104 @@
|
||||
import { deactivateListings, getActiveOrUnknownListings } from '../storage/listingsStorage.js';
|
||||
import { getProviders } from '../../utils.js';
|
||||
import logger from '../../services/logger.js';
|
||||
|
||||
/**
|
||||
* Runs the active-listing checker:
|
||||
* 1) Loads all listings with unknown or active status.
|
||||
* 2) Resolves each listing's provider and calls its `activeTester(link)`.
|
||||
* 3) Collects listings that are no longer active and deactivates them in one batch.
|
||||
*
|
||||
* Concurrency: network-bound checks are executed with a configurable concurrency limit.
|
||||
*
|
||||
* @param {object} [opts]
|
||||
* @param {number} [opts.concurrency=8] Max number of parallel activeTester calls.
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
export default async function runActiveChecker(opts = {}) {
|
||||
const { concurrency = 4 } = opts;
|
||||
|
||||
const listings = getActiveOrUnknownListings();
|
||||
if (!Array.isArray(listings) || listings.length === 0) {
|
||||
logger.debug('No listings to check.');
|
||||
return;
|
||||
}
|
||||
|
||||
const providers = await getProviders();
|
||||
if (!Array.isArray(providers) || providers.length === 0) {
|
||||
logger.warn('No providers available. Skipping active checks.');
|
||||
return;
|
||||
}
|
||||
|
||||
// Build a map for O(1) provider lookup by id
|
||||
/** @type {Record<string, any>} */
|
||||
const providerById = Object.create(null);
|
||||
for (const p of providers) {
|
||||
const id = p?.metaInformation?.id;
|
||||
if (id) providerById[id] = p;
|
||||
}
|
||||
|
||||
// Small generic mapLimit to cap concurrency without extra deps
|
||||
/**
|
||||
* @template T, R
|
||||
* @param {T[]} items
|
||||
* @param {number} limit
|
||||
* @param {(item: T, index: number) => Promise<R>} worker
|
||||
* @returns {Promise<R[]>}
|
||||
*/
|
||||
async function mapLimit(items, limit, worker) {
|
||||
const results = new Array(items.length);
|
||||
let next = 0;
|
||||
|
||||
async function runOne() {
|
||||
while (next < items.length) {
|
||||
const i = next++;
|
||||
try {
|
||||
results[i] = await worker(items[i], i);
|
||||
} catch (err) {
|
||||
results[i] = /** @type {any} */ (err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const runners = Array.from({ length: Math.min(limit, items.length) }, runOne);
|
||||
await Promise.all(runners);
|
||||
return results;
|
||||
}
|
||||
|
||||
/** @type {string[]} */
|
||||
const listingsSetToInactive = [];
|
||||
|
||||
await mapLimit(listings, concurrency, async (listing) => {
|
||||
const { provider: listingProviderId, link, id } = listing || {};
|
||||
|
||||
const matchedProvider = providerById[listingProviderId];
|
||||
if (!matchedProvider) {
|
||||
logger.warn('Could not find matching provider for', listingProviderId);
|
||||
return;
|
||||
}
|
||||
const tester = matchedProvider?.config?.activeTester;
|
||||
if (typeof tester !== 'function') {
|
||||
logger.warn('No activeTester configured for', listingProviderId);
|
||||
return;
|
||||
}
|
||||
|
||||
// Contract: activeTester(link) returns 1 if active, 0 if inactive
|
||||
let result;
|
||||
try {
|
||||
result = await tester(link);
|
||||
} catch {
|
||||
result = -1;
|
||||
}
|
||||
|
||||
if (result === 0 && id) {
|
||||
listingsSetToInactive.push(id);
|
||||
}
|
||||
});
|
||||
|
||||
if (listingsSetToInactive.length > 0) {
|
||||
logger.info(`Setting ${listingsSetToInactive.length} listings to inactive.`);
|
||||
deactivateListings(listingsSetToInactive);
|
||||
} else {
|
||||
logger.debug('No listings need to be set inactive.');
|
||||
}
|
||||
}
|
||||
68
lib/services/listings/listingActiveTester.js
Normal file
68
lib/services/listings/listingActiveTester.js
Normal file
@@ -0,0 +1,68 @@
|
||||
import fetch from 'node-fetch';
|
||||
import { randomBetween, sleep } from '../../utils.js';
|
||||
|
||||
const maxAttempts = 3;
|
||||
|
||||
/**
|
||||
* Check if a listing is still active with up to 3 attempts and exponential backoff.
|
||||
* Backoff waits are capped and the last wait is at most 2000 ms.
|
||||
*
|
||||
* Rules:
|
||||
* - HTTP 200 => return 1
|
||||
* - HTTP 401/403 => return -1 (most certainly detected as a bot)
|
||||
* - HTTP 404 => return 0
|
||||
* - Other statuses or network errors => retry until attempts are exhausted
|
||||
*
|
||||
* @returns {Promise<Integer>} 1 if active, o if not active and -1 if detected as bot
|
||||
*/
|
||||
export default async function checkIfListingIsActive(link) {
|
||||
await sleep(randomBetween(50, 100));
|
||||
|
||||
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
|
||||
try {
|
||||
const res = await fetch(link, {
|
||||
headers: {
|
||||
'User-Agent':
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
|
||||
'Accept-Language': 'de-DE,de;q=0.9,en;q=0.8',
|
||||
},
|
||||
});
|
||||
|
||||
if (res.status === 200) {
|
||||
return 1;
|
||||
}
|
||||
if (res.status === 401) return -1;
|
||||
if (res.status === 403) return -1;
|
||||
if (res.status === 404) return 0;
|
||||
|
||||
// For any other status, only retry if attempts remain
|
||||
if (attempt < maxAttempts) {
|
||||
await sleep(backoffDelay(attempt));
|
||||
continue;
|
||||
}
|
||||
|
||||
return 0;
|
||||
} catch {
|
||||
// Network error: retry if attempts remain
|
||||
if (attempt < maxAttempts) {
|
||||
await sleep(backoffDelay(attempt));
|
||||
continue;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Exponential backoff delay with cap.
|
||||
* attempt: 1 -> 500ms, 2 -> 1000ms, 3 -> 2000ms (cap)
|
||||
* @param {number} attempt 1-based attempt index
|
||||
* @returns {number} delay in ms
|
||||
*/
|
||||
function backoffDelay(attempt) {
|
||||
const base = 500;
|
||||
const cap = 2000;
|
||||
return Math.min(base * 2 ** (attempt - 1), cap);
|
||||
}
|
||||
@@ -53,6 +53,36 @@ export const getKnownListingHashesForJobAndProvider = (jobId, providerId) => {
|
||||
).map((r) => r.hash);
|
||||
};
|
||||
|
||||
/**
|
||||
* Return a list of listing that either are active or have an unknown status
|
||||
* to constantly check if they are still online
|
||||
*
|
||||
* @returns {string[]} Array of listings
|
||||
*/
|
||||
export const getActiveOrUnknownListings = () => {
|
||||
return SqliteConnection.query(
|
||||
`SELECT *
|
||||
FROM listings
|
||||
WHERE is_active is null OR is_active = 1 ORDER BY provider`,
|
||||
);
|
||||
};
|
||||
|
||||
/**
|
||||
* Deactivates listings by setting is_active = 0 for all matching IDs.
|
||||
*
|
||||
* @param {string[]} ids - Array of listing IDs to deactivate.
|
||||
* @returns {object[]} Result of the SQLite query execution.
|
||||
*/
|
||||
export const deactivateListings = (ids) => {
|
||||
const placeholders = ids.map(() => '?').join(',');
|
||||
return SqliteConnection.execute(
|
||||
`UPDATE listings
|
||||
SET is_active = 0
|
||||
WHERE id IN (${placeholders})`,
|
||||
ids,
|
||||
);
|
||||
};
|
||||
|
||||
/**
|
||||
* Persist a batch of scraped listings for a given job and provider.
|
||||
*
|
||||
@@ -86,9 +116,9 @@ export const storeListings = (jobId, providerId, listings) => {
|
||||
SqliteConnection.withTransaction((db) => {
|
||||
const stmt = db.prepare(
|
||||
`INSERT INTO listings (id, hash, provider, job_id, price, size, title, image_url, description, address,
|
||||
link, created_at)
|
||||
link, created_at, is_active)
|
||||
VALUES (@id, @hash, @provider, @job_id, @price, @size, @title, @image_url, @description, @address, @link,
|
||||
@created_at)
|
||||
@created_at, 1)
|
||||
ON CONFLICT(job_id, hash) DO NOTHING`,
|
||||
);
|
||||
|
||||
|
||||
@@ -0,0 +1,8 @@
|
||||
// Migration: there needs to be a unique index on job_id and hash as only
|
||||
// this makes the listing indeed unique
|
||||
|
||||
export function up(db) {
|
||||
db.exec(`
|
||||
ALTER TABLE listings ADD COLUMN is_active INTEGER DEFAULT 1;
|
||||
`);
|
||||
}
|
||||
Reference in New Issue
Block a user