mirror of
https://github.com/orangecoding/fredy.git
synced 2026-06-16 12:31:07 +00:00
139 lines
4.5 KiB
JavaScript
Executable File
139 lines
4.5 KiB
JavaScript
Executable File
import { nullOrEmpty } from '../../utils.js';
|
|
import SqliteConnection from './SqliteConnection.js';
|
|
import { nanoid } from 'nanoid';
|
|
|
|
/**
|
|
* Build analytics data for a given job by grouping all listings by provider and
|
|
* mapping each listing hash to its creation timestamp.
|
|
*
|
|
* SQL shape:
|
|
* SELECT json_group_object(provider, json_object(hash, created_at)) AS result
|
|
* FROM listings WHERE job_id = @jobId;
|
|
*
|
|
* The resulting object has the shape:
|
|
* {
|
|
* providerA: { "<hash1>": <created_at_ms>, "<hash2>": <created_at_ms>, ... },
|
|
* providerB: { ... }
|
|
* }
|
|
*
|
|
* @param {string} jobId - ID of the job whose listings should be aggregated.
|
|
* @returns {Record<string, Record<string, number>>} Object grouped by provider mapping listing-hash -> created_at epoch ms.
|
|
*/
|
|
export const getListingProviderDataForAnalytics = (jobId) => {
|
|
const row = SqliteConnection.query(
|
|
`SELECT COALESCE(
|
|
json_group_object(provider, json(provider_map)),
|
|
json('{}')
|
|
) AS result
|
|
FROM (SELECT provider,
|
|
json_group_object(hash, created_at) AS provider_map
|
|
FROM listings
|
|
WHERE job_id = @jobId
|
|
GROUP BY provider);`,
|
|
{ jobId },
|
|
);
|
|
|
|
return row?.length > 0 ? JSON.parse(row[0].result) : {};
|
|
};
|
|
|
|
/**
|
|
* Return a list of known listing hashes for a given job and provider.
|
|
* Useful to de-duplicate before inserting new listings.
|
|
*
|
|
* @param {string} jobId - The job identifier.
|
|
* @param {string} providerId - The provider identifier (e.g., 'immoscout').
|
|
* @returns {string[]} Array of listing hashes.
|
|
*/
|
|
export const getKnownListingHashesForJobAndProvider = (jobId, providerId) => {
|
|
return SqliteConnection.query(
|
|
`SELECT hash
|
|
FROM listings
|
|
WHERE job_id = @jobId AND provider = @providerId`,
|
|
{ jobId, providerId },
|
|
).map((r) => r.hash);
|
|
};
|
|
|
|
/**
|
|
* Persist a batch of scraped listings for a given job and provider.
|
|
*
|
|
* - Empty or non-array inputs are ignored.
|
|
* - Each listing is inserted with ON CONFLICT(hash) DO NOTHING to avoid duplicates.
|
|
* - Performs inserts in a single transaction for performance.
|
|
*
|
|
* Listing input shape (minimal expected):
|
|
* {
|
|
* id: string, // unique id
|
|
* hash: string // stable hash/id of the listing (used as unique hash)
|
|
* price?: string, // e.g., "1.234 €" or "1,234€"
|
|
* size?: string, // e.g., "70 m²"
|
|
* title?: string,
|
|
* image?: string, // image URL
|
|
* description?: string,
|
|
* address?: string, // free-text address possibly containing parentheses
|
|
* link?: string
|
|
* }
|
|
*
|
|
* @param {string} jobId - The job identifier.
|
|
* @param {string} providerId - The provider identifier.
|
|
* @param {Array<Object>} listings - Array of listing objects as described above.
|
|
* @returns {void}
|
|
*/
|
|
export const storeListings = (jobId, providerId, listings) => {
|
|
if (!Array.isArray(listings) || listings.length === 0) {
|
|
return;
|
|
}
|
|
|
|
SqliteConnection.withTransaction((db) => {
|
|
const stmt = db.prepare(
|
|
`INSERT INTO listings (id, hash, provider, job_id, price, size, title, image_url, description, address,
|
|
link, created_at)
|
|
VALUES (@id, @hash, @provider, @job_id, @price, @size, @title, @image_url, @description, @address, @link,
|
|
@created_at)
|
|
ON CONFLICT(job_id, hash) DO NOTHING`,
|
|
);
|
|
|
|
for (const item of listings) {
|
|
const params = {
|
|
id: nanoid(),
|
|
hash: item.id,
|
|
provider: providerId,
|
|
job_id: jobId,
|
|
price: extractNumber(item.price),
|
|
size: extractNumber(item.size),
|
|
title: item.title,
|
|
image_url: item.image,
|
|
description: item.description,
|
|
address: removeParentheses(item.address),
|
|
link: item.link,
|
|
created_at: Date.now(),
|
|
};
|
|
stmt.run(params);
|
|
}
|
|
});
|
|
|
|
/**
|
|
* Extract the first number from a string like "1.234 €" or "70 m²".
|
|
* Removes dots/commas before parsing. Returns null on invalid input.
|
|
* @param {string|undefined|null} str
|
|
* @returns {number|null}
|
|
*/
|
|
function extractNumber(str) {
|
|
if (!str) return null;
|
|
const match = str.replace(/[.,]/g, '').match(/\d+/);
|
|
return match ? +match[0] : null;
|
|
}
|
|
|
|
/**
|
|
* Remove any parentheses segments (including surrounding whitespace) from a string.
|
|
* Returns null for empty input.
|
|
* @param {string|undefined|null} str
|
|
* @returns {string|null}
|
|
*/
|
|
function removeParentheses(str) {
|
|
if (nullOrEmpty(str)) {
|
|
return null;
|
|
}
|
|
return str.replace(/\s*\([^)]*\)/g, '');
|
|
}
|
|
};
|