mirror of
https://github.com/orangecoding/fredy.git
synced 2026-06-16 12:31:07 +00:00
Feature/spec filter (#276)
* feat(): create map component, add area filtering to the job config * feat(): filter listings by area filter * chore(): cleanup * feat(): solve feedback * feat(): solve most providers * feat(): solve maybe other providers * feat(): add specFilter config, also add rooms to listing * feat(): change tests * feat(): fix kleinanzeigen parser * feat(): add spec filter switch for listing overviiews * feat(): add rooms and size to the overview and detail of a listing * feat(): rem label * feat(): add types, update providers, they now return specs as numbers * feat(): add jsonconfig to enable type checks * feat: add type for prividerConfig, add fieldNames per provider * feat: fix tests, provider, add formatListing * chore: remov duplicates * feat(): fix tests * feat: fix immoscout * chore: geojson typing * feat: solve requested changes
This commit is contained in:
@@ -19,22 +19,14 @@ import { distanceMeters } from './services/listings/distanceCalculator.js';
|
||||
import { getUserSettings } from './services/storage/settingsStorage.js';
|
||||
import { updateListingDistance } from './services/storage/listingsStorage.js';
|
||||
import booleanPointInPolygon from '@turf/boolean-point-in-polygon';
|
||||
import { formatListing } from './utils/formatListing.js';
|
||||
|
||||
/**
|
||||
* @typedef {Object} Listing
|
||||
* @property {string} id Stable unique identifier (hash) of the listing.
|
||||
* @property {string} title Title or headline of the listing.
|
||||
* @property {string} [address] Optional address/location text.
|
||||
* @property {string} [price] Optional price text/value.
|
||||
* @property {string} [url] Link to the listing detail page.
|
||||
* @property {any} [meta] Provider-specific additional metadata.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {Object} SimilarityCache
|
||||
* @property {(title:string, address?:string)=>boolean} hasSimilarEntries Returns true if a similar entry is known.
|
||||
* @property {(title:string, address?:string)=>void} addCacheEntry Adds a new entry to the similarity cache.
|
||||
*/
|
||||
/** @import { ParsedListing } from './types/listing.js' */
|
||||
/** @import { Job } from './types/job.js' */
|
||||
/** @import { ProviderConfig } from './types/providerConfig.js' */
|
||||
/** @import { SpecFilter, SpatialFilter } from './types/filter.js' */
|
||||
/** @import { SimilarityCache } from './types/similarityCache.js' */
|
||||
/** @import { Browser } from './types/browser.js' */
|
||||
|
||||
/**
|
||||
* Runtime orchestrator for fetching, normalizing, filtering, deduplicating, storing,
|
||||
@@ -48,43 +40,43 @@ import booleanPointInPolygon from '@turf/boolean-point-in-polygon';
|
||||
* 5) Identify new listings (vs. previously stored hashes)
|
||||
* 6) Persist new listings
|
||||
* 7) Filter out entries similar to already seen ones
|
||||
* 8) Dispatch notifications
|
||||
* 8) Filter out entries that do not match the job's specFilter
|
||||
* 9) Filter out entries that do not match the job's spatialFilter
|
||||
* 10) Dispatch notifications
|
||||
*/
|
||||
class FredyPipelineExecutioner {
|
||||
/**
|
||||
* Create a new runtime instance for a single provider/job execution.
|
||||
*
|
||||
* @param {Object} providerConfig Provider configuration.
|
||||
* @param {string} providerConfig.url Base URL to crawl.
|
||||
* @param {string} [providerConfig.sortByDateParam] Query parameter used to enforce sorting by date (provider-specific).
|
||||
* @param {string} [providerConfig.waitForSelector] CSS selector to wait for before parsing content.
|
||||
* @param {Object.<string, string>} providerConfig.crawlFields Mapping of field names to selectors/paths to extract.
|
||||
* @param {string} providerConfig.crawlContainer CSS selector for the container holding listing items.
|
||||
* @param {(raw:any)=>Listing} providerConfig.normalize Function to convert raw scraped data into a Listing shape.
|
||||
* @param {(listing:Listing)=>boolean} providerConfig.filter Function to filter out unwanted listings.
|
||||
* @param {(url:string, waitForSelector?:string)=>Promise<void>|Promise<Listing[]>} [providerConfig.getListings] Optional override to fetch listings.
|
||||
* @param {(listing:Listing, browser:any)=>Promise<Listing>} [providerConfig.fetchDetails] Optional per-listing detail enrichment. Called in parallel for each new listing after deduplication. Receives the shared browser instance. Must always resolve (never reject).
|
||||
* @param {Object} notificationConfig Notification configuration passed to notification adapters.
|
||||
* @param {Object} spatialFilter Optional spatial filter configuration.
|
||||
* @param {ProviderConfig} providerConfig Provider configuration.
|
||||
* @param {Job} job Job configuration.
|
||||
* @param {string} providerId The ID of the provider currently in use.
|
||||
* @param {string} jobKey Key of the job that is currently running (from within the config).
|
||||
* @param {SimilarityCache} similarityCache Cache instance for checking similar entries.
|
||||
* @param browser
|
||||
* @param {Browser} browser Puppeteer browser instance.
|
||||
*/
|
||||
constructor(providerConfig, notificationConfig, spatialFilter, providerId, jobKey, similarityCache, browser) {
|
||||
constructor(providerConfig, job, providerId, similarityCache, browser) {
|
||||
/** @type {ProviderConfig} */
|
||||
this._providerConfig = providerConfig;
|
||||
this._notificationConfig = notificationConfig;
|
||||
this._spatialFilter = spatialFilter;
|
||||
/** @type {Object} */
|
||||
this._jobNotificationConfig = job.notificationAdapter;
|
||||
/** @type {string} */
|
||||
this._jobKey = job.id;
|
||||
/** @type {SpecFilter | null} */
|
||||
this._jobSpecFilter = job.specFilter;
|
||||
/** @type {SpatialFilter | null} */
|
||||
this._jobSpatialFilter = job.spatialFilter;
|
||||
/** @type {string} */
|
||||
this._providerId = providerId;
|
||||
this._jobKey = jobKey;
|
||||
/** @type {SimilarityCache} */
|
||||
this._similarityCache = similarityCache;
|
||||
/** @type {Browser} */
|
||||
this._browser = browser;
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute the end-to-end pipeline for a single provider run.
|
||||
*
|
||||
* @returns {Promise<Listing[]|void>} Resolves to the list of new (and similarity-filtered) listings
|
||||
* @returns {Promise<ParsedListing[]|void>} Resolves to the list of new (and similarity-filtered) listings
|
||||
* after notifications have been sent; resolves to void when there are no new listings.
|
||||
*/
|
||||
execute() {
|
||||
@@ -98,6 +90,7 @@ class FredyPipelineExecutioner {
|
||||
.then(this._save.bind(this))
|
||||
.then(this._calculateDistance.bind(this))
|
||||
.then(this._filterBySimilarListings.bind(this))
|
||||
.then(this._filterBySpecs.bind(this))
|
||||
.then(this._filterByArea.bind(this))
|
||||
.then(this._notify.bind(this))
|
||||
.catch(this._handleError.bind(this));
|
||||
@@ -132,8 +125,8 @@ class FredyPipelineExecutioner {
|
||||
/**
|
||||
* Geocode new listings.
|
||||
*
|
||||
* @param {Listing[]} newListings New listings to geocode.
|
||||
* @returns {Promise<Listing[]>} Resolves with the listings (potentially with added coordinates).
|
||||
* @param {ParsedListing[]} newListings New listings to geocode.
|
||||
* @returns {Promise<ParsedListing[]>} Resolves with the listings (potentially with added coordinates).
|
||||
*/
|
||||
async _geocode(newListings) {
|
||||
for (const listing of newListings) {
|
||||
@@ -152,18 +145,18 @@ class FredyPipelineExecutioner {
|
||||
* Filter listings by area using the provider's area filter if available.
|
||||
* Only filters if areaFilter is set on the provider AND the listing has coordinates.
|
||||
*
|
||||
* @param {Listing[]} newListings New listings to filter by area.
|
||||
* @returns {Promise<Listing[]>} Resolves with listings that are within the area (or not filtered if no area is set).
|
||||
* @param {ParsedListing[]} newListings New listings to filter by area.
|
||||
* @returns {ParsedListing[]} Resolves with listings that are within the area (or not filtered if no area is set).
|
||||
*/
|
||||
_filterByArea(newListings) {
|
||||
const polygonFeatures = this._spatialFilter?.features?.filter((f) => f.geometry?.type === 'Polygon');
|
||||
const polygonFeatures = this._jobSpatialFilter?.features?.filter((f) => f.geometry?.type === 'Polygon');
|
||||
|
||||
// If no area filter is set, return all listings
|
||||
if (!polygonFeatures?.length) {
|
||||
return newListings;
|
||||
}
|
||||
|
||||
const filteredIds = [];
|
||||
const toDeleteListingByIds = [];
|
||||
// Filter listings by area - keep only those within the polygon
|
||||
const keptListings = newListings.filter((listing) => {
|
||||
// If listing doesn't have coordinates, keep it (don't filter out)
|
||||
@@ -176,14 +169,48 @@ class FredyPipelineExecutioner {
|
||||
const isInPolygon = polygonFeatures.some((feature) => booleanPointInPolygon(point, feature));
|
||||
|
||||
if (!isInPolygon) {
|
||||
filteredIds.push(listing.id);
|
||||
toDeleteListingByIds.push(listing.id);
|
||||
}
|
||||
|
||||
return isInPolygon;
|
||||
});
|
||||
|
||||
if (filteredIds.length > 0) {
|
||||
deleteListingsById(filteredIds);
|
||||
if (toDeleteListingByIds.length > 0) {
|
||||
deleteListingsById(toDeleteListingByIds);
|
||||
}
|
||||
|
||||
return keptListings;
|
||||
}
|
||||
|
||||
/**
|
||||
* Filter listings based on its specifications (minRooms, minSize, maxPrice).
|
||||
*
|
||||
* @param {ParsedListing[]} newListings New listings to filter.
|
||||
* @returns {ParsedListing[]} Resolves with listings that pass the specification filters.
|
||||
*/
|
||||
_filterBySpecs(newListings) {
|
||||
const { minRooms, minSize, maxPrice } = this._jobSpecFilter || {};
|
||||
|
||||
// If no specs are set, return all listings
|
||||
if (!minRooms && !minSize && !maxPrice) {
|
||||
return newListings;
|
||||
}
|
||||
|
||||
const toDeleteListingByIds = [];
|
||||
const keptListings = newListings.filter((listing) => {
|
||||
const filterOut =
|
||||
(minRooms && listing.rooms && listing.rooms < minRooms) ||
|
||||
(minSize && listing.size && listing.size < minSize) ||
|
||||
(maxPrice && listing.price && listing.price > maxPrice);
|
||||
|
||||
if (filterOut) {
|
||||
toDeleteListingByIds.push(listing.id);
|
||||
}
|
||||
return !filterOut;
|
||||
});
|
||||
|
||||
if (toDeleteListingByIds.length > 0) {
|
||||
deleteListingsById(toDeleteListingByIds);
|
||||
}
|
||||
|
||||
return keptListings;
|
||||
@@ -194,7 +221,7 @@ class FredyPipelineExecutioner {
|
||||
* a provider-specific getListings override is supplied.
|
||||
*
|
||||
* @param {string} url The provider URL to fetch from.
|
||||
* @returns {Promise<Listing[]>} Resolves with an array of listings (empty when none found).
|
||||
* @returns {Promise<ParsedListing[]>} Resolves with an array of listings (empty when none found).
|
||||
*/
|
||||
_getListings(url) {
|
||||
const extractor = new Extractor({ ...this._providerConfig.puppeteerOptions, browser: this._browser });
|
||||
@@ -217,33 +244,42 @@ class FredyPipelineExecutioner {
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize raw listings into the provider-specific Listing shape.
|
||||
* Normalize raw listings into the provider-specific ParsedListing shape.
|
||||
*
|
||||
* @param {any[]} listings Raw listing entries from the extractor or override.
|
||||
* @returns {Listing[]} Normalized listings.
|
||||
* @returns {ParsedListing[]} Normalized listings.
|
||||
*/
|
||||
_normalize(listings) {
|
||||
return listings.map(this._providerConfig.normalize);
|
||||
return listings.map((listing) => this._providerConfig.normalize(listing));
|
||||
}
|
||||
|
||||
/**
|
||||
* Filter out listings that are missing required fields and those rejected by the
|
||||
* provider's blacklist/filter function.
|
||||
*
|
||||
* @param {Listing[]} listings Listings to filter.
|
||||
* @returns {Listing[]} Filtered listings that pass validation and provider filter.
|
||||
* @param {ParsedListing[]} listings Listings to filter.
|
||||
* @returns {ParsedListing[]} Filtered listings that pass validation and provider filter.
|
||||
*/
|
||||
_filter(listings) {
|
||||
const keys = Object.keys(this._providerConfig.crawlFields);
|
||||
const filteredListings = listings.filter((item) => keys.every((key) => key in item));
|
||||
return filteredListings.filter(this._providerConfig.filter);
|
||||
const requiredKeys = this._providerConfig.fieldNames;
|
||||
const requireValues = ['id', 'link', 'title'];
|
||||
|
||||
const filteredListings = listings
|
||||
// this should never filter some listings out, because the normalize function should always extract all fields.
|
||||
.filter((item) => requiredKeys.every((key) => key in item))
|
||||
// TODO: move blacklist filter to this file, so it will handle for all providers in same way.
|
||||
.filter(this._providerConfig.filter)
|
||||
// filter out listings that are missing required fields
|
||||
.filter((item) => requireValues.every((key) => item[key] != null));
|
||||
|
||||
return filteredListings;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine which listings are new by comparing their IDs against stored hashes.
|
||||
*
|
||||
* @param {Listing[]} listings Listings to evaluate for novelty.
|
||||
* @returns {Listing[]} New listings not seen before.
|
||||
* @param {ParsedListing[]} listings Listings to evaluate for novelty.
|
||||
* @returns {ParsedListing[]} New listings not seen before.
|
||||
* @throws {NoNewListingsWarning} When no new listings are found.
|
||||
*/
|
||||
_findNew(listings) {
|
||||
@@ -260,23 +296,30 @@ class FredyPipelineExecutioner {
|
||||
/**
|
||||
* Send notifications for new listings using the configured notification adapter(s).
|
||||
*
|
||||
* @param {Listing[]} newListings New listings to notify about.
|
||||
* @returns {Promise<Listing[]>} Resolves to the provided listings after notifications complete.
|
||||
* @param {ParsedListing[]} newListings New listings to notify about.
|
||||
* @returns {Promise<ParsedListing[]>} Resolves to the provided listings after notifications complete.
|
||||
* @throws {NoNewListingsWarning} When there are no listings to notify about.
|
||||
*/
|
||||
_notify(newListings) {
|
||||
if (newListings.length === 0) {
|
||||
throw new NoNewListingsWarning();
|
||||
}
|
||||
const sendNotifications = notify.send(this._providerId, newListings, this._notificationConfig, this._jobKey);
|
||||
// TODO: move this to the notification adapter, so it will handle for all providers in same way.
|
||||
const formattedListings = newListings.map(formatListing);
|
||||
const sendNotifications = notify.send(
|
||||
this._providerId,
|
||||
formattedListings,
|
||||
this._jobNotificationConfig,
|
||||
this._jobKey,
|
||||
);
|
||||
return Promise.all(sendNotifications).then(() => newListings);
|
||||
}
|
||||
|
||||
/**
|
||||
* Persist new listings and pass them through.
|
||||
*
|
||||
* @param {Listing[]} newListings Listings to store.
|
||||
* @returns {Listing[]} The same listings, unchanged.
|
||||
* @param {ParsedListing[]} newListings Listings to store.
|
||||
* @returns {ParsedListing[]} The same listings, unchanged.
|
||||
*/
|
||||
_save(newListings) {
|
||||
logger.debug(`Storing ${newListings.length} new listings (Provider: '${this._providerId}')`);
|
||||
@@ -287,8 +330,8 @@ class FredyPipelineExecutioner {
|
||||
/**
|
||||
* Calculate distance for new listings.
|
||||
*
|
||||
* @param {Listing[]} listings
|
||||
* @returns {Listing[]}
|
||||
* @param {ParsedListing[]} listings
|
||||
* @returns {ParsedListing[]}
|
||||
* @private
|
||||
*/
|
||||
_calculateDistance(listings) {
|
||||
@@ -324,8 +367,8 @@ class FredyPipelineExecutioner {
|
||||
* Remove listings that are similar to already known entries according to the similarity cache.
|
||||
* Adds the remaining listings to the cache.
|
||||
*
|
||||
* @param {Listing[]} listings Listings to filter by similarity.
|
||||
* @returns {Listing[]} Listings considered unique enough to keep.
|
||||
* @param {ParsedListing[]} listings Listings to filter by similarity.
|
||||
* @returns {ParsedListing[]} Listings considered unique enough to keep.
|
||||
*/
|
||||
_filterBySimilarListings(listings) {
|
||||
const filteredIds = [];
|
||||
|
||||
@@ -172,6 +172,7 @@ jobRouter.post('/', async (req, res) => {
|
||||
enabled,
|
||||
shareWithUsers = [],
|
||||
spatialFilter = null,
|
||||
specFilter = null,
|
||||
} = req.body;
|
||||
const settings = await getSettings();
|
||||
try {
|
||||
@@ -197,6 +198,7 @@ jobRouter.post('/', async (req, res) => {
|
||||
notificationAdapter,
|
||||
shareWithUsers,
|
||||
spatialFilter,
|
||||
specFilter,
|
||||
});
|
||||
} catch (error) {
|
||||
res.send(new Error(error));
|
||||
|
||||
@@ -5,8 +5,16 @@
|
||||
|
||||
import { buildHash, isOneOf } from '../utils.js';
|
||||
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
|
||||
import { extractNumber } from '../utils/extract-number.js';
|
||||
/** @import { ParsedListing } from '../types/listing.js' */
|
||||
/** @import { ProviderConfig } from '../types/providerConfig.js' */
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
/**
|
||||
* @param {any} o
|
||||
* @returns {ParsedListing}
|
||||
*/
|
||||
function normalize(o) {
|
||||
const baseUrl = 'https://www.1a-immobilienmarkt.de';
|
||||
const link = `${baseUrl}/expose/${o.id}.html`;
|
||||
@@ -14,7 +22,17 @@ function normalize(o) {
|
||||
const id = buildHash(o.id, price);
|
||||
const image = baseUrl + o.image;
|
||||
const address = o.address == null ? null : o.address.trim().replaceAll('/', ',');
|
||||
return Object.assign(o, { id, price, link, image, address });
|
||||
return {
|
||||
id,
|
||||
link,
|
||||
title: o.title || '',
|
||||
price: extractNumber(price),
|
||||
size: extractNumber(o.size),
|
||||
rooms: extractNumber(o.rooms),
|
||||
address,
|
||||
image,
|
||||
description: undefined,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -34,13 +52,19 @@ function normalizePrice(price) {
|
||||
}
|
||||
return result[0];
|
||||
}
|
||||
/**
|
||||
* @param {ParsedListing} o
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function applyBlacklist(o) {
|
||||
const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList);
|
||||
const descNotBlacklisted = !isOneOf(o.description, appliedBlackList);
|
||||
return titleNotBlacklisted && descNotBlacklisted;
|
||||
}
|
||||
|
||||
/** @type {ProviderConfig} */
|
||||
const config = {
|
||||
fieldNames: ['id', 'link', 'title', 'price', 'size', 'rooms', 'address', 'image', 'description'],
|
||||
url: null,
|
||||
crawlContainer: '.tabelle',
|
||||
sortByDateParam: 'sort_type=newest',
|
||||
@@ -48,7 +72,8 @@ const config = {
|
||||
crawlFields: {
|
||||
id: '.inner_object_data input[name="marker_objekt_id"]@value | int',
|
||||
price: '.inner_object_data .single_data_price | removeNewline | trim',
|
||||
size: '.tabelle .tabelle_inhalt_infos .single_data_box | removeNewline | trim',
|
||||
size: '.tabelle .tabelle_inhalt_infos .single_data_box:nth-of-type(1) | removeNewline | trim',
|
||||
rooms: '.tabelle .tabelle_inhalt_infos .single_data_box:nth-of-type(2) | removeNewline | trim',
|
||||
title: '.inner_object_data .tabelle_inhalt_titel_black | removeNewline | trim',
|
||||
image: '.inner_object_pic img@src',
|
||||
address: '.tabelle .tabelle_inhalt_infos .left_information > div:nth-child(2) | removeNewline | trim',
|
||||
|
||||
@@ -5,9 +5,12 @@
|
||||
|
||||
import { buildHash, isOneOf } from '../utils.js';
|
||||
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
|
||||
import { extractNumber } from '../utils/extract-number.js';
|
||||
import puppeteerExtractor from '../services/extractor/puppeteerExtractor.js';
|
||||
import * as cheerio from 'cheerio';
|
||||
import logger from '../services/logger.js';
|
||||
/** @import { ParsedListing } from '../types/listing.js' */
|
||||
/** @import { ProviderConfig } from '../types/providerConfig.js' */
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
@@ -65,27 +68,44 @@ async function fetchDetails(listing, browser) {
|
||||
return listing;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {any} o
|
||||
* @returns {ParsedListing}
|
||||
*/
|
||||
function normalize(o) {
|
||||
const baseUrl = 'https://www.immobilien.de';
|
||||
const size = o.size || null;
|
||||
const price = o.price || null;
|
||||
const title = o.title || 'No title available';
|
||||
const title = o.title || '';
|
||||
const address = o.address || null;
|
||||
const shortLink = shortenLink(o.link);
|
||||
const link = shortLink ? (shortLink.startsWith('http') ? shortLink : baseUrl + shortLink) : baseUrl;
|
||||
const image = o.image ? (o.image.startsWith('http') ? o.image : baseUrl + o.image) : null;
|
||||
const id = buildHash(parseId(shortLink), o.price);
|
||||
return Object.assign(o, { id, price, size, title, address, link, image });
|
||||
return {
|
||||
id,
|
||||
link,
|
||||
title,
|
||||
price: extractNumber(o.price),
|
||||
size: extractNumber(o.size),
|
||||
rooms: extractNumber(o.rooms),
|
||||
address,
|
||||
image,
|
||||
description: o.description,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {ParsedListing} o
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function applyBlacklist(o) {
|
||||
const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList);
|
||||
const descNotBlacklisted = !isOneOf(o.description, appliedBlackList);
|
||||
return titleNotBlacklisted && descNotBlacklisted;
|
||||
}
|
||||
|
||||
/** @type {ProviderConfig} */
|
||||
const config = {
|
||||
fieldNames: ['id', 'link', 'title', 'price', 'size', 'rooms', 'address', 'image', 'description'],
|
||||
url: null,
|
||||
crawlContainer: 'a.lr-card',
|
||||
sortByDateParam: 'sort_col=*created_ts&sort_dir=desc',
|
||||
@@ -94,6 +114,7 @@ const config = {
|
||||
id: '@href', //will be transformed later
|
||||
price: '.lr-card__price-amount | trim',
|
||||
size: '.lr-card__fact:has(.lr-card__fact-label:contains("Fläche")) .lr-card__fact-value | trim',
|
||||
rooms: '.zimmer .label_info',
|
||||
title: '.lr-card__title | trim',
|
||||
description: '.description | trim',
|
||||
link: '@href',
|
||||
|
||||
@@ -46,6 +46,10 @@ import {
|
||||
convertWebToMobile,
|
||||
} from '../services/immoscout/immoscout-web-translator.js';
|
||||
import logger from '../services/logger.js';
|
||||
import { extractNumber } from '../utils/extract-number.js';
|
||||
/** @import { ParsedListing } from '../types/listing.js' */
|
||||
/** @import { ProviderConfig } from '../types/providerConfig.js' */
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
async function getListings(url) {
|
||||
@@ -168,22 +172,44 @@ async function isListingActive(link) {
|
||||
function nullOrEmpty(val) {
|
||||
return val == null || val.length === 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {any} o
|
||||
* @returns {ParsedListing}
|
||||
*/
|
||||
function normalize(o) {
|
||||
const title = nullOrEmpty(o.title) ? 'NO TITLE FOUND' : o.title.replace('NEU', '');
|
||||
const title = (o.title || '').replace('NEU', '').trim();
|
||||
const address = nullOrEmpty(o.address) ? 'NO ADDRESS FOUND' : (o.address || '').replace(/\(.*\),.*$/, '').trim();
|
||||
const id = buildHash(o.id, o.price);
|
||||
return Object.assign(o, { id, title, address });
|
||||
return {
|
||||
id,
|
||||
link: o.link,
|
||||
title,
|
||||
price: extractNumber(o.price),
|
||||
size: extractNumber(o.size),
|
||||
rooms: extractNumber(o.rooms),
|
||||
address,
|
||||
image: o.image,
|
||||
description: o.description,
|
||||
};
|
||||
}
|
||||
/**
|
||||
* @param {ParsedListing} o
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function applyBlacklist(o) {
|
||||
return !isOneOf(o.title, appliedBlackList);
|
||||
}
|
||||
/** @type {ProviderConfig} */
|
||||
const config = {
|
||||
fieldNames: ['id', 'link', 'title', 'price', 'size', 'rooms', 'address', 'image', 'description'],
|
||||
url: null,
|
||||
crawlFields: {
|
||||
id: 'id',
|
||||
title: 'title',
|
||||
price: 'price',
|
||||
size: 'size',
|
||||
rooms: 'rooms',
|
||||
link: 'link',
|
||||
address: 'address',
|
||||
},
|
||||
|
||||
@@ -5,27 +5,46 @@
|
||||
|
||||
import { isOneOf, buildHash } from '../utils.js';
|
||||
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
|
||||
import { extractNumber } from '../utils/extract-number.js';
|
||||
/** @import { ParsedListing } from '../types/listing.js' */
|
||||
/** @import { ProviderConfig } from '../types/providerConfig.js' */
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
/**
|
||||
* @param {any} o
|
||||
* @returns {ParsedListing}
|
||||
*/
|
||||
function normalize(o) {
|
||||
const size = o.size || 'N/A m²';
|
||||
const price = (o.price || '--- €').replace('Preis auf Anfrage', '--- €');
|
||||
const title = o.title || 'No title available';
|
||||
const immoId = o.id.substring(o.id.indexOf('-') + 1, o.id.length);
|
||||
const link = `https://immo.swp.de/immobilien/${immoId}`;
|
||||
const description = o.description;
|
||||
const id = buildHash(immoId, price);
|
||||
return Object.assign(o, { id, price, size, title, link, description });
|
||||
const id = buildHash(immoId, o.price);
|
||||
return {
|
||||
id,
|
||||
link,
|
||||
title: o.title || '',
|
||||
price: extractNumber(o.price),
|
||||
size: extractNumber(o.size),
|
||||
rooms: extractNumber(o.rooms),
|
||||
address: o.address,
|
||||
image: o.image,
|
||||
description: undefined,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {ParsedListing} o
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function applyBlacklist(o) {
|
||||
const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList);
|
||||
const descNotBlacklisted = !isOneOf(o.description, appliedBlackList);
|
||||
return titleNotBlacklisted && descNotBlacklisted;
|
||||
}
|
||||
|
||||
/** @type {ProviderConfig} */
|
||||
const config = {
|
||||
fieldNames: ['id', 'link', 'title', 'price', 'size', 'rooms', 'address', 'image', 'description'],
|
||||
url: null,
|
||||
crawlContainer: '.js-serp-item',
|
||||
sortByDateParam: 's=most_recently_updated_first',
|
||||
@@ -34,9 +53,10 @@ const config = {
|
||||
id: '.js-bookmark-btn@data-id',
|
||||
price: 'div.align-items-start div:first-child | trim',
|
||||
size: 'div.align-items-start div:nth-child(3) | trim',
|
||||
rooms: 'div.align-items-start div:nth-child(2) | trim',
|
||||
address: '.js-bookmark-btn@data-address',
|
||||
title: '.js-item-title-link@title | trim',
|
||||
link: '.ci-search-result__link@href',
|
||||
description: '.js-show-more-item-sm | removeNewline | trim',
|
||||
image: 'img@src',
|
||||
},
|
||||
normalize: normalize,
|
||||
|
||||
@@ -5,9 +5,12 @@
|
||||
|
||||
import { buildHash, isOneOf } from '../utils.js';
|
||||
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
|
||||
import { extractNumber } from '../utils/extract-number.js';
|
||||
import puppeteerExtractor from '../services/extractor/puppeteerExtractor.js';
|
||||
import * as cheerio from 'cheerio';
|
||||
import logger from '../services/logger.js';
|
||||
/** @import { ParsedListing } from '../types/listing.js' */
|
||||
/** @import { ProviderConfig } from '../types/providerConfig.js' */
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
@@ -48,18 +51,38 @@ async function fetchDetails(listing, browser) {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {any} o
|
||||
* @returns {ParsedListing}
|
||||
*/
|
||||
function normalize(o) {
|
||||
const id = buildHash(o.id, o.price);
|
||||
return Object.assign(o, { id });
|
||||
return {
|
||||
id,
|
||||
link: o.link,
|
||||
title: o.title || '',
|
||||
price: extractNumber(o.price),
|
||||
size: extractNumber(o.size),
|
||||
rooms: extractNumber(o.rooms),
|
||||
address: o.address,
|
||||
image: o.image,
|
||||
description: o.description,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {ParsedListing} o
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function applyBlacklist(o) {
|
||||
const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList);
|
||||
const descNotBlacklisted = !isOneOf(o.description, appliedBlackList);
|
||||
return titleNotBlacklisted && descNotBlacklisted;
|
||||
}
|
||||
|
||||
/** @type {ProviderConfig} */
|
||||
const config = {
|
||||
fieldNames: ['id', 'link', 'title', 'price', 'size', 'rooms', 'address', 'image', 'description'],
|
||||
url: null,
|
||||
crawlContainer:
|
||||
'div[data-testid="serp-core-scrollablelistview-testid"]:not(div[data-testid="serp-enlargementlist-testid"] div[data-testid="serp-card-testid"]) div[data-testid="serp-core-classified-card-testid"]',
|
||||
@@ -68,7 +91,8 @@ const config = {
|
||||
crawlFields: {
|
||||
id: 'a@href',
|
||||
price: 'div[data-testid="cardmfe-price-testid"] | removeNewline | trim',
|
||||
size: 'div[data-testid="cardmfe-keyfacts-testid"] | removeNewline | trim',
|
||||
size: 'div[data-testid="cardmfe-keyfacts-testid"] div:nth-of-type(3) | removeNewline | trim',
|
||||
rooms: 'div[data-testid="cardmfe-keyfacts-testid"] div:nth-of-type(1) | removeNewline | trim',
|
||||
title: 'div[data-testid="cardmfe-description-box-text-test-id"] > div:nth-of-type(2)',
|
||||
link: 'a@href',
|
||||
description: 'div[data-testid="cardmfe-description-text-test-id"] > div:nth-of-type(2) | removeNewline | trim',
|
||||
|
||||
@@ -5,6 +5,9 @@
|
||||
|
||||
import { buildHash, isOneOf } from '../utils.js';
|
||||
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
|
||||
import { extractNumber } from '../utils/extract-number.js';
|
||||
/** @import { ParsedListing } from '../types/listing.js' */
|
||||
/** @import { ProviderConfig } from '../types/providerConfig.js' */
|
||||
import puppeteerExtractor from '../services/extractor/puppeteerExtractor.js';
|
||||
import logger from '../services/logger.js';
|
||||
import * as cheerio from 'cheerio';
|
||||
@@ -146,13 +149,33 @@ async function fetchDetails(listing, browser) {
|
||||
return enrichListingFromDetails(listing, browser);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {any} o
|
||||
* @returns {ParsedListing}
|
||||
*/
|
||||
function normalize(o) {
|
||||
const size = o.size || '--- m²';
|
||||
const parts = (o.tags || '').split('·').map((p) => p.trim());
|
||||
const size = parts.find((p) => p.includes('m²'));
|
||||
const rooms = parts.find((p) => p.includes('Zi.'));
|
||||
const id = buildHash(o.id, o.price);
|
||||
const link = toAbsoluteLink(o.link) || o.link;
|
||||
return Object.assign(o, { id, size, link });
|
||||
|
||||
return {
|
||||
id,
|
||||
title: o.title,
|
||||
link: toAbsoluteLink(o.link) || o.link,
|
||||
price: extractNumber(o.price),
|
||||
size: extractNumber(size),
|
||||
rooms: extractNumber(rooms),
|
||||
address: o.address,
|
||||
description: o.description,
|
||||
image: o.image,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {ParsedListing} o
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function applyBlacklist(o) {
|
||||
const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList);
|
||||
const descNotBlacklisted = !isOneOf(o.description, appliedBlackList);
|
||||
@@ -161,16 +184,18 @@ function applyBlacklist(o) {
|
||||
return o.title != null && !isBlacklistedDistrict && titleNotBlacklisted && descNotBlacklisted;
|
||||
}
|
||||
|
||||
/** @type {ProviderConfig} */
|
||||
const config = {
|
||||
fieldNames: ['id', 'link', 'title', 'price', 'size', 'rooms', 'address', 'image', 'description'],
|
||||
url: null,
|
||||
crawlContainer: '#srchrslt-adtable .ad-listitem ',
|
||||
//sort by date is standard oO
|
||||
sortByDateParam: null,
|
||||
waitForSelector: 'body',
|
||||
crawlFields: {
|
||||
id: '.aditem@data-adid | int',
|
||||
id: '.aditem@data-adid',
|
||||
price: '.aditem-main--middle--price-shipping--price | removeNewline | trim',
|
||||
size: '.aditem-main .text-module-end | removeNewline | trim',
|
||||
tags: '.aditem-main--middle--tags | removeNewline | trim',
|
||||
title: '.aditem-main .text-module-begin a | removeNewline | trim',
|
||||
link: '.aditem-main .text-module-begin a@href | removeNewline | trim',
|
||||
description: '.aditem-main .aditem-main--middle--description | removeNewline | trim',
|
||||
|
||||
@@ -5,23 +5,46 @@
|
||||
|
||||
import { isOneOf, buildHash } from '../utils.js';
|
||||
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
|
||||
import { extractNumber } from '../utils/extract-number.js';
|
||||
/** @import { ParsedListing } from '../types/listing.js' */
|
||||
/** @import { ProviderConfig } from '../types/providerConfig.js' */
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
/**
|
||||
* @param {any} o
|
||||
* @returns {ParsedListing}
|
||||
*/
|
||||
function normalize(o) {
|
||||
const originalId = o.id.split('/').pop();
|
||||
const id = buildHash(originalId, o.price);
|
||||
const size = o.size ?? 'N/A m²';
|
||||
const title = o.title || 'No title available';
|
||||
const link = o.link != null ? `https://www.mcmakler.de${o.link}` : o.link;
|
||||
const [rooms, size] = o.tags.split(' | ');
|
||||
const address = o.address?.replace(' / ', ' ') || null;
|
||||
const link = o.link != null ? `https://www.mcmakler.de${o.link}` : config.url;
|
||||
return Object.assign(o, { id, size, title, link, address });
|
||||
return {
|
||||
id,
|
||||
link,
|
||||
title: o.title || '',
|
||||
price: extractNumber(o.price),
|
||||
size: extractNumber(size),
|
||||
rooms: extractNumber(rooms),
|
||||
address,
|
||||
image: o.image,
|
||||
description: undefined,
|
||||
};
|
||||
}
|
||||
/**
|
||||
* @param {ParsedListing} o
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function applyBlacklist(o) {
|
||||
const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList);
|
||||
const descNotBlacklisted = !isOneOf(o.description, appliedBlackList);
|
||||
return titleNotBlacklisted && descNotBlacklisted;
|
||||
}
|
||||
/** @type {ProviderConfig} */
|
||||
const config = {
|
||||
fieldNames: ['id', 'link', 'title', 'price', 'size', 'rooms', 'address', 'image', 'description'],
|
||||
url: null,
|
||||
crawlContainer: 'article[data-testid="propertyCard"]',
|
||||
sortByDateParam: 'sortBy=DATE&sortOn=DESC',
|
||||
@@ -30,7 +53,7 @@ const config = {
|
||||
id: 'h2 a@href',
|
||||
title: 'h2 a | removeNewline | trim',
|
||||
price: 'footer > p:first-of-type | trim',
|
||||
size: 'footer > p:nth-of-type(2) | trim',
|
||||
tags: 'footer > p:nth-of-type(2) | trim',
|
||||
address: 'div > h2 + p | removeNewline | trim',
|
||||
image: 'img@src',
|
||||
link: 'h2 a@href',
|
||||
|
||||
@@ -5,6 +5,9 @@
|
||||
|
||||
import { isOneOf, buildHash } from '../utils.js';
|
||||
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
|
||||
import { extractNumber } from '../utils/extract-number.js';
|
||||
/** @import { ParsedListing } from '../types/listing.js' */
|
||||
/** @import { ProviderConfig } from '../types/providerConfig.js' */
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
@@ -12,19 +15,39 @@ function nullOrEmpty(val) {
|
||||
return val == null || val.length === 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {any} o
|
||||
* @returns {ParsedListing}
|
||||
*/
|
||||
function normalize(o) {
|
||||
const link = nullOrEmpty(o.link)
|
||||
? 'NO LINK'
|
||||
: `https://www.neubaukompass.de${o.link.substring(o.link.indexOf('/neubau'))}`;
|
||||
const id = buildHash(o.link, o.price);
|
||||
return Object.assign(o, { id, link });
|
||||
return {
|
||||
id,
|
||||
link,
|
||||
title: o.title || '',
|
||||
price: extractNumber(o.price),
|
||||
size: extractNumber(o.size),
|
||||
rooms: extractNumber(o.rooms),
|
||||
address: o.address,
|
||||
image: o.image,
|
||||
description: o.description,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {ParsedListing} o
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function applyBlacklist(o) {
|
||||
return !isOneOf(o.title, appliedBlackList);
|
||||
}
|
||||
|
||||
/** @type {ProviderConfig} */
|
||||
const config = {
|
||||
fieldNames: ['id', 'link', 'title', 'price', 'size', 'rooms', 'address', 'image', 'description'],
|
||||
url: null,
|
||||
crawlContainer: '.col-12.mb-4',
|
||||
sortByDateParam: 'Sortierung=Id&Richtung=DESC',
|
||||
@@ -34,7 +57,9 @@ const config = {
|
||||
title: 'a@title | removeNewline | trim',
|
||||
link: 'a@href',
|
||||
address: '.nbk-project-card__description | removeNewline | trim',
|
||||
price: '.nbk-project-card__spec-item .nbk-project-card__spec-value | removeNewline | trim',
|
||||
price: '.nbk-project-card__spec-item:nth-child(1) .nbk-project-card__spec-value | removeNewline | trim',
|
||||
size: '.nbk-project-card__spec-item:nth-child(2) .nbk-project-card__spec-value | removeNewline | trim',
|
||||
rooms: '.nbk-project-card__spec-item:nth-child(3) .nbk-project-card__spec-value | removeNewline | trim',
|
||||
image: '.nbk-project-card__image@src',
|
||||
},
|
||||
normalize: normalize,
|
||||
|
||||
@@ -5,19 +5,43 @@
|
||||
|
||||
import { isOneOf, buildHash } from '../utils.js';
|
||||
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
|
||||
import { extractNumber } from '../utils/extract-number.js';
|
||||
/** @import { ParsedListing } from '../types/listing.js' */
|
||||
/** @import { ProviderConfig } from '../types/providerConfig.js' */
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
/**
|
||||
* @param {any} o
|
||||
* @returns {ParsedListing}
|
||||
*/
|
||||
function normalize(o) {
|
||||
const link = metaInformation.baseUrl + o.link;
|
||||
const id = buildHash(o.title, o.link, o.price);
|
||||
return Object.assign(o, { link, id });
|
||||
return {
|
||||
id,
|
||||
link,
|
||||
title: o.title || '',
|
||||
price: extractNumber(o.price),
|
||||
size: extractNumber(o.size),
|
||||
rooms: extractNumber(o.rooms),
|
||||
address: o.address,
|
||||
image: o.image,
|
||||
description: o.description,
|
||||
};
|
||||
}
|
||||
/**
|
||||
* @param {ParsedListing} o
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function applyBlacklist(o) {
|
||||
const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList);
|
||||
const descNotBlacklisted = !isOneOf(o.description, appliedBlackList);
|
||||
return titleNotBlacklisted && descNotBlacklisted;
|
||||
}
|
||||
/** @type {ProviderConfig} */
|
||||
const config = {
|
||||
fieldNames: ['id', 'link', 'title', 'price', 'size', 'rooms', 'address', 'image', 'description'],
|
||||
url: null,
|
||||
crawlContainer: 'div[data-livecomponent-id*="search/property_list"] .grid > div',
|
||||
sortByDateParam: null,
|
||||
@@ -27,6 +51,7 @@ const config = {
|
||||
title: 'h4 | removeNewline | trim',
|
||||
price: '.text-xl | trim',
|
||||
size: 'div[title="Wohnfläche"] | trim',
|
||||
rooms: 'div[title="Zimmer"] | trim',
|
||||
address: '.text-slate-800 | removeNewline | trim',
|
||||
image: 'img@src',
|
||||
link: 'a@href',
|
||||
|
||||
@@ -5,24 +5,47 @@
|
||||
|
||||
import { isOneOf, buildHash } from '../utils.js';
|
||||
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
|
||||
import { extractNumber } from '../utils/extract-number.js';
|
||||
/** @import { ParsedListing } from '../types/listing.js' */
|
||||
/** @import { ProviderConfig } from '../types/providerConfig.js' */
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
/**
|
||||
* @param {any} o
|
||||
* @returns {ParsedListing}
|
||||
*/
|
||||
function normalize(o) {
|
||||
const id = buildHash(o.id, o.price);
|
||||
const address = o.address?.replace(/^adresse /i, '') ?? null;
|
||||
const title = o.title || 'No title available';
|
||||
const link = o.link != null ? decodeURIComponent(o.link) : config.url;
|
||||
|
||||
const urlReg = new RegExp(/url\((.*?)\)/gim);
|
||||
const image = o.image != null ? urlReg.exec(o.image)[1] : null;
|
||||
return Object.assign(o, { id, address, title, link, image });
|
||||
return {
|
||||
id,
|
||||
link,
|
||||
title: o.title || '',
|
||||
price: extractNumber(o.price),
|
||||
size: extractNumber(o.size),
|
||||
rooms: extractNumber(o.rooms),
|
||||
address,
|
||||
image,
|
||||
description: o.description,
|
||||
};
|
||||
}
|
||||
/**
|
||||
* @param {ParsedListing} o
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function applyBlacklist(o) {
|
||||
const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList);
|
||||
const descNotBlacklisted = !isOneOf(o.description, appliedBlackList);
|
||||
return titleNotBlacklisted && descNotBlacklisted;
|
||||
}
|
||||
/** @type {ProviderConfig} */
|
||||
const config = {
|
||||
fieldNames: ['id', 'link', 'title', 'price', 'size', 'rooms', 'address', 'image', 'description'],
|
||||
url: null,
|
||||
crawlContainer: '.listentry-content',
|
||||
sortByDateParam: null, // sort by date is standard
|
||||
@@ -32,6 +55,7 @@ const config = {
|
||||
title: 'h2 | trim',
|
||||
price: '.listentry-details-price .listentry-details-v | trim',
|
||||
size: '.listentry-details-size .listentry-details-v | trim',
|
||||
rooms: '.listentry-details-rooms .listentry-details-v | trim',
|
||||
address: '.listentry-adress | trim',
|
||||
image: '.listentry-img@style',
|
||||
link: '.shariff@data-url',
|
||||
|
||||
@@ -8,6 +8,9 @@ import checkIfListingIsActive from '../services/listings/listingActiveTester.js'
|
||||
import puppeteerExtractor from '../services/extractor/puppeteerExtractor.js';
|
||||
import * as cheerio from 'cheerio';
|
||||
import logger from '../services/logger.js';
|
||||
import { extractNumber } from '../utils/extract-number.js';
|
||||
/** @import { ParsedListing } from '../types/listing.js' */
|
||||
/** @import { ProviderConfig } from '../types/providerConfig.js' */
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
@@ -55,20 +58,39 @@ async function fetchDetails(listing, browser) {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {any} o
|
||||
* @returns {ParsedListing}
|
||||
*/
|
||||
function normalize(o) {
|
||||
const originalId = o.id.split('/').pop().replace('.html', '');
|
||||
const id = buildHash(originalId, o.price);
|
||||
const size = o.size?.replace(' Wohnfläche', '').replace(' m²', 'm²') ?? null;
|
||||
const title = o.title || 'No title available';
|
||||
const link = o.link != null ? `https://immobilien.sparkasse.de${o.link}` : config.url;
|
||||
return Object.assign(o, { id, size, title, link });
|
||||
|
||||
return {
|
||||
id,
|
||||
link,
|
||||
title: o.title || '',
|
||||
price: extractNumber(o.price),
|
||||
size: extractNumber(o.size),
|
||||
rooms: extractNumber(o.rooms),
|
||||
address: o.address,
|
||||
image: o.image,
|
||||
description: o.description,
|
||||
};
|
||||
}
|
||||
/**
|
||||
* @param {ParsedListing} o
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function applyBlacklist(o) {
|
||||
const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList);
|
||||
const descNotBlacklisted = !isOneOf(o.description, appliedBlackList);
|
||||
return titleNotBlacklisted && descNotBlacklisted;
|
||||
}
|
||||
/** @type {ProviderConfig} */
|
||||
const config = {
|
||||
fieldNames: ['id', 'link', 'title', 'price', 'size', 'rooms', 'address', 'image', 'description'],
|
||||
url: null,
|
||||
crawlContainer: 'div[data-testid="estate-link"]',
|
||||
sortByDateParam: 'sortBy=date_desc',
|
||||
@@ -77,7 +99,8 @@ const config = {
|
||||
id: 'a@href',
|
||||
title: 'h3 | trim',
|
||||
price: '.estate-list-price | trim',
|
||||
size: '.estate-mainfact span | trim',
|
||||
size: '.estate-mainfact:nth-child(1) span | trim',
|
||||
rooms: '.estate-mainfact:nth-child(2) span | trim',
|
||||
address: 'h6 | trim',
|
||||
image: 'img@src',
|
||||
link: 'a@href',
|
||||
|
||||
@@ -5,9 +5,12 @@
|
||||
|
||||
import { isOneOf, buildHash } from '../utils.js';
|
||||
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
|
||||
import { extractNumber } from '../utils/extract-number.js';
|
||||
import puppeteerExtractor from '../services/extractor/puppeteerExtractor.js';
|
||||
import * as cheerio from 'cheerio';
|
||||
import logger from '../services/logger.js';
|
||||
/** @import { ParsedListing } from '../types/listing.js' */
|
||||
/** @import { ProviderConfig } from '../types/providerConfig.js' */
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
@@ -32,20 +35,39 @@ async function fetchDetails(listing, browser) {
|
||||
return listing;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {any} o
|
||||
* @returns {ParsedListing}
|
||||
*/
|
||||
function normalize(o) {
|
||||
const id = buildHash(o.id, o.price);
|
||||
const link = `https://www.wg-gesucht.de${o.link}`;
|
||||
const image = o.image != null ? o.image.replace('small', 'large') : null;
|
||||
return Object.assign(o, { id, link, image });
|
||||
const [rooms, city, road] = o.details?.split(' | ') || [];
|
||||
return {
|
||||
id,
|
||||
link,
|
||||
title: o.title || '',
|
||||
price: extractNumber(o.price),
|
||||
size: extractNumber(o.size),
|
||||
rooms: extractNumber(rooms),
|
||||
address: `${city}, ${road}`,
|
||||
image,
|
||||
description: o.description,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {ParsedListing} o
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function applyBlacklist(o) {
|
||||
const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList);
|
||||
const descNotBlacklisted = !isOneOf(o.description, appliedBlackList);
|
||||
return o.id != null && titleNotBlacklisted && descNotBlacklisted;
|
||||
}
|
||||
|
||||
/** @type {ProviderConfig} */
|
||||
const config = {
|
||||
url: null,
|
||||
crawlContainer: '#main_column .wgg_card',
|
||||
@@ -56,10 +78,13 @@ const config = {
|
||||
details: '.row .noprint .col-xs-11 |removeNewline |trim',
|
||||
price: '.middle .col-xs-3 |removeNewline |trim',
|
||||
size: '.middle .text-right |removeNewline |trim',
|
||||
rooms: '.middle .text-right |removeNewline |trim',
|
||||
title: '.truncate_title a |removeNewline |trim',
|
||||
link: '.truncate_title a@href',
|
||||
image: '.img-responsive@src',
|
||||
description: '.row .noprint .col-xs-11 |removeNewline |trim',
|
||||
},
|
||||
fieldNames: ['id', 'link', 'title', 'price', 'size', 'rooms', 'address', 'image', 'description'],
|
||||
normalize: normalize,
|
||||
filter: applyBlacklist,
|
||||
fetchDetails,
|
||||
|
||||
@@ -5,26 +5,45 @@
|
||||
|
||||
import * as utils from '../utils.js';
|
||||
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
|
||||
import { extractNumber } from '../utils/extract-number.js';
|
||||
/** @import { ParsedListing } from '../types/listing.js' */
|
||||
/** @import { ProviderConfig } from '../types/providerConfig.js' */
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
/**
|
||||
* @param {any} o
|
||||
* @returns {ParsedListing}
|
||||
*/
|
||||
function normalize(o) {
|
||||
const id = o.link.split('/').pop();
|
||||
const price = o.price;
|
||||
const size = o.size;
|
||||
const rooms = o.rooms;
|
||||
const [city = '', part = ''] = (o.description || '').split('-').map((v) => v.trim());
|
||||
const address = `${part}, ${city}`;
|
||||
return Object.assign(o, { id, price, size, rooms, address });
|
||||
return {
|
||||
id: o.link.split('/').pop(),
|
||||
link: o.link,
|
||||
title: o.title || '',
|
||||
price: extractNumber(o.price),
|
||||
size: extractNumber(o.size),
|
||||
rooms: extractNumber(o.rooms),
|
||||
address,
|
||||
image: o.image,
|
||||
description: o.description,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {ParsedListing} o
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function applyBlacklist(o) {
|
||||
const titleNotBlacklisted = !utils.isOneOf(o.title, appliedBlackList);
|
||||
const descNotBlacklisted = !utils.isOneOf(o.description, appliedBlackList);
|
||||
return o.id != null && o.title != null && titleNotBlacklisted && descNotBlacklisted && o.link.startsWith(o.link);
|
||||
}
|
||||
|
||||
/** @type {ProviderConfig} */
|
||||
const config = {
|
||||
fieldNames: ['id', 'link', 'title', 'price', 'size', 'rooms', 'address', 'image', 'description'],
|
||||
url: null,
|
||||
sortByDateParam: null,
|
||||
waitForSelector: 'body',
|
||||
@@ -37,7 +56,7 @@ const config = {
|
||||
size: 'dl:nth-of-type(3) dd | removeNewline | trim',
|
||||
description: 'div.before\\:icon-location_marker | trim',
|
||||
link: '@href',
|
||||
imageUrl: 'img@src',
|
||||
image: 'img@src',
|
||||
},
|
||||
normalize: normalize,
|
||||
filter: applyBlacklist,
|
||||
|
||||
@@ -178,15 +178,7 @@ export function initJobExecutionService({ providers, settings, intervalMs }) {
|
||||
browser = await puppeteerExtractor.launchBrowser(matchedProvider.config.url, {});
|
||||
}
|
||||
|
||||
await new FredyPipelineExecutioner(
|
||||
matchedProvider.config,
|
||||
job.notificationAdapter,
|
||||
job.spatialFilter,
|
||||
prov.id,
|
||||
job.id,
|
||||
similarityCache,
|
||||
browser,
|
||||
).execute();
|
||||
await new FredyPipelineExecutioner(matchedProvider.config, job, prov.id, similarityCache, browser).execute();
|
||||
} catch (err) {
|
||||
logger.error(err);
|
||||
}
|
||||
|
||||
@@ -31,6 +31,7 @@ export const upsertJob = ({
|
||||
userId,
|
||||
shareWithUsers = [],
|
||||
spatialFilter = null,
|
||||
specFilter = null,
|
||||
}) => {
|
||||
const id = jobId || nanoid();
|
||||
const existing = SqliteConnection.query(`SELECT id, user_id FROM jobs WHERE id = @id LIMIT 1`, { id })[0];
|
||||
@@ -44,7 +45,8 @@ export const upsertJob = ({
|
||||
provider = @provider,
|
||||
notification_adapter = @notification_adapter,
|
||||
shared_with_user = @shareWithUsers,
|
||||
spatial_filter = @spatialFilter
|
||||
spatial_filter = @spatialFilter,
|
||||
spec_filter = @specFilter
|
||||
WHERE id = @id`,
|
||||
{
|
||||
id,
|
||||
@@ -55,12 +57,13 @@ export const upsertJob = ({
|
||||
provider: toJson(provider ?? []),
|
||||
notification_adapter: toJson(notificationAdapter ?? []),
|
||||
spatialFilter: spatialFilter ? toJson(spatialFilter) : null,
|
||||
specFilter: specFilter ? toJson(specFilter) : null,
|
||||
},
|
||||
);
|
||||
} else {
|
||||
SqliteConnection.execute(
|
||||
`INSERT INTO jobs (id, user_id, enabled, name, blacklist, provider, notification_adapter, shared_with_user, spatial_filter)
|
||||
VALUES (@id, @user_id, @enabled, @name, @blacklist, @provider, @notification_adapter, @shareWithUsers, @spatialFilter)`,
|
||||
`INSERT INTO jobs (id, user_id, enabled, name, blacklist, provider, notification_adapter, shared_with_user, spatial_filter, spec_filter)
|
||||
VALUES (@id, @user_id, @enabled, @name, @blacklist, @provider, @notification_adapter, @shareWithUsers, @spatialFilter, @specFilter)`,
|
||||
{
|
||||
id,
|
||||
user_id: ownerId,
|
||||
@@ -71,6 +74,7 @@ export const upsertJob = ({
|
||||
shareWithUsers: toJson(shareWithUsers ?? []),
|
||||
notification_adapter: toJson(notificationAdapter ?? []),
|
||||
spatialFilter: spatialFilter ? toJson(spatialFilter) : null,
|
||||
specFilter: specFilter ? toJson(specFilter) : null,
|
||||
},
|
||||
);
|
||||
}
|
||||
@@ -92,6 +96,7 @@ export const getJob = (jobId) => {
|
||||
j.shared_with_user,
|
||||
j.notification_adapter AS notificationAdapter,
|
||||
j.spatial_filter AS spatialFilter,
|
||||
j.spec_filter AS specFilter,
|
||||
(SELECT COUNT(1) FROM listings l WHERE l.job_id = j.id AND l.is_active = 1 AND l.manually_deleted = 0) AS numberOfFoundListings
|
||||
FROM jobs j
|
||||
WHERE j.id = @id
|
||||
@@ -107,6 +112,7 @@ export const getJob = (jobId) => {
|
||||
shared_with_user: fromJson(row.shared_with_user, []),
|
||||
notificationAdapter: fromJson(row.notificationAdapter, []),
|
||||
spatialFilter: fromJson(row.spatialFilter, null),
|
||||
specFilter: fromJson(row.specFilter, null),
|
||||
};
|
||||
};
|
||||
|
||||
@@ -157,6 +163,7 @@ export const getJobs = () => {
|
||||
j.shared_with_user,
|
||||
j.notification_adapter AS notificationAdapter,
|
||||
j.spatial_filter AS spatialFilter,
|
||||
j.spec_filter AS specFilter,
|
||||
(SELECT COUNT(1) FROM listings l WHERE l.job_id = j.id AND l.is_active = 1 AND l.manually_deleted = 0) AS numberOfFoundListings
|
||||
FROM jobs j
|
||||
WHERE j.enabled = 1
|
||||
@@ -170,6 +177,7 @@ export const getJobs = () => {
|
||||
shared_with_user: fromJson(row.shared_with_user, []),
|
||||
notificationAdapter: fromJson(row.notificationAdapter, []),
|
||||
spatialFilter: fromJson(row.spatialFilter, null),
|
||||
specFilter: fromJson(row.specFilter, null),
|
||||
}));
|
||||
};
|
||||
|
||||
@@ -260,6 +268,7 @@ export const queryJobs = ({
|
||||
j.shared_with_user,
|
||||
j.notification_adapter AS notificationAdapter,
|
||||
j.spatial_filter AS spatialFilter,
|
||||
j.spec_filter AS specFilter,
|
||||
(SELECT COUNT(1) FROM listings l WHERE l.job_id = j.id AND l.is_active = 1 AND l.manually_deleted = 0) AS numberOfFoundListings
|
||||
FROM jobs j
|
||||
${whereSql}
|
||||
@@ -276,6 +285,7 @@ export const queryJobs = ({
|
||||
shared_with_user: fromJson(row.shared_with_user, []),
|
||||
notificationAdapter: fromJson(row.notificationAdapter, []),
|
||||
spatialFilter: fromJson(row.spatialFilter, null),
|
||||
specFilter: fromJson(row.specFilter, null),
|
||||
}));
|
||||
|
||||
return { totalNumber, page: safePage, result };
|
||||
|
||||
@@ -174,9 +174,9 @@ export const storeListings = (jobId, providerId, listings) => {
|
||||
|
||||
SqliteConnection.withTransaction((db) => {
|
||||
const stmt = db.prepare(
|
||||
`INSERT INTO listings (id, hash, provider, job_id, price, size, title, image_url, description, address,
|
||||
`INSERT INTO listings (id, hash, provider, job_id, price, size, rooms, title, image_url, description, address,
|
||||
link, created_at, is_active, latitude, longitude)
|
||||
VALUES (@id, @hash, @provider, @job_id, @price, @size, @title, @image_url, @description, @address, @link,
|
||||
VALUES (@id, @hash, @provider, @job_id, @price, @size, @rooms, @title, @image_url, @description, @address, @link,
|
||||
@created_at, 1, @latitude, @longitude)
|
||||
ON CONFLICT(job_id, hash) DO NOTHING`,
|
||||
);
|
||||
@@ -187,8 +187,9 @@ export const storeListings = (jobId, providerId, listings) => {
|
||||
hash: item.id,
|
||||
provider: providerId,
|
||||
job_id: jobId,
|
||||
price: extractNumber(item.price),
|
||||
size: extractNumber(item.size),
|
||||
price: item.price,
|
||||
size: item.size,
|
||||
rooms: item.rooms,
|
||||
title: item.title,
|
||||
image_url: item.image,
|
||||
description: item.description,
|
||||
@@ -202,19 +203,6 @@ export const storeListings = (jobId, providerId, listings) => {
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* Extract the first number from a string like "1.234 €" or "70 m²".
|
||||
* Removes dots/commas before parsing. Returns null on invalid input.
|
||||
* @param {string|undefined|null} str
|
||||
* @returns {number|null}
|
||||
*/
|
||||
function extractNumber(str) {
|
||||
if (!str) return null;
|
||||
const cleaned = str.replace(/\./g, '').replace(',', '.');
|
||||
const num = parseFloat(cleaned);
|
||||
return isNaN(num) ? null : num;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove any parentheses segments (including surrounding whitespace) from a string.
|
||||
* Returns null for empty input.
|
||||
|
||||
10
lib/services/storage/migrations/sql/15.add-listing-specs.js
Normal file
10
lib/services/storage/migrations/sql/15.add-listing-specs.js
Normal file
@@ -0,0 +1,10 @@
|
||||
/*
|
||||
* Copyright (c) 2026 by Christian Kellner.
|
||||
* Licensed under Apache-2.0 with Commons Clause and Attribution/Naming Clause
|
||||
*/
|
||||
|
||||
export function up(db) {
|
||||
db.exec(`
|
||||
ALTER TABLE jobs ADD COLUMN spec_filter JSONB DEFAULT NULL;
|
||||
`);
|
||||
}
|
||||
@@ -0,0 +1,10 @@
|
||||
/*
|
||||
* Copyright (c) 2026 by Christian Kellner.
|
||||
* Licensed under Apache-2.0 with Commons Clause and Attribution/Naming Clause
|
||||
*/
|
||||
|
||||
export function up(db) {
|
||||
db.exec(`
|
||||
ALTER TABLE listings ADD COLUMN rooms INTEGER;
|
||||
`);
|
||||
}
|
||||
10
lib/types/browser.js
Normal file
10
lib/types/browser.js
Normal file
@@ -0,0 +1,10 @@
|
||||
/*
|
||||
* Copyright (c) 2026 by Christian Kellner.
|
||||
* Licensed under Apache-2.0 with Commons Clause and Attribution/Naming Clause
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {import('puppeteer').Browser} Browser
|
||||
*/
|
||||
|
||||
export {};
|
||||
19
lib/types/filter.js
Normal file
19
lib/types/filter.js
Normal file
@@ -0,0 +1,19 @@
|
||||
/*
|
||||
* Copyright (c) 2026 by Christian Kellner.
|
||||
* Licensed under Apache-2.0 with Commons Clause and Attribution/Naming Clause
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {Object} SpecFilter
|
||||
* @property {number} [minRooms] Minimum number of rooms.
|
||||
* @property {number} [minSize] Minimum size in m².
|
||||
* @property {number} [maxPrice] Maximum price.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {Object} SpatialFilter GeoJSON FeatureCollection.
|
||||
* @property {Array<Object>} [features] GeoJSON features for spatial filtering (typically Polygons).
|
||||
* @property {string} [type] Type 'FeatureCollection'.
|
||||
*/
|
||||
|
||||
export {};
|
||||
23
lib/types/job.js
Normal file
23
lib/types/job.js
Normal file
@@ -0,0 +1,23 @@
|
||||
/*
|
||||
* Copyright (c) 2026 by Christian Kellner.
|
||||
* Licensed under Apache-2.0 with Commons Clause and Attribution/Naming Clause
|
||||
*/
|
||||
|
||||
/** @import { SpecFilter, SpatialFilter } from './filter.js' */
|
||||
|
||||
/**
|
||||
* @typedef {Object} Job
|
||||
* @property {string} id Job ID.
|
||||
* @property {string} [userId] Owner user id.
|
||||
* @property {string} [name] Job display name.
|
||||
* @property {boolean} [enabled] Whether the job is enabled.
|
||||
* @property {Array<any>} [blacklist] Blacklist entries.
|
||||
* @property {Array<any>} [provider] Provider configuration list.
|
||||
* @property {Object} [notificationAdapter] Notification configuration.
|
||||
* @property {Array<string>} [shared_with_user] Users this job is shared with.
|
||||
* @property {SpatialFilter | null} [spatialFilter] Optional spatial filter configuration as GeoJSON FeatureCollection.
|
||||
* @property {SpecFilter | null} [specFilter] Optional listing specifications.
|
||||
* @property {number} [numberOfFoundListings] Count of active listings for this job.
|
||||
*/
|
||||
|
||||
export {};
|
||||
22
lib/types/listing.js
Normal file
22
lib/types/listing.js
Normal file
@@ -0,0 +1,22 @@
|
||||
/*
|
||||
* Copyright (c) 2026 by Christian Kellner.
|
||||
* Licensed under Apache-2.0 with Commons Clause and Attribution/Naming Clause
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {Object} ParsedListing
|
||||
* @property {string} id Stable unique identifier (hash) of the listing.
|
||||
* @property {string} link Link to the listing detail page.
|
||||
* @property {string} image Link to the listing image.
|
||||
* @property {string} title Title or headline of the listing.
|
||||
* @property {string} [description] Description of the listing.
|
||||
* @property {string} [address] Optional address/location text.
|
||||
* @property {number} [price] Optional price of the listing.
|
||||
* @property {number} [size] Optional size of the listing.
|
||||
* @property {number} [rooms] Optional number of rooms.
|
||||
* @property {number} [latitude] Optional latitude.
|
||||
* @property {number} [longitude] Optional longitude.
|
||||
* @property {number} [distance_to_destination] Optional distance to destination.
|
||||
*/
|
||||
|
||||
export {};
|
||||
25
lib/types/providerConfig.js
Normal file
25
lib/types/providerConfig.js
Normal file
@@ -0,0 +1,25 @@
|
||||
/*
|
||||
* Copyright (c) 2026 by Christian Kellner.
|
||||
* Licensed under Apache-2.0 with Commons Clause and Attribution/Naming Clause
|
||||
*/
|
||||
|
||||
/** @import { ParsedListing } from './listing.js' */
|
||||
|
||||
/**
|
||||
* @typedef {Object} ProviderConfig
|
||||
* @property {string} [url] Base URL to crawl.
|
||||
* @property {string} [sortByDateParam] Query parameter used to enforce sorting by date.
|
||||
* @property {string} [waitForSelector] CSS selector to wait for before parsing content.
|
||||
* @property {Object.<string, string>} crawlFields Mapping of field names to selectors/paths.
|
||||
* @property {string[]} fieldNames List of field names that this provider supports.
|
||||
* @property {string} [crawlContainer] CSS selector for the container holding listing items.
|
||||
* @property {(raw: any) => ParsedListing} normalize Function to convert raw scraped data into a ParsedListing shape.
|
||||
* @property {(listing: ParsedListing) => boolean} filter Function to filter out unwanted listings.
|
||||
* @property {(url: string, waitForSelector?: string) => Promise<any[]>} [getListings] Optional override to fetch listings.
|
||||
* @property {(listing:ParsedListing, browser:any)=>Promise<ParsedListing>} [providerConfig.fetchDetails] Optional per-listing detail enrichment. Called in parallel for each new listing after deduplication. Receives the shared browser instance. Must always resolve (never reject).
|
||||
* @property {Object} [puppeteerOptions] Puppeteer specific options.
|
||||
* @property {boolean} [enabled] Whether the provider is enabled.
|
||||
* @property {(url: string) => Promise<number> | number} [activeTester] Function to check if a listing is still active.
|
||||
*/
|
||||
|
||||
export {};
|
||||
11
lib/types/similarityCache.js
Normal file
11
lib/types/similarityCache.js
Normal file
@@ -0,0 +1,11 @@
|
||||
/*
|
||||
* Copyright (c) 2026 by Christian Kellner.
|
||||
* Licensed under Apache-2.0 with Commons Clause and Attribution/Naming Clause
|
||||
*/
|
||||
|
||||
/**
|
||||
* @typedef {Object} SimilarityCache
|
||||
* @property {(params: { title?: string, address?: string, price?: number|string }) => boolean} checkAndAddEntry Checks if a listing is similar and adds it if not.
|
||||
*/
|
||||
|
||||
export {};
|
||||
18
lib/utils/extract-number.js
Normal file
18
lib/utils/extract-number.js
Normal file
@@ -0,0 +1,18 @@
|
||||
/*
|
||||
* Copyright (c) 2026 by Christian Kellner.
|
||||
* Licensed under Apache-2.0 with Commons Clause and Attribution/Naming Clause
|
||||
*/
|
||||
|
||||
/**
|
||||
* Extract the first number from a string like "1.234 €" or "70 m²".
|
||||
* Removes dots/commas before parsing. Returns null on invalid input.
|
||||
* @param {string|undefined|null} str
|
||||
* @returns {number|null}
|
||||
*/
|
||||
export const extractNumber = (str) => {
|
||||
if (str == null) return null;
|
||||
if (typeof str === 'number') return str;
|
||||
const cleaned = str.replace(/\./g, '').replace(',', '.');
|
||||
const num = parseFloat(cleaned);
|
||||
return isNaN(num) ? null : num;
|
||||
};
|
||||
29
lib/utils/formatListing.js
Normal file
29
lib/utils/formatListing.js
Normal file
@@ -0,0 +1,29 @@
|
||||
/*
|
||||
* Copyright (c) 2026 by Christian Kellner.
|
||||
* Licensed under Apache-2.0 with Commons Clause and Attribution/Naming Clause
|
||||
*/
|
||||
|
||||
/** @import { ParsedListing } from '../types/listing.js' */
|
||||
|
||||
/**
|
||||
* @typedef {Omit<import('../types/listing.js').ParsedListing, 'price' | 'size' | 'rooms'> & {
|
||||
* price: string | null,
|
||||
* size: string | null,
|
||||
* rooms: string | null,
|
||||
* }} FormattedListing
|
||||
*/
|
||||
|
||||
/**
|
||||
* Formats a listing's numerical fields (price, size, rooms) into strings with their respective units.
|
||||
*
|
||||
* @param {import('../types/listing.js').ParsedListing} listing The original listing object.
|
||||
* @returns {FormattedListing} A copy of the listing with formatted strings for price, size, and rooms.
|
||||
*/
|
||||
export const formatListing = (listing) => {
|
||||
return {
|
||||
...listing,
|
||||
price: listing.price != null ? `${listing.price} €` : null,
|
||||
size: listing.size != null ? `${listing.size} m²` : null,
|
||||
rooms: listing.rooms != null ? `${listing.rooms} Zimmer` : null,
|
||||
};
|
||||
};
|
||||
Reference in New Issue
Block a user