mirror of
https://github.com/orangecoding/fredy.git
synced 2026-06-16 12:31:07 +00:00
Feature/spec filter (#276)
* feat(): create map component, add area filtering to the job config * feat(): filter listings by area filter * chore(): cleanup * feat(): solve feedback * feat(): solve most providers * feat(): solve maybe other providers * feat(): add specFilter config, also add rooms to listing * feat(): change tests * feat(): fix kleinanzeigen parser * feat(): add spec filter switch for listing overviiews * feat(): add rooms and size to the overview and detail of a listing * feat(): rem label * feat(): add types, update providers, they now return specs as numbers * feat(): add jsonconfig to enable type checks * feat: add type for prividerConfig, add fieldNames per provider * feat: fix tests, provider, add formatListing * chore: remov duplicates * feat(): fix tests * feat: fix immoscout * chore: geojson typing * feat: solve requested changes
This commit is contained in:
@@ -5,8 +5,16 @@
|
||||
|
||||
import { buildHash, isOneOf } from '../utils.js';
|
||||
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
|
||||
import { extractNumber } from '../utils/extract-number.js';
|
||||
/** @import { ParsedListing } from '../types/listing.js' */
|
||||
/** @import { ProviderConfig } from '../types/providerConfig.js' */
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
/**
|
||||
* @param {any} o
|
||||
* @returns {ParsedListing}
|
||||
*/
|
||||
function normalize(o) {
|
||||
const baseUrl = 'https://www.1a-immobilienmarkt.de';
|
||||
const link = `${baseUrl}/expose/${o.id}.html`;
|
||||
@@ -14,7 +22,17 @@ function normalize(o) {
|
||||
const id = buildHash(o.id, price);
|
||||
const image = baseUrl + o.image;
|
||||
const address = o.address == null ? null : o.address.trim().replaceAll('/', ',');
|
||||
return Object.assign(o, { id, price, link, image, address });
|
||||
return {
|
||||
id,
|
||||
link,
|
||||
title: o.title || '',
|
||||
price: extractNumber(price),
|
||||
size: extractNumber(o.size),
|
||||
rooms: extractNumber(o.rooms),
|
||||
address,
|
||||
image,
|
||||
description: undefined,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -34,13 +52,19 @@ function normalizePrice(price) {
|
||||
}
|
||||
return result[0];
|
||||
}
|
||||
/**
|
||||
* @param {ParsedListing} o
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function applyBlacklist(o) {
|
||||
const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList);
|
||||
const descNotBlacklisted = !isOneOf(o.description, appliedBlackList);
|
||||
return titleNotBlacklisted && descNotBlacklisted;
|
||||
}
|
||||
|
||||
/** @type {ProviderConfig} */
|
||||
const config = {
|
||||
fieldNames: ['id', 'link', 'title', 'price', 'size', 'rooms', 'address', 'image', 'description'],
|
||||
url: null,
|
||||
crawlContainer: '.tabelle',
|
||||
sortByDateParam: 'sort_type=newest',
|
||||
@@ -48,7 +72,8 @@ const config = {
|
||||
crawlFields: {
|
||||
id: '.inner_object_data input[name="marker_objekt_id"]@value | int',
|
||||
price: '.inner_object_data .single_data_price | removeNewline | trim',
|
||||
size: '.tabelle .tabelle_inhalt_infos .single_data_box | removeNewline | trim',
|
||||
size: '.tabelle .tabelle_inhalt_infos .single_data_box:nth-of-type(1) | removeNewline | trim',
|
||||
rooms: '.tabelle .tabelle_inhalt_infos .single_data_box:nth-of-type(2) | removeNewline | trim',
|
||||
title: '.inner_object_data .tabelle_inhalt_titel_black | removeNewline | trim',
|
||||
image: '.inner_object_pic img@src',
|
||||
address: '.tabelle .tabelle_inhalt_infos .left_information > div:nth-child(2) | removeNewline | trim',
|
||||
|
||||
@@ -5,9 +5,12 @@
|
||||
|
||||
import { buildHash, isOneOf } from '../utils.js';
|
||||
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
|
||||
import { extractNumber } from '../utils/extract-number.js';
|
||||
import puppeteerExtractor from '../services/extractor/puppeteerExtractor.js';
|
||||
import * as cheerio from 'cheerio';
|
||||
import logger from '../services/logger.js';
|
||||
/** @import { ParsedListing } from '../types/listing.js' */
|
||||
/** @import { ProviderConfig } from '../types/providerConfig.js' */
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
@@ -65,27 +68,44 @@ async function fetchDetails(listing, browser) {
|
||||
return listing;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {any} o
|
||||
* @returns {ParsedListing}
|
||||
*/
|
||||
function normalize(o) {
|
||||
const baseUrl = 'https://www.immobilien.de';
|
||||
const size = o.size || null;
|
||||
const price = o.price || null;
|
||||
const title = o.title || 'No title available';
|
||||
const title = o.title || '';
|
||||
const address = o.address || null;
|
||||
const shortLink = shortenLink(o.link);
|
||||
const link = shortLink ? (shortLink.startsWith('http') ? shortLink : baseUrl + shortLink) : baseUrl;
|
||||
const image = o.image ? (o.image.startsWith('http') ? o.image : baseUrl + o.image) : null;
|
||||
const id = buildHash(parseId(shortLink), o.price);
|
||||
return Object.assign(o, { id, price, size, title, address, link, image });
|
||||
return {
|
||||
id,
|
||||
link,
|
||||
title,
|
||||
price: extractNumber(o.price),
|
||||
size: extractNumber(o.size),
|
||||
rooms: extractNumber(o.rooms),
|
||||
address,
|
||||
image,
|
||||
description: o.description,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {ParsedListing} o
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function applyBlacklist(o) {
|
||||
const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList);
|
||||
const descNotBlacklisted = !isOneOf(o.description, appliedBlackList);
|
||||
return titleNotBlacklisted && descNotBlacklisted;
|
||||
}
|
||||
|
||||
/** @type {ProviderConfig} */
|
||||
const config = {
|
||||
fieldNames: ['id', 'link', 'title', 'price', 'size', 'rooms', 'address', 'image', 'description'],
|
||||
url: null,
|
||||
crawlContainer: 'a.lr-card',
|
||||
sortByDateParam: 'sort_col=*created_ts&sort_dir=desc',
|
||||
@@ -94,6 +114,7 @@ const config = {
|
||||
id: '@href', //will be transformed later
|
||||
price: '.lr-card__price-amount | trim',
|
||||
size: '.lr-card__fact:has(.lr-card__fact-label:contains("Fläche")) .lr-card__fact-value | trim',
|
||||
rooms: '.zimmer .label_info',
|
||||
title: '.lr-card__title | trim',
|
||||
description: '.description | trim',
|
||||
link: '@href',
|
||||
|
||||
@@ -46,6 +46,10 @@ import {
|
||||
convertWebToMobile,
|
||||
} from '../services/immoscout/immoscout-web-translator.js';
|
||||
import logger from '../services/logger.js';
|
||||
import { extractNumber } from '../utils/extract-number.js';
|
||||
/** @import { ParsedListing } from '../types/listing.js' */
|
||||
/** @import { ProviderConfig } from '../types/providerConfig.js' */
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
async function getListings(url) {
|
||||
@@ -168,22 +172,44 @@ async function isListingActive(link) {
|
||||
function nullOrEmpty(val) {
|
||||
return val == null || val.length === 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {any} o
|
||||
* @returns {ParsedListing}
|
||||
*/
|
||||
function normalize(o) {
|
||||
const title = nullOrEmpty(o.title) ? 'NO TITLE FOUND' : o.title.replace('NEU', '');
|
||||
const title = (o.title || '').replace('NEU', '').trim();
|
||||
const address = nullOrEmpty(o.address) ? 'NO ADDRESS FOUND' : (o.address || '').replace(/\(.*\),.*$/, '').trim();
|
||||
const id = buildHash(o.id, o.price);
|
||||
return Object.assign(o, { id, title, address });
|
||||
return {
|
||||
id,
|
||||
link: o.link,
|
||||
title,
|
||||
price: extractNumber(o.price),
|
||||
size: extractNumber(o.size),
|
||||
rooms: extractNumber(o.rooms),
|
||||
address,
|
||||
image: o.image,
|
||||
description: o.description,
|
||||
};
|
||||
}
|
||||
/**
|
||||
* @param {ParsedListing} o
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function applyBlacklist(o) {
|
||||
return !isOneOf(o.title, appliedBlackList);
|
||||
}
|
||||
/** @type {ProviderConfig} */
|
||||
const config = {
|
||||
fieldNames: ['id', 'link', 'title', 'price', 'size', 'rooms', 'address', 'image', 'description'],
|
||||
url: null,
|
||||
crawlFields: {
|
||||
id: 'id',
|
||||
title: 'title',
|
||||
price: 'price',
|
||||
size: 'size',
|
||||
rooms: 'rooms',
|
||||
link: 'link',
|
||||
address: 'address',
|
||||
},
|
||||
|
||||
@@ -5,27 +5,46 @@
|
||||
|
||||
import { isOneOf, buildHash } from '../utils.js';
|
||||
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
|
||||
import { extractNumber } from '../utils/extract-number.js';
|
||||
/** @import { ParsedListing } from '../types/listing.js' */
|
||||
/** @import { ProviderConfig } from '../types/providerConfig.js' */
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
/**
|
||||
* @param {any} o
|
||||
* @returns {ParsedListing}
|
||||
*/
|
||||
function normalize(o) {
|
||||
const size = o.size || 'N/A m²';
|
||||
const price = (o.price || '--- €').replace('Preis auf Anfrage', '--- €');
|
||||
const title = o.title || 'No title available';
|
||||
const immoId = o.id.substring(o.id.indexOf('-') + 1, o.id.length);
|
||||
const link = `https://immo.swp.de/immobilien/${immoId}`;
|
||||
const description = o.description;
|
||||
const id = buildHash(immoId, price);
|
||||
return Object.assign(o, { id, price, size, title, link, description });
|
||||
const id = buildHash(immoId, o.price);
|
||||
return {
|
||||
id,
|
||||
link,
|
||||
title: o.title || '',
|
||||
price: extractNumber(o.price),
|
||||
size: extractNumber(o.size),
|
||||
rooms: extractNumber(o.rooms),
|
||||
address: o.address,
|
||||
image: o.image,
|
||||
description: undefined,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {ParsedListing} o
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function applyBlacklist(o) {
|
||||
const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList);
|
||||
const descNotBlacklisted = !isOneOf(o.description, appliedBlackList);
|
||||
return titleNotBlacklisted && descNotBlacklisted;
|
||||
}
|
||||
|
||||
/** @type {ProviderConfig} */
|
||||
const config = {
|
||||
fieldNames: ['id', 'link', 'title', 'price', 'size', 'rooms', 'address', 'image', 'description'],
|
||||
url: null,
|
||||
crawlContainer: '.js-serp-item',
|
||||
sortByDateParam: 's=most_recently_updated_first',
|
||||
@@ -34,9 +53,10 @@ const config = {
|
||||
id: '.js-bookmark-btn@data-id',
|
||||
price: 'div.align-items-start div:first-child | trim',
|
||||
size: 'div.align-items-start div:nth-child(3) | trim',
|
||||
rooms: 'div.align-items-start div:nth-child(2) | trim',
|
||||
address: '.js-bookmark-btn@data-address',
|
||||
title: '.js-item-title-link@title | trim',
|
||||
link: '.ci-search-result__link@href',
|
||||
description: '.js-show-more-item-sm | removeNewline | trim',
|
||||
image: 'img@src',
|
||||
},
|
||||
normalize: normalize,
|
||||
|
||||
@@ -5,9 +5,12 @@
|
||||
|
||||
import { buildHash, isOneOf } from '../utils.js';
|
||||
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
|
||||
import { extractNumber } from '../utils/extract-number.js';
|
||||
import puppeteerExtractor from '../services/extractor/puppeteerExtractor.js';
|
||||
import * as cheerio from 'cheerio';
|
||||
import logger from '../services/logger.js';
|
||||
/** @import { ParsedListing } from '../types/listing.js' */
|
||||
/** @import { ProviderConfig } from '../types/providerConfig.js' */
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
@@ -48,18 +51,38 @@ async function fetchDetails(listing, browser) {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {any} o
|
||||
* @returns {ParsedListing}
|
||||
*/
|
||||
function normalize(o) {
|
||||
const id = buildHash(o.id, o.price);
|
||||
return Object.assign(o, { id });
|
||||
return {
|
||||
id,
|
||||
link: o.link,
|
||||
title: o.title || '',
|
||||
price: extractNumber(o.price),
|
||||
size: extractNumber(o.size),
|
||||
rooms: extractNumber(o.rooms),
|
||||
address: o.address,
|
||||
image: o.image,
|
||||
description: o.description,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {ParsedListing} o
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function applyBlacklist(o) {
|
||||
const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList);
|
||||
const descNotBlacklisted = !isOneOf(o.description, appliedBlackList);
|
||||
return titleNotBlacklisted && descNotBlacklisted;
|
||||
}
|
||||
|
||||
/** @type {ProviderConfig} */
|
||||
const config = {
|
||||
fieldNames: ['id', 'link', 'title', 'price', 'size', 'rooms', 'address', 'image', 'description'],
|
||||
url: null,
|
||||
crawlContainer:
|
||||
'div[data-testid="serp-core-scrollablelistview-testid"]:not(div[data-testid="serp-enlargementlist-testid"] div[data-testid="serp-card-testid"]) div[data-testid="serp-core-classified-card-testid"]',
|
||||
@@ -68,7 +91,8 @@ const config = {
|
||||
crawlFields: {
|
||||
id: 'a@href',
|
||||
price: 'div[data-testid="cardmfe-price-testid"] | removeNewline | trim',
|
||||
size: 'div[data-testid="cardmfe-keyfacts-testid"] | removeNewline | trim',
|
||||
size: 'div[data-testid="cardmfe-keyfacts-testid"] div:nth-of-type(3) | removeNewline | trim',
|
||||
rooms: 'div[data-testid="cardmfe-keyfacts-testid"] div:nth-of-type(1) | removeNewline | trim',
|
||||
title: 'div[data-testid="cardmfe-description-box-text-test-id"] > div:nth-of-type(2)',
|
||||
link: 'a@href',
|
||||
description: 'div[data-testid="cardmfe-description-text-test-id"] > div:nth-of-type(2) | removeNewline | trim',
|
||||
|
||||
@@ -5,6 +5,9 @@
|
||||
|
||||
import { buildHash, isOneOf } from '../utils.js';
|
||||
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
|
||||
import { extractNumber } from '../utils/extract-number.js';
|
||||
/** @import { ParsedListing } from '../types/listing.js' */
|
||||
/** @import { ProviderConfig } from '../types/providerConfig.js' */
|
||||
import puppeteerExtractor from '../services/extractor/puppeteerExtractor.js';
|
||||
import logger from '../services/logger.js';
|
||||
import * as cheerio from 'cheerio';
|
||||
@@ -146,13 +149,33 @@ async function fetchDetails(listing, browser) {
|
||||
return enrichListingFromDetails(listing, browser);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {any} o
|
||||
* @returns {ParsedListing}
|
||||
*/
|
||||
function normalize(o) {
|
||||
const size = o.size || '--- m²';
|
||||
const parts = (o.tags || '').split('·').map((p) => p.trim());
|
||||
const size = parts.find((p) => p.includes('m²'));
|
||||
const rooms = parts.find((p) => p.includes('Zi.'));
|
||||
const id = buildHash(o.id, o.price);
|
||||
const link = toAbsoluteLink(o.link) || o.link;
|
||||
return Object.assign(o, { id, size, link });
|
||||
|
||||
return {
|
||||
id,
|
||||
title: o.title,
|
||||
link: toAbsoluteLink(o.link) || o.link,
|
||||
price: extractNumber(o.price),
|
||||
size: extractNumber(size),
|
||||
rooms: extractNumber(rooms),
|
||||
address: o.address,
|
||||
description: o.description,
|
||||
image: o.image,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {ParsedListing} o
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function applyBlacklist(o) {
|
||||
const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList);
|
||||
const descNotBlacklisted = !isOneOf(o.description, appliedBlackList);
|
||||
@@ -161,16 +184,18 @@ function applyBlacklist(o) {
|
||||
return o.title != null && !isBlacklistedDistrict && titleNotBlacklisted && descNotBlacklisted;
|
||||
}
|
||||
|
||||
/** @type {ProviderConfig} */
|
||||
const config = {
|
||||
fieldNames: ['id', 'link', 'title', 'price', 'size', 'rooms', 'address', 'image', 'description'],
|
||||
url: null,
|
||||
crawlContainer: '#srchrslt-adtable .ad-listitem ',
|
||||
//sort by date is standard oO
|
||||
sortByDateParam: null,
|
||||
waitForSelector: 'body',
|
||||
crawlFields: {
|
||||
id: '.aditem@data-adid | int',
|
||||
id: '.aditem@data-adid',
|
||||
price: '.aditem-main--middle--price-shipping--price | removeNewline | trim',
|
||||
size: '.aditem-main .text-module-end | removeNewline | trim',
|
||||
tags: '.aditem-main--middle--tags | removeNewline | trim',
|
||||
title: '.aditem-main .text-module-begin a | removeNewline | trim',
|
||||
link: '.aditem-main .text-module-begin a@href | removeNewline | trim',
|
||||
description: '.aditem-main .aditem-main--middle--description | removeNewline | trim',
|
||||
|
||||
@@ -5,23 +5,46 @@
|
||||
|
||||
import { isOneOf, buildHash } from '../utils.js';
|
||||
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
|
||||
import { extractNumber } from '../utils/extract-number.js';
|
||||
/** @import { ParsedListing } from '../types/listing.js' */
|
||||
/** @import { ProviderConfig } from '../types/providerConfig.js' */
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
/**
|
||||
* @param {any} o
|
||||
* @returns {ParsedListing}
|
||||
*/
|
||||
function normalize(o) {
|
||||
const originalId = o.id.split('/').pop();
|
||||
const id = buildHash(originalId, o.price);
|
||||
const size = o.size ?? 'N/A m²';
|
||||
const title = o.title || 'No title available';
|
||||
const link = o.link != null ? `https://www.mcmakler.de${o.link}` : o.link;
|
||||
const [rooms, size] = o.tags.split(' | ');
|
||||
const address = o.address?.replace(' / ', ' ') || null;
|
||||
const link = o.link != null ? `https://www.mcmakler.de${o.link}` : config.url;
|
||||
return Object.assign(o, { id, size, title, link, address });
|
||||
return {
|
||||
id,
|
||||
link,
|
||||
title: o.title || '',
|
||||
price: extractNumber(o.price),
|
||||
size: extractNumber(size),
|
||||
rooms: extractNumber(rooms),
|
||||
address,
|
||||
image: o.image,
|
||||
description: undefined,
|
||||
};
|
||||
}
|
||||
/**
|
||||
* @param {ParsedListing} o
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function applyBlacklist(o) {
|
||||
const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList);
|
||||
const descNotBlacklisted = !isOneOf(o.description, appliedBlackList);
|
||||
return titleNotBlacklisted && descNotBlacklisted;
|
||||
}
|
||||
/** @type {ProviderConfig} */
|
||||
const config = {
|
||||
fieldNames: ['id', 'link', 'title', 'price', 'size', 'rooms', 'address', 'image', 'description'],
|
||||
url: null,
|
||||
crawlContainer: 'article[data-testid="propertyCard"]',
|
||||
sortByDateParam: 'sortBy=DATE&sortOn=DESC',
|
||||
@@ -30,7 +53,7 @@ const config = {
|
||||
id: 'h2 a@href',
|
||||
title: 'h2 a | removeNewline | trim',
|
||||
price: 'footer > p:first-of-type | trim',
|
||||
size: 'footer > p:nth-of-type(2) | trim',
|
||||
tags: 'footer > p:nth-of-type(2) | trim',
|
||||
address: 'div > h2 + p | removeNewline | trim',
|
||||
image: 'img@src',
|
||||
link: 'h2 a@href',
|
||||
|
||||
@@ -5,6 +5,9 @@
|
||||
|
||||
import { isOneOf, buildHash } from '../utils.js';
|
||||
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
|
||||
import { extractNumber } from '../utils/extract-number.js';
|
||||
/** @import { ParsedListing } from '../types/listing.js' */
|
||||
/** @import { ProviderConfig } from '../types/providerConfig.js' */
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
@@ -12,19 +15,39 @@ function nullOrEmpty(val) {
|
||||
return val == null || val.length === 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {any} o
|
||||
* @returns {ParsedListing}
|
||||
*/
|
||||
function normalize(o) {
|
||||
const link = nullOrEmpty(o.link)
|
||||
? 'NO LINK'
|
||||
: `https://www.neubaukompass.de${o.link.substring(o.link.indexOf('/neubau'))}`;
|
||||
const id = buildHash(o.link, o.price);
|
||||
return Object.assign(o, { id, link });
|
||||
return {
|
||||
id,
|
||||
link,
|
||||
title: o.title || '',
|
||||
price: extractNumber(o.price),
|
||||
size: extractNumber(o.size),
|
||||
rooms: extractNumber(o.rooms),
|
||||
address: o.address,
|
||||
image: o.image,
|
||||
description: o.description,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {ParsedListing} o
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function applyBlacklist(o) {
|
||||
return !isOneOf(o.title, appliedBlackList);
|
||||
}
|
||||
|
||||
/** @type {ProviderConfig} */
|
||||
const config = {
|
||||
fieldNames: ['id', 'link', 'title', 'price', 'size', 'rooms', 'address', 'image', 'description'],
|
||||
url: null,
|
||||
crawlContainer: '.col-12.mb-4',
|
||||
sortByDateParam: 'Sortierung=Id&Richtung=DESC',
|
||||
@@ -34,7 +57,9 @@ const config = {
|
||||
title: 'a@title | removeNewline | trim',
|
||||
link: 'a@href',
|
||||
address: '.nbk-project-card__description | removeNewline | trim',
|
||||
price: '.nbk-project-card__spec-item .nbk-project-card__spec-value | removeNewline | trim',
|
||||
price: '.nbk-project-card__spec-item:nth-child(1) .nbk-project-card__spec-value | removeNewline | trim',
|
||||
size: '.nbk-project-card__spec-item:nth-child(2) .nbk-project-card__spec-value | removeNewline | trim',
|
||||
rooms: '.nbk-project-card__spec-item:nth-child(3) .nbk-project-card__spec-value | removeNewline | trim',
|
||||
image: '.nbk-project-card__image@src',
|
||||
},
|
||||
normalize: normalize,
|
||||
|
||||
@@ -5,19 +5,43 @@
|
||||
|
||||
import { isOneOf, buildHash } from '../utils.js';
|
||||
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
|
||||
import { extractNumber } from '../utils/extract-number.js';
|
||||
/** @import { ParsedListing } from '../types/listing.js' */
|
||||
/** @import { ProviderConfig } from '../types/providerConfig.js' */
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
/**
|
||||
* @param {any} o
|
||||
* @returns {ParsedListing}
|
||||
*/
|
||||
function normalize(o) {
|
||||
const link = metaInformation.baseUrl + o.link;
|
||||
const id = buildHash(o.title, o.link, o.price);
|
||||
return Object.assign(o, { link, id });
|
||||
return {
|
||||
id,
|
||||
link,
|
||||
title: o.title || '',
|
||||
price: extractNumber(o.price),
|
||||
size: extractNumber(o.size),
|
||||
rooms: extractNumber(o.rooms),
|
||||
address: o.address,
|
||||
image: o.image,
|
||||
description: o.description,
|
||||
};
|
||||
}
|
||||
/**
|
||||
* @param {ParsedListing} o
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function applyBlacklist(o) {
|
||||
const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList);
|
||||
const descNotBlacklisted = !isOneOf(o.description, appliedBlackList);
|
||||
return titleNotBlacklisted && descNotBlacklisted;
|
||||
}
|
||||
/** @type {ProviderConfig} */
|
||||
const config = {
|
||||
fieldNames: ['id', 'link', 'title', 'price', 'size', 'rooms', 'address', 'image', 'description'],
|
||||
url: null,
|
||||
crawlContainer: 'div[data-livecomponent-id*="search/property_list"] .grid > div',
|
||||
sortByDateParam: null,
|
||||
@@ -27,6 +51,7 @@ const config = {
|
||||
title: 'h4 | removeNewline | trim',
|
||||
price: '.text-xl | trim',
|
||||
size: 'div[title="Wohnfläche"] | trim',
|
||||
rooms: 'div[title="Zimmer"] | trim',
|
||||
address: '.text-slate-800 | removeNewline | trim',
|
||||
image: 'img@src',
|
||||
link: 'a@href',
|
||||
|
||||
@@ -5,24 +5,47 @@
|
||||
|
||||
import { isOneOf, buildHash } from '../utils.js';
|
||||
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
|
||||
import { extractNumber } from '../utils/extract-number.js';
|
||||
/** @import { ParsedListing } from '../types/listing.js' */
|
||||
/** @import { ProviderConfig } from '../types/providerConfig.js' */
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
/**
|
||||
* @param {any} o
|
||||
* @returns {ParsedListing}
|
||||
*/
|
||||
function normalize(o) {
|
||||
const id = buildHash(o.id, o.price);
|
||||
const address = o.address?.replace(/^adresse /i, '') ?? null;
|
||||
const title = o.title || 'No title available';
|
||||
const link = o.link != null ? decodeURIComponent(o.link) : config.url;
|
||||
|
||||
const urlReg = new RegExp(/url\((.*?)\)/gim);
|
||||
const image = o.image != null ? urlReg.exec(o.image)[1] : null;
|
||||
return Object.assign(o, { id, address, title, link, image });
|
||||
return {
|
||||
id,
|
||||
link,
|
||||
title: o.title || '',
|
||||
price: extractNumber(o.price),
|
||||
size: extractNumber(o.size),
|
||||
rooms: extractNumber(o.rooms),
|
||||
address,
|
||||
image,
|
||||
description: o.description,
|
||||
};
|
||||
}
|
||||
/**
|
||||
* @param {ParsedListing} o
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function applyBlacklist(o) {
|
||||
const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList);
|
||||
const descNotBlacklisted = !isOneOf(o.description, appliedBlackList);
|
||||
return titleNotBlacklisted && descNotBlacklisted;
|
||||
}
|
||||
/** @type {ProviderConfig} */
|
||||
const config = {
|
||||
fieldNames: ['id', 'link', 'title', 'price', 'size', 'rooms', 'address', 'image', 'description'],
|
||||
url: null,
|
||||
crawlContainer: '.listentry-content',
|
||||
sortByDateParam: null, // sort by date is standard
|
||||
@@ -32,6 +55,7 @@ const config = {
|
||||
title: 'h2 | trim',
|
||||
price: '.listentry-details-price .listentry-details-v | trim',
|
||||
size: '.listentry-details-size .listentry-details-v | trim',
|
||||
rooms: '.listentry-details-rooms .listentry-details-v | trim',
|
||||
address: '.listentry-adress | trim',
|
||||
image: '.listentry-img@style',
|
||||
link: '.shariff@data-url',
|
||||
|
||||
@@ -8,6 +8,9 @@ import checkIfListingIsActive from '../services/listings/listingActiveTester.js'
|
||||
import puppeteerExtractor from '../services/extractor/puppeteerExtractor.js';
|
||||
import * as cheerio from 'cheerio';
|
||||
import logger from '../services/logger.js';
|
||||
import { extractNumber } from '../utils/extract-number.js';
|
||||
/** @import { ParsedListing } from '../types/listing.js' */
|
||||
/** @import { ProviderConfig } from '../types/providerConfig.js' */
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
@@ -55,20 +58,39 @@ async function fetchDetails(listing, browser) {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {any} o
|
||||
* @returns {ParsedListing}
|
||||
*/
|
||||
function normalize(o) {
|
||||
const originalId = o.id.split('/').pop().replace('.html', '');
|
||||
const id = buildHash(originalId, o.price);
|
||||
const size = o.size?.replace(' Wohnfläche', '').replace(' m²', 'm²') ?? null;
|
||||
const title = o.title || 'No title available';
|
||||
const link = o.link != null ? `https://immobilien.sparkasse.de${o.link}` : config.url;
|
||||
return Object.assign(o, { id, size, title, link });
|
||||
|
||||
return {
|
||||
id,
|
||||
link,
|
||||
title: o.title || '',
|
||||
price: extractNumber(o.price),
|
||||
size: extractNumber(o.size),
|
||||
rooms: extractNumber(o.rooms),
|
||||
address: o.address,
|
||||
image: o.image,
|
||||
description: o.description,
|
||||
};
|
||||
}
|
||||
/**
|
||||
* @param {ParsedListing} o
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function applyBlacklist(o) {
|
||||
const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList);
|
||||
const descNotBlacklisted = !isOneOf(o.description, appliedBlackList);
|
||||
return titleNotBlacklisted && descNotBlacklisted;
|
||||
}
|
||||
/** @type {ProviderConfig} */
|
||||
const config = {
|
||||
fieldNames: ['id', 'link', 'title', 'price', 'size', 'rooms', 'address', 'image', 'description'],
|
||||
url: null,
|
||||
crawlContainer: 'div[data-testid="estate-link"]',
|
||||
sortByDateParam: 'sortBy=date_desc',
|
||||
@@ -77,7 +99,8 @@ const config = {
|
||||
id: 'a@href',
|
||||
title: 'h3 | trim',
|
||||
price: '.estate-list-price | trim',
|
||||
size: '.estate-mainfact span | trim',
|
||||
size: '.estate-mainfact:nth-child(1) span | trim',
|
||||
rooms: '.estate-mainfact:nth-child(2) span | trim',
|
||||
address: 'h6 | trim',
|
||||
image: 'img@src',
|
||||
link: 'a@href',
|
||||
|
||||
@@ -5,9 +5,12 @@
|
||||
|
||||
import { isOneOf, buildHash } from '../utils.js';
|
||||
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
|
||||
import { extractNumber } from '../utils/extract-number.js';
|
||||
import puppeteerExtractor from '../services/extractor/puppeteerExtractor.js';
|
||||
import * as cheerio from 'cheerio';
|
||||
import logger from '../services/logger.js';
|
||||
/** @import { ParsedListing } from '../types/listing.js' */
|
||||
/** @import { ProviderConfig } from '../types/providerConfig.js' */
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
@@ -32,20 +35,39 @@ async function fetchDetails(listing, browser) {
|
||||
return listing;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {any} o
|
||||
* @returns {ParsedListing}
|
||||
*/
|
||||
function normalize(o) {
|
||||
const id = buildHash(o.id, o.price);
|
||||
const link = `https://www.wg-gesucht.de${o.link}`;
|
||||
const image = o.image != null ? o.image.replace('small', 'large') : null;
|
||||
return Object.assign(o, { id, link, image });
|
||||
const [rooms, city, road] = o.details?.split(' | ') || [];
|
||||
return {
|
||||
id,
|
||||
link,
|
||||
title: o.title || '',
|
||||
price: extractNumber(o.price),
|
||||
size: extractNumber(o.size),
|
||||
rooms: extractNumber(rooms),
|
||||
address: `${city}, ${road}`,
|
||||
image,
|
||||
description: o.description,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {ParsedListing} o
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function applyBlacklist(o) {
|
||||
const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList);
|
||||
const descNotBlacklisted = !isOneOf(o.description, appliedBlackList);
|
||||
return o.id != null && titleNotBlacklisted && descNotBlacklisted;
|
||||
}
|
||||
|
||||
/** @type {ProviderConfig} */
|
||||
const config = {
|
||||
url: null,
|
||||
crawlContainer: '#main_column .wgg_card',
|
||||
@@ -56,10 +78,13 @@ const config = {
|
||||
details: '.row .noprint .col-xs-11 |removeNewline |trim',
|
||||
price: '.middle .col-xs-3 |removeNewline |trim',
|
||||
size: '.middle .text-right |removeNewline |trim',
|
||||
rooms: '.middle .text-right |removeNewline |trim',
|
||||
title: '.truncate_title a |removeNewline |trim',
|
||||
link: '.truncate_title a@href',
|
||||
image: '.img-responsive@src',
|
||||
description: '.row .noprint .col-xs-11 |removeNewline |trim',
|
||||
},
|
||||
fieldNames: ['id', 'link', 'title', 'price', 'size', 'rooms', 'address', 'image', 'description'],
|
||||
normalize: normalize,
|
||||
filter: applyBlacklist,
|
||||
fetchDetails,
|
||||
|
||||
@@ -5,26 +5,45 @@
|
||||
|
||||
import * as utils from '../utils.js';
|
||||
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
|
||||
import { extractNumber } from '../utils/extract-number.js';
|
||||
/** @import { ParsedListing } from '../types/listing.js' */
|
||||
/** @import { ProviderConfig } from '../types/providerConfig.js' */
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
/**
|
||||
* @param {any} o
|
||||
* @returns {ParsedListing}
|
||||
*/
|
||||
function normalize(o) {
|
||||
const id = o.link.split('/').pop();
|
||||
const price = o.price;
|
||||
const size = o.size;
|
||||
const rooms = o.rooms;
|
||||
const [city = '', part = ''] = (o.description || '').split('-').map((v) => v.trim());
|
||||
const address = `${part}, ${city}`;
|
||||
return Object.assign(o, { id, price, size, rooms, address });
|
||||
return {
|
||||
id: o.link.split('/').pop(),
|
||||
link: o.link,
|
||||
title: o.title || '',
|
||||
price: extractNumber(o.price),
|
||||
size: extractNumber(o.size),
|
||||
rooms: extractNumber(o.rooms),
|
||||
address,
|
||||
image: o.image,
|
||||
description: o.description,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {ParsedListing} o
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function applyBlacklist(o) {
|
||||
const titleNotBlacklisted = !utils.isOneOf(o.title, appliedBlackList);
|
||||
const descNotBlacklisted = !utils.isOneOf(o.description, appliedBlackList);
|
||||
return o.id != null && o.title != null && titleNotBlacklisted && descNotBlacklisted && o.link.startsWith(o.link);
|
||||
}
|
||||
|
||||
/** @type {ProviderConfig} */
|
||||
const config = {
|
||||
fieldNames: ['id', 'link', 'title', 'price', 'size', 'rooms', 'address', 'image', 'description'],
|
||||
url: null,
|
||||
sortByDateParam: null,
|
||||
waitForSelector: 'body',
|
||||
@@ -37,7 +56,7 @@ const config = {
|
||||
size: 'dl:nth-of-type(3) dd | removeNewline | trim',
|
||||
description: 'div.before\\:icon-location_marker | trim',
|
||||
link: '@href',
|
||||
imageUrl: 'img@src',
|
||||
image: 'img@src',
|
||||
},
|
||||
normalize: normalize,
|
||||
filter: applyBlacklist,
|
||||
|
||||
Reference in New Issue
Block a user