regex for einsAImmobilien price normalization | filter listings that does not have all required keys

This commit is contained in:
weakmap@gmail.com
2024-09-29 16:58:01 +02:00
parent 111ef8be43
commit d3cb3a5881
4 changed files with 27 additions and 7 deletions

View File

@@ -1,4 +1,4 @@
import utils, {buildHash} from '../utils.js';
import utils, { buildHash } from '../utils.js';
let appliedBlackList = [];
function normalize(o) {
@@ -7,10 +7,28 @@ function normalize(o) {
size += ` / / ${o.rooms.trim()}`;
}
const link = `https://www.1a-immobilienmarkt.de/expose/${o.id}.html`;
const id = buildHash(o.id, o.price);
return Object.assign(o, { id, size, link });
const price = normalizePrice(o.price);
const id = buildHash(o.id, price);
return Object.assign(o, { id, price, size, link });
}
/**
* einsAImmobilien sometimes use a weird pricing label such as `775.700,00 EUR Kaufpreis ab 2.475 € mtl`.
* Make sure to extract only the actual price out of the string.
* @param price
* @returns {*}
*/
function normalizePrice(price) {
if (price == null) {
return null;
}
const regex = /(\d{1,3}(?:\.\d{3})*,\d{2})\s?(EUR|€)/g;
const result = price.match(regex);
if (result == null || result.length === 0) {
return price;
}
return result[0];
}
function applyBlacklist(o) {
const titleNotBlacklisted = !utils.isOneOf(o.title, appliedBlackList);
const descNotBlacklisted = !utils.isOneOf(o.description, appliedBlackList);
@@ -27,7 +45,6 @@ const config = {
size: '.tabelle .inner_object_data .data_boxes div:nth-child(1)',
rooms: '.tabelle .inner_object_data .data_boxes div:nth-child(2)',
title: '.tabelle .inner_object_data .tabelle_inhalt_titel_black | removeNewline | trim',
description: '.tabelle .inner_object_data .objekt_beschreibung | removeNewline | trim',
},
normalize: normalize,
filter: applyBlacklist,