upgrading dependencies, fixing immowelt, using hash of price and id as unique identifier for listings

This commit is contained in:
Christian Kellner
2024-09-05 13:34:14 +02:00
parent 1bf012f13e
commit 35feb772d7
15 changed files with 680 additions and 446 deletions

View File

@@ -1,18 +1,22 @@
import utils from '../utils.js';
import utils, {buildHash} from '../utils.js';
let appliedBlackList = [];
function normalize(o) {
let size = `${o.size.replace(' Wohnfläche ', '').trim()}`;
if (o.rooms != null) {
size += ` / / ${o.rooms.trim()}`;
}
const link = `https://www.1a-immobilienmarkt.de/expose/${o.id}.html`;
return Object.assign(o, { size, link });
const id = buildHash(o.id, o.price);
return Object.assign(o, { id, size, link });
}
function applyBlacklist(o) {
const titleNotBlacklisted = !utils.isOneOf(o.title, appliedBlackList);
const descNotBlacklisted = !utils.isOneOf(o.description, appliedBlackList);
return titleNotBlacklisted && descNotBlacklisted;
}
const config = {
url: null,
crawlContainer: '.tabelle',

View File

@@ -1,4 +1,4 @@
import utils from '../utils.js';
import utils, {buildHash} from '../utils.js';
let appliedBlackList = [];
function shortenLink(link) {
return link.substring(0, link.indexOf('?'));
@@ -7,12 +7,12 @@ function parseId(shortenedLink) {
return shortenedLink.substring(shortenedLink.lastIndexOf('/') + 1);
}
function normalize(o) {
const id = parseId(shortenLink(o.link));
const size = o.size || 'N/A m²';
const price = o.price || 'N/A €';
const title = o.title || 'No title available';
const address = o.address || 'No address available';
const link = shortenLink(o.link);
const id = buildHash(parseId(shortenLink(o.link)), o.price);
return Object.assign(o, { id, price, size, title, address, link });
}
function applyBlacklist(o) {

View File

@@ -1,12 +1,12 @@
import utils from '../utils.js';
import utils, {buildHash} from '../utils.js';
let appliedBlackList = [];
function normalize(o) {
const id = o.id.substring(o.id.lastIndexOf('/') + 1, o.id.length);
const size = o.size != null ? o.size.replace('Wohnfläche ', '') : 'N/A m²';
const price = o.price.replace('Kaufpreis ', '');
const address = o.address.split(' • ')[o.address.split(' • ').length - 1];
const title = o.title || 'No title available';
const link = o.id;
const id = buildHash(o.id.substring(o.id.lastIndexOf('/') + 1, o.id.length), price);
return Object.assign(o, { id, address, price, size, title, link });
}
function applyBlacklist(o) {

View File

@@ -1,4 +1,4 @@
import utils from '../utils.js';
import utils, {buildHash} from '../utils.js';
let appliedBlackList = [];
function nullOrEmpty(val) {
return val == null || val.length === 0;
@@ -7,7 +7,8 @@ function normalize(o) {
const title = nullOrEmpty(o.title) ? 'NO TITLE FOUND' : o.title.replace('NEU', '');
const address = nullOrEmpty(o.address) ? 'NO ADDRESS FOUND' : (o.address || '').replace(/\(.*\),.*$/, '').trim();
const link = nullOrEmpty(o.link) ? 'NO LINK' : `https://www.immobilienscout24.de${o.link.substring(o.link.indexOf('/expose'))}`;
return Object.assign(o, { title, address, link });
const id = buildHash(o.id, o.price);
return Object.assign(o, { id, title, address, link });
}
function applyBlacklist(o) {
return !utils.isOneOf(o.title, appliedBlackList);

View File

@@ -1,14 +1,15 @@
import utils from '../utils.js';
import utils, {buildHash} from '../utils.js';
let appliedBlackList = [];
function normalize(o) {
const id = o.id.substring(o.id.indexOf('-') + 1, o.id.length);
const size = o.size || 'N/A m²';
const price = (o.price || '--- €').replace('Preis auf Anfrage', '--- €');
const title = o.title || 'No title available';
const link = `https://immo.swp.de/immobilien/${id}`;
const immoId = o.id.substring(o.id.indexOf('-') + 1, o.id.length);
const link = `https://immo.swp.de/immobilien/${immoId}`;
const description = o.description;
const id = buildHash(immoId, price);
return Object.assign(o, {id, price, size, title, link, description});
}

View File

@@ -1,37 +1,42 @@
import utils from '../utils.js';
import utils, {buildHash} from '../utils.js';
let appliedBlackList = [];
function normalize(o) {
return o;
const id = buildHash(o.id, o.price);
return Object.assign(o, {id});
}
function applyBlacklist(o) {
const titleNotBlacklisted = !utils.isOneOf(o.title, appliedBlackList);
const descNotBlacklisted = !utils.isOneOf(o.description, appliedBlackList);
return titleNotBlacklisted && descNotBlacklisted;
const titleNotBlacklisted = !utils.isOneOf(o.title, appliedBlackList);
const descNotBlacklisted = !utils.isOneOf(o.description, appliedBlackList);
return titleNotBlacklisted && descNotBlacklisted;
}
const config = {
url: null,
crawlContainer: "div[class^='EstateItem-']",
sortByDateParam: 'sd=DESC&sf=TIMESTAMP',
crawlFields: {
id: 'a@id',
price: "div[class^='KeyFacts-'] [data-test='price'] | removeNewline | trim",
size: "div[class^='KeyFacts-'] [data-test='area'] | removeNewline | trim",
title: "div[class^='FactsMain-'] h2",
link: 'a@href',
address: "div[class^='estateFacts-'] span | removeNewline | trim",
},
paginate: '#pnlPaging #nlbPlus@href',
normalize: normalize,
filter: applyBlacklist,
url: null,
crawlContainer: 'div[data-testid="serp-card-testid"]',
sortByDateParam: 'sd=DESC&sf=TIMESTAMP',
crawlFields: {
id: 'a@id',
price: 'div[data-testid="cardmfe-price-testid"] | removeNewline | trim',
size: 'div[data-testid="cardmfe-keyfacts-testid"] | removeNewline | trim',
title: '.css-1cbj9xw',
link: 'a@href',
address: 'div[data-testid="cardmfe-description-box-address"] | removeNewline | trim',
},
paginate: '#pnlPaging #nlbPlus@href',
normalize: normalize,
filter: applyBlacklist,
};
export const init = (sourceConfig, blacklist) => {
config.enabled = sourceConfig.enabled;
config.url = sourceConfig.url;
appliedBlackList = blacklist || [];
config.enabled = sourceConfig.enabled;
config.url = sourceConfig.url;
appliedBlackList = blacklist || [];
};
export const metaInformation = {
name: 'Immowelt',
baseUrl: 'https://www.immowelt.de/',
id: 'immowelt',
name: 'Immowelt',
baseUrl: 'https://www.immowelt.de/',
id: 'immowelt',
};
export { config };
export {config};

View File

@@ -1,44 +1,49 @@
import utils from '../utils.js';
import utils, {buildHash} from '../utils.js';
let appliedBlackList = [];
let appliedBlacklistedDistricts = [];
function normalize(o) {
const size = o.size || '--- m²';
return Object.assign(o, { size });
const size = o.size || '--- m²';
const id = buildHash(o.id, o.price);
return Object.assign(o, {id, size});
}
function applyBlacklist(o) {
const titleNotBlacklisted = !utils.isOneOf(o.title, appliedBlackList);
const descNotBlacklisted = !utils.isOneOf(o.description, appliedBlackList);
const isBlacklistedDistrict =
appliedBlacklistedDistricts.length === 0 ? false : utils.isOneOf(o.description, appliedBlacklistedDistricts);
return !isBlacklistedDistrict && titleNotBlacklisted && descNotBlacklisted;
const titleNotBlacklisted = !utils.isOneOf(o.title, appliedBlackList);
const descNotBlacklisted = !utils.isOneOf(o.description, appliedBlackList);
const isBlacklistedDistrict =
appliedBlacklistedDistricts.length === 0 ? false : utils.isOneOf(o.description, appliedBlacklistedDistricts);
return !isBlacklistedDistrict && titleNotBlacklisted && descNotBlacklisted;
}
const config = {
url: null,
crawlContainer: '#srchrslt-adtable .ad-listitem ',
//sort by date is standard oO
sortByDateParam: null,
crawlFields: {
id: '.aditem@data-adid | int',
price: '.aditem-main--middle--price-shipping--price | removeNewline | trim',
size: '.aditem-main .text-module-end span:nth-child(2) | removeNewline | trim',
title: '.aditem-main .text-module-begin a | removeNewline | trim',
link: '.aditem-main .text-module-begin a@href | removeNewline | trim',
description: '.aditem-main p:not(.text-module-end) | removeNewline | trim',
address: '.aditem-main--top--left | trim | removeNewline',
},
paginate: '#srchrslt-pagination .pagination-next@href',
normalize: normalize,
filter: applyBlacklist,
url: null,
crawlContainer: '#srchrslt-adtable .ad-listitem ',
//sort by date is standard oO
sortByDateParam: null,
crawlFields: {
id: '.aditem@data-adid | int',
price: '.aditem-main--middle--price-shipping--price | removeNewline | trim',
size: '.aditem-main .text-module-end span:nth-child(2) | removeNewline | trim',
title: '.aditem-main .text-module-begin a | removeNewline | trim',
link: '.aditem-main .text-module-begin a@href | removeNewline | trim',
description: '.aditem-main p:not(.text-module-end) | removeNewline | trim',
address: '.aditem-main--top--left | trim | removeNewline',
},
paginate: '#srchrslt-pagination .pagination-next@href',
normalize: normalize,
filter: applyBlacklist,
};
export const metaInformation = {
name: 'Ebay Kleinanzeigen',
baseUrl: 'https://www.kleinanzeigen.de/',
id: 'kleinanzeigen',
name: 'Ebay Kleinanzeigen',
baseUrl: 'https://www.kleinanzeigen.de/',
id: 'kleinanzeigen',
};
export const init = (sourceConfig, blacklist, blacklistedDistricts) => {
config.enabled = sourceConfig.enabled;
config.url = sourceConfig.url;
appliedBlacklistedDistricts = blacklistedDistricts || [];
appliedBlackList = blacklist || [];
config.enabled = sourceConfig.enabled;
config.url = sourceConfig.url;
appliedBlacklistedDistricts = blacklistedDistricts || [];
appliedBlackList = blacklist || [];
};
export { config };
export {config};

View File

@@ -1,38 +1,44 @@
import utils from '../utils.js';
import utils, {buildHash} from '../utils.js';
let appliedBlackList = [];
function nullOrEmpty(val) {
return val == null || val.length === 0;
}
function normalize(o) {
const link = nullOrEmpty(o.link) ? 'NO LINK' : `https://www.neubaukompass.de${o.link.substring(o.link.indexOf('/neubau'))}`;
return {...o, link};
const id = buildHash(o.id, o.price);
return Object.assign(o, {id, link});
}
function applyBlacklist(o) {
return !utils.isOneOf(o.title, appliedBlackList);
return !utils.isOneOf(o.title, appliedBlackList);
}
const config = {
url: null,
crawlContainer: '.nbk-container >div article',
sortByDateParam: 'Sortierung=Id&Richtung=DESC',
crawlFields: {
id: '@id',
title: 'a.nbk-truncate@title | removeNewline | trim',
link: 'a.nbk-truncate@href',
address: 'p.nbk-truncate | removeNewline | trim',
price: 'p.nbk-mb-0 | removeNewline | trim',
},
paginate: '.numbered-pager__bottom .numbered-pager--info li:nth-child(2) a@href',
normalize: normalize,
filter: applyBlacklist,
url: null,
crawlContainer: '.nbk-container >div article',
sortByDateParam: 'Sortierung=Id&Richtung=DESC',
crawlFields: {
id: '@id',
title: 'a.nbk-truncate@title | removeNewline | trim',
link: 'a.nbk-truncate@href',
address: 'p.nbk-truncate | removeNewline | trim',
price: 'p.nbk-mb-0 | removeNewline | trim',
},
paginate: '.numbered-pager__bottom .numbered-pager--info li:nth-child(2) a@href',
normalize: normalize,
filter: applyBlacklist,
};
export const init = (sourceConfig, blacklist) => {
config.enabled = sourceConfig.enabled;
config.url = sourceConfig.url;
appliedBlackList = blacklist || [];
config.enabled = sourceConfig.enabled;
config.url = sourceConfig.url;
appliedBlackList = blacklist || [];
};
export const metaInformation = {
name: 'Neubau Kompass',
baseUrl: 'https://www.neubaukompass.de/',
id: 'neubauKompass',
name: 'Neubau Kompass',
baseUrl: 'https://www.neubaukompass.de/',
id: 'neubauKompass',
};
export { config };
export {config};

View File

@@ -1,36 +1,41 @@
import utils from '../utils.js';
import utils, {buildHash} from '../utils.js';
let appliedBlackList = [];
function normalize(o) {
return o;
const id = buildHash(o.id, o.price);
return Object.assign(o, {id});
}
function applyBlacklist(o) {
const titleNotBlacklisted = !utils.isOneOf(o.title, appliedBlackList);
const descNotBlacklisted = !utils.isOneOf(o.description, appliedBlackList);
return o.id != null && titleNotBlacklisted && descNotBlacklisted;
const titleNotBlacklisted = !utils.isOneOf(o.title, appliedBlackList);
const descNotBlacklisted = !utils.isOneOf(o.description, appliedBlackList);
return o.id != null && titleNotBlacklisted && descNotBlacklisted;
}
const config = {
url: null,
crawlContainer: '#main_column .wgg_card',
sortByDateParam: 'sort_column=0&sort_order=0',
crawlFields: {
id: '@data-id',
details: '.row .noprint .col-xs-11 |removeNewline |trim',
price: '.middle .col-xs-3 |removeNewline |trim',
size: '.middle .text-right |removeNewline |trim',
title: '.truncate_title a |removeNewline |trim',
link: '.truncate_title a@href',
},
normalize: normalize,
filter: applyBlacklist,
url: null,
crawlContainer: '#main_column .wgg_card',
sortByDateParam: 'sort_column=0&sort_order=0',
crawlFields: {
id: '@data-id',
details: '.row .noprint .col-xs-11 |removeNewline |trim',
price: '.middle .col-xs-3 |removeNewline |trim',
size: '.middle .text-right |removeNewline |trim',
title: '.truncate_title a |removeNewline |trim',
link: '.truncate_title a@href',
},
normalize: normalize,
filter: applyBlacklist,
};
export const init = (sourceConfig, blacklist) => {
config.enabled = sourceConfig.enabled;
config.url = sourceConfig.url;
appliedBlackList = blacklist || [];
config.enabled = sourceConfig.enabled;
config.url = sourceConfig.url;
appliedBlackList = blacklist || [];
};
export const metaInformation = {
name: 'Wg gesucht',
baseUrl: 'https://www.wg-gesucht.de/',
id: 'wgGesucht',
name: 'Wg gesucht',
baseUrl: 'https://www.wg-gesucht.de/',
id: 'wgGesucht',
};
export { config };
export {config};