Feature/kleinanzeigen new (#292)

* Feature/Kleinanzeigen addresses (#289)

* upgrade dependencies

* immoscout_details -> provider_details

* fetching details more generic

* removing claude action

* fixing sparkassen selector

* improvements

* fixing immobilienDE test

* upgrading dependencies

* settings for many provider

---------

Co-authored-by: Adrian Bach <65734063+realDayaa@users.noreply.github.com>
This commit is contained in:
Christian Kellner
2026-04-07 19:53:40 +02:00
committed by GitHub
parent 7888c5b340
commit cdc0cbda2f
35 changed files with 1098 additions and 501 deletions

View File

@@ -5,6 +5,9 @@
import { buildHash, isOneOf } from '../utils.js';
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
import puppeteerExtractor from '../services/extractor/puppeteerExtractor.js';
import * as cheerio from 'cheerio';
import logger from '../services/logger.js';
let appliedBlackList = [];
@@ -18,6 +21,51 @@ function parseId(shortenedLink) {
return shortenedLink.substring(shortenedLink.lastIndexOf('/') + 1);
}
async function fetchDetails(listing, browser) {
try {
const html = await puppeteerExtractor(listing.link, null, { browser });
if (!html) return listing;
const $ = cheerio.load(html);
// Try JSON-LD first
let description = null;
let address = listing.address;
$('script[type="application/ld+json"]').each((_, el) => {
if (description) return;
try {
const data = JSON.parse($(el).text());
const nodes = Array.isArray(data) ? data : [data];
for (const node of nodes) {
if (node.description && !description) description = String(node.description).replace(/\s+/g, ' ').trim();
const addr = node.address || node?.mainEntity?.address;
if (addr && addr.streetAddress && address === listing.address) {
const parts = [addr.streetAddress, addr.postalCode, addr.addressLocality].filter(Boolean);
if (parts.length) address = parts.join(' ');
}
}
} catch {
// ignore malformed JSON-LD
}
});
// Fallback: common description selectors used by immobilien.de
if (!description) {
const sel = ['.beschreibung', '.freitext', '.objektbeschreibung', '.description'].find((s) => $(s).length > 0);
if (sel) description = $(sel).text().replace(/\s+/g, ' ').trim();
}
return {
...listing,
address,
description: description || listing.description,
};
} catch (error) {
logger.warn(`Could not fetch immobilien.de detail page for listing '${listing.id}'.`, error?.message || error);
return listing;
}
}
function normalize(o) {
const baseUrl = 'https://www.immobilien.de';
const size = o.size || null;
@@ -25,8 +73,8 @@ function normalize(o) {
const title = o.title || 'No title available';
const address = o.address || null;
const shortLink = shortenLink(o.link);
const link = baseUrl + shortLink;
const image = baseUrl + o.image;
const link = shortLink ? (shortLink.startsWith('http') ? shortLink : baseUrl + shortLink) : baseUrl;
const image = o.image ? (o.image.startsWith('http') ? o.image : baseUrl + o.image) : null;
const id = buildHash(parseId(shortLink), o.price);
return Object.assign(o, { id, price, size, title, address, link, image });
}
@@ -39,21 +87,22 @@ function applyBlacklist(o) {
const config = {
url: null,
crawlContainer: 'a:has(div.list_entry)',
crawlContainer: 'a.lr-card',
sortByDateParam: 'sort_col=*created_ts&sort_dir=desc',
waitForSelector: 'body',
waitForSelector: null,
crawlFields: {
id: '@href', //will be transformed later
price: '.immo_preis .label_info',
size: '.flaeche .label_info | removeNewline | trim',
title: 'h3 span',
price: '.lr-card__price-amount | trim',
size: '.lr-card__fact:has(.lr-card__fact-label:contains("Fläche")) .lr-card__fact-value | trim',
title: '.lr-card__title | trim',
description: '.description | trim',
link: '@href',
address: '.place',
image: 'img@src',
address: '.lr-card__address span | trim',
image: 'img.lr-card__gallery-img@src',
},
normalize: normalize,
normalize,
filter: applyBlacklist,
fetchDetails,
activeTester: checkIfListingIsActive,
};
export const init = (sourceConfig, blacklist) => {

View File

@@ -46,9 +46,7 @@ import {
convertWebToMobile,
} from '../services/immoscout/immoscout-web-translator.js';
import logger from '../services/logger.js';
import { getUserSettings } from '../services/storage/settingsStorage.js';
let appliedBlackList = [];
let currentUserId = null;
async function getListings(url) {
const response = await fetch(url, {
@@ -68,42 +66,40 @@ async function getListings(url) {
}
const responseBody = await response.json();
return Promise.all(
responseBody.resultListItems
.filter((item) => item.type === 'EXPOSE_RESULT')
.map(async (expose) => {
const item = expose.item;
const [price, size] = item.attributes;
const image = item?.titlePicture?.full ?? item?.titlePicture?.preview ?? null;
let listing = {
id: item.id,
price: price?.value,
size: size?.value,
title: item.title,
link: `${metaInformation.baseUrl}expose/${item.id}`,
address: item.address?.line,
image,
};
if (currentUserId) {
const userSettings = getUserSettings(currentUserId);
if (userSettings.immoscout_details) {
return await pushDetails(listing);
}
}
return listing;
}),
);
return responseBody.resultListItems
.filter((item) => item.type === 'EXPOSE_RESULT')
.map((expose) => {
const item = expose.item;
const [price, size] = item.attributes;
const image = item?.titlePicture?.full ?? item?.titlePicture?.preview ?? null;
return {
id: item.id,
price: price?.value,
size: size?.value,
title: item.title,
link: `${metaInformation.baseUrl}expose/${item.id}`,
address: item.address?.line,
image,
};
});
}
async function fetchDetails(listing) {
return pushDetails(listing);
}
async function pushDetails(listing) {
const detailed = await fetch(`https://api.mobile.immobilienscout24.de/expose/${listing.id}`, {
const exposeId = listing.link?.split('/').pop();
const detailed = await fetch(`https://api.mobile.immobilienscout24.de/expose/${exposeId}`, {
headers: {
'User-Agent': 'ImmoScout_27.3_26.0_._',
'Content-Type': 'application/json',
},
});
if (!detailed.ok) {
logger.error('Error fetching listing details from ImmoScout Mobile API:', detailed.statusText);
logger.warn(
`Error fetching listing details from ImmoScout Mobile API for id: ${exposeId} Status: ${detailed.statusText}`,
);
return listing;
}
const detailBody = await detailed.json();
@@ -196,13 +192,13 @@ const config = {
normalize: normalize,
filter: applyBlacklist,
getListings: getListings,
fetchDetails: fetchDetails,
activeTester: isListingActive,
};
export const init = (sourceConfig, blacklist) => {
config.enabled = sourceConfig.enabled;
config.url = convertWebToMobile(sourceConfig.url);
appliedBlackList = blacklist || [];
currentUserId = sourceConfig.userId || null;
};
export const metaInformation = {
name: 'Immoscout',

View File

@@ -5,9 +5,49 @@
import { buildHash, isOneOf } from '../utils.js';
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
import puppeteerExtractor from '../services/extractor/puppeteerExtractor.js';
import * as cheerio from 'cheerio';
import logger from '../services/logger.js';
let appliedBlackList = [];
async function fetchDetails(listing, browser) {
try {
const html = await puppeteerExtractor(listing.link, null, { browser });
if (!html) return listing;
const $ = cheerio.load(html);
const nextDataRaw = $('#__NEXT_DATA__').text();
if (!nextDataRaw) return listing;
const classified = JSON.parse(nextDataRaw)?.props?.pageProps?.classified;
if (!classified) return listing;
const description = (classified.Texts || [])
.map((t) => [t.Title, t.Content].filter(Boolean).join('\n'))
.filter(Boolean)
.join('\n\n');
const addr = classified.EstateAddress;
let address = listing.address;
if (addr) {
const street = [addr.Street, addr.HouseNumber].filter(Boolean).join(' ');
const cityLine = [addr.ZipCode, addr.District || addr.City].filter(Boolean).join(' ');
const full = [street, cityLine].filter(Boolean).join(', ');
if (full) address = full;
}
return {
...listing,
address,
description: description || listing.description,
};
} catch (error) {
logger.warn(`Could not fetch immowelt detail page for listing '${listing.id}'.`, error?.message || error);
return listing;
}
}
function normalize(o) {
const id = buildHash(o.id, o.price);
return Object.assign(o, { id });
@@ -37,6 +77,7 @@ const config = {
},
normalize: normalize,
filter: applyBlacklist,
fetchDetails: fetchDetails,
activeTester: checkIfListingIsActive,
};
export const init = (sourceConfig, blacklist) => {

View File

@@ -5,14 +5,151 @@
import { buildHash, isOneOf } from '../utils.js';
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
import puppeteerExtractor from '../services/extractor/puppeteerExtractor.js';
import logger from '../services/logger.js';
import * as cheerio from 'cheerio';
let appliedBlackList = [];
let appliedBlacklistedDistricts = [];
function toAbsoluteLink(link) {
if (!link) return null;
return link.startsWith('http') ? link : `https://www.kleinanzeigen.de${link}`;
}
function cleanText(value) {
if (value == null) return '';
return String(value)
.replace(/<[^>]*>/g, ' ')
.replace(/\s+/g, ' ')
.trim();
}
function buildAddressFromJsonLd(address) {
if (!address || typeof address !== 'object') return null;
const locality = cleanText(address.addressLocality);
const region = cleanText(address.addressRegion);
const postalCode = cleanText(address.postalCode);
const streetAddress = cleanText(address.streetAddress);
const cityPart = [region, locality].filter(Boolean).join(' - ');
const tail = [postalCode, cityPart || locality || region].filter(Boolean).join(' ');
const fullAddress = [streetAddress, tail].filter(Boolean).join(', ');
return fullAddress || null;
}
function flattenJsonLdNodes(node, acc = []) {
if (node == null) return acc;
if (Array.isArray(node)) {
node.forEach((item) => flattenJsonLdNodes(item, acc));
return acc;
}
if (typeof node !== 'object') return acc;
acc.push(node);
if (Array.isArray(node['@graph'])) {
node['@graph'].forEach((item) => flattenJsonLdNodes(item, acc));
}
if (node.mainEntity) {
flattenJsonLdNodes(node.mainEntity, acc);
}
if (node.itemOffered) {
flattenJsonLdNodes(node.itemOffered, acc);
}
return acc;
}
function extractDetailFromHtml(html) {
const $ = cheerio.load(html);
const nodes = [];
// Prefer the rendered postal address block from the detail page because
// it contains the street line that is missing from list results.
const streetFromDom = cleanText($('#street-address').first().text());
const localityFromDom = cleanText($('#viewad-locality').first().text());
const domAddress = [streetFromDom, localityFromDom].filter(Boolean).join(' ');
$('script[type="application/ld+json"]').each((_, element) => {
const content = $(element).text();
if (!content) return;
try {
const parsed = JSON.parse(content);
flattenJsonLdNodes(parsed, nodes);
} catch {
// Ignore broken JSON-LD blocks from ads/trackers and keep trying others.
}
});
let detailAddress = null;
let detailDescription = null;
if (domAddress) {
detailAddress = domAddress;
}
for (const node of nodes) {
const candidateAddress = buildAddressFromJsonLd(
node.address || node?.itemOffered?.address || node?.offers?.address,
);
if (!detailAddress && candidateAddress) {
detailAddress = candidateAddress;
}
const candidateDescription = cleanText(node.description || node?.itemOffered?.description);
if (!detailDescription && candidateDescription) {
detailDescription = candidateDescription;
}
if (detailAddress && detailDescription) {
break;
}
}
return {
detailAddress,
detailDescription,
};
}
async function enrichListingFromDetails(listing, browser) {
const absoluteLink = toAbsoluteLink(listing.link);
if (!absoluteLink) return listing;
try {
const html = await puppeteerExtractor(absoluteLink, null, { browser });
if (!html) return { ...listing, link: absoluteLink };
const { detailAddress, detailDescription } = extractDetailFromHtml(html);
return {
...listing,
link: absoluteLink,
address: detailAddress || listing.address,
description: detailDescription || listing.description,
};
} catch (error) {
logger.warn(`Could not fetch Kleinanzeigen detail page for listing '${listing.id}'.`, error?.message || error);
return { ...listing, link: absoluteLink };
}
}
async function fetchDetails(listing, browser) {
return enrichListingFromDetails(listing, browser);
}
function normalize(o) {
const size = o.size || '--- m²';
const id = buildHash(o.id, o.price);
const link = `https://www.kleinanzeigen.de${o.link}`;
const link = toAbsoluteLink(o.link) || o.link;
return Object.assign(o, { id, size, link });
}
@@ -40,12 +177,13 @@ const config = {
address: '.aditem-main--top--left | trim | removeNewline',
image: 'img@src',
},
fetchDetails,
normalize: normalize,
filter: applyBlacklist,
activeTester: checkIfListingIsActive,
};
export const metaInformation = {
name: 'Ebay Kleinanzeigen',
name: 'Kleinanzeigen',
baseUrl: 'https://www.kleinanzeigen.de/',
id: 'kleinanzeigen',
};

View File

@@ -5,12 +5,60 @@
import { isOneOf, buildHash } from '../utils.js';
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
import puppeteerExtractor from '../services/extractor/puppeteerExtractor.js';
import * as cheerio from 'cheerio';
import logger from '../services/logger.js';
let appliedBlackList = [];
async function fetchDetails(listing, browser) {
try {
const html = await puppeteerExtractor(listing.link, 'body', { browser });
const $ = cheerio.load(html);
const nextDataRaw = $('#__NEXT_DATA__').text;
if (!nextDataRaw) return listing;
const estate = JSON.parse(nextDataRaw)?.props?.pageProps?.estate;
if (!estate) return listing;
const description = (estate.frontendItems || [])
.map((item) => {
const texts = (item.contents || [])
.filter((c) => c.type === 'contentBoxes')
.flatMap((c) => c.data || [])
.filter((d) => d.type === 'text' && d.content)
.map((d) => d.content);
if (!texts.length) return null;
return [item.label, ...texts].filter(Boolean).join('\n');
})
.filter(Boolean)
.join('\n\n');
const addr = estate.address;
let address = listing.address;
if (addr) {
const street = [addr.street, addr.streetNumber].filter(Boolean).join(' ');
const cityLine = [addr.zip, addr.city].filter(Boolean).join(' ');
const full = [street, cityLine].filter(Boolean).join(', ');
if (full) address = full;
}
return {
...listing,
address,
description: description || listing.description,
};
} catch (error) {
logger.warn(`Could not fetch Sparkasse detail page for listing '${listing.id}'.`, error?.message || error);
return listing;
}
}
function normalize(o) {
const originalId = o.id.split('/').pop().replace('.html', '');
const id = buildHash(originalId, o.price);
const size = o.size?.replace(' Wohnfläche', '') ?? null;
const size = o.size?.replace(' Wohnfläche', '').replace(' m²', 'm²') ?? null;
const title = o.title || 'No title available';
const link = o.link != null ? `https://immobilien.sparkasse.de${o.link}` : config.url;
return Object.assign(o, { id, size, title, link });
@@ -22,20 +70,21 @@ function applyBlacklist(o) {
}
const config = {
url: null,
crawlContainer: '.estate-list-item-row',
crawlContainer: 'div[data-testid="estate-link"]',
sortByDateParam: 'sortBy=date_desc',
waitForSelector: 'body',
crawlFields: {
id: 'div[data-testid="estate-link"] a@href',
id: 'a@href',
title: 'h3 | trim',
price: '.estate-list-price | trim',
size: '.estate-mainfact:first-child span | trim',
size: '.estate-mainfact span | trim',
address: 'h6 | trim',
image: '.estate-list-item-image-container img@src',
link: 'div[data-testid="estate-link"] a@href',
image: 'img@src',
link: 'a@href',
},
normalize: normalize,
filter: applyBlacklist,
fetchDetails,
activeTester: (url) => checkIfListingIsActive(url, 'Angebot nicht gefunden'),
};
export const init = (sourceConfig, blacklist) => {

View File

@@ -5,9 +5,34 @@
import { isOneOf, buildHash } from '../utils.js';
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
import puppeteerExtractor from '../services/extractor/puppeteerExtractor.js';
import * as cheerio from 'cheerio';
import logger from '../services/logger.js';
let appliedBlackList = [];
async function fetchDetails(listing, browser) {
try {
const html = await puppeteerExtractor(listing.link, null, { browser });
if (!html) return listing;
const $ = cheerio.load(html);
$('#freitext_0 script').remove();
const description = $('#freitext_0').text().replace(/\s+/g, ' ').trim();
const address = $('a[href="#map_container"] .section_panel_detail').text().replace(/\s+/g, ' ').trim();
return {
...listing,
address: address || listing.address,
description: description || listing.description,
};
} catch (error) {
logger.warn(`Could not fetch wgGesucht detail page for listing '${listing.id}'.`, error?.message || error);
return listing;
}
}
function normalize(o) {
const id = buildHash(o.id, o.price);
const link = `https://www.wg-gesucht.de${o.link}`;
@@ -37,6 +62,7 @@ const config = {
},
normalize: normalize,
filter: applyBlacklist,
fetchDetails,
activeTester: checkIfListingIsActive,
};
export const init = (sourceConfig, blacklist) => {