mirror of
https://github.com/orangecoding/fredy.git
synced 2026-06-16 12:31:07 +00:00
* feat(): create map component, add area filtering to the job config * feat(): filter listings by area filter * chore(): cleanup * feat(): solve feedback * feat(): solve most providers * feat(): solve maybe other providers * feat(): add specFilter config, also add rooms to listing * feat(): change tests * feat(): fix kleinanzeigen parser * feat(): add spec filter switch for listing overviiews * feat(): add rooms and size to the overview and detail of a listing * feat(): rem label * feat(): add types, update providers, they now return specs as numbers * feat(): add jsonconfig to enable type checks * feat: add type for prividerConfig, add fieldNames per provider * feat: fix tests, provider, add formatListing * chore: remov duplicates * feat(): fix tests * feat: fix immoscout * chore: geojson typing * feat: solve requested changes
222 lines
6.6 KiB
JavaScript
Executable File
222 lines
6.6 KiB
JavaScript
Executable File
/*
|
|
* Copyright (c) 2026 by Christian Kellner.
|
|
* Licensed under Apache-2.0 with Commons Clause and Attribution/Naming Clause
|
|
*/
|
|
|
|
import { buildHash, isOneOf } from '../utils.js';
|
|
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
|
|
import { extractNumber } from '../utils/extract-number.js';
|
|
/** @import { ParsedListing } from '../types/listing.js' */
|
|
/** @import { ProviderConfig } from '../types/providerConfig.js' */
|
|
import puppeteerExtractor from '../services/extractor/puppeteerExtractor.js';
|
|
import logger from '../services/logger.js';
|
|
import * as cheerio from 'cheerio';
|
|
|
|
let appliedBlackList = [];
|
|
let appliedBlacklistedDistricts = [];
|
|
|
|
function toAbsoluteLink(link) {
|
|
if (!link) return null;
|
|
return link.startsWith('http') ? link : `https://www.kleinanzeigen.de${link}`;
|
|
}
|
|
|
|
function cleanText(value) {
|
|
if (value == null) return '';
|
|
return String(value)
|
|
.replace(/<[^>]*>/g, ' ')
|
|
.replace(/\s+/g, ' ')
|
|
.trim();
|
|
}
|
|
|
|
function buildAddressFromJsonLd(address) {
|
|
if (!address || typeof address !== 'object') return null;
|
|
|
|
const locality = cleanText(address.addressLocality);
|
|
const region = cleanText(address.addressRegion);
|
|
const postalCode = cleanText(address.postalCode);
|
|
const streetAddress = cleanText(address.streetAddress);
|
|
|
|
const cityPart = [region, locality].filter(Boolean).join(' - ');
|
|
const tail = [postalCode, cityPart || locality || region].filter(Boolean).join(' ');
|
|
const fullAddress = [streetAddress, tail].filter(Boolean).join(', ');
|
|
|
|
return fullAddress || null;
|
|
}
|
|
|
|
function flattenJsonLdNodes(node, acc = []) {
|
|
if (node == null) return acc;
|
|
|
|
if (Array.isArray(node)) {
|
|
node.forEach((item) => flattenJsonLdNodes(item, acc));
|
|
return acc;
|
|
}
|
|
|
|
if (typeof node !== 'object') return acc;
|
|
|
|
acc.push(node);
|
|
|
|
if (Array.isArray(node['@graph'])) {
|
|
node['@graph'].forEach((item) => flattenJsonLdNodes(item, acc));
|
|
}
|
|
|
|
if (node.mainEntity) {
|
|
flattenJsonLdNodes(node.mainEntity, acc);
|
|
}
|
|
|
|
if (node.itemOffered) {
|
|
flattenJsonLdNodes(node.itemOffered, acc);
|
|
}
|
|
|
|
return acc;
|
|
}
|
|
|
|
function extractDetailFromHtml(html) {
|
|
const $ = cheerio.load(html);
|
|
const nodes = [];
|
|
|
|
// Prefer the rendered postal address block from the detail page because
|
|
// it contains the street line that is missing from list results.
|
|
const streetFromDom = cleanText($('#street-address').first().text());
|
|
const localityFromDom = cleanText($('#viewad-locality').first().text());
|
|
const domAddress = [streetFromDom, localityFromDom].filter(Boolean).join(' ');
|
|
|
|
$('script[type="application/ld+json"]').each((_, element) => {
|
|
const content = $(element).text();
|
|
if (!content) return;
|
|
|
|
try {
|
|
const parsed = JSON.parse(content);
|
|
flattenJsonLdNodes(parsed, nodes);
|
|
} catch {
|
|
// Ignore broken JSON-LD blocks from ads/trackers and keep trying others.
|
|
}
|
|
});
|
|
|
|
let detailAddress = null;
|
|
let detailDescription = null;
|
|
|
|
if (domAddress) {
|
|
detailAddress = domAddress;
|
|
}
|
|
|
|
for (const node of nodes) {
|
|
const candidateAddress = buildAddressFromJsonLd(
|
|
node.address || node?.itemOffered?.address || node?.offers?.address,
|
|
);
|
|
if (!detailAddress && candidateAddress) {
|
|
detailAddress = candidateAddress;
|
|
}
|
|
|
|
const candidateDescription = cleanText(node.description || node?.itemOffered?.description);
|
|
if (!detailDescription && candidateDescription) {
|
|
detailDescription = candidateDescription;
|
|
}
|
|
|
|
if (detailAddress && detailDescription) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return {
|
|
detailAddress,
|
|
detailDescription,
|
|
};
|
|
}
|
|
|
|
async function enrichListingFromDetails(listing, browser) {
|
|
const absoluteLink = toAbsoluteLink(listing.link);
|
|
if (!absoluteLink) return listing;
|
|
|
|
try {
|
|
const html = await puppeteerExtractor(absoluteLink, null, { browser });
|
|
if (!html) return { ...listing, link: absoluteLink };
|
|
|
|
const { detailAddress, detailDescription } = extractDetailFromHtml(html);
|
|
|
|
return {
|
|
...listing,
|
|
link: absoluteLink,
|
|
address: detailAddress || listing.address,
|
|
description: detailDescription || listing.description,
|
|
};
|
|
} catch (error) {
|
|
logger.warn(`Could not fetch Kleinanzeigen detail page for listing '${listing.id}'.`, error?.message || error);
|
|
return { ...listing, link: absoluteLink };
|
|
}
|
|
}
|
|
|
|
async function fetchDetails(listing, browser) {
|
|
return enrichListingFromDetails(listing, browser);
|
|
}
|
|
|
|
/**
|
|
* @param {any} o
|
|
* @returns {ParsedListing}
|
|
*/
|
|
function normalize(o) {
|
|
const parts = (o.tags || '').split('·').map((p) => p.trim());
|
|
const size = parts.find((p) => p.includes('m²'));
|
|
const rooms = parts.find((p) => p.includes('Zi.'));
|
|
const id = buildHash(o.id, o.price);
|
|
|
|
return {
|
|
id,
|
|
title: o.title,
|
|
link: toAbsoluteLink(o.link) || o.link,
|
|
price: extractNumber(o.price),
|
|
size: extractNumber(size),
|
|
rooms: extractNumber(rooms),
|
|
address: o.address,
|
|
description: o.description,
|
|
image: o.image,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* @param {ParsedListing} o
|
|
* @returns {boolean}
|
|
*/
|
|
function applyBlacklist(o) {
|
|
const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList);
|
|
const descNotBlacklisted = !isOneOf(o.description, appliedBlackList);
|
|
const isBlacklistedDistrict =
|
|
appliedBlacklistedDistricts.length === 0 ? false : isOneOf(o.description, appliedBlacklistedDistricts);
|
|
return o.title != null && !isBlacklistedDistrict && titleNotBlacklisted && descNotBlacklisted;
|
|
}
|
|
|
|
/** @type {ProviderConfig} */
|
|
const config = {
|
|
fieldNames: ['id', 'link', 'title', 'price', 'size', 'rooms', 'address', 'image', 'description'],
|
|
url: null,
|
|
crawlContainer: '#srchrslt-adtable .ad-listitem ',
|
|
//sort by date is standard oO
|
|
sortByDateParam: null,
|
|
waitForSelector: 'body',
|
|
crawlFields: {
|
|
id: '.aditem@data-adid',
|
|
price: '.aditem-main--middle--price-shipping--price | removeNewline | trim',
|
|
tags: '.aditem-main--middle--tags | removeNewline | trim',
|
|
title: '.aditem-main .text-module-begin a | removeNewline | trim',
|
|
link: '.aditem-main .text-module-begin a@href | removeNewline | trim',
|
|
description: '.aditem-main .aditem-main--middle--description | removeNewline | trim',
|
|
address: '.aditem-main--top--left | trim | removeNewline',
|
|
image: 'img@src',
|
|
},
|
|
fetchDetails,
|
|
normalize: normalize,
|
|
filter: applyBlacklist,
|
|
activeTester: checkIfListingIsActive,
|
|
};
|
|
export const metaInformation = {
|
|
name: 'Kleinanzeigen',
|
|
baseUrl: 'https://www.kleinanzeigen.de/',
|
|
id: 'kleinanzeigen',
|
|
};
|
|
export const init = (sourceConfig, blacklist, blacklistedDistricts) => {
|
|
config.enabled = sourceConfig.enabled;
|
|
config.url = sourceConfig.url;
|
|
appliedBlacklistedDistricts = blacklistedDistricts || [];
|
|
appliedBlackList = blacklist || [];
|
|
};
|
|
export { config };
|