mirror of
https://github.com/orangecoding/fredy.git
synced 2026-06-16 12:31:07 +00:00
Feature/kleinanzeigen new (#292)
* Feature/Kleinanzeigen addresses (#289) * upgrade dependencies * immoscout_details -> provider_details * fetching details more generic * removing claude action * fixing sparkassen selector * improvements * fixing immobilienDE test * upgrading dependencies * settings for many provider --------- Co-authored-by: Adrian Bach <65734063+realDayaa@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
7888c5b340
commit
cdc0cbda2f
@@ -63,6 +63,7 @@ class FredyPipelineExecutioner {
|
||||
* @param {(raw:any)=>Listing} providerConfig.normalize Function to convert raw scraped data into a Listing shape.
|
||||
* @param {(listing:Listing)=>boolean} providerConfig.filter Function to filter out unwanted listings.
|
||||
* @param {(url:string, waitForSelector?:string)=>Promise<void>|Promise<Listing[]>} [providerConfig.getListings] Optional override to fetch listings.
|
||||
* @param {(listing:Listing, browser:any)=>Promise<Listing>} [providerConfig.fetchDetails] Optional per-listing detail enrichment. Called in parallel for each new listing after deduplication. Receives the shared browser instance. Must always resolve (never reject).
|
||||
* @param {Object} notificationConfig Notification configuration passed to notification adapters.
|
||||
* @param {Object} spatialFilter Optional spatial filter configuration.
|
||||
* @param {string} providerId The ID of the provider currently in use.
|
||||
@@ -92,6 +93,7 @@ class FredyPipelineExecutioner {
|
||||
.then(this._normalize.bind(this))
|
||||
.then(this._filter.bind(this))
|
||||
.then(this._findNew.bind(this))
|
||||
.then(this._fetchDetails.bind(this))
|
||||
.then(this._geocode.bind(this))
|
||||
.then(this._save.bind(this))
|
||||
.then(this._calculateDistance.bind(this))
|
||||
@@ -101,6 +103,32 @@ class FredyPipelineExecutioner {
|
||||
.catch(this._handleError.bind(this));
|
||||
}
|
||||
|
||||
/**
|
||||
* Optionally enrich new listings with data from their detail pages.
|
||||
* Only called when the provider config defines a `fetchDetails` function.
|
||||
* Runs all fetches in parallel. Each individual fetch must handle its own errors
|
||||
* and always resolve (never reject) to avoid aborting other listings.
|
||||
*
|
||||
* @param {Listing[]} newListings New listings to enrich.
|
||||
* @returns {Promise<Listing[]>} Resolves with enriched listings.
|
||||
*/
|
||||
async _fetchDetails(newListings) {
|
||||
if (typeof this._providerConfig.fetchDetails !== 'function') {
|
||||
return newListings;
|
||||
}
|
||||
const userId = getJob(this._jobKey)?.userId;
|
||||
const enabledProviders = getUserSettings(userId)?.provider_details ?? [];
|
||||
if (!userId || !Array.isArray(enabledProviders) || !enabledProviders.includes(this._providerId)) {
|
||||
return newListings;
|
||||
}
|
||||
const listingsToEnrich = process.env.NODE_ENV === 'test' ? newListings.slice(0, 1) : newListings;
|
||||
const enriched = [];
|
||||
for (const listing of listingsToEnrich) {
|
||||
enriched.push(await this._providerConfig.fetchDetails(listing, this._browser));
|
||||
}
|
||||
return enriched;
|
||||
}
|
||||
|
||||
/**
|
||||
* Geocode new listings.
|
||||
*
|
||||
|
||||
@@ -97,9 +97,9 @@ userSettingsRouter.post('/news-hash', async (req, res) => {
|
||||
}
|
||||
});
|
||||
|
||||
userSettingsRouter.post('/immoscout-details', async (req, res) => {
|
||||
userSettingsRouter.post('/provider-details', async (req, res) => {
|
||||
const userId = req.session.currentUser;
|
||||
const { immoscout_details } = req.body;
|
||||
const { provider_details } = req.body;
|
||||
|
||||
const globalSettings = await getSettings();
|
||||
if (globalSettings.demoMode) {
|
||||
@@ -108,11 +108,17 @@ userSettingsRouter.post('/immoscout-details', async (req, res) => {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!Array.isArray(provider_details)) {
|
||||
res.statusCode = 400;
|
||||
res.send({ error: 'provider_details must be an array of provider ids.' });
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
upsertSettings({ immoscout_details: !!immoscout_details }, userId);
|
||||
upsertSettings({ provider_details }, userId);
|
||||
res.send({ success: true });
|
||||
} catch (error) {
|
||||
logger.error('Error updating immoscout details setting', error);
|
||||
logger.error('Error updating provider details setting', error);
|
||||
res.statusCode = 500;
|
||||
res.send({ error: error.message });
|
||||
}
|
||||
|
||||
@@ -5,6 +5,9 @@
|
||||
|
||||
import { buildHash, isOneOf } from '../utils.js';
|
||||
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
|
||||
import puppeteerExtractor from '../services/extractor/puppeteerExtractor.js';
|
||||
import * as cheerio from 'cheerio';
|
||||
import logger from '../services/logger.js';
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
@@ -18,6 +21,51 @@ function parseId(shortenedLink) {
|
||||
return shortenedLink.substring(shortenedLink.lastIndexOf('/') + 1);
|
||||
}
|
||||
|
||||
async function fetchDetails(listing, browser) {
|
||||
try {
|
||||
const html = await puppeteerExtractor(listing.link, null, { browser });
|
||||
if (!html) return listing;
|
||||
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// Try JSON-LD first
|
||||
let description = null;
|
||||
let address = listing.address;
|
||||
$('script[type="application/ld+json"]').each((_, el) => {
|
||||
if (description) return;
|
||||
try {
|
||||
const data = JSON.parse($(el).text());
|
||||
const nodes = Array.isArray(data) ? data : [data];
|
||||
for (const node of nodes) {
|
||||
if (node.description && !description) description = String(node.description).replace(/\s+/g, ' ').trim();
|
||||
const addr = node.address || node?.mainEntity?.address;
|
||||
if (addr && addr.streetAddress && address === listing.address) {
|
||||
const parts = [addr.streetAddress, addr.postalCode, addr.addressLocality].filter(Boolean);
|
||||
if (parts.length) address = parts.join(' ');
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// ignore malformed JSON-LD
|
||||
}
|
||||
});
|
||||
|
||||
// Fallback: common description selectors used by immobilien.de
|
||||
if (!description) {
|
||||
const sel = ['.beschreibung', '.freitext', '.objektbeschreibung', '.description'].find((s) => $(s).length > 0);
|
||||
if (sel) description = $(sel).text().replace(/\s+/g, ' ').trim();
|
||||
}
|
||||
|
||||
return {
|
||||
...listing,
|
||||
address,
|
||||
description: description || listing.description,
|
||||
};
|
||||
} catch (error) {
|
||||
logger.warn(`Could not fetch immobilien.de detail page for listing '${listing.id}'.`, error?.message || error);
|
||||
return listing;
|
||||
}
|
||||
}
|
||||
|
||||
function normalize(o) {
|
||||
const baseUrl = 'https://www.immobilien.de';
|
||||
const size = o.size || null;
|
||||
@@ -25,8 +73,8 @@ function normalize(o) {
|
||||
const title = o.title || 'No title available';
|
||||
const address = o.address || null;
|
||||
const shortLink = shortenLink(o.link);
|
||||
const link = baseUrl + shortLink;
|
||||
const image = baseUrl + o.image;
|
||||
const link = shortLink ? (shortLink.startsWith('http') ? shortLink : baseUrl + shortLink) : baseUrl;
|
||||
const image = o.image ? (o.image.startsWith('http') ? o.image : baseUrl + o.image) : null;
|
||||
const id = buildHash(parseId(shortLink), o.price);
|
||||
return Object.assign(o, { id, price, size, title, address, link, image });
|
||||
}
|
||||
@@ -39,21 +87,22 @@ function applyBlacklist(o) {
|
||||
|
||||
const config = {
|
||||
url: null,
|
||||
crawlContainer: 'a:has(div.list_entry)',
|
||||
crawlContainer: 'a.lr-card',
|
||||
sortByDateParam: 'sort_col=*created_ts&sort_dir=desc',
|
||||
waitForSelector: 'body',
|
||||
waitForSelector: null,
|
||||
crawlFields: {
|
||||
id: '@href', //will be transformed later
|
||||
price: '.immo_preis .label_info',
|
||||
size: '.flaeche .label_info | removeNewline | trim',
|
||||
title: 'h3 span',
|
||||
price: '.lr-card__price-amount | trim',
|
||||
size: '.lr-card__fact:has(.lr-card__fact-label:contains("Fläche")) .lr-card__fact-value | trim',
|
||||
title: '.lr-card__title | trim',
|
||||
description: '.description | trim',
|
||||
link: '@href',
|
||||
address: '.place',
|
||||
image: 'img@src',
|
||||
address: '.lr-card__address span | trim',
|
||||
image: 'img.lr-card__gallery-img@src',
|
||||
},
|
||||
normalize: normalize,
|
||||
normalize,
|
||||
filter: applyBlacklist,
|
||||
fetchDetails,
|
||||
activeTester: checkIfListingIsActive,
|
||||
};
|
||||
export const init = (sourceConfig, blacklist) => {
|
||||
|
||||
@@ -46,9 +46,7 @@ import {
|
||||
convertWebToMobile,
|
||||
} from '../services/immoscout/immoscout-web-translator.js';
|
||||
import logger from '../services/logger.js';
|
||||
import { getUserSettings } from '../services/storage/settingsStorage.js';
|
||||
let appliedBlackList = [];
|
||||
let currentUserId = null;
|
||||
|
||||
async function getListings(url) {
|
||||
const response = await fetch(url, {
|
||||
@@ -68,42 +66,40 @@ async function getListings(url) {
|
||||
}
|
||||
|
||||
const responseBody = await response.json();
|
||||
return Promise.all(
|
||||
responseBody.resultListItems
|
||||
.filter((item) => item.type === 'EXPOSE_RESULT')
|
||||
.map(async (expose) => {
|
||||
const item = expose.item;
|
||||
const [price, size] = item.attributes;
|
||||
const image = item?.titlePicture?.full ?? item?.titlePicture?.preview ?? null;
|
||||
let listing = {
|
||||
id: item.id,
|
||||
price: price?.value,
|
||||
size: size?.value,
|
||||
title: item.title,
|
||||
link: `${metaInformation.baseUrl}expose/${item.id}`,
|
||||
address: item.address?.line,
|
||||
image,
|
||||
};
|
||||
if (currentUserId) {
|
||||
const userSettings = getUserSettings(currentUserId);
|
||||
if (userSettings.immoscout_details) {
|
||||
return await pushDetails(listing);
|
||||
}
|
||||
}
|
||||
return listing;
|
||||
}),
|
||||
);
|
||||
return responseBody.resultListItems
|
||||
.filter((item) => item.type === 'EXPOSE_RESULT')
|
||||
.map((expose) => {
|
||||
const item = expose.item;
|
||||
const [price, size] = item.attributes;
|
||||
const image = item?.titlePicture?.full ?? item?.titlePicture?.preview ?? null;
|
||||
return {
|
||||
id: item.id,
|
||||
price: price?.value,
|
||||
size: size?.value,
|
||||
title: item.title,
|
||||
link: `${metaInformation.baseUrl}expose/${item.id}`,
|
||||
address: item.address?.line,
|
||||
image,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
async function fetchDetails(listing) {
|
||||
return pushDetails(listing);
|
||||
}
|
||||
|
||||
async function pushDetails(listing) {
|
||||
const detailed = await fetch(`https://api.mobile.immobilienscout24.de/expose/${listing.id}`, {
|
||||
const exposeId = listing.link?.split('/').pop();
|
||||
const detailed = await fetch(`https://api.mobile.immobilienscout24.de/expose/${exposeId}`, {
|
||||
headers: {
|
||||
'User-Agent': 'ImmoScout_27.3_26.0_._',
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
});
|
||||
if (!detailed.ok) {
|
||||
logger.error('Error fetching listing details from ImmoScout Mobile API:', detailed.statusText);
|
||||
logger.warn(
|
||||
`Error fetching listing details from ImmoScout Mobile API for id: ${exposeId} Status: ${detailed.statusText}`,
|
||||
);
|
||||
return listing;
|
||||
}
|
||||
const detailBody = await detailed.json();
|
||||
@@ -196,13 +192,13 @@ const config = {
|
||||
normalize: normalize,
|
||||
filter: applyBlacklist,
|
||||
getListings: getListings,
|
||||
fetchDetails: fetchDetails,
|
||||
activeTester: isListingActive,
|
||||
};
|
||||
export const init = (sourceConfig, blacklist) => {
|
||||
config.enabled = sourceConfig.enabled;
|
||||
config.url = convertWebToMobile(sourceConfig.url);
|
||||
appliedBlackList = blacklist || [];
|
||||
currentUserId = sourceConfig.userId || null;
|
||||
};
|
||||
export const metaInformation = {
|
||||
name: 'Immoscout',
|
||||
|
||||
@@ -5,9 +5,49 @@
|
||||
|
||||
import { buildHash, isOneOf } from '../utils.js';
|
||||
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
|
||||
import puppeteerExtractor from '../services/extractor/puppeteerExtractor.js';
|
||||
import * as cheerio from 'cheerio';
|
||||
import logger from '../services/logger.js';
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
async function fetchDetails(listing, browser) {
|
||||
try {
|
||||
const html = await puppeteerExtractor(listing.link, null, { browser });
|
||||
if (!html) return listing;
|
||||
|
||||
const $ = cheerio.load(html);
|
||||
const nextDataRaw = $('#__NEXT_DATA__').text();
|
||||
if (!nextDataRaw) return listing;
|
||||
|
||||
const classified = JSON.parse(nextDataRaw)?.props?.pageProps?.classified;
|
||||
if (!classified) return listing;
|
||||
|
||||
const description = (classified.Texts || [])
|
||||
.map((t) => [t.Title, t.Content].filter(Boolean).join('\n'))
|
||||
.filter(Boolean)
|
||||
.join('\n\n');
|
||||
|
||||
const addr = classified.EstateAddress;
|
||||
let address = listing.address;
|
||||
if (addr) {
|
||||
const street = [addr.Street, addr.HouseNumber].filter(Boolean).join(' ');
|
||||
const cityLine = [addr.ZipCode, addr.District || addr.City].filter(Boolean).join(' ');
|
||||
const full = [street, cityLine].filter(Boolean).join(', ');
|
||||
if (full) address = full;
|
||||
}
|
||||
|
||||
return {
|
||||
...listing,
|
||||
address,
|
||||
description: description || listing.description,
|
||||
};
|
||||
} catch (error) {
|
||||
logger.warn(`Could not fetch immowelt detail page for listing '${listing.id}'.`, error?.message || error);
|
||||
return listing;
|
||||
}
|
||||
}
|
||||
|
||||
function normalize(o) {
|
||||
const id = buildHash(o.id, o.price);
|
||||
return Object.assign(o, { id });
|
||||
@@ -37,6 +77,7 @@ const config = {
|
||||
},
|
||||
normalize: normalize,
|
||||
filter: applyBlacklist,
|
||||
fetchDetails: fetchDetails,
|
||||
activeTester: checkIfListingIsActive,
|
||||
};
|
||||
export const init = (sourceConfig, blacklist) => {
|
||||
|
||||
@@ -5,14 +5,151 @@
|
||||
|
||||
import { buildHash, isOneOf } from '../utils.js';
|
||||
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
|
||||
import puppeteerExtractor from '../services/extractor/puppeteerExtractor.js';
|
||||
import logger from '../services/logger.js';
|
||||
import * as cheerio from 'cheerio';
|
||||
|
||||
let appliedBlackList = [];
|
||||
let appliedBlacklistedDistricts = [];
|
||||
|
||||
function toAbsoluteLink(link) {
|
||||
if (!link) return null;
|
||||
return link.startsWith('http') ? link : `https://www.kleinanzeigen.de${link}`;
|
||||
}
|
||||
|
||||
function cleanText(value) {
|
||||
if (value == null) return '';
|
||||
return String(value)
|
||||
.replace(/<[^>]*>/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
}
|
||||
|
||||
function buildAddressFromJsonLd(address) {
|
||||
if (!address || typeof address !== 'object') return null;
|
||||
|
||||
const locality = cleanText(address.addressLocality);
|
||||
const region = cleanText(address.addressRegion);
|
||||
const postalCode = cleanText(address.postalCode);
|
||||
const streetAddress = cleanText(address.streetAddress);
|
||||
|
||||
const cityPart = [region, locality].filter(Boolean).join(' - ');
|
||||
const tail = [postalCode, cityPart || locality || region].filter(Boolean).join(' ');
|
||||
const fullAddress = [streetAddress, tail].filter(Boolean).join(', ');
|
||||
|
||||
return fullAddress || null;
|
||||
}
|
||||
|
||||
function flattenJsonLdNodes(node, acc = []) {
|
||||
if (node == null) return acc;
|
||||
|
||||
if (Array.isArray(node)) {
|
||||
node.forEach((item) => flattenJsonLdNodes(item, acc));
|
||||
return acc;
|
||||
}
|
||||
|
||||
if (typeof node !== 'object') return acc;
|
||||
|
||||
acc.push(node);
|
||||
|
||||
if (Array.isArray(node['@graph'])) {
|
||||
node['@graph'].forEach((item) => flattenJsonLdNodes(item, acc));
|
||||
}
|
||||
|
||||
if (node.mainEntity) {
|
||||
flattenJsonLdNodes(node.mainEntity, acc);
|
||||
}
|
||||
|
||||
if (node.itemOffered) {
|
||||
flattenJsonLdNodes(node.itemOffered, acc);
|
||||
}
|
||||
|
||||
return acc;
|
||||
}
|
||||
|
||||
function extractDetailFromHtml(html) {
|
||||
const $ = cheerio.load(html);
|
||||
const nodes = [];
|
||||
|
||||
// Prefer the rendered postal address block from the detail page because
|
||||
// it contains the street line that is missing from list results.
|
||||
const streetFromDom = cleanText($('#street-address').first().text());
|
||||
const localityFromDom = cleanText($('#viewad-locality').first().text());
|
||||
const domAddress = [streetFromDom, localityFromDom].filter(Boolean).join(' ');
|
||||
|
||||
$('script[type="application/ld+json"]').each((_, element) => {
|
||||
const content = $(element).text();
|
||||
if (!content) return;
|
||||
|
||||
try {
|
||||
const parsed = JSON.parse(content);
|
||||
flattenJsonLdNodes(parsed, nodes);
|
||||
} catch {
|
||||
// Ignore broken JSON-LD blocks from ads/trackers and keep trying others.
|
||||
}
|
||||
});
|
||||
|
||||
let detailAddress = null;
|
||||
let detailDescription = null;
|
||||
|
||||
if (domAddress) {
|
||||
detailAddress = domAddress;
|
||||
}
|
||||
|
||||
for (const node of nodes) {
|
||||
const candidateAddress = buildAddressFromJsonLd(
|
||||
node.address || node?.itemOffered?.address || node?.offers?.address,
|
||||
);
|
||||
if (!detailAddress && candidateAddress) {
|
||||
detailAddress = candidateAddress;
|
||||
}
|
||||
|
||||
const candidateDescription = cleanText(node.description || node?.itemOffered?.description);
|
||||
if (!detailDescription && candidateDescription) {
|
||||
detailDescription = candidateDescription;
|
||||
}
|
||||
|
||||
if (detailAddress && detailDescription) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
detailAddress,
|
||||
detailDescription,
|
||||
};
|
||||
}
|
||||
|
||||
async function enrichListingFromDetails(listing, browser) {
|
||||
const absoluteLink = toAbsoluteLink(listing.link);
|
||||
if (!absoluteLink) return listing;
|
||||
|
||||
try {
|
||||
const html = await puppeteerExtractor(absoluteLink, null, { browser });
|
||||
if (!html) return { ...listing, link: absoluteLink };
|
||||
|
||||
const { detailAddress, detailDescription } = extractDetailFromHtml(html);
|
||||
|
||||
return {
|
||||
...listing,
|
||||
link: absoluteLink,
|
||||
address: detailAddress || listing.address,
|
||||
description: detailDescription || listing.description,
|
||||
};
|
||||
} catch (error) {
|
||||
logger.warn(`Could not fetch Kleinanzeigen detail page for listing '${listing.id}'.`, error?.message || error);
|
||||
return { ...listing, link: absoluteLink };
|
||||
}
|
||||
}
|
||||
|
||||
async function fetchDetails(listing, browser) {
|
||||
return enrichListingFromDetails(listing, browser);
|
||||
}
|
||||
|
||||
function normalize(o) {
|
||||
const size = o.size || '--- m²';
|
||||
const id = buildHash(o.id, o.price);
|
||||
const link = `https://www.kleinanzeigen.de${o.link}`;
|
||||
const link = toAbsoluteLink(o.link) || o.link;
|
||||
return Object.assign(o, { id, size, link });
|
||||
}
|
||||
|
||||
@@ -40,12 +177,13 @@ const config = {
|
||||
address: '.aditem-main--top--left | trim | removeNewline',
|
||||
image: 'img@src',
|
||||
},
|
||||
fetchDetails,
|
||||
normalize: normalize,
|
||||
filter: applyBlacklist,
|
||||
activeTester: checkIfListingIsActive,
|
||||
};
|
||||
export const metaInformation = {
|
||||
name: 'Ebay Kleinanzeigen',
|
||||
name: 'Kleinanzeigen',
|
||||
baseUrl: 'https://www.kleinanzeigen.de/',
|
||||
id: 'kleinanzeigen',
|
||||
};
|
||||
|
||||
@@ -5,12 +5,60 @@
|
||||
|
||||
import { isOneOf, buildHash } from '../utils.js';
|
||||
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
|
||||
import puppeteerExtractor from '../services/extractor/puppeteerExtractor.js';
|
||||
import * as cheerio from 'cheerio';
|
||||
import logger from '../services/logger.js';
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
async function fetchDetails(listing, browser) {
|
||||
try {
|
||||
const html = await puppeteerExtractor(listing.link, 'body', { browser });
|
||||
|
||||
const $ = cheerio.load(html);
|
||||
const nextDataRaw = $('#__NEXT_DATA__').text;
|
||||
if (!nextDataRaw) return listing;
|
||||
|
||||
const estate = JSON.parse(nextDataRaw)?.props?.pageProps?.estate;
|
||||
if (!estate) return listing;
|
||||
|
||||
const description = (estate.frontendItems || [])
|
||||
.map((item) => {
|
||||
const texts = (item.contents || [])
|
||||
.filter((c) => c.type === 'contentBoxes')
|
||||
.flatMap((c) => c.data || [])
|
||||
.filter((d) => d.type === 'text' && d.content)
|
||||
.map((d) => d.content);
|
||||
if (!texts.length) return null;
|
||||
return [item.label, ...texts].filter(Boolean).join('\n');
|
||||
})
|
||||
.filter(Boolean)
|
||||
.join('\n\n');
|
||||
|
||||
const addr = estate.address;
|
||||
let address = listing.address;
|
||||
if (addr) {
|
||||
const street = [addr.street, addr.streetNumber].filter(Boolean).join(' ');
|
||||
const cityLine = [addr.zip, addr.city].filter(Boolean).join(' ');
|
||||
const full = [street, cityLine].filter(Boolean).join(', ');
|
||||
if (full) address = full;
|
||||
}
|
||||
|
||||
return {
|
||||
...listing,
|
||||
address,
|
||||
description: description || listing.description,
|
||||
};
|
||||
} catch (error) {
|
||||
logger.warn(`Could not fetch Sparkasse detail page for listing '${listing.id}'.`, error?.message || error);
|
||||
return listing;
|
||||
}
|
||||
}
|
||||
|
||||
function normalize(o) {
|
||||
const originalId = o.id.split('/').pop().replace('.html', '');
|
||||
const id = buildHash(originalId, o.price);
|
||||
const size = o.size?.replace(' Wohnfläche', '') ?? null;
|
||||
const size = o.size?.replace(' Wohnfläche', '').replace(' m²', 'm²') ?? null;
|
||||
const title = o.title || 'No title available';
|
||||
const link = o.link != null ? `https://immobilien.sparkasse.de${o.link}` : config.url;
|
||||
return Object.assign(o, { id, size, title, link });
|
||||
@@ -22,20 +70,21 @@ function applyBlacklist(o) {
|
||||
}
|
||||
const config = {
|
||||
url: null,
|
||||
crawlContainer: '.estate-list-item-row',
|
||||
crawlContainer: 'div[data-testid="estate-link"]',
|
||||
sortByDateParam: 'sortBy=date_desc',
|
||||
waitForSelector: 'body',
|
||||
crawlFields: {
|
||||
id: 'div[data-testid="estate-link"] a@href',
|
||||
id: 'a@href',
|
||||
title: 'h3 | trim',
|
||||
price: '.estate-list-price | trim',
|
||||
size: '.estate-mainfact:first-child span | trim',
|
||||
size: '.estate-mainfact span | trim',
|
||||
address: 'h6 | trim',
|
||||
image: '.estate-list-item-image-container img@src',
|
||||
link: 'div[data-testid="estate-link"] a@href',
|
||||
image: 'img@src',
|
||||
link: 'a@href',
|
||||
},
|
||||
normalize: normalize,
|
||||
filter: applyBlacklist,
|
||||
fetchDetails,
|
||||
activeTester: (url) => checkIfListingIsActive(url, 'Angebot nicht gefunden'),
|
||||
};
|
||||
export const init = (sourceConfig, blacklist) => {
|
||||
|
||||
@@ -5,9 +5,34 @@
|
||||
|
||||
import { isOneOf, buildHash } from '../utils.js';
|
||||
import checkIfListingIsActive from '../services/listings/listingActiveTester.js';
|
||||
import puppeteerExtractor from '../services/extractor/puppeteerExtractor.js';
|
||||
import * as cheerio from 'cheerio';
|
||||
import logger from '../services/logger.js';
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
async function fetchDetails(listing, browser) {
|
||||
try {
|
||||
const html = await puppeteerExtractor(listing.link, null, { browser });
|
||||
if (!html) return listing;
|
||||
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
$('#freitext_0 script').remove();
|
||||
const description = $('#freitext_0').text().replace(/\s+/g, ' ').trim();
|
||||
const address = $('a[href="#map_container"] .section_panel_detail').text().replace(/\s+/g, ' ').trim();
|
||||
|
||||
return {
|
||||
...listing,
|
||||
address: address || listing.address,
|
||||
description: description || listing.description,
|
||||
};
|
||||
} catch (error) {
|
||||
logger.warn(`Could not fetch wgGesucht detail page for listing '${listing.id}'.`, error?.message || error);
|
||||
return listing;
|
||||
}
|
||||
}
|
||||
|
||||
function normalize(o) {
|
||||
const id = buildHash(o.id, o.price);
|
||||
const link = `https://www.wg-gesucht.de${o.link}`;
|
||||
@@ -37,6 +62,7 @@ const config = {
|
||||
},
|
||||
normalize: normalize,
|
||||
filter: applyBlacklist,
|
||||
fetchDetails,
|
||||
activeTester: checkIfListingIsActive,
|
||||
};
|
||||
export const init = (sourceConfig, blacklist) => {
|
||||
|
||||
@@ -94,12 +94,34 @@ export async function applyBotPreventionToPage(page, cfg) {
|
||||
// webdriver
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
||||
|
||||
// chrome runtime
|
||||
// chrome runtime — expose loadTimes, csi and app like real Chrome
|
||||
// @ts-ignore
|
||||
if (!window.chrome) {
|
||||
window.chrome = {
|
||||
runtime: {},
|
||||
// @ts-ignore
|
||||
window.chrome = { runtime: {} };
|
||||
}
|
||||
loadTimes: () => ({
|
||||
requestTime: performance.timeOrigin / 1000,
|
||||
startLoadTime: performance.timeOrigin / 1000,
|
||||
commitLoadTime: performance.timeOrigin / 1000 + 0.1,
|
||||
finishDocumentLoadTime: 0,
|
||||
finishLoadTime: 0,
|
||||
firstPaintTime: 0,
|
||||
firstPaintAfterLoadTime: 0,
|
||||
navigationType: 'Other',
|
||||
wasFetchedViaSpdy: false,
|
||||
wasNpnNegotiated: false,
|
||||
npnNegotiatedProtocol: '',
|
||||
wasAlternateProtocolAvailable: false,
|
||||
connectionInfo: 'http/1.1',
|
||||
}),
|
||||
// @ts-ignore
|
||||
csi: () => ({ startE: performance.timeOrigin, onloadT: Date.now(), pageT: performance.now(), tran: 15 }),
|
||||
app: {
|
||||
isInstalled: false,
|
||||
InstallState: { DISABLED: 'disabled', INSTALLED: 'installed', NOT_INSTALLED: 'not_installed' },
|
||||
RunningState: { CANNOT_RUN: 'cannot_run', READY_TO_RUN: 'ready_to_run', RUNNING: 'running' },
|
||||
},
|
||||
};
|
||||
|
||||
// languages
|
||||
// @ts-ignore
|
||||
@@ -107,23 +129,38 @@ export async function applyBotPreventionToPage(page, cfg) {
|
||||
get: () => (window.localStorage.getItem('__LANGS__') || 'de-DE,de').split(','),
|
||||
});
|
||||
|
||||
// plugins
|
||||
// plugins — mimic real Chrome's built-in PDF plugins
|
||||
const makePlugin = (name, filename, description, mimeType, mimeTypeSuffix) => {
|
||||
const mimeObj = { type: mimeType, suffixes: mimeTypeSuffix, description, enabledPlugin: null };
|
||||
const plugin = { name, filename, description, length: 1, 0: mimeObj };
|
||||
mimeObj.enabledPlugin = plugin;
|
||||
return plugin;
|
||||
};
|
||||
const fakePlugins = [
|
||||
makePlugin('PDF Viewer', 'internal-pdf-viewer', 'Portable Document Format', 'application/pdf', 'pdf'),
|
||||
makePlugin('Chrome PDF Viewer', 'internal-pdf-viewer', 'Portable Document Format', 'application/pdf', 'pdf'),
|
||||
makePlugin('Chromium PDF Viewer', 'internal-pdf-viewer', 'Portable Document Format', 'application/pdf', 'pdf'),
|
||||
makePlugin(
|
||||
'Microsoft Edge PDF Viewer',
|
||||
'internal-pdf-viewer',
|
||||
'Portable Document Format',
|
||||
'application/pdf',
|
||||
'pdf',
|
||||
),
|
||||
makePlugin('WebKit built-in PDF', 'internal-pdf-viewer', 'Portable Document Format', 'application/pdf', 'pdf'),
|
||||
];
|
||||
// @ts-ignore
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [{}, {}, {}],
|
||||
});
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => fakePlugins });
|
||||
// @ts-ignore
|
||||
Object.defineProperty(navigator, 'mimeTypes', { get: () => [fakePlugins[0][0]] });
|
||||
|
||||
// platform and concurrency hints
|
||||
// @ts-ignore
|
||||
Object.defineProperty(navigator, 'platform', { get: () => 'Win32' });
|
||||
// @ts-ignore
|
||||
if (typeof navigator.hardwareConcurrency === 'number' && navigator.hardwareConcurrency < 2) {
|
||||
Object.defineProperty(navigator, 'hardwareConcurrency', { get: () => 4 });
|
||||
}
|
||||
Object.defineProperty(navigator, 'hardwareConcurrency', { get: () => 8 });
|
||||
// @ts-ignore
|
||||
if (typeof navigator.deviceMemory === 'number' && navigator.deviceMemory < 2) {
|
||||
Object.defineProperty(navigator, 'deviceMemory', { get: () => 8 });
|
||||
}
|
||||
Object.defineProperty(navigator, 'deviceMemory', { get: () => 8 });
|
||||
|
||||
// userAgentData (Client Hints)
|
||||
try {
|
||||
@@ -236,6 +273,21 @@ export async function applyBotPreventionToPage(page, cfg) {
|
||||
} catch {
|
||||
//noop
|
||||
}
|
||||
|
||||
// document.hasFocus — headless returns false; real active tabs return true
|
||||
try {
|
||||
document.hasFocus = () => true;
|
||||
} catch {
|
||||
//noop
|
||||
}
|
||||
|
||||
// screen color depth — normalise in case headless reports 0
|
||||
try {
|
||||
Object.defineProperty(screen, 'colorDepth', { get: () => 24 });
|
||||
Object.defineProperty(screen, 'pixelDepth', { get: () => 24 });
|
||||
} catch {
|
||||
//noop
|
||||
}
|
||||
} catch {
|
||||
//noop
|
||||
}
|
||||
@@ -273,6 +325,8 @@ export async function applyPostNavigationHumanSignals(page, cfg) {
|
||||
const my = Math.floor(vh * (0.3 + Math.random() * 0.4));
|
||||
await page.mouse.move(mx, my, { steps: 10 + Math.floor(Math.random() * 10) });
|
||||
await page.mouse.wheel({ deltaY: 100 + Math.floor(Math.random() * 200) });
|
||||
await new Promise((res) => setTimeout(res, 150 + Math.floor(Math.random() * 200)));
|
||||
await page.mouse.wheel({ deltaY: -(30 + Math.floor(Math.random() * 60)) });
|
||||
} catch {
|
||||
// ignore if mouse is unavailable
|
||||
}
|
||||
|
||||
@@ -110,6 +110,7 @@ export default async function execute(url, waitForSelector, options) {
|
||||
// Navigation
|
||||
const response = await page.goto(url, {
|
||||
waitUntil: options?.waitUntil || 'domcontentloaded',
|
||||
timeout: options?.puppeteerTimeout || 60000,
|
||||
});
|
||||
|
||||
// Optionally wait and add subtle human-like interactions
|
||||
|
||||
17
lib/services/storage/migrations/sql/13.provider-details.js
Normal file
17
lib/services/storage/migrations/sql/13.provider-details.js
Normal file
@@ -0,0 +1,17 @@
|
||||
/*
|
||||
* Copyright (c) 2026 by Christian Kellner.
|
||||
* Licensed under Apache-2.0 with Commons Clause and Attribution/Naming Clause
|
||||
*/
|
||||
|
||||
// We have moved the previous immoscout_details setting to provider_details and enable this by default
|
||||
// We also set it to false per default as this is increasing the chance to be detected as a bot by a lot
|
||||
export function up(db) {
|
||||
db.exec(`
|
||||
UPDATE settings
|
||||
SET name = 'provider_details', value = false
|
||||
WHERE name = 'immoscout_details'
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM settings WHERE name = 'provider_details'
|
||||
);
|
||||
`);
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
/*
|
||||
* Copyright (c) 2026 by Christian Kellner.
|
||||
* Licensed under Apache-2.0 with Commons Clause and Attribution/Naming Clause
|
||||
*/
|
||||
|
||||
// Convert provider_details from a boolean to an array of provider id strings.
|
||||
// Users will re-configure which providers they want to fetch details from.
|
||||
export function up(db) {
|
||||
const row = db.prepare("SELECT value FROM settings WHERE name = 'provider_details'").get();
|
||||
if (row) {
|
||||
db.prepare("UPDATE settings SET value = ? WHERE name = 'provider_details'").run(JSON.stringify([]));
|
||||
} else {
|
||||
db.prepare("INSERT INTO settings (name, value) VALUES ('provider_details', ?)").run(JSON.stringify([]));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user