mirror of
https://github.com/orangecoding/fredy.git
synced 2026-06-16 12:31:07 +00:00
Compare commits
21 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3aae81ca19 | ||
|
|
f1effe941f | ||
|
|
cd3631f910 | ||
|
|
8f490f2426 | ||
|
|
48e2ca942f | ||
|
|
b9e4bca244 | ||
|
|
a138dafc31 | ||
|
|
c6bb3c44d4 | ||
|
|
a3471a091a | ||
|
|
b5a96afcc8 | ||
|
|
3903ab59cf | ||
|
|
8fe7cec2a1 | ||
|
|
97deea6f5b | ||
|
|
1ecbbdd774 | ||
|
|
e1db3840f6 | ||
|
|
26127eeac1 | ||
|
|
90a4ee5dcf | ||
|
|
2aaf63c253 | ||
|
|
f52e3e9fd8 | ||
|
|
0d69232395 | ||
|
|
b473cf7fb4 |
20
.github/workflows/test.yml
vendored
20
.github/workflows/test.yml
vendored
@@ -1,23 +1,23 @@
|
||||
name: Test
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
branches: [master]
|
||||
pull_request:
|
||||
branches:
|
||||
- master
|
||||
branches: [master]
|
||||
schedule:
|
||||
- cron: '0 12 * * *'
|
||||
- cron: '0 12 * * *'
|
||||
|
||||
jobs:
|
||||
test:
|
||||
name: Test
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v1
|
||||
- name: Setup node
|
||||
uses: actions/setup-node@v2.5.1
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 20
|
||||
cache: 'yarn'
|
||||
|
||||
- run: yarn install
|
||||
- run: yarn run test
|
||||
- run: yarn test
|
||||
|
||||
@@ -4,6 +4,11 @@ WORKDIR /fredy
|
||||
|
||||
COPY . /fredy
|
||||
|
||||
RUN apt-get update && apt-get install -y chromium
|
||||
|
||||
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \
|
||||
PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium
|
||||
|
||||
RUN yarn install
|
||||
|
||||
RUN yarn global add pm2
|
||||
|
||||
2
LICENSE
2
LICENSE
@@ -1,6 +1,6 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2024 Christian Kellner
|
||||
Copyright (c) 2025 Christian Kellner
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
||||
@@ -11,7 +11,7 @@ If _Fredy_ finds matching results, it will send them to you via Slack, Email, Te
|
||||
# Sponsorship [](https://github.com/sponsors/orangecoding)
|
||||
If you like my work, consider becoming a sponsor. I'm not expecting anybody to pay for _Fredy_ or any other Open Source Project I'm maintaining, however keep in mind, I'm doing all of this in my spare time :) Thanks.
|
||||
|
||||
<img src="https://github.com/orangecoding/fredy/blob/master/doc/jetbrains.png" width="200">
|
||||
[](https://jb.gg/OpenSourceSupport)
|
||||
|
||||
_Fredy_ is supported by JetBrains under Open Source Support Program
|
||||
|
||||
@@ -82,7 +82,7 @@ yarn run test
|
||||

|
||||
|
||||
### Immoscout
|
||||
Immoscout has implemented advanced bot detection. I’m actively working on bypassing these measures, but until then, selecting Immoscout as a provider will not return any results. I apologize for the inconvenience. 😉
|
||||
Immoscout has implemented advanced bot detection. In order to work around this, we are using a reversed engineered version of their mobile api. For now, only real estate rentals are supported. Purchases will be supported at a later point in time.
|
||||
|
||||
# Analytics
|
||||
Fredy is completely free (and will always remain free). However, it would be a huge help if you’d allow me to collect some analytical data.
|
||||
|
||||
@@ -1,118 +1,124 @@
|
||||
import {NoNewListingsWarning} from './errors.js';
|
||||
import {setKnownListings, getKnownListings} from './services/storage/listingsStorage.js';
|
||||
import { NoNewListingsWarning } from './errors.js';
|
||||
import { setKnownListings, getKnownListings } from './services/storage/listingsStorage.js';
|
||||
import * as notify from './notification/notify.js';
|
||||
import Extractor from './services/extractor/extractor.js';
|
||||
import urlModifier from './services/queryStringMutator.js';
|
||||
|
||||
class FredyRuntime {
|
||||
/**
|
||||
*
|
||||
* @param providerConfig the config for the specific provider, we're going to query at the moment
|
||||
* @param notificationConfig the config for all notifications
|
||||
* @param providerId the id of the provider currently in use
|
||||
* @param jobKey key of the job that is currently running (from within the config)
|
||||
* @param similarityCache cache instance holding values to check for similarity of entries
|
||||
*/
|
||||
constructor(providerConfig, notificationConfig, providerId, jobKey, similarityCache) {
|
||||
this._providerConfig = providerConfig;
|
||||
this._notificationConfig = notificationConfig;
|
||||
this._providerId = providerId;
|
||||
this._jobKey = jobKey;
|
||||
this._similarityCache = similarityCache;
|
||||
}
|
||||
/**
|
||||
*
|
||||
* @param providerConfig the config for the specific provider, we're going to query at the moment
|
||||
* @param notificationConfig the config for all notifications
|
||||
* @param providerId the id of the provider currently in use
|
||||
* @param jobKey key of the job that is currently running (from within the config)
|
||||
* @param similarityCache cache instance holding values to check for similarity of entries
|
||||
*/
|
||||
constructor(providerConfig, notificationConfig, providerId, jobKey, similarityCache) {
|
||||
this._providerConfig = providerConfig;
|
||||
this._notificationConfig = notificationConfig;
|
||||
this._providerId = providerId;
|
||||
this._jobKey = jobKey;
|
||||
this._similarityCache = similarityCache;
|
||||
}
|
||||
|
||||
execute() {
|
||||
return (
|
||||
//modify the url to make sure search order is correctly set
|
||||
Promise.resolve(urlModifier(this._providerConfig.url, this._providerConfig.sortByDateParam))
|
||||
//scraping the site and try finding new listings
|
||||
.then(this._getListings.bind(this))
|
||||
//bring them in a proper form (dictated by the provider)
|
||||
.then(this._normalize.bind(this))
|
||||
//filter listings with stuff tagged by the blacklist of the provider
|
||||
.then(this._filter.bind(this))
|
||||
//check if new listings available. if so proceed
|
||||
.then(this._findNew.bind(this))
|
||||
//store everything in db
|
||||
.then(this._save.bind(this))
|
||||
//check for similar listings. if found, remove them before notifying
|
||||
.then(this._filterBySimilarListings.bind(this))
|
||||
//notify the user using the configured notification adapter
|
||||
.then(this._notify.bind(this))
|
||||
//if an error occurred on the way, handle it here.
|
||||
.catch(this._handleError.bind(this))
|
||||
);
|
||||
}
|
||||
execute() {
|
||||
return (
|
||||
//modify the url to make sure search order is correctly set
|
||||
Promise.resolve(urlModifier(this._providerConfig.url, this._providerConfig.sortByDateParam))
|
||||
//scraping the site and try finding new listings
|
||||
.then(this._providerConfig.getListings?.bind(this) ?? this._getListings.bind(this))
|
||||
//bring them in a proper form (dictated by the provider)
|
||||
.then(this._normalize.bind(this))
|
||||
//filter listings with stuff tagged by the blacklist of the provider
|
||||
.then(this._filter.bind(this))
|
||||
//check if new listings available. if so proceed
|
||||
.then(this._findNew.bind(this))
|
||||
//store everything in db
|
||||
.then(this._save.bind(this))
|
||||
//check for similar listings. if found, remove them before notifying
|
||||
.then(this._filterBySimilarListings.bind(this))
|
||||
//notify the user using the configured notification adapter
|
||||
.then(this._notify.bind(this))
|
||||
//if an error occurred on the way, handle it here.
|
||||
.catch(this._handleError.bind(this))
|
||||
);
|
||||
}
|
||||
|
||||
_getListings(url) {
|
||||
const extractor = new Extractor();
|
||||
return new Promise((resolve, reject) => {
|
||||
extractor.execute(url,this._providerConfig.waitForSelector)
|
||||
.then(() => {
|
||||
const listings = extractor.parseResponseText(this._providerConfig.crawlContainer, this._providerConfig.crawlFields);
|
||||
resolve(listings == null ? [] : listings);
|
||||
}).catch(err => {
|
||||
reject(err);
|
||||
/* eslint-disable no-console */
|
||||
console.error(err);
|
||||
/* eslint-enable no-console */
|
||||
});
|
||||
_getListings(url) {
|
||||
const extractor = new Extractor();
|
||||
return new Promise((resolve, reject) => {
|
||||
extractor
|
||||
.execute(url, this._providerConfig.waitForSelector)
|
||||
.then(() => {
|
||||
const listings = extractor.parseResponseText(
|
||||
this._providerConfig.crawlContainer,
|
||||
this._providerConfig.crawlFields,
|
||||
url,
|
||||
);
|
||||
resolve(listings == null ? [] : listings);
|
||||
})
|
||||
.catch((err) => {
|
||||
reject(err);
|
||||
/* eslint-disable no-console */
|
||||
console.error(err);
|
||||
/* eslint-enable no-console */
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
_normalize(listings) {
|
||||
return listings.map(this._providerConfig.normalize);
|
||||
}
|
||||
_normalize(listings) {
|
||||
return listings.map(this._providerConfig.normalize);
|
||||
}
|
||||
|
||||
_filter(listings) {
|
||||
//only return those where all the fields have been found
|
||||
const keys = Object.keys(this._providerConfig.crawlFields);
|
||||
const filteredListings = listings.filter((item) => keys.every((key) => key in item));
|
||||
return filteredListings.filter(this._providerConfig.filter);
|
||||
}
|
||||
_filter(listings) {
|
||||
//only return those where all the fields have been found
|
||||
const keys = Object.keys(this._providerConfig.crawlFields);
|
||||
const filteredListings = listings.filter((item) => keys.every((key) => key in item));
|
||||
return filteredListings.filter(this._providerConfig.filter);
|
||||
}
|
||||
|
||||
_findNew(listings) {
|
||||
const newListings = listings.filter((o) => getKnownListings(this._jobKey, this._providerId)[o.id] == null);
|
||||
if (newListings.length === 0) {
|
||||
throw new NoNewListingsWarning();
|
||||
}
|
||||
return newListings;
|
||||
_findNew(listings) {
|
||||
const newListings = listings.filter((o) => getKnownListings(this._jobKey, this._providerId)[o.id] == null);
|
||||
if (newListings.length === 0) {
|
||||
throw new NoNewListingsWarning();
|
||||
}
|
||||
return newListings;
|
||||
}
|
||||
|
||||
_notify(newListings) {
|
||||
if (newListings.length === 0) {
|
||||
throw new NoNewListingsWarning();
|
||||
}
|
||||
const sendNotifications = notify.send(this._providerId, newListings, this._notificationConfig, this._jobKey);
|
||||
return Promise.all(sendNotifications).then(() => newListings);
|
||||
_notify(newListings) {
|
||||
if (newListings.length === 0) {
|
||||
throw new NoNewListingsWarning();
|
||||
}
|
||||
const sendNotifications = notify.send(this._providerId, newListings, this._notificationConfig, this._jobKey);
|
||||
return Promise.all(sendNotifications).then(() => newListings);
|
||||
}
|
||||
|
||||
_save(newListings) {
|
||||
const currentListings = getKnownListings(this._jobKey, this._providerId) || {};
|
||||
newListings.forEach((listing) => {
|
||||
currentListings[listing.id] = Date.now();
|
||||
});
|
||||
setKnownListings(this._jobKey, this._providerId, currentListings);
|
||||
return newListings;
|
||||
}
|
||||
_save(newListings) {
|
||||
const currentListings = getKnownListings(this._jobKey, this._providerId) || {};
|
||||
newListings.forEach((listing) => {
|
||||
currentListings[listing.id] = Date.now();
|
||||
});
|
||||
setKnownListings(this._jobKey, this._providerId, currentListings);
|
||||
return newListings;
|
||||
}
|
||||
|
||||
_filterBySimilarListings(listings) {
|
||||
const filteredList = listings.filter((listing) => {
|
||||
const similar = this._similarityCache.hasSimilarEntries(this._jobKey, listing.title);
|
||||
if (similar) {
|
||||
/* eslint-disable no-console */
|
||||
console.debug(`Filtering similar entry for job with id ${this._jobKey} with title: `, listing.title);
|
||||
/* eslint-enable no-console */
|
||||
}
|
||||
return !similar;
|
||||
});
|
||||
filteredList.forEach((filter) => this._similarityCache.addCacheEntry(this._jobKey, filter.title));
|
||||
return filteredList;
|
||||
}
|
||||
_filterBySimilarListings(listings) {
|
||||
const filteredList = listings.filter((listing) => {
|
||||
const similar = this._similarityCache.hasSimilarEntries(this._jobKey, listing.title);
|
||||
if (similar) {
|
||||
/* eslint-disable no-console */
|
||||
console.debug(`Filtering similar entry for job with id ${this._jobKey} with title: `, listing.title);
|
||||
/* eslint-enable no-console */
|
||||
}
|
||||
return !similar;
|
||||
});
|
||||
filteredList.forEach((filter) => this._similarityCache.addCacheEntry(this._jobKey, filter.title));
|
||||
return filteredList;
|
||||
}
|
||||
|
||||
_handleError(err) {
|
||||
if (err.name !== 'NoNewListingsWarning') console.error(err);
|
||||
}
|
||||
_handleError(err) {
|
||||
if (err.name !== 'NoNewListingsWarning') console.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
export default FredyRuntime;
|
||||
|
||||
@@ -1,11 +1,9 @@
|
||||
import restana from 'restana';
|
||||
import fetch from 'node-fetch';
|
||||
import * as jobStorage from '../../services/storage/jobStorage.js';
|
||||
import * as userStorage from '../../services/storage/userStorage.js';
|
||||
import * as immoscoutProvider from '../../provider/immoscout.js';
|
||||
import { config } from '../../utils.js';
|
||||
import { isAdmin } from '../security.js';
|
||||
import {trackDemoJobCreated} from '../../services/tracking/Tracker.js';
|
||||
import { trackDemoJobCreated } from '../../services/tracking/Tracker.js';
|
||||
const service = restana();
|
||||
const jobRouter = service.newRouter();
|
||||
function doesJobBelongsToUser(job, req) {
|
||||
@@ -28,7 +26,7 @@ jobRouter.get('/', async (req, res) => {
|
||||
jobRouter.get('/processingTimes', async (req, res) => {
|
||||
res.body = {
|
||||
interval: config.interval,
|
||||
lastRun: config.lastRun || null
|
||||
lastRun: config.lastRun || null,
|
||||
};
|
||||
res.send();
|
||||
});
|
||||
@@ -51,7 +49,7 @@ jobRouter.post('/', async (req, res) => {
|
||||
trackDemoJobCreated({
|
||||
name,
|
||||
provider,
|
||||
adapter: notificationAdapter
|
||||
adapter: notificationAdapter,
|
||||
});
|
||||
res.send();
|
||||
});
|
||||
|
||||
@@ -7,9 +7,11 @@ export const send = ({ serviceName, newListings, notificationConfig, jobKey }) =
|
||||
const job = getJob(jobKey);
|
||||
const jobName = job == null ? jobKey : job.name;
|
||||
const promises = newListings.map((newListing) => {
|
||||
const message = `Address: ${newListing.address} Size: ${newListing.size.replace(/2m/g, '$m^2$')} Price: ${
|
||||
newListing.price
|
||||
}`;
|
||||
const message = `
|
||||
Address: ${newListing.address}
|
||||
Size: ${newListing.size.replace(/2m/g, '$m^2$')}
|
||||
Price: ${newListing.price}
|
||||
Link: ${newListing.link}`;
|
||||
return fetch(server, {
|
||||
method: 'POST',
|
||||
body: JSON.stringify({
|
||||
|
||||
@@ -1,50 +1,73 @@
|
||||
import { markdown2Html } from '../../services/markdown.js';
|
||||
import { getJob } from '../../services/storage/jobStorage.js';
|
||||
import {markdown2Html} from '../../services/markdown.js';
|
||||
import {getJob} from '../../services/storage/jobStorage.js';
|
||||
import fetch from 'node-fetch';
|
||||
|
||||
export const send = ({ serviceName, newListings, notificationConfig, jobKey }) => {
|
||||
const { token, user, device } = notificationConfig.find((adapter) => adapter.id === config.id).fields;
|
||||
const job = getJob(jobKey);
|
||||
const jobName = job == null ? jobKey : job.name;
|
||||
const promises = newListings.map((newListing) => {
|
||||
const title = `${jobName} at ${serviceName}: ${newListing.title}`;
|
||||
const message = `Address: ${newListing.address}\nSize: ${newListing.size}\nPrice: ${newListing.price}\nLink: ${newListing.link}`;
|
||||
return fetch('https://api.pushover.net/1/messages.json', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
token: token,
|
||||
user: user,
|
||||
message: message,
|
||||
device: device,
|
||||
title: title,
|
||||
}),
|
||||
export const send = ({serviceName, newListings, notificationConfig, jobKey}) => {
|
||||
const {token, user, device} = notificationConfig.find((adapter) => adapter.id === config.id).fields;
|
||||
const job = getJob(jobKey);
|
||||
const jobName = job == null ? jobKey : job.name;
|
||||
const promises = newListings.map((newListing) => {
|
||||
const title = `${jobName} at ${serviceName}: ${newListing.title}`;
|
||||
const message = `Address: ${newListing.address}\nSize: ${newListing.size}\nPrice: ${newListing.price}\nLink: ${newListing.link}`;
|
||||
return fetch('https://api.pushover.net/1/messages.json', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type': 'application/json'},
|
||||
body: JSON.stringify({
|
||||
token: token,
|
||||
user: user,
|
||||
message: message,
|
||||
device: device,
|
||||
title: title,
|
||||
}),
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
return Promise.all(promises);
|
||||
return Promise.all(promises)
|
||||
.then((responses) => {
|
||||
// Convert all responses to JSON
|
||||
return Promise.all(responses.map((response) => response.json()));
|
||||
})
|
||||
.then((data) => {
|
||||
// Check for errors in the data
|
||||
const error = data
|
||||
.map((item) => (item.errors != null && item.errors.length > 0 ? item.errors.join(', ') : null))
|
||||
.filter((err) => err !== null);
|
||||
|
||||
if (error.length > 0) {
|
||||
// Reject with the combined error messages
|
||||
return Promise.reject(error.join('; '));
|
||||
}
|
||||
|
||||
return data;
|
||||
})
|
||||
.then(() => {
|
||||
return Promise.resolve();
|
||||
})
|
||||
.catch((error) => {
|
||||
return Promise.reject(error);
|
||||
});
|
||||
};
|
||||
|
||||
export const config = {
|
||||
id: 'pushover',
|
||||
name: 'Pushover',
|
||||
readme: markdown2Html('lib/notification/adapter/pushover.md'),
|
||||
description: 'Fredy will send new listings to your mobile using Pushover.',
|
||||
fields: {
|
||||
token: {
|
||||
type: 'text',
|
||||
label: 'API token',
|
||||
description: 'Your application\'s API token.',
|
||||
id: 'pushover',
|
||||
name: 'Pushover',
|
||||
readme: markdown2Html('lib/notification/adapter/pushover.md'),
|
||||
description: 'Fredy will send new listings to your mobile using Pushover.',
|
||||
fields: {
|
||||
token: {
|
||||
type: 'text',
|
||||
label: 'API token',
|
||||
description: 'Your application\'s API token.',
|
||||
},
|
||||
user: {
|
||||
type: 'text',
|
||||
label: 'User key',
|
||||
description: 'Your user/group key.',
|
||||
},
|
||||
device: {
|
||||
type: 'text',
|
||||
label: 'Device name',
|
||||
description: 'The device name to send your notification to. Messages may be addressed to multiple specific devices by joining them with a comma.',
|
||||
},
|
||||
},
|
||||
user: {
|
||||
type: 'text',
|
||||
label: 'User key',
|
||||
description: 'Your user/group key.',
|
||||
},
|
||||
device: {
|
||||
type: 'text',
|
||||
label: 'Device name',
|
||||
description: 'The device name to send your notification to. Messages may be addressed to multiple specific devices by joining them with a comma.',
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import utils, {buildHash} from '../utils.js';
|
||||
import utils, { buildHash } from '../utils.js';
|
||||
let appliedBlackList = [];
|
||||
|
||||
/**
|
||||
@@ -26,7 +26,7 @@ const config = {
|
||||
url: null,
|
||||
crawlContainer: 'div[data-testid="serp-core-classified-card-testid"]',
|
||||
sortByDateParam: 'sortby=19',
|
||||
waitForSelector: 'div[data-testid="serp-core-classified-card-testid"]',
|
||||
waitForSelector: 'div[data-testid="serp-resultscount-testid"]',
|
||||
crawlFields: {
|
||||
id: 'button@title |trim', // immonet is a piece of sh*t. See comment above
|
||||
title: 'button@title |trim',
|
||||
|
||||
@@ -1,37 +1,114 @@
|
||||
import utils, {buildHash} from '../utils.js';
|
||||
/**
|
||||
* ImmoScout provider using the mobile API to retrieve listings.
|
||||
*
|
||||
* The mobile API provides the following endpoints:
|
||||
* - GET /search/total?{search parameters}: Returns the total number of listings for the given query
|
||||
* Example: `curl -H "User-Agent: ImmoScout24_1410_30_._" https://api.mobile.immobilienscout24.de/search/total?searchType=region&realestatetype=apartmentrent&pricetype=calculatedtotalrent&geocodes=%2Fde%2Fberlin%2Fberlin `
|
||||
*
|
||||
* - POST /search/list?{search parameters}: Actually retrieves the listings. Body is json encoded and contains
|
||||
* data specifying additional results (advertisements) to return. The format is as follows:
|
||||
* ```
|
||||
* {
|
||||
* "supportedResultListTypes": [],
|
||||
* "userData": {}
|
||||
* }
|
||||
* ```
|
||||
* It is not necessary to provide data for the specified keys.
|
||||
*
|
||||
* Example: `curl -X POST 'https://api.mobile.immobilienscout24.de/search/list?pricetype=calculatedtotalrent&realestatetype=apartmentrent&searchType=region&geocodes=%2Fde%2Fberlin%2Fberlin&pagenumber=1' -H "Connection: keep-alive" -H "User-Agent: ImmoScout24_1410_30_._" -H "Accept: application/json" -H "Content-Type: application/json" -d '{"supportedResultListType": [], "userData": {}}'`
|
||||
|
||||
* - GET /expose/{id} - Returns the details of a listing. The response contains additional details not included in the
|
||||
* listing response.
|
||||
*
|
||||
* Example: `curl -H "User-Agent: ImmoScout24_1410_30_._" "https://api.mobile.immobilienscout24.de/expose/158382494"`
|
||||
*
|
||||
*
|
||||
* It is necessary to set the correct User Agent (see `getListings`) in the request header.
|
||||
*
|
||||
* Note that the mobile API is not publicly documented. I've reverse-engineered
|
||||
* it by intercepting traffic from an android emulator running the immoscout app.
|
||||
* Moreover, the search parameters differ slightly from the web API. I've mapped them
|
||||
* to the web API parameters by comparing a search request with all parameters set between
|
||||
* the web and mobile API. The mobile API actually seems to be a superset of the web API,
|
||||
* but I have decided not to include new parameters as I wanted to keep the existing UX (i.e.,
|
||||
* users only have to provide a link to an existing search).
|
||||
*
|
||||
* Limitations:
|
||||
* - The current implementation of this provider *does not* support non-rental properties,
|
||||
* although the same approach can be used to implement support. It's just a matter of
|
||||
* mapping the web search URL to the corresponding mobile API URL.
|
||||
* - Pagination support is not implemented.
|
||||
*/
|
||||
|
||||
import utils, { buildHash } from '../utils.js';
|
||||
import queryString from 'query-string';
|
||||
let appliedBlackList = [];
|
||||
|
||||
async function getListings(url) {
|
||||
const response = await fetch(url, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'User-Agent': 'ImmoScout24_1410_30_._',
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify({
|
||||
supportedResultListTypes: [],
|
||||
userData: {},
|
||||
}),
|
||||
});
|
||||
if (!response.ok) {
|
||||
console.error('Error fetching data from ImmoScout Mobile API:', response.statusText);
|
||||
return [];
|
||||
}
|
||||
|
||||
const responseBody = await response.json();
|
||||
return responseBody.resultListItems
|
||||
.filter((item) => item.type === 'EXPOSE_RESULT')
|
||||
.map((expose) => {
|
||||
const item = expose.item;
|
||||
const [price, size] = item.attributes;
|
||||
return {
|
||||
id: item.id,
|
||||
price: price?.value,
|
||||
size: size?.value,
|
||||
title: item.title,
|
||||
link: `${metaInformation.baseUrl}expose/${item.id}`,
|
||||
address: item.address?.line,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
function nullOrEmpty(val) {
|
||||
return val == null || val.length === 0;
|
||||
}
|
||||
function normalize(o) {
|
||||
const title = nullOrEmpty(o.title) ? 'NO TITLE FOUND' : o.title.replace('NEU', '');
|
||||
const address = nullOrEmpty(o.address) ? 'NO ADDRESS FOUND' : (o.address || '').replace(/\(.*\),.*$/, '').trim();
|
||||
const link = nullOrEmpty(o.link) ? 'NO LINK' : `https://www.immobilienscout24.de${o.link.substring(o.link.indexOf('/expose'))}`;
|
||||
const id = buildHash(o.id, o.price);
|
||||
return Object.assign(o, { id, title, address, link });
|
||||
return Object.assign(o, { id, title, address });
|
||||
}
|
||||
function applyBlacklist(o) {
|
||||
return !utils.isOneOf(o.title, appliedBlackList);
|
||||
}
|
||||
const config = {
|
||||
url: null,
|
||||
crawlContainer: '#resultListItems li.result-list__listing',
|
||||
sortByDateParam: 'sorting=2',
|
||||
waitForSelector: 'body',
|
||||
sortByDateParam: 'sorting=-firstactivation',
|
||||
// Not actually required - used by filter to remove and listings that failed to parse
|
||||
crawlFields: {
|
||||
id: '.result-list-entry@data-obid | int',
|
||||
price: '.result-list-entry .result-list-entry__criteria .grid-item:first-child dd | removeNewline | trim',
|
||||
size: '.result-list-entry .result-list-entry__criteria .grid-item:nth-child(2) dd | removeNewline | trim',
|
||||
title: '.result-list-entry .result-list-entry__brand-title-container h2 | removeNewline | trim',
|
||||
link: '.result-list-entry .result-list-entry__brand-title-container@href',
|
||||
address: '.result-list-entry .result-list-entry__map-link',
|
||||
id: 'id',
|
||||
title: 'title',
|
||||
price: 'price',
|
||||
size: 'size',
|
||||
link: 'link',
|
||||
address: 'address',
|
||||
},
|
||||
normalize: normalize,
|
||||
filter: applyBlacklist,
|
||||
getListings: getListings,
|
||||
};
|
||||
export const init = (sourceConfig, blacklist) => {
|
||||
config.enabled = sourceConfig.enabled;
|
||||
config.url = sourceConfig.url;
|
||||
config.url = convertWebToMobile(sourceConfig.url);
|
||||
appliedBlackList = blacklist || [];
|
||||
};
|
||||
export const metaInformation = {
|
||||
@@ -39,4 +116,90 @@ export const metaInformation = {
|
||||
baseUrl: 'https://www.immobilienscout24.de/',
|
||||
id: 'immoscout',
|
||||
};
|
||||
|
||||
export function convertWebToMobile(webUrl) {
|
||||
let url;
|
||||
try {
|
||||
url = new URL(webUrl);
|
||||
} catch (err) {
|
||||
throw new Error(`Invalid URL: ${webUrl}`);
|
||||
}
|
||||
const segments = url.pathname.split('/');
|
||||
if (segments.length < 6 || segments[1] !== 'Suche') {
|
||||
throw new Error(`Unexpected path format: ${url.pathname}`);
|
||||
}
|
||||
const geocodes = `/${segments[2]}/${segments[3]}/${segments[4]}`;
|
||||
|
||||
const paramNameMap = {
|
||||
heatingtypes: 'heatingtypes',
|
||||
haspromotion: 'haspromotion',
|
||||
numberofrooms: 'numberofrooms',
|
||||
livingspace: 'livingspace',
|
||||
energyefficiencyclasses: 'energyefficiencyclasses',
|
||||
exclusioncriteria: 'exclusioncriteria',
|
||||
equipment: 'equipment',
|
||||
petsallowedtypes: 'petsallowedtypes',
|
||||
price: 'price',
|
||||
constructionyear: 'constructionyear',
|
||||
apartmenttypes: 'apartmenttypes',
|
||||
pricetype: 'pricetype',
|
||||
floor: 'floor',
|
||||
};
|
||||
|
||||
const equipmentValueMap = {
|
||||
parking: 'parking',
|
||||
cellar: 'cellar',
|
||||
builtinkitchen: 'builtInKitchen',
|
||||
lift: 'lift',
|
||||
garden: 'garden',
|
||||
guesttoilet: 'guestToilet',
|
||||
balcony: 'balcony',
|
||||
};
|
||||
|
||||
const { query: webParams } = queryString.parseUrl(webUrl, { arrayFormat: 'comma' });
|
||||
delete webParams['enteredFrom'];
|
||||
|
||||
// Remove unsupported parameters
|
||||
Object.keys(webParams).forEach((key) => {
|
||||
if (!paramNameMap[key]) {
|
||||
delete webParams[key];
|
||||
}
|
||||
});
|
||||
|
||||
// Build mobile params
|
||||
const mobileParams = {
|
||||
searchType: 'region',
|
||||
geocodes,
|
||||
realestatetype: 'apartmentrent',
|
||||
};
|
||||
|
||||
Object.entries(webParams).forEach(([webKey, webVal]) => {
|
||||
let value = webVal;
|
||||
|
||||
if (webKey === 'equipment') {
|
||||
// Map equipment list to camelCase values
|
||||
if (!Array.isArray(value)) {
|
||||
value = ('' + value).split(',');
|
||||
}
|
||||
value = value.map((token) => {
|
||||
const lower = token.toLowerCase();
|
||||
if (!equipmentValueMap[lower]) {
|
||||
throw new Error(`Unknown equipment type: "${token}"`);
|
||||
}
|
||||
return equipmentValueMap[lower];
|
||||
});
|
||||
}
|
||||
|
||||
mobileParams[paramNameMap[webKey]] = value;
|
||||
});
|
||||
|
||||
const mobileQuery = queryString.stringify(mobileParams, {
|
||||
arrayFormat: 'comma',
|
||||
encode: true,
|
||||
skipEmptyString: true,
|
||||
});
|
||||
|
||||
return `https://api.mobile.immobilienscout24.de/search/list?${mobileQuery}`;
|
||||
}
|
||||
|
||||
export { config };
|
||||
|
||||
@@ -1,48 +1,48 @@
|
||||
import utils, {buildHash} from '../utils.js';
|
||||
import utils, { buildHash } from '../utils.js';
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
function normalize(o) {
|
||||
const size = o.size || 'N/A m²';
|
||||
const price = (o.price || '--- €').replace('Preis auf Anfrage', '--- €');
|
||||
const title = o.title || 'No title available';
|
||||
const immoId = o.id.substring(o.id.indexOf('-') + 1, o.id.length);
|
||||
const link = `https://immo.swp.de/immobilien/${immoId}`;
|
||||
const description = o.description;
|
||||
const id = buildHash(immoId, price);
|
||||
return Object.assign(o, {id, price, size, title, link, description});
|
||||
const size = o.size || 'N/A m²';
|
||||
const price = (o.price || '--- €').replace('Preis auf Anfrage', '--- €');
|
||||
const title = o.title || 'No title available';
|
||||
const immoId = o.id.substring(o.id.indexOf('-') + 1, o.id.length);
|
||||
const link = `https://immo.swp.de/immobilien/${immoId}`;
|
||||
const description = o.description;
|
||||
const id = buildHash(immoId, price);
|
||||
return Object.assign(o, { id, price, size, title, link, description });
|
||||
}
|
||||
|
||||
function applyBlacklist(o) {
|
||||
const titleNotBlacklisted = !utils.isOneOf(o.title, appliedBlackList);
|
||||
const descNotBlacklisted = !utils.isOneOf(o.description, appliedBlackList);
|
||||
return titleNotBlacklisted && descNotBlacklisted;
|
||||
const titleNotBlacklisted = !utils.isOneOf(o.title, appliedBlackList);
|
||||
const descNotBlacklisted = !utils.isOneOf(o.description, appliedBlackList);
|
||||
return titleNotBlacklisted && descNotBlacklisted;
|
||||
}
|
||||
|
||||
const config = {
|
||||
url: null,
|
||||
crawlContainer: '.js-serp-item',
|
||||
sortByDateParam: 's=most_recently_updated_first',
|
||||
waitForSelector: 'body',
|
||||
crawlFields: {
|
||||
id: '.js-bookmark-btn@data-id',
|
||||
price: 'div.align-items-start div:first-child | trim',
|
||||
size: 'div.align-items-start div:nth-child(3) | trim',
|
||||
title: '.card-title h2 | trim',
|
||||
link: '.ci-search-result__link@href',
|
||||
description: '.js-show-more-item-sm | removeNewline | trim',
|
||||
},
|
||||
normalize: normalize,
|
||||
filter: applyBlacklist,
|
||||
url: null,
|
||||
crawlContainer: '.js-serp-item',
|
||||
sortByDateParam: 's=most_recently_updated_first',
|
||||
waitForSelector: 'body',
|
||||
crawlFields: {
|
||||
id: '.js-bookmark-btn@data-id',
|
||||
price: 'div.align-items-start div:first-child | trim',
|
||||
size: 'div.align-items-start div:nth-child(3) | trim',
|
||||
title: '.js-item-title-link@title | trim',
|
||||
link: '.ci-search-result__link@href',
|
||||
description: '.js-show-more-item-sm | removeNewline | trim',
|
||||
},
|
||||
normalize: normalize,
|
||||
filter: applyBlacklist,
|
||||
};
|
||||
export const init = (sourceConfig, blacklist) => {
|
||||
config.enabled = sourceConfig.enabled;
|
||||
config.url = sourceConfig.url;
|
||||
appliedBlackList = blacklist || [];
|
||||
config.enabled = sourceConfig.enabled;
|
||||
config.url = sourceConfig.url;
|
||||
appliedBlackList = blacklist || [];
|
||||
};
|
||||
export const metaInformation = {
|
||||
name: 'Immo Südwest Presse',
|
||||
baseUrl: 'https://immo.swp.de/',
|
||||
id: 'immoswp',
|
||||
name: 'Immo Südwest Presse',
|
||||
baseUrl: 'https://immo.swp.de/',
|
||||
id: 'immoswp',
|
||||
};
|
||||
export {config};
|
||||
export { config };
|
||||
|
||||
@@ -18,12 +18,12 @@ const config = {
|
||||
crawlContainer:
|
||||
'div[data-testid="serp-core-scrollablelistview-testid"]:not(div[data-testid="serp-enlargementlist-testid"] div[data-testid="serp-card-testid"]) div[data-testid="serp-core-classified-card-testid"]',
|
||||
sortByDateParam: 'order=DateDesc',
|
||||
waitForSelector: 'div[data-testid="cardmfe-price-testid"]',
|
||||
waitForSelector: 'div[data-testid="serp-gridcontainer-testid"]',
|
||||
crawlFields: {
|
||||
id: 'a@href',
|
||||
price: 'div[data-testid="cardmfe-price-testid"] | removeNewline | trim',
|
||||
size: 'div[data-testid="cardmfe-keyfacts-testid"] | removeNewline | trim',
|
||||
title: '.css-1cbj9xw',
|
||||
title: '.css-jv3zx6',
|
||||
link: 'a@href',
|
||||
address: 'div[data-testid="cardmfe-description-box-address"] | removeNewline | trim',
|
||||
},
|
||||
|
||||
@@ -15,7 +15,7 @@ function applyBlacklist(o) {
|
||||
const descNotBlacklisted = !utils.isOneOf(o.description, appliedBlackList);
|
||||
const isBlacklistedDistrict =
|
||||
appliedBlacklistedDistricts.length === 0 ? false : utils.isOneOf(o.description, appliedBlacklistedDistricts);
|
||||
return !isBlacklistedDistrict && titleNotBlacklisted && descNotBlacklisted;
|
||||
return o.title != null && !isBlacklistedDistrict && titleNotBlacklisted && descNotBlacklisted;
|
||||
}
|
||||
|
||||
const config = {
|
||||
|
||||
@@ -4,7 +4,8 @@ let appliedBlackList = [];
|
||||
|
||||
function normalize(o) {
|
||||
const id = buildHash(o.id, o.price);
|
||||
return Object.assign(o, {id});
|
||||
const link = `https://www.wg-gesucht.de${o.link}`;
|
||||
return Object.assign(o, { id, link });
|
||||
}
|
||||
|
||||
function applyBlacklist(o) {
|
||||
|
||||
@@ -1,45 +1,43 @@
|
||||
import {setDebug} from './utils.js';
|
||||
import { setDebug } from './utils.js';
|
||||
import puppeteerExtractor from './puppeteerExtractor.js';
|
||||
import {loadParser, parse} from './parser/parser.js';
|
||||
import { loadParser, parse } from './parser/parser.js';
|
||||
|
||||
const DEFAULT_OPTIONS = {
|
||||
debug: false,
|
||||
puppeteerTimeout: 20_000,
|
||||
puppeteerHeadless: true
|
||||
|
||||
debug: false,
|
||||
puppeteerTimeout: 60_000,
|
||||
puppeteerHeadless: true,
|
||||
};
|
||||
|
||||
export default class Extractor {
|
||||
constructor(options) {
|
||||
this.options = {
|
||||
...DEFAULT_OPTIONS,
|
||||
...options
|
||||
};
|
||||
this.responseText = null;
|
||||
setDebug(this.options);
|
||||
constructor(options) {
|
||||
this.options = {
|
||||
...DEFAULT_OPTIONS,
|
||||
...options,
|
||||
};
|
||||
this.responseText = null;
|
||||
setDebug(this.options);
|
||||
}
|
||||
|
||||
/**
|
||||
* if you are extracting data from a SPA, you must provide a selector, otherwise
|
||||
* your response will never contain what you are really looking for
|
||||
* @param url
|
||||
* @param waitForSelector
|
||||
*/
|
||||
execute = async (url, waitForSelector = null) => {
|
||||
this.responseText = null;
|
||||
try {
|
||||
this.responseText = await puppeteerExtractor(url, waitForSelector, this.options);
|
||||
if (this.responseText != null) {
|
||||
loadParser(this.responseText);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error trying to load page.', error);
|
||||
}
|
||||
return this;
|
||||
};
|
||||
|
||||
/**
|
||||
* if you are extracting data from a SPA, you must provide a selector, otherwise
|
||||
* your response will never contain what you are really looking for
|
||||
* @param url
|
||||
* @param waitForSelector
|
||||
*/
|
||||
execute = async (url, waitForSelector = null) => {
|
||||
this.responseText = null;
|
||||
try {
|
||||
this.responseText = await puppeteerExtractor(url, waitForSelector, this.options);
|
||||
if(this.responseText != null) {
|
||||
loadParser(this.responseText);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error trying to load page.', error);
|
||||
}
|
||||
return this;
|
||||
};
|
||||
|
||||
|
||||
parseResponseText = (crawlContainer, crawlFields) => {
|
||||
return parse(crawlContainer, crawlFields, this.responseText);
|
||||
};
|
||||
parseResponseText = (crawlContainer, crawlFields, url) => {
|
||||
return parse(crawlContainer, crawlFields, this.responseText, url);
|
||||
};
|
||||
}
|
||||
|
||||
@@ -3,92 +3,95 @@ import * as cheerio from 'cheerio';
|
||||
let $ = null;
|
||||
|
||||
export function loadParser(text) {
|
||||
$ = cheerio.load(text);
|
||||
$ = cheerio.load(text);
|
||||
}
|
||||
|
||||
export function parse(crawlContainer, crawlFields, text) {
|
||||
if (!text) {
|
||||
console.warn('Cannot parse, text was empty.');
|
||||
return null;
|
||||
}
|
||||
export function parse(crawlContainer, crawlFields, text, url) {
|
||||
if (!text) {
|
||||
console.warn('Cannot parse, text was empty for url ', url);
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!crawlContainer || !crawlFields) {
|
||||
console.warn('Cannot parse, selector was empty.');
|
||||
return null;
|
||||
}
|
||||
if (!crawlContainer || !crawlFields) {
|
||||
console.warn('Cannot parse, selector was empty for url ', url);
|
||||
return null;
|
||||
}
|
||||
|
||||
const result = [];
|
||||
const result = [];
|
||||
|
||||
if ($(crawlContainer).length === 0) {
|
||||
console.error('No elements in crawl container found!');
|
||||
}
|
||||
if ($(crawlContainer).length === 0) {
|
||||
console.warn('No elements in crawl container found for url ', url);
|
||||
return null;
|
||||
}
|
||||
|
||||
$(crawlContainer).each((_, element) => {
|
||||
const container = $(element);
|
||||
const parsedObject = {};
|
||||
$(crawlContainer).each((_, element) => {
|
||||
const container = $(element);
|
||||
const parsedObject = {};
|
||||
|
||||
// Parse fields based on crawlFields
|
||||
for (const [key, fieldSelector] of Object.entries(crawlFields)) {
|
||||
let value;
|
||||
// Parse fields based on crawlFields
|
||||
for (const [key, fieldSelector] of Object.entries(crawlFields)) {
|
||||
let value;
|
||||
|
||||
try {
|
||||
try {
|
||||
const selector = fieldSelector.includes('|')
|
||||
? fieldSelector.substring(0, fieldSelector.indexOf('|')).trim()
|
||||
: fieldSelector;
|
||||
|
||||
const selector = fieldSelector.includes('|') ? fieldSelector.substring(0, fieldSelector.indexOf('|')).trim() : fieldSelector;
|
||||
|
||||
if (selector.includes('@')) {
|
||||
const [sel, attr] = selector.split('@');
|
||||
if (sel.length === 0) {
|
||||
value = container.attr(attr.trim());
|
||||
} else {
|
||||
value = container.find(sel.trim()).attr(attr.trim());
|
||||
}
|
||||
} else {
|
||||
value = container.find(selector.trim()).text();
|
||||
}
|
||||
|
||||
// Apply modifiers if specified
|
||||
if (fieldSelector.includes('|')) {
|
||||
const [_, ...modifiers] = fieldSelector.split('|').map(s => s.trim());
|
||||
value = applyModifiers(value, modifiers);
|
||||
}
|
||||
|
||||
parsedObject[key] = value || null;
|
||||
} catch (error) {
|
||||
console.error(`Error parsing field '${key}' with selector '${fieldSelector}':`, error);
|
||||
parsedObject[key] = null;
|
||||
}
|
||||
}
|
||||
|
||||
if (parsedObject.id != null) {
|
||||
result.push(parsedObject);
|
||||
if (selector.includes('@')) {
|
||||
const [sel, attr] = selector.split('@');
|
||||
if (sel.length === 0) {
|
||||
value = container.attr(attr.trim());
|
||||
} else {
|
||||
value = container.find(sel.trim()).attr(attr.trim());
|
||||
}
|
||||
} else {
|
||||
console.warn('ID not found. Not relaying object.');
|
||||
value = container.find(selector.trim()).text();
|
||||
}
|
||||
});
|
||||
|
||||
return result;
|
||||
// Apply modifiers if specified
|
||||
if (fieldSelector.includes('|')) {
|
||||
/* eslint-disable no-unused-vars */
|
||||
const [_, ...modifiers] = fieldSelector.split('|').map((s) => s.trim());
|
||||
/* eslint-disable no-unused-vars */
|
||||
value = applyModifiers(value, modifiers);
|
||||
}
|
||||
|
||||
parsedObject[key] = value || null;
|
||||
} catch (error) {
|
||||
console.error(`Error parsing field '${key}' with selector '${fieldSelector}':`, error);
|
||||
parsedObject[key] = null;
|
||||
}
|
||||
}
|
||||
|
||||
if (parsedObject.id != null) {
|
||||
result.push(parsedObject);
|
||||
} else {
|
||||
console.warn('ID not found. Not relaying object.');
|
||||
}
|
||||
});
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Helper function to apply modifiers
|
||||
function applyModifiers(value, modifiers) {
|
||||
if (!value) return value;
|
||||
if (!value) return value;
|
||||
|
||||
modifiers.forEach(modifier => {
|
||||
switch (modifier) {
|
||||
case 'int':
|
||||
value = parseInt(value, 10);
|
||||
break;
|
||||
case 'trim':
|
||||
value = value.replace(/\s+/g, ' ').trim();
|
||||
break;
|
||||
case 'removeNewline':
|
||||
value = value.replace(/\n/g, ' ');
|
||||
break;
|
||||
default:
|
||||
console.warn(`Unknown modifier: ${modifier}`);
|
||||
}
|
||||
});
|
||||
modifiers.forEach((modifier) => {
|
||||
switch (modifier) {
|
||||
case 'int':
|
||||
value = parseInt(value, 10);
|
||||
break;
|
||||
case 'trim':
|
||||
value = value.replace(/\s+/g, ' ').trim();
|
||||
break;
|
||||
case 'removeNewline':
|
||||
value = value.replace(/\n/g, ' ');
|
||||
break;
|
||||
default:
|
||||
console.warn(`Unknown modifier: ${modifier}`);
|
||||
}
|
||||
});
|
||||
|
||||
return value;
|
||||
return value;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,48 +1,49 @@
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
import {debug, DEFAULT_HEADER, botDetected} from './utils.js';
|
||||
import { debug, DEFAULT_HEADER, botDetected } from './utils.js';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
export default async function execute(url, waitForSelector, options) {
|
||||
let browser;
|
||||
try {
|
||||
debug(`Sending request to ${url} using Puppeteer.`);
|
||||
let browser;
|
||||
try {
|
||||
debug(`Sending request to ${url} using Puppeteer.`);
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
headless: options.puppeteerHeadless ?? true,
|
||||
args: ['--no-sandbox', '--disable-gpu', '--disable-setuid-sandbox']
|
||||
});
|
||||
let page = await browser.newPage();
|
||||
await page.setExtraHTTPHeaders(DEFAULT_HEADER);
|
||||
const response = await page.goto(url, {
|
||||
waitUntil: 'domcontentloaded'
|
||||
});
|
||||
let pageSource;
|
||||
//if we're extracting data from a spa, we must wait for the selector
|
||||
if (waitForSelector != null) {
|
||||
await page.waitForSelector(waitForSelector);
|
||||
pageSource = await page.evaluate(selector => {
|
||||
return document.querySelector(selector).innerHTML;
|
||||
}, waitForSelector);
|
||||
} else {
|
||||
pageSource = await page.content();
|
||||
}
|
||||
|
||||
const statusCode = response.status();
|
||||
|
||||
if (botDetected(pageSource, statusCode)) {
|
||||
console.warn('We have been detected as a bot :-/ Tried url: => ', url);
|
||||
return null;
|
||||
}
|
||||
|
||||
return await page.content();
|
||||
} catch (error) {
|
||||
console.error('Error executing with puppeteer executor', error);
|
||||
return null;
|
||||
} finally {
|
||||
if (browser != null) {
|
||||
await browser.close();
|
||||
}
|
||||
browser = await puppeteer.launch({
|
||||
headless: options.puppeteerHeadless ?? true,
|
||||
args: ['--no-sandbox', '--disable-gpu', '--disable-setuid-sandbox'],
|
||||
timeout: options.puppeteerTimeout || 30_000,
|
||||
});
|
||||
let page = await browser.newPage();
|
||||
await page.setExtraHTTPHeaders(DEFAULT_HEADER);
|
||||
const response = await page.goto(url, {
|
||||
waitUntil: 'domcontentloaded',
|
||||
});
|
||||
let pageSource;
|
||||
//if we're extracting data from a spa, we must wait for the selector
|
||||
if (waitForSelector != null) {
|
||||
await page.waitForSelector(waitForSelector);
|
||||
pageSource = await page.evaluate((selector) => {
|
||||
return document.querySelector(selector).innerHTML;
|
||||
}, waitForSelector);
|
||||
} else {
|
||||
pageSource = await page.content();
|
||||
}
|
||||
}
|
||||
|
||||
const statusCode = response.status();
|
||||
|
||||
if (botDetected(pageSource, statusCode)) {
|
||||
console.warn('We have been detected as a bot :-/ Tried url: => ', url);
|
||||
return null;
|
||||
}
|
||||
|
||||
return await page.content();
|
||||
} catch (error) {
|
||||
console.error('Error executing with puppeteer executor', error);
|
||||
return null;
|
||||
} finally {
|
||||
if (browser != null) {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,35 +1,32 @@
|
||||
let debuggingOn = false;
|
||||
|
||||
export const DEFAULT_HEADER = {
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
|
||||
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
Connection: 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'User-Agent':
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
|
||||
};
|
||||
|
||||
export const setDebug = options => {
|
||||
debuggingOn = !!options?.debug;
|
||||
export const setDebug = (options) => {
|
||||
debuggingOn = !!options?.debug;
|
||||
};
|
||||
|
||||
export const debug = (message) => {
|
||||
if(debuggingOn) {
|
||||
console.debug(message);
|
||||
}
|
||||
if (debuggingOn) {
|
||||
/* eslint-disable no-console */
|
||||
console.debug(message);
|
||||
/* eslint-enable no-console */
|
||||
}
|
||||
};
|
||||
|
||||
export const botDetected = (pageSource, statusCode) => {
|
||||
const suspiciousStatusCodes = [
|
||||
403, 429
|
||||
];
|
||||
const botDetectionPatterns = [
|
||||
/verify you are human/i,
|
||||
/access denied/i,
|
||||
/x-amz-cf-id/i,
|
||||
];
|
||||
const suspiciousStatusCodes = [403, 429];
|
||||
const botDetectionPatterns = [/verify you are human/i, /access denied/i, /x-amz-cf-id/i];
|
||||
|
||||
const detectedInSource = botDetectionPatterns.some(pattern => pattern.test(pageSource));
|
||||
const detectedByStatus = suspiciousStatusCodes.includes(statusCode);
|
||||
const detectedInSource = botDetectionPatterns.some((pattern) => pattern.test(pageSource));
|
||||
const detectedByStatus = suspiciousStatusCodes.includes(statusCode);
|
||||
|
||||
return detectedInSource || detectedByStatus;
|
||||
};
|
||||
return detectedInSource || detectedByStatus;
|
||||
};
|
||||
|
||||
30
package.json
30
package.json
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "fredy",
|
||||
"version": "11.0.0",
|
||||
"version": "11.1.0",
|
||||
"description": "[F]ind [R]eal [E]states [d]amn eas[y].",
|
||||
"scripts": {
|
||||
"start": "node prod.js",
|
||||
@@ -50,27 +50,27 @@
|
||||
"Firefox ESR"
|
||||
],
|
||||
"dependencies": {
|
||||
"@douyinfe/semi-ui": "2.71.3",
|
||||
"@douyinfe/semi-ui": "2.75.0",
|
||||
"@rematch/core": "2.2.0",
|
||||
"@rematch/loading": "2.1.2",
|
||||
"@sendgrid/mail": "8.1.4",
|
||||
"@vitejs/plugin-react": "4.3.4",
|
||||
"better-sqlite3": "^11.7.0",
|
||||
"better-sqlite3": "^11.8.1",
|
||||
"body-parser": "1.20.3",
|
||||
"cheerio": "^1.0.0",
|
||||
"cookie-session": "2.1.0",
|
||||
"handlebars": "4.7.8",
|
||||
"highcharts": "12.1.0",
|
||||
"highcharts": "12.1.2",
|
||||
"highcharts-react-official": "3.2.1",
|
||||
"lodash": "4.17.21",
|
||||
"lowdb": "6.0.1",
|
||||
"markdown": "^0.5.0",
|
||||
"mixpanel": "^0.18.0",
|
||||
"nanoid": "5.0.9",
|
||||
"nanoid": "5.1.2",
|
||||
"node-fetch": "3.3.2",
|
||||
"node-mailjet": "6.0.6",
|
||||
"package-up": "^5.0.0",
|
||||
"puppeteer": "^23.10.4",
|
||||
"puppeteer": "^24.2.1",
|
||||
"puppeteer-extra": "^3.3.6",
|
||||
"puppeteer-extra-plugin-stealth": "^2.11.2",
|
||||
"query-string": "9.1.1",
|
||||
@@ -88,21 +88,21 @@
|
||||
"vite": "5.4.11"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@babel/core": "7.26.0",
|
||||
"@babel/eslint-parser": "7.25.9",
|
||||
"@babel/preset-env": "7.26.0",
|
||||
"@babel/core": "7.26.9",
|
||||
"@babel/eslint-parser": "7.26.8",
|
||||
"@babel/preset-env": "7.26.9",
|
||||
"@babel/preset-react": "7.26.3",
|
||||
"chai": "5.1.2",
|
||||
"chai": "5.2.0",
|
||||
"eslint": "8.56.0",
|
||||
"eslint-config-prettier": "8.8.0",
|
||||
"eslint-plugin-react": "7.37.2",
|
||||
"esmock": "2.6.9",
|
||||
"eslint-plugin-react": "7.37.4",
|
||||
"esmock": "2.7.0",
|
||||
"history": "5.3.0",
|
||||
"husky": "9.1.7",
|
||||
"less": "4.2.1",
|
||||
"lint-staged": "15.2.11",
|
||||
"less": "4.2.2",
|
||||
"lint-staged": "15.4.3",
|
||||
"mocha": "10.8.2",
|
||||
"prettier": "3.4.2",
|
||||
"prettier": "3.5.2",
|
||||
"redux-logger": "3.0.6"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,40 +1,38 @@
|
||||
import * as similarityCache from '../../lib/services/similarity-check/similarityCache.js';
|
||||
import { get } from '../mocks/mockNotification.js';
|
||||
import { mockFredy, providerConfig } from '../utils.js';
|
||||
import { expect } from 'chai';
|
||||
import {get} from '../mocks/mockNotification.js';
|
||||
import {mockFredy, providerConfig} from '../utils.js';
|
||||
import {expect} from 'chai';
|
||||
import * as provider from '../../lib/provider/immonet.js';
|
||||
|
||||
describe('#immonet testsuite()', () => {
|
||||
after(() => {
|
||||
similarityCache.stopCacheCleanup();
|
||||
});
|
||||
provider.init(providerConfig.immonet, [], []);
|
||||
it('should test immonet provider', async () => {
|
||||
const Fredy = await mockFredy();
|
||||
return await new Promise((resolve) => {
|
||||
const fredy = new Fredy(provider.config, null, provider.metaInformation.id, 'immonet', similarityCache);
|
||||
fredy.execute().then((listing) => {
|
||||
expect(listing).to.be.a('array');
|
||||
const notificationObj = get();
|
||||
expect(notificationObj).to.be.a('object');
|
||||
expect(notificationObj.serviceName).to.equal('immonet');
|
||||
notificationObj.payload.forEach((notify) => {
|
||||
/** check the actual structure **/
|
||||
expect(notify.id).to.be.a('string');
|
||||
expect(notify.price).to.be.a('string');
|
||||
expect(notify.size).to.be.a('string');
|
||||
expect(notify.title).to.be.a('string');
|
||||
expect(notify.link).to.be.a('string');
|
||||
expect(notify.address).to.be.a('string');
|
||||
|
||||
/** check the values if possible **/
|
||||
expect(notify.price).that.does.include('€');
|
||||
expect(notify.size).that.does.include('m²');
|
||||
expect(notify.title).to.be.not.empty;
|
||||
expect(notify.address).to.be.not.empty;
|
||||
});
|
||||
resolve();
|
||||
});
|
||||
after(() => {
|
||||
similarityCache.stopCacheCleanup();
|
||||
});
|
||||
provider.init(providerConfig.immonet, [], []);
|
||||
it('should test immonet provider', async () => {
|
||||
const Fredy = await mockFredy();
|
||||
return await new Promise((resolve) => {
|
||||
const fredy = new Fredy(provider.config, null, provider.metaInformation.id, 'immonet', similarityCache);
|
||||
fredy.execute().then((listing) => {
|
||||
expect(listing).to.be.a('array');
|
||||
const notificationObj = get();
|
||||
expect(notificationObj).to.be.a('object');
|
||||
expect(notificationObj.serviceName).to.equal('immonet');
|
||||
notificationObj.payload.forEach((notify) => {
|
||||
/** check the actual structure **/
|
||||
expect(notify.id).to.be.a('string');
|
||||
expect(notify.price).to.be.a('string');
|
||||
expect(notify.size).to.be.a('string');
|
||||
expect(notify.title).to.be.a('string');
|
||||
expect(notify.link).to.be.a('string');
|
||||
expect(notify.address).to.be.a('string');
|
||||
|
||||
expect(notify.size).that.does.include('m²');
|
||||
expect(notify.title).to.be.not.empty;
|
||||
expect(notify.address).to.be.not.empty;
|
||||
});
|
||||
resolve();
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -1,43 +1,73 @@
|
||||
import { expect } from 'chai';
|
||||
import { convertWebToMobile } from '../../lib/provider/immoscout.js';
|
||||
import * as similarityCache from '../../lib/services/similarity-check/similarityCache.js';
|
||||
//import {get} from '../mocks/mockNotification.js';
|
||||
import {/*mockFredy, */providerConfig} from '../utils.js';
|
||||
//import {expect} from 'chai';
|
||||
import { mockFredy, providerConfig } from '../utils.js';
|
||||
import { get } from '../mocks/mockNotification.js';
|
||||
import * as provider from '../../lib/provider/immoscout.js';
|
||||
|
||||
describe('#immoscout testsuite()', () => {
|
||||
after(() => {
|
||||
similarityCache.stopCacheCleanup();
|
||||
});
|
||||
provider.init(providerConfig.immoscout, [], []);
|
||||
it('should test immoscout provider', async () => {
|
||||
//const Fredy = await mockFredy();
|
||||
return await new Promise((resolve) => {
|
||||
/* eslint-disable no-console */
|
||||
console.info('Skipping Immoscout test for now until we figured out how to surpass bot detection.');
|
||||
/* eslint-enable no-console */
|
||||
resolve();
|
||||
/*
|
||||
const fredy = new Fredy(provider.config, null, provider.metaInformation.id, 'immoscout', similarityCache);
|
||||
fredy.execute().then((listing) => {
|
||||
expect(listing).to.be.a('array');
|
||||
const notificationObj = get();
|
||||
expect(notificationObj).to.be.a('object');
|
||||
expect(notificationObj.serviceName).to.equal('immoscout');
|
||||
notificationObj.payload.forEach((notify) => {
|
||||
expect(notify.id).to.be.a('number');
|
||||
expect(notify.price).to.be.a('string');
|
||||
expect(notify.size).to.be.a('string');
|
||||
expect(notify.title).to.be.a('string');
|
||||
expect(notify.link).to.be.a('string');
|
||||
expect(notify.address).to.be.a('string');
|
||||
expect(notify.price).that.does.include('€');
|
||||
expect(notify.size).that.does.include('m²');
|
||||
expect(notify.title).to.be.not.empty;
|
||||
expect(notify.link).that.does.include('https://www.immobilienscout24.de');
|
||||
expect(notify.address).to.be.not.empty;
|
||||
});
|
||||
resolve();
|
||||
});*/
|
||||
describe('#immoscout provider testsuite()', () => {
|
||||
after(() => {
|
||||
similarityCache.stopCacheCleanup();
|
||||
});
|
||||
|
||||
provider.init(providerConfig.immoscout, [], []);
|
||||
it('should test immoscout provider', async () => {
|
||||
const Fredy = await mockFredy();
|
||||
return await new Promise((resolve) => {
|
||||
const fredy = new Fredy(provider.config, null, provider.metaInformation.id, '', similarityCache);
|
||||
fredy.execute().then((listings) => {
|
||||
expect(listings).to.be.a('array');
|
||||
const notificationObj = get();
|
||||
expect(notificationObj).to.be.a('object');
|
||||
expect(notificationObj.serviceName).to.equal('immoscout');
|
||||
notificationObj.payload.forEach((notify) => {
|
||||
/** check the actual structure **/
|
||||
expect(notify.id).to.be.a('string');
|
||||
expect(notify.price).to.be.a('string');
|
||||
expect(notify.size).to.be.a('string');
|
||||
expect(notify.title).to.be.a('string');
|
||||
expect(notify.link).to.be.a('string');
|
||||
expect(notify.address).to.be.a('string');
|
||||
/** check the values if possible **/
|
||||
expect(notify.size).to.be.not.empty;
|
||||
expect(notify.title).to.be.not.empty;
|
||||
expect(notify.link).that.does.include('https://www.immobilienscout24.de/');
|
||||
});
|
||||
resolve();
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('#immoscout-mobile URL conversion', () => {
|
||||
// Test URL conversion
|
||||
it('should convert a full web URL to mobile URL', () => {
|
||||
const webUrl =
|
||||
'https://www.immobilienscout24.de/Suche/de/berlin/berlin/wohnung-mieten?heatingtypes=central,selfcontainedcentral&haspromotion=false&numberofrooms=2.0-5.0&livingspace=10.0-25.0&energyefficiencyclasses=a,b,c,d,e,f,g,h,a_plus&exclusioncriteria=projectlisting,swapflat&equipment=parking,cellar,builtinkitchen,lift,garden,guesttoilet,balcony&petsallowedtypes=no,yes,negotiable&price=10.0-100.0&constructionyear=1920-2026&apartmenttypes=halfbasement,penthouse,other,loft,groundfloor,terracedflat,raisedgroundfloor,roofstorey,apartment,maisonette&pricetype=calculatedtotalrent&floor=2-7&enteredFrom=result_list';
|
||||
const expectedMobileUrl =
|
||||
'https://api.mobile.immobilienscout24.de/search/list?apartmenttypes=halfbasement,penthouse,other,loft,groundfloor,terracedflat,raisedgroundfloor,roofstorey,apartment,maisonette&constructionyear=1920-2026&energyefficiencyclasses=a,b,c,d,e,f,g,h,a_plus&equipment=parking,cellar,builtInKitchen,lift,garden,guestToilet,balcony&exclusioncriteria=projectlisting,swapflat&floor=2-7&geocodes=%2Fde%2Fberlin%2Fberlin&haspromotion=false&heatingtypes=central,selfcontainedcentral&livingspace=10.0-25.0&numberofrooms=2.0-5.0&petsallowedtypes=no,yes,negotiable&price=10.0-100.0&pricetype=calculatedtotalrent&realestatetype=apartmentrent&searchType=region';
|
||||
|
||||
const actualMobileUrl = convertWebToMobile(webUrl);
|
||||
expect(actualMobileUrl).to.equal(expectedMobileUrl);
|
||||
});
|
||||
|
||||
// Test URL conversion with unsupported query parameters
|
||||
it('should remove unsupported query parameters', () => {
|
||||
const webUrl = 'https://www.immobilienscout24.de/Suche/de/berlin/berlin/wohnung-mieten?minimuminternetspeed=100000';
|
||||
const converted = convertWebToMobile(webUrl);
|
||||
expect(converted).that.does.not.include('minimuminternetspeed');
|
||||
});
|
||||
|
||||
// Test URL conversion with invalid URL
|
||||
it('should throw an error for invalid URL', () => {
|
||||
const invalidUrl = 'invalid-url';
|
||||
|
||||
expect(() => convertWebToMobile(invalidUrl)).to.throw('Invalid URL: invalid-url');
|
||||
});
|
||||
|
||||
// Test URL conversion with unexpected path format
|
||||
it('should throw an error for unexpected path format', () => {
|
||||
const webUrl = 'https://www.immobilienscout24.de/invalid/path/format';
|
||||
expect(() => convertWebToMobile(webUrl)).to.throw('Unexpected path format: /invalid/path/format');
|
||||
});
|
||||
});
|
||||
|
||||
@@ -20,11 +20,6 @@
|
||||
"shouldBecome": "https://www.immonet.de/immobiliensuche/sel.do?sortby=19&suchart=1&objecttype=1&marketingtype=2&parentcat=1&locationname=d%C3%BCsseldorf",
|
||||
"id": "immonet"
|
||||
},
|
||||
{
|
||||
"url": "https://www.immobilienscout24.de/Suche/de/nordrhein-westfalen/duesseldorf/wohnung-mieten",
|
||||
"shouldBecome": "https://www.immobilienscout24.de/Suche/de/nordrhein-westfalen/duesseldorf/wohnung-mieten?sorting=2",
|
||||
"id": "immoscout"
|
||||
},
|
||||
{
|
||||
"url": "https://www.neubaukompass.de/neubau-immobilien/berlin-region/",
|
||||
"shouldBecome": "https://www.neubaukompass.de/neubau-immobilien/berlin-region/?Sortierung=Id&Richtung=DESC",
|
||||
|
||||
@@ -101,10 +101,7 @@ export default function ProviderMutator({ onVisibilityChanged, visible = false,
|
||||
description={
|
||||
<div>
|
||||
<p>
|
||||
Immoscout will not work at the moment due to advanced bot detection. I'm currently working on a fix.
|
||||
</p>
|
||||
<p>
|
||||
Until a fix has been released, Immoscout won't yield any results.
|
||||
Currently, Immoscout only works for real estate rentals. Purchases are not yet supported.
|
||||
</p>
|
||||
</div>
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user