Bringing back immoscout (#21)

Bringing back immoscout support 🎉
This commit is contained in:
Christian Kellner
2021-05-11 11:25:14 +02:00
committed by GitHub
parent 9726079f62
commit 70cab66651
13 changed files with 225 additions and 23 deletions

View File

@@ -17,6 +17,15 @@ yarn run start
```
_Fredy_ will start with the default port, set to `9998`. You can access _Fredy_ by opening a browser `http://localhost:9998`. The default login is `admin` for username and password. (You should change the password asap when you plan to run Fredy on your server.)
## Immoscout
I have added **EXPERIMENTAL** support for Immoscout. Immoscout is somewhat special, coz they have decided to secure their service from bots using Re-Capture. Finding a way around this is barely possible. For _Fredy_ to be able to bypass the check, I'm using a service called [ScrapingAnt](https://scrapingant.com/). The trick is to use a headless browser, rotating proxies and (once successful validated) re-send the cookies each time.
To be able to use Immoscout, you need to create an account and copy the apiKey into the config file under /conf/config.json.
The rest should be done by _Fredy_. Keep in mind, the support is experimental. There might be bugs and you might not always get pass the re-capture check, but most of the time it works pretty good :)
If you need more that the 1000 api calls you can do per month, I'd suggest opting for a paid account... (No I don't get any money for recommending good services)
## Understanding the fundamentals
There are 3 important parts in Fredy, that you need to understand to leverage the full power of _Fredy_.
@@ -57,9 +66,6 @@ yarn run test
# Architecture
![Architecture](/doc/architecture.jpg "Architecture")
## Why is Immoscout missing
Immoscout decided to add "robot protection" to their service. Meaning if Fredy tries to check for listings, it will be recognized as a bot. I haven't found a way around it (yet) ;)
#### Contribution guidelines
See [Contribution](https://github.com/orangecoding/fredy/blob/master/CONTRIBUTION.md)

View File

@@ -1,4 +1,7 @@
{
"interval": 30,
"port": 9998
"port": 9998,
"scrapingAnt": {
"apiKey": ""
}
}

View File

@@ -3,6 +3,7 @@ const { setKnownListings, getKnownListings } = require('./services/storage/listi
const notify = require('./notification/notify');
const xray = require('./services/scraper');
const scrapingAnt = require('./services/scrapingAnt');
class FredyRuntime {
/**
@@ -41,15 +42,24 @@ class FredyRuntime {
_getListings(url) {
return new Promise((resolve, reject) => {
let x = xray(url, this._providerConfig.crawlContainer, [this._providerConfig.crawlFields]);
x((err, listings) => {
if (err) {
const id = this._providerId;
if (scrapingAnt.isImmoscout(id) && !scrapingAnt.isScrapingAntApiKeySet()) {
const error = 'Immoscout can only be used with if you have set an apikey for scrapingAnt.';
/* eslint-disable no-console */
console.log(error);
/* eslint-enable no-console */
reject(error);
return;
}
const u = scrapingAnt.isImmoscout(id) ? scrapingAnt.transformUrlForScrapingAnt(url, id) : url;
xray(u, this._providerConfig.crawlContainer, [this._providerConfig.crawlFields])
.then((listings) => {
resolve(listings == null ? [] : listings);
})
.catch((err) => {
reject(err);
} else {
resolve(listings);
}
});
console.error(err);
});
});
}

44
lib/provider/immoscout.js Normal file
View File

@@ -0,0 +1,44 @@
const utils = require('../utils');
let appliedBlackList = [];
function normalize(o) {
const title = o.title.replace('NEU', '');
const address = (o.address || '').replace(/\(.*\),.*$/, '').trim();
const link = `https://www.immobilienscout24.de${o.link.substring(o.link.indexOf('/expose'))}`;
return Object.assign(o, { title, address, link });
}
function applyBlacklist(o) {
return !utils.isOneOf(o.title, appliedBlackList);
}
const config = {
url: null,
crawlContainer: '#resultListItems li.result-list__listing',
crawlFields: {
id: '.result-list-entry@data-obid | int',
price: '.result-list-entry .result-list-entry__criteria .grid-item:first-child dd | removeNewline | trim',
size: '.result-list-entry .result-list-entry__criteria .grid-item:nth-child(2) dd | removeNewline | trim',
title: '.result-list-entry .result-list-entry__brand-title-container h5 | removeNewline | trim',
link: '.result-list-entry .result-list-entry__brand-title-container@href',
address: '.result-list-entry .result-list-entry__map-link',
},
paginate: '#pager .align-right a@href',
normalize: normalize,
filter: applyBlacklist,
};
exports.init = (sourceConfig, blacklist) => {
config.enabled = sourceConfig.enabled;
config.url = sourceConfig.url;
appliedBlackList = blacklist || [];
};
exports.metaInformation = {
name: 'Immoscout',
baseUrl: 'https://www.immobilienscout24.de/',
id: __filename.slice(__dirname.length + 1, -3),
};
exports.config = config;

View File

@@ -0,0 +1,33 @@
const axios = require('axios');
function makeDriver(headers = {}) {
let cookies = '';
return async function driver(context, callback) {
const url = context.url;
let result;
try {
result = await axios({
url,
headers: {
...headers,
Cookie: cookies,
},
});
} catch (exception) {
callback(exception, null);
}
if (typeof result.data === 'object' && url.toLowerCase().indexOf('scrapingant') !== -1) {
//assume we have gotten a response from scrapingAnt
if (cookies.length === 0) {
cookies = result.data.cookies;
}
callback(null, result.data.content);
} else {
callback(null, result.data);
}
};
}
module.exports = makeDriver;

View File

@@ -1,4 +1,5 @@
const makeDriver = require('request-x-ray');
const config = require('../../conf/config.json');
const makeDriver = require('./requestDriver');
const Xray = require('x-ray');
class Scraper {
@@ -9,14 +10,16 @@ class Scraper {
int: this._int,
};
const driver = makeDriver({
headers: {
'User-Agent':
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.85 Safari/537.36',
cookie:
'longUnreliableState="dWlkcg==:YS1kZDViMzVhZWRhMTk0MDdmYWRjNDNkY2VmYTcxZmVkOQ=="; eveD=eyJldnRfZ2FfYWN0aW9uIjpbInNlYXJjaCJdLCJldnRfZ2FfY2F0ZWdvcnkiOlsicmVzdWx0bGlzdCJdLCJnZW9fYmxuIjpbIm5vcmRyaGVpbl93ZXN0ZmFsZW4iXSwiZXZ0X2dhX2xhYmVsIjpbImRpc3RyaWN0Il0sIm9ial9pdHlwIjpbIndvaG51bmdfa2F1ZiJdLCJnZW9fa3JzIjpbImTDvHNzZWxkb3JmIl0sImdlb19sYW5kIjpbImRldXRzY2hsYW5kIl0sIm9ial9yZXN1bHRsaXN0X2NvdW50IjpbIjI4NCJdLCJvYmpfY3Jvc3N0eXBlIjpbImxpdl9hcGFydG1lbnRfYnV5Il19; ABNTEST=9526230109; is24_experiment_visitor_id=d568590b-951b-45c3-b890-13feef6ee472; reese84=3:Xf3JwcTIC3yeubDXqWBTfg==:oqnDVs58wBxZRMfpzPnlzLzscVQhboRBffkM4caxNe+vLBdozdtdrCwpcTKyvIuhB9MOMCAinb2qnSTL4D9kLpqL72gl+jtl7QdiNAEn2erDKLqX4b9/K5wFU7j6qzxFWdfcMUm295qU3o3s7O8CM8HdghKYOVtoif+qTkeztphyYMfmAePYkfYRhZXZaFwHwxUfkRVUEX2VKoepkTf9TudCHsTYXWqvnpUt/CT+yrFHlUdTgdTWfD5tQJvn3inPqKERAB8TTKoHIvM4duBJV/5fZDax07CHNqHcKhrws0pq4y2ssKfdxLxCE0OIpnMSOtmn7O0koDoV6RzRjNUC+UZ7mhPFH+YSPHTb+6VJsZQDnRufEIz4B1WWIORV+jvHzfIli9OHsmOPnskA6mnCpFwEvQAfJu9R+jI9dccjFno=:Oc7c2wwYiNMBJnvZeDCIKLP0LuVVPWJ4kzd5MPlsoTg=',
},
});
const headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.85 Safari/537.36',
useQueryString: true,
};
if (config.scrapingAnt != null && config.scrapingAnt.apiKey != null) {
headers['x-api-key'] = config.scrapingAnt.apiKey;
}
const driver = makeDriver(headers);
const xray = Xray({ filters });
xray.driver(driver);

View File

@@ -0,0 +1,24 @@
const { metaInformation } = require('../provider/immoscout');
//to better confure re-capture chose a random proxy each time we do a call
const proxies = ['ae', 'br', 'cn', 'de', 'es', 'fr', 'gb', 'hk', 'in', 'it', 'il', 'jp', 'nl', 'ru', 'sa', 'us', 'cz'];
const config = require('../../conf/config.json');
const isImmoscout = (id) => {
return id.toLowerCase() === metaInformation.id;
};
exports.transformUrlForScrapingAnt = (url, id) => {
const randomProxy = proxies[Math.floor(Math.random() * proxies.length)];
if (isImmoscout(id)) {
//only do calls to scrapingAnt when dealing with Immoscout
url = `https://api.scrapingant.com/v1/general?url=${encodeURIComponent(url)}&proxy_country=${randomProxy}`;
}
return url;
};
exports.isScrapingAntApiKeySet = () => {
return config.scrapingAnt != null && config.scrapingAnt.apiKey != null && config.scrapingAnt.apiKey.length > 0;
};
exports.isImmoscout = isImmoscout;

View File

@@ -1,6 +1,6 @@
{
"name": "fredy",
"version": "3.0.0",
"version": "4.0.0",
"description": "[F]ind [R]eal [E]states [d]amn eas[y].",
"scripts": {
"start": "node index.js",
@@ -72,7 +72,6 @@
"react-switch": "^6.0.0",
"redux": "4.0.5",
"redux-thunk": "2.3.0",
"request-x-ray": "0.1.4",
"restana": "4.8.1",
"semantic-ui-react": "2.0.3",
"serve-static": "^1.14.1",

View File

@@ -0,0 +1,56 @@
const mockNotification = require('../mocks/mockNotification');
const providerConfig = require('./testProvider.json');
const mockStore = require('../mocks/mockStore');
const proxyquire = require('proxyquire').noCallThru();
const expect = require('chai').expect;
const provider = require('../../lib/provider/immoscout');
const scrapingAnt = require('../../lib/services/scrapingAnt');
describe('#immoscout testsuite()', () => {
provider.init(providerConfig.immoscout, [], []);
const Fredy = proxyquire('../../lib/FredyRuntime', {
'./services/storage/listingsStorage': {
...mockStore,
},
'./notification/notify': mockNotification,
});
it('should test immoscout provider', async () => {
return await new Promise((resolve) => {
if (!scrapingAnt.isScrapingAntApiKeySet()) {
/* eslint-disable no-console */
console.info('Skipping Immoscout test as ScrapingAnt Api Key is not set.');
/* eslint-enable no-console */
resolve();
return;
}
const fredy = new Fredy(provider.config, null, provider.metaInformation.id, 'test1');
fredy.execute().then((listing) => {
expect(listing).to.be.a('array');
const notificationObj = mockNotification.get();
expect(notificationObj).to.be.a('object');
expect(notificationObj.serviceName).to.equal('immoscout');
notificationObj.payload.forEach((notify) => {
/** check the actual structure **/
expect(notify.id).to.be.a('number');
expect(notify.price).to.be.a('string');
expect(notify.size).to.be.a('string');
expect(notify.title).to.be.a('string');
expect(notify.link).to.be.a('string');
expect(notify.address).to.be.a('string');
/** check the values if possible **/
expect(notify.price).that.does.include('€');
expect(notify.size).that.does.include('m²');
expect(notify.title).to.be.not.empty;
expect(notify.link).that.does.include('https://www.immobilienscout24.de');
expect(notify.address).to.be.not.empty;
});
resolve();
});
});
});
});

View File

@@ -12,6 +12,10 @@
"url": "https://www.immowelt.de/liste/duesseldorf-benrath/wohnungen/kaufen?geoid=10805111000004%2C10805111000005%2C10805111000006%2C10805111000007%2C10805111000009%2C10805111000010%2C10805111000011%2C10805111000013%2C10805111000014%2C10805111000015%2C10805111000016%2C10805111000017%2C10805111000018%2C10805111000019%2C10805111000023%2C10805111000024%2C10805111000027%2C10805111000032%2C10805111000034%2C10805111000035%2C10805111000039%2C10805111000041%2C10805111000042%2C10805111000043%2C10805111000047%2C10805111000048%2C10805111000049%2C10805111000051%2C10805111000052%2C10805111000053&roomi=3&prima=420000&wflmi=90&sort=createdate%2Bdesc",
"enabled": true
},
"immoscout": {
"url": "https://www.immobilienscout24.de/Suche/de/nordrhein-westfalen/duesseldorf/wohnung-mieten?enteredFrom=one_step_search",
"enabled": true
},
"kalaydo": {
"url": "https://www.kalaydo.de/immobilien/eigentumswohnung-kaufen/o/duesseldorf/4/?attr_gt_estate_size_living_area=90.0&attr_gt_no_of_rooms=3.5&maxPrice=420000.00&radius=5&resultsPerPage=50&sorting=-date",
"enabled": true

View File

@@ -84,6 +84,11 @@ export default function ProviderMutator({ onVisibilityChanged, visible = false,
<br />
When the search results are shown on the website, copy the url and paste it into the textfield below.
<br />
<span style={{ color: '#ff0000' }}>
If you chose Immoscout as a provider, make sure to also add the scrapingAnt apiKey to the config.json.
(See readme)
</span>
<br />
<span style={{ color: '#ff0000' }}>
Do not forget to sort the results by date before copying the url to Fredy, so that Fredy always captures
the latest search results.

View File

@@ -18,6 +18,7 @@ module.exports = {
publicPath: '/',
filename: 'fredy.bundle.js',
},
performance: { hints: false },
module: {
rules: [
{

View File

@@ -1895,6 +1895,15 @@ bcrypt-pbkdf@^1.0.0:
dependencies:
tweetnacl "^0.14.3"
bent@^7.3.12:
version "7.3.12"
resolved "https://registry.yarnpkg.com/bent/-/bent-7.3.12.tgz#e0a2775d4425e7674c64b78b242af4f49da6b035"
integrity sha512-T3yrKnVGB63zRuoco/7Ybl7BwwGZR0lceoVG5XmQyMIH9s19SV5m+a8qam4if0zQuAmOQTyPTPmsQBdAorGK3w==
dependencies:
bytesish "^0.4.1"
caseless "~0.12.0"
is-stream "^2.0.0"
big.js@^5.2.2:
version "5.2.2"
resolved "https://registry.yarnpkg.com/big.js/-/big.js-5.2.2.tgz#65f0af382f578bcdc742bd9c281e9cb2d7768328"
@@ -2133,6 +2142,11 @@ bytes@3.1.0:
resolved "https://registry.yarnpkg.com/bytes/-/bytes-3.1.0.tgz#f6cf7933a360e0588fa9fde85651cdc7f805d1f6"
integrity sha512-zauLjrfCG+xvoyaqLoV8bLVXXNGC4JqlxFCutSDWA6fJrTo2ZuvLYTqZ7aHBLZSMOopbzwv8f+wZcVzfVTI2Dg==
bytesish@^0.4.1:
version "0.4.4"
resolved "https://registry.yarnpkg.com/bytesish/-/bytesish-0.4.4.tgz#f3b535a0f1153747427aee27256748cff92347e6"
integrity sha512-i4uu6M4zuMUiyfZN4RU2+i9+peJh//pXhd9x1oSe1LBkZ3LEbCoygu8W0bXTukU1Jme2txKuotpCZRaC3FLxcQ==
cacache@^12.0.2:
version "12.0.4"
resolved "https://registry.yarnpkg.com/cacache/-/cacache-12.0.4.tgz#668bcbd105aeb5f1d92fe25570ec9525c8faa40c"