From 70cab66651d3b36648a85ec9040e434c3b902d4e Mon Sep 17 00:00:00 2001 From: Christian Kellner Date: Tue, 11 May 2021 11:25:14 +0200 Subject: [PATCH] Bringing back immoscout (#21) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bringing back immoscout support 🎉 --- README.md | 12 +++- conf/config.json | 5 +- lib/FredyRuntime.js | 26 ++++++--- lib/provider/immoscout.js | 44 +++++++++++++++ lib/services/requestDriver.js | 33 +++++++++++ lib/services/scraper.js | 21 ++++--- lib/services/scrapingAnt.js | 24 ++++++++ package.json | 3 +- test/provider/immoscout.test.js | 56 +++++++++++++++++++ test/provider/testProvider.json | 4 ++ .../components/provider/ProviderMutator.js | 5 ++ webpack.common.js | 1 + yarn.lock | 14 +++++ 13 files changed, 225 insertions(+), 23 deletions(-) create mode 100644 lib/provider/immoscout.js create mode 100644 lib/services/requestDriver.js create mode 100644 lib/services/scrapingAnt.js create mode 100644 test/provider/immoscout.test.js diff --git a/README.md b/README.md index 968820d..c57164f 100755 --- a/README.md +++ b/README.md @@ -17,6 +17,15 @@ yarn run start ``` _Fredy_ will start with the default port, set to `9998`. You can access _Fredy_ by opening a browser `http://localhost:9998`. The default login is `admin` for username and password. (You should change the password asap when you plan to run Fredy on your server.) +## Immoscout +I have added **EXPERIMENTAL** support for Immoscout. Immoscout is somewhat special, coz they have decided to secure their service from bots using Re-Capture. Finding a way around this is barely possible. For _Fredy_ to be able to bypass the check, I'm using a service called [ScrapingAnt](https://scrapingant.com/). The trick is to use a headless browser, rotating proxies and (once successful validated) re-send the cookies each time. + +To be able to use Immoscout, you need to create an account and copy the apiKey into the config file under /conf/config.json. +The rest should be done by _Fredy_. Keep in mind, the support is experimental. There might be bugs and you might not always get pass the re-capture check, but most of the time it works pretty good :) + +If you need more that the 1000 api calls you can do per month, I'd suggest opting for a paid account... (No I don't get any money for recommending good services) + + ## Understanding the fundamentals There are 3 important parts in Fredy, that you need to understand to leverage the full power of _Fredy_. @@ -57,9 +66,6 @@ yarn run test # Architecture ![Architecture](/doc/architecture.jpg "Architecture") -## Why is Immoscout missing -Immoscout decided to add "robot protection" to their service. Meaning if Fredy tries to check for listings, it will be recognized as a bot. I haven't found a way around it (yet) ;) - #### Contribution guidelines See [Contribution](https://github.com/orangecoding/fredy/blob/master/CONTRIBUTION.md) diff --git a/conf/config.json b/conf/config.json index 53d66d8..bf23b83 100755 --- a/conf/config.json +++ b/conf/config.json @@ -1,4 +1,7 @@ { "interval": 30, - "port": 9998 + "port": 9998, + "scrapingAnt": { + "apiKey": "" + } } diff --git a/lib/FredyRuntime.js b/lib/FredyRuntime.js index b7b45c1..89d1d7b 100755 --- a/lib/FredyRuntime.js +++ b/lib/FredyRuntime.js @@ -3,6 +3,7 @@ const { setKnownListings, getKnownListings } = require('./services/storage/listi const notify = require('./notification/notify'); const xray = require('./services/scraper'); +const scrapingAnt = require('./services/scrapingAnt'); class FredyRuntime { /** @@ -41,15 +42,24 @@ class FredyRuntime { _getListings(url) { return new Promise((resolve, reject) => { - let x = xray(url, this._providerConfig.crawlContainer, [this._providerConfig.crawlFields]); - - x((err, listings) => { - if (err) { + const id = this._providerId; + if (scrapingAnt.isImmoscout(id) && !scrapingAnt.isScrapingAntApiKeySet()) { + const error = 'Immoscout can only be used with if you have set an apikey for scrapingAnt.'; + /* eslint-disable no-console */ + console.log(error); + /* eslint-enable no-console */ + reject(error); + return; + } + const u = scrapingAnt.isImmoscout(id) ? scrapingAnt.transformUrlForScrapingAnt(url, id) : url; + xray(u, this._providerConfig.crawlContainer, [this._providerConfig.crawlFields]) + .then((listings) => { + resolve(listings == null ? [] : listings); + }) + .catch((err) => { reject(err); - } else { - resolve(listings); - } - }); + console.error(err); + }); }); } diff --git a/lib/provider/immoscout.js b/lib/provider/immoscout.js new file mode 100644 index 0000000..ddf8364 --- /dev/null +++ b/lib/provider/immoscout.js @@ -0,0 +1,44 @@ +const utils = require('../utils'); + +let appliedBlackList = []; + +function normalize(o) { + const title = o.title.replace('NEU', ''); + const address = (o.address || '').replace(/\(.*\),.*$/, '').trim(); + const link = `https://www.immobilienscout24.de${o.link.substring(o.link.indexOf('/expose'))}`; + return Object.assign(o, { title, address, link }); +} + +function applyBlacklist(o) { + return !utils.isOneOf(o.title, appliedBlackList); +} + +const config = { + url: null, + crawlContainer: '#resultListItems li.result-list__listing', + crawlFields: { + id: '.result-list-entry@data-obid | int', + price: '.result-list-entry .result-list-entry__criteria .grid-item:first-child dd | removeNewline | trim', + size: '.result-list-entry .result-list-entry__criteria .grid-item:nth-child(2) dd | removeNewline | trim', + title: '.result-list-entry .result-list-entry__brand-title-container h5 | removeNewline | trim', + link: '.result-list-entry .result-list-entry__brand-title-container@href', + address: '.result-list-entry .result-list-entry__map-link', + }, + paginate: '#pager .align-right a@href', + normalize: normalize, + filter: applyBlacklist, +}; + +exports.init = (sourceConfig, blacklist) => { + config.enabled = sourceConfig.enabled; + config.url = sourceConfig.url; + appliedBlackList = blacklist || []; +}; + +exports.metaInformation = { + name: 'Immoscout', + baseUrl: 'https://www.immobilienscout24.de/', + id: __filename.slice(__dirname.length + 1, -3), +}; + +exports.config = config; diff --git a/lib/services/requestDriver.js b/lib/services/requestDriver.js new file mode 100644 index 0000000..39d8ebd --- /dev/null +++ b/lib/services/requestDriver.js @@ -0,0 +1,33 @@ +const axios = require('axios'); + +function makeDriver(headers = {}) { + let cookies = ''; + + return async function driver(context, callback) { + const url = context.url; + let result; + try { + result = await axios({ + url, + headers: { + ...headers, + Cookie: cookies, + }, + }); + } catch (exception) { + callback(exception, null); + } + + if (typeof result.data === 'object' && url.toLowerCase().indexOf('scrapingant') !== -1) { + //assume we have gotten a response from scrapingAnt + if (cookies.length === 0) { + cookies = result.data.cookies; + } + callback(null, result.data.content); + } else { + callback(null, result.data); + } + }; +} + +module.exports = makeDriver; diff --git a/lib/services/scraper.js b/lib/services/scraper.js index 9959111..a677699 100755 --- a/lib/services/scraper.js +++ b/lib/services/scraper.js @@ -1,4 +1,5 @@ -const makeDriver = require('request-x-ray'); +const config = require('../../conf/config.json'); +const makeDriver = require('./requestDriver'); const Xray = require('x-ray'); class Scraper { @@ -9,14 +10,16 @@ class Scraper { int: this._int, }; - const driver = makeDriver({ - headers: { - 'User-Agent': - 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.85 Safari/537.36', - cookie: - 'longUnreliableState="dWlkcg==:YS1kZDViMzVhZWRhMTk0MDdmYWRjNDNkY2VmYTcxZmVkOQ=="; eveD=eyJldnRfZ2FfYWN0aW9uIjpbInNlYXJjaCJdLCJldnRfZ2FfY2F0ZWdvcnkiOlsicmVzdWx0bGlzdCJdLCJnZW9fYmxuIjpbIm5vcmRyaGVpbl93ZXN0ZmFsZW4iXSwiZXZ0X2dhX2xhYmVsIjpbImRpc3RyaWN0Il0sIm9ial9pdHlwIjpbIndvaG51bmdfa2F1ZiJdLCJnZW9fa3JzIjpbImTDvHNzZWxkb3JmIl0sImdlb19sYW5kIjpbImRldXRzY2hsYW5kIl0sIm9ial9yZXN1bHRsaXN0X2NvdW50IjpbIjI4NCJdLCJvYmpfY3Jvc3N0eXBlIjpbImxpdl9hcGFydG1lbnRfYnV5Il19; ABNTEST=9526230109; is24_experiment_visitor_id=d568590b-951b-45c3-b890-13feef6ee472; reese84=3:Xf3JwcTIC3yeubDXqWBTfg==:oqnDVs58wBxZRMfpzPnlzLzscVQhboRBffkM4caxNe+vLBdozdtdrCwpcTKyvIuhB9MOMCAinb2qnSTL4D9kLpqL72gl+jtl7QdiNAEn2erDKLqX4b9/K5wFU7j6qzxFWdfcMUm295qU3o3s7O8CM8HdghKYOVtoif+qTkeztphyYMfmAePYkfYRhZXZaFwHwxUfkRVUEX2VKoepkTf9TudCHsTYXWqvnpUt/CT+yrFHlUdTgdTWfD5tQJvn3inPqKERAB8TTKoHIvM4duBJV/5fZDax07CHNqHcKhrws0pq4y2ssKfdxLxCE0OIpnMSOtmn7O0koDoV6RzRjNUC+UZ7mhPFH+YSPHTb+6VJsZQDnRufEIz4B1WWIORV+jvHzfIli9OHsmOPnskA6mnCpFwEvQAfJu9R+jI9dccjFno=:Oc7c2wwYiNMBJnvZeDCIKLP0LuVVPWJ4kzd5MPlsoTg=', - }, - }); + const headers = { + 'User-Agent': + 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.85 Safari/537.36', + useQueryString: true, + }; + + if (config.scrapingAnt != null && config.scrapingAnt.apiKey != null) { + headers['x-api-key'] = config.scrapingAnt.apiKey; + } + const driver = makeDriver(headers); const xray = Xray({ filters }); xray.driver(driver); diff --git a/lib/services/scrapingAnt.js b/lib/services/scrapingAnt.js new file mode 100644 index 0000000..e13a7e4 --- /dev/null +++ b/lib/services/scrapingAnt.js @@ -0,0 +1,24 @@ +const { metaInformation } = require('../provider/immoscout'); +//to better confure re-capture chose a random proxy each time we do a call +const proxies = ['ae', 'br', 'cn', 'de', 'es', 'fr', 'gb', 'hk', 'in', 'it', 'il', 'jp', 'nl', 'ru', 'sa', 'us', 'cz']; +const config = require('../../conf/config.json'); + +const isImmoscout = (id) => { + return id.toLowerCase() === metaInformation.id; +}; + +exports.transformUrlForScrapingAnt = (url, id) => { + const randomProxy = proxies[Math.floor(Math.random() * proxies.length)]; + + if (isImmoscout(id)) { + //only do calls to scrapingAnt when dealing with Immoscout + url = `https://api.scrapingant.com/v1/general?url=${encodeURIComponent(url)}&proxy_country=${randomProxy}`; + } + return url; +}; + +exports.isScrapingAntApiKeySet = () => { + return config.scrapingAnt != null && config.scrapingAnt.apiKey != null && config.scrapingAnt.apiKey.length > 0; +}; + +exports.isImmoscout = isImmoscout; diff --git a/package.json b/package.json index 1732525..09c4579 100755 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "fredy", - "version": "3.0.0", + "version": "4.0.0", "description": "[F]ind [R]eal [E]states [d]amn eas[y].", "scripts": { "start": "node index.js", @@ -72,7 +72,6 @@ "react-switch": "^6.0.0", "redux": "4.0.5", "redux-thunk": "2.3.0", - "request-x-ray": "0.1.4", "restana": "4.8.1", "semantic-ui-react": "2.0.3", "serve-static": "^1.14.1", diff --git a/test/provider/immoscout.test.js b/test/provider/immoscout.test.js new file mode 100644 index 0000000..657f793 --- /dev/null +++ b/test/provider/immoscout.test.js @@ -0,0 +1,56 @@ +const mockNotification = require('../mocks/mockNotification'); +const providerConfig = require('./testProvider.json'); +const mockStore = require('../mocks/mockStore'); +const proxyquire = require('proxyquire').noCallThru(); +const expect = require('chai').expect; +const provider = require('../../lib/provider/immoscout'); +const scrapingAnt = require('../../lib/services/scrapingAnt'); + +describe('#immoscout testsuite()', () => { + provider.init(providerConfig.immoscout, [], []); + const Fredy = proxyquire('../../lib/FredyRuntime', { + './services/storage/listingsStorage': { + ...mockStore, + }, + './notification/notify': mockNotification, + }); + + it('should test immoscout provider', async () => { + return await new Promise((resolve) => { + if (!scrapingAnt.isScrapingAntApiKeySet()) { + /* eslint-disable no-console */ + console.info('Skipping Immoscout test as ScrapingAnt Api Key is not set.'); + /* eslint-enable no-console */ + resolve(); + return; + } + + const fredy = new Fredy(provider.config, null, provider.metaInformation.id, 'test1'); + fredy.execute().then((listing) => { + expect(listing).to.be.a('array'); + + const notificationObj = mockNotification.get(); + expect(notificationObj).to.be.a('object'); + expect(notificationObj.serviceName).to.equal('immoscout'); + + notificationObj.payload.forEach((notify) => { + /** check the actual structure **/ + expect(notify.id).to.be.a('number'); + expect(notify.price).to.be.a('string'); + expect(notify.size).to.be.a('string'); + expect(notify.title).to.be.a('string'); + expect(notify.link).to.be.a('string'); + expect(notify.address).to.be.a('string'); + + /** check the values if possible **/ + expect(notify.price).that.does.include('€'); + expect(notify.size).that.does.include('m²'); + expect(notify.title).to.be.not.empty; + expect(notify.link).that.does.include('https://www.immobilienscout24.de'); + expect(notify.address).to.be.not.empty; + }); + resolve(); + }); + }); + }); +}); diff --git a/test/provider/testProvider.json b/test/provider/testProvider.json index 84ee78c..4d4dbbe 100644 --- a/test/provider/testProvider.json +++ b/test/provider/testProvider.json @@ -12,6 +12,10 @@ "url": "https://www.immowelt.de/liste/duesseldorf-benrath/wohnungen/kaufen?geoid=10805111000004%2C10805111000005%2C10805111000006%2C10805111000007%2C10805111000009%2C10805111000010%2C10805111000011%2C10805111000013%2C10805111000014%2C10805111000015%2C10805111000016%2C10805111000017%2C10805111000018%2C10805111000019%2C10805111000023%2C10805111000024%2C10805111000027%2C10805111000032%2C10805111000034%2C10805111000035%2C10805111000039%2C10805111000041%2C10805111000042%2C10805111000043%2C10805111000047%2C10805111000048%2C10805111000049%2C10805111000051%2C10805111000052%2C10805111000053&roomi=3&prima=420000&wflmi=90&sort=createdate%2Bdesc", "enabled": true }, + "immoscout": { + "url": "https://www.immobilienscout24.de/Suche/de/nordrhein-westfalen/duesseldorf/wohnung-mieten?enteredFrom=one_step_search", + "enabled": true + }, "kalaydo": { "url": "https://www.kalaydo.de/immobilien/eigentumswohnung-kaufen/o/duesseldorf/4/?attr_gt_estate_size_living_area=90.0&attr_gt_no_of_rooms=3.5&maxPrice=420000.00&radius=5&resultsPerPage=50&sorting=-date", "enabled": true diff --git a/ui/src/views/jobs/mutation/components/provider/ProviderMutator.js b/ui/src/views/jobs/mutation/components/provider/ProviderMutator.js index de93e52..66233de 100644 --- a/ui/src/views/jobs/mutation/components/provider/ProviderMutator.js +++ b/ui/src/views/jobs/mutation/components/provider/ProviderMutator.js @@ -84,6 +84,11 @@ export default function ProviderMutator({ onVisibilityChanged, visible = false,
When the search results are shown on the website, copy the url and paste it into the textfield below.
+ + If you chose Immoscout as a provider, make sure to also add the scrapingAnt apiKey to the config.json. + (See readme) + +
Do not forget to sort the results by date before copying the url to Fredy, so that Fredy always captures the latest search results. diff --git a/webpack.common.js b/webpack.common.js index fcfc00b..0c17f64 100644 --- a/webpack.common.js +++ b/webpack.common.js @@ -18,6 +18,7 @@ module.exports = { publicPath: '/', filename: 'fredy.bundle.js', }, + performance: { hints: false }, module: { rules: [ { diff --git a/yarn.lock b/yarn.lock index ac17694..c1cac5b 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1895,6 +1895,15 @@ bcrypt-pbkdf@^1.0.0: dependencies: tweetnacl "^0.14.3" +bent@^7.3.12: + version "7.3.12" + resolved "https://registry.yarnpkg.com/bent/-/bent-7.3.12.tgz#e0a2775d4425e7674c64b78b242af4f49da6b035" + integrity sha512-T3yrKnVGB63zRuoco/7Ybl7BwwGZR0lceoVG5XmQyMIH9s19SV5m+a8qam4if0zQuAmOQTyPTPmsQBdAorGK3w== + dependencies: + bytesish "^0.4.1" + caseless "~0.12.0" + is-stream "^2.0.0" + big.js@^5.2.2: version "5.2.2" resolved "https://registry.yarnpkg.com/big.js/-/big.js-5.2.2.tgz#65f0af382f578bcdc742bd9c281e9cb2d7768328" @@ -2133,6 +2142,11 @@ bytes@3.1.0: resolved "https://registry.yarnpkg.com/bytes/-/bytes-3.1.0.tgz#f6cf7933a360e0588fa9fde85651cdc7f805d1f6" integrity sha512-zauLjrfCG+xvoyaqLoV8bLVXXNGC4JqlxFCutSDWA6fJrTo2ZuvLYTqZ7aHBLZSMOopbzwv8f+wZcVzfVTI2Dg== +bytesish@^0.4.1: + version "0.4.4" + resolved "https://registry.yarnpkg.com/bytesish/-/bytesish-0.4.4.tgz#f3b535a0f1153747427aee27256748cff92347e6" + integrity sha512-i4uu6M4zuMUiyfZN4RU2+i9+peJh//pXhd9x1oSe1LBkZ3LEbCoygu8W0bXTukU1Jme2txKuotpCZRaC3FLxcQ== + cacache@^12.0.2: version "12.0.4" resolved "https://registry.yarnpkg.com/cacache/-/cacache-12.0.4.tgz#668bcbd105aeb5f1d92fe25570ec9525c8faa40c"