mirror of
https://github.com/orangecoding/fredy.git
synced 2026-06-16 12:31:07 +00:00
committed by
GitHub
parent
9726079f62
commit
70cab66651
12
README.md
12
README.md
@@ -17,6 +17,15 @@ yarn run start
|
||||
```
|
||||
_Fredy_ will start with the default port, set to `9998`. You can access _Fredy_ by opening a browser `http://localhost:9998`. The default login is `admin` for username and password. (You should change the password asap when you plan to run Fredy on your server.)
|
||||
|
||||
## Immoscout
|
||||
I have added **EXPERIMENTAL** support for Immoscout. Immoscout is somewhat special, coz they have decided to secure their service from bots using Re-Capture. Finding a way around this is barely possible. For _Fredy_ to be able to bypass the check, I'm using a service called [ScrapingAnt](https://scrapingant.com/). The trick is to use a headless browser, rotating proxies and (once successful validated) re-send the cookies each time.
|
||||
|
||||
To be able to use Immoscout, you need to create an account and copy the apiKey into the config file under /conf/config.json.
|
||||
The rest should be done by _Fredy_. Keep in mind, the support is experimental. There might be bugs and you might not always get pass the re-capture check, but most of the time it works pretty good :)
|
||||
|
||||
If you need more that the 1000 api calls you can do per month, I'd suggest opting for a paid account... (No I don't get any money for recommending good services)
|
||||
|
||||
|
||||
## Understanding the fundamentals
|
||||
There are 3 important parts in Fredy, that you need to understand to leverage the full power of _Fredy_.
|
||||
|
||||
@@ -57,9 +66,6 @@ yarn run test
|
||||
# Architecture
|
||||

|
||||
|
||||
## Why is Immoscout missing
|
||||
Immoscout decided to add "robot protection" to their service. Meaning if Fredy tries to check for listings, it will be recognized as a bot. I haven't found a way around it (yet) ;)
|
||||
|
||||
#### Contribution guidelines
|
||||
|
||||
See [Contribution](https://github.com/orangecoding/fredy/blob/master/CONTRIBUTION.md)
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
{
|
||||
"interval": 30,
|
||||
"port": 9998
|
||||
"port": 9998,
|
||||
"scrapingAnt": {
|
||||
"apiKey": ""
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ const { setKnownListings, getKnownListings } = require('./services/storage/listi
|
||||
|
||||
const notify = require('./notification/notify');
|
||||
const xray = require('./services/scraper');
|
||||
const scrapingAnt = require('./services/scrapingAnt');
|
||||
|
||||
class FredyRuntime {
|
||||
/**
|
||||
@@ -41,15 +42,24 @@ class FredyRuntime {
|
||||
|
||||
_getListings(url) {
|
||||
return new Promise((resolve, reject) => {
|
||||
let x = xray(url, this._providerConfig.crawlContainer, [this._providerConfig.crawlFields]);
|
||||
|
||||
x((err, listings) => {
|
||||
if (err) {
|
||||
const id = this._providerId;
|
||||
if (scrapingAnt.isImmoscout(id) && !scrapingAnt.isScrapingAntApiKeySet()) {
|
||||
const error = 'Immoscout can only be used with if you have set an apikey for scrapingAnt.';
|
||||
/* eslint-disable no-console */
|
||||
console.log(error);
|
||||
/* eslint-enable no-console */
|
||||
reject(error);
|
||||
return;
|
||||
}
|
||||
const u = scrapingAnt.isImmoscout(id) ? scrapingAnt.transformUrlForScrapingAnt(url, id) : url;
|
||||
xray(u, this._providerConfig.crawlContainer, [this._providerConfig.crawlFields])
|
||||
.then((listings) => {
|
||||
resolve(listings == null ? [] : listings);
|
||||
})
|
||||
.catch((err) => {
|
||||
reject(err);
|
||||
} else {
|
||||
resolve(listings);
|
||||
}
|
||||
});
|
||||
console.error(err);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
44
lib/provider/immoscout.js
Normal file
44
lib/provider/immoscout.js
Normal file
@@ -0,0 +1,44 @@
|
||||
const utils = require('../utils');
|
||||
|
||||
let appliedBlackList = [];
|
||||
|
||||
function normalize(o) {
|
||||
const title = o.title.replace('NEU', '');
|
||||
const address = (o.address || '').replace(/\(.*\),.*$/, '').trim();
|
||||
const link = `https://www.immobilienscout24.de${o.link.substring(o.link.indexOf('/expose'))}`;
|
||||
return Object.assign(o, { title, address, link });
|
||||
}
|
||||
|
||||
function applyBlacklist(o) {
|
||||
return !utils.isOneOf(o.title, appliedBlackList);
|
||||
}
|
||||
|
||||
const config = {
|
||||
url: null,
|
||||
crawlContainer: '#resultListItems li.result-list__listing',
|
||||
crawlFields: {
|
||||
id: '.result-list-entry@data-obid | int',
|
||||
price: '.result-list-entry .result-list-entry__criteria .grid-item:first-child dd | removeNewline | trim',
|
||||
size: '.result-list-entry .result-list-entry__criteria .grid-item:nth-child(2) dd | removeNewline | trim',
|
||||
title: '.result-list-entry .result-list-entry__brand-title-container h5 | removeNewline | trim',
|
||||
link: '.result-list-entry .result-list-entry__brand-title-container@href',
|
||||
address: '.result-list-entry .result-list-entry__map-link',
|
||||
},
|
||||
paginate: '#pager .align-right a@href',
|
||||
normalize: normalize,
|
||||
filter: applyBlacklist,
|
||||
};
|
||||
|
||||
exports.init = (sourceConfig, blacklist) => {
|
||||
config.enabled = sourceConfig.enabled;
|
||||
config.url = sourceConfig.url;
|
||||
appliedBlackList = blacklist || [];
|
||||
};
|
||||
|
||||
exports.metaInformation = {
|
||||
name: 'Immoscout',
|
||||
baseUrl: 'https://www.immobilienscout24.de/',
|
||||
id: __filename.slice(__dirname.length + 1, -3),
|
||||
};
|
||||
|
||||
exports.config = config;
|
||||
33
lib/services/requestDriver.js
Normal file
33
lib/services/requestDriver.js
Normal file
@@ -0,0 +1,33 @@
|
||||
const axios = require('axios');
|
||||
|
||||
function makeDriver(headers = {}) {
|
||||
let cookies = '';
|
||||
|
||||
return async function driver(context, callback) {
|
||||
const url = context.url;
|
||||
let result;
|
||||
try {
|
||||
result = await axios({
|
||||
url,
|
||||
headers: {
|
||||
...headers,
|
||||
Cookie: cookies,
|
||||
},
|
||||
});
|
||||
} catch (exception) {
|
||||
callback(exception, null);
|
||||
}
|
||||
|
||||
if (typeof result.data === 'object' && url.toLowerCase().indexOf('scrapingant') !== -1) {
|
||||
//assume we have gotten a response from scrapingAnt
|
||||
if (cookies.length === 0) {
|
||||
cookies = result.data.cookies;
|
||||
}
|
||||
callback(null, result.data.content);
|
||||
} else {
|
||||
callback(null, result.data);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
module.exports = makeDriver;
|
||||
@@ -1,4 +1,5 @@
|
||||
const makeDriver = require('request-x-ray');
|
||||
const config = require('../../conf/config.json');
|
||||
const makeDriver = require('./requestDriver');
|
||||
const Xray = require('x-ray');
|
||||
|
||||
class Scraper {
|
||||
@@ -9,14 +10,16 @@ class Scraper {
|
||||
int: this._int,
|
||||
};
|
||||
|
||||
const driver = makeDriver({
|
||||
headers: {
|
||||
'User-Agent':
|
||||
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.85 Safari/537.36',
|
||||
cookie:
|
||||
'longUnreliableState="dWlkcg==:YS1kZDViMzVhZWRhMTk0MDdmYWRjNDNkY2VmYTcxZmVkOQ=="; eveD=eyJldnRfZ2FfYWN0aW9uIjpbInNlYXJjaCJdLCJldnRfZ2FfY2F0ZWdvcnkiOlsicmVzdWx0bGlzdCJdLCJnZW9fYmxuIjpbIm5vcmRyaGVpbl93ZXN0ZmFsZW4iXSwiZXZ0X2dhX2xhYmVsIjpbImRpc3RyaWN0Il0sIm9ial9pdHlwIjpbIndvaG51bmdfa2F1ZiJdLCJnZW9fa3JzIjpbImTDvHNzZWxkb3JmIl0sImdlb19sYW5kIjpbImRldXRzY2hsYW5kIl0sIm9ial9yZXN1bHRsaXN0X2NvdW50IjpbIjI4NCJdLCJvYmpfY3Jvc3N0eXBlIjpbImxpdl9hcGFydG1lbnRfYnV5Il19; ABNTEST=9526230109; is24_experiment_visitor_id=d568590b-951b-45c3-b890-13feef6ee472; reese84=3:Xf3JwcTIC3yeubDXqWBTfg==:oqnDVs58wBxZRMfpzPnlzLzscVQhboRBffkM4caxNe+vLBdozdtdrCwpcTKyvIuhB9MOMCAinb2qnSTL4D9kLpqL72gl+jtl7QdiNAEn2erDKLqX4b9/K5wFU7j6qzxFWdfcMUm295qU3o3s7O8CM8HdghKYOVtoif+qTkeztphyYMfmAePYkfYRhZXZaFwHwxUfkRVUEX2VKoepkTf9TudCHsTYXWqvnpUt/CT+yrFHlUdTgdTWfD5tQJvn3inPqKERAB8TTKoHIvM4duBJV/5fZDax07CHNqHcKhrws0pq4y2ssKfdxLxCE0OIpnMSOtmn7O0koDoV6RzRjNUC+UZ7mhPFH+YSPHTb+6VJsZQDnRufEIz4B1WWIORV+jvHzfIli9OHsmOPnskA6mnCpFwEvQAfJu9R+jI9dccjFno=:Oc7c2wwYiNMBJnvZeDCIKLP0LuVVPWJ4kzd5MPlsoTg=',
|
||||
},
|
||||
});
|
||||
const headers = {
|
||||
'User-Agent':
|
||||
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.85 Safari/537.36',
|
||||
useQueryString: true,
|
||||
};
|
||||
|
||||
if (config.scrapingAnt != null && config.scrapingAnt.apiKey != null) {
|
||||
headers['x-api-key'] = config.scrapingAnt.apiKey;
|
||||
}
|
||||
const driver = makeDriver(headers);
|
||||
|
||||
const xray = Xray({ filters });
|
||||
xray.driver(driver);
|
||||
|
||||
24
lib/services/scrapingAnt.js
Normal file
24
lib/services/scrapingAnt.js
Normal file
@@ -0,0 +1,24 @@
|
||||
const { metaInformation } = require('../provider/immoscout');
|
||||
//to better confure re-capture chose a random proxy each time we do a call
|
||||
const proxies = ['ae', 'br', 'cn', 'de', 'es', 'fr', 'gb', 'hk', 'in', 'it', 'il', 'jp', 'nl', 'ru', 'sa', 'us', 'cz'];
|
||||
const config = require('../../conf/config.json');
|
||||
|
||||
const isImmoscout = (id) => {
|
||||
return id.toLowerCase() === metaInformation.id;
|
||||
};
|
||||
|
||||
exports.transformUrlForScrapingAnt = (url, id) => {
|
||||
const randomProxy = proxies[Math.floor(Math.random() * proxies.length)];
|
||||
|
||||
if (isImmoscout(id)) {
|
||||
//only do calls to scrapingAnt when dealing with Immoscout
|
||||
url = `https://api.scrapingant.com/v1/general?url=${encodeURIComponent(url)}&proxy_country=${randomProxy}`;
|
||||
}
|
||||
return url;
|
||||
};
|
||||
|
||||
exports.isScrapingAntApiKeySet = () => {
|
||||
return config.scrapingAnt != null && config.scrapingAnt.apiKey != null && config.scrapingAnt.apiKey.length > 0;
|
||||
};
|
||||
|
||||
exports.isImmoscout = isImmoscout;
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "fredy",
|
||||
"version": "3.0.0",
|
||||
"version": "4.0.0",
|
||||
"description": "[F]ind [R]eal [E]states [d]amn eas[y].",
|
||||
"scripts": {
|
||||
"start": "node index.js",
|
||||
@@ -72,7 +72,6 @@
|
||||
"react-switch": "^6.0.0",
|
||||
"redux": "4.0.5",
|
||||
"redux-thunk": "2.3.0",
|
||||
"request-x-ray": "0.1.4",
|
||||
"restana": "4.8.1",
|
||||
"semantic-ui-react": "2.0.3",
|
||||
"serve-static": "^1.14.1",
|
||||
|
||||
56
test/provider/immoscout.test.js
Normal file
56
test/provider/immoscout.test.js
Normal file
@@ -0,0 +1,56 @@
|
||||
const mockNotification = require('../mocks/mockNotification');
|
||||
const providerConfig = require('./testProvider.json');
|
||||
const mockStore = require('../mocks/mockStore');
|
||||
const proxyquire = require('proxyquire').noCallThru();
|
||||
const expect = require('chai').expect;
|
||||
const provider = require('../../lib/provider/immoscout');
|
||||
const scrapingAnt = require('../../lib/services/scrapingAnt');
|
||||
|
||||
describe('#immoscout testsuite()', () => {
|
||||
provider.init(providerConfig.immoscout, [], []);
|
||||
const Fredy = proxyquire('../../lib/FredyRuntime', {
|
||||
'./services/storage/listingsStorage': {
|
||||
...mockStore,
|
||||
},
|
||||
'./notification/notify': mockNotification,
|
||||
});
|
||||
|
||||
it('should test immoscout provider', async () => {
|
||||
return await new Promise((resolve) => {
|
||||
if (!scrapingAnt.isScrapingAntApiKeySet()) {
|
||||
/* eslint-disable no-console */
|
||||
console.info('Skipping Immoscout test as ScrapingAnt Api Key is not set.');
|
||||
/* eslint-enable no-console */
|
||||
resolve();
|
||||
return;
|
||||
}
|
||||
|
||||
const fredy = new Fredy(provider.config, null, provider.metaInformation.id, 'test1');
|
||||
fredy.execute().then((listing) => {
|
||||
expect(listing).to.be.a('array');
|
||||
|
||||
const notificationObj = mockNotification.get();
|
||||
expect(notificationObj).to.be.a('object');
|
||||
expect(notificationObj.serviceName).to.equal('immoscout');
|
||||
|
||||
notificationObj.payload.forEach((notify) => {
|
||||
/** check the actual structure **/
|
||||
expect(notify.id).to.be.a('number');
|
||||
expect(notify.price).to.be.a('string');
|
||||
expect(notify.size).to.be.a('string');
|
||||
expect(notify.title).to.be.a('string');
|
||||
expect(notify.link).to.be.a('string');
|
||||
expect(notify.address).to.be.a('string');
|
||||
|
||||
/** check the values if possible **/
|
||||
expect(notify.price).that.does.include('€');
|
||||
expect(notify.size).that.does.include('m²');
|
||||
expect(notify.title).to.be.not.empty;
|
||||
expect(notify.link).that.does.include('https://www.immobilienscout24.de');
|
||||
expect(notify.address).to.be.not.empty;
|
||||
});
|
||||
resolve();
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -12,6 +12,10 @@
|
||||
"url": "https://www.immowelt.de/liste/duesseldorf-benrath/wohnungen/kaufen?geoid=10805111000004%2C10805111000005%2C10805111000006%2C10805111000007%2C10805111000009%2C10805111000010%2C10805111000011%2C10805111000013%2C10805111000014%2C10805111000015%2C10805111000016%2C10805111000017%2C10805111000018%2C10805111000019%2C10805111000023%2C10805111000024%2C10805111000027%2C10805111000032%2C10805111000034%2C10805111000035%2C10805111000039%2C10805111000041%2C10805111000042%2C10805111000043%2C10805111000047%2C10805111000048%2C10805111000049%2C10805111000051%2C10805111000052%2C10805111000053&roomi=3&prima=420000&wflmi=90&sort=createdate%2Bdesc",
|
||||
"enabled": true
|
||||
},
|
||||
"immoscout": {
|
||||
"url": "https://www.immobilienscout24.de/Suche/de/nordrhein-westfalen/duesseldorf/wohnung-mieten?enteredFrom=one_step_search",
|
||||
"enabled": true
|
||||
},
|
||||
"kalaydo": {
|
||||
"url": "https://www.kalaydo.de/immobilien/eigentumswohnung-kaufen/o/duesseldorf/4/?attr_gt_estate_size_living_area=90.0&attr_gt_no_of_rooms=3.5&maxPrice=420000.00&radius=5&resultsPerPage=50&sorting=-date",
|
||||
"enabled": true
|
||||
|
||||
@@ -84,6 +84,11 @@ export default function ProviderMutator({ onVisibilityChanged, visible = false,
|
||||
<br />
|
||||
When the search results are shown on the website, copy the url and paste it into the textfield below.
|
||||
<br />
|
||||
<span style={{ color: '#ff0000' }}>
|
||||
If you chose Immoscout as a provider, make sure to also add the scrapingAnt apiKey to the config.json.
|
||||
(See readme)
|
||||
</span>
|
||||
<br />
|
||||
<span style={{ color: '#ff0000' }}>
|
||||
Do not forget to sort the results by date before copying the url to Fredy, so that Fredy always captures
|
||||
the latest search results.
|
||||
|
||||
@@ -18,6 +18,7 @@ module.exports = {
|
||||
publicPath: '/',
|
||||
filename: 'fredy.bundle.js',
|
||||
},
|
||||
performance: { hints: false },
|
||||
module: {
|
||||
rules: [
|
||||
{
|
||||
|
||||
14
yarn.lock
14
yarn.lock
@@ -1895,6 +1895,15 @@ bcrypt-pbkdf@^1.0.0:
|
||||
dependencies:
|
||||
tweetnacl "^0.14.3"
|
||||
|
||||
bent@^7.3.12:
|
||||
version "7.3.12"
|
||||
resolved "https://registry.yarnpkg.com/bent/-/bent-7.3.12.tgz#e0a2775d4425e7674c64b78b242af4f49da6b035"
|
||||
integrity sha512-T3yrKnVGB63zRuoco/7Ybl7BwwGZR0lceoVG5XmQyMIH9s19SV5m+a8qam4if0zQuAmOQTyPTPmsQBdAorGK3w==
|
||||
dependencies:
|
||||
bytesish "^0.4.1"
|
||||
caseless "~0.12.0"
|
||||
is-stream "^2.0.0"
|
||||
|
||||
big.js@^5.2.2:
|
||||
version "5.2.2"
|
||||
resolved "https://registry.yarnpkg.com/big.js/-/big.js-5.2.2.tgz#65f0af382f578bcdc742bd9c281e9cb2d7768328"
|
||||
@@ -2133,6 +2142,11 @@ bytes@3.1.0:
|
||||
resolved "https://registry.yarnpkg.com/bytes/-/bytes-3.1.0.tgz#f6cf7933a360e0588fa9fde85651cdc7f805d1f6"
|
||||
integrity sha512-zauLjrfCG+xvoyaqLoV8bLVXXNGC4JqlxFCutSDWA6fJrTo2ZuvLYTqZ7aHBLZSMOopbzwv8f+wZcVzfVTI2Dg==
|
||||
|
||||
bytesish@^0.4.1:
|
||||
version "0.4.4"
|
||||
resolved "https://registry.yarnpkg.com/bytesish/-/bytesish-0.4.4.tgz#f3b535a0f1153747427aee27256748cff92347e6"
|
||||
integrity sha512-i4uu6M4zuMUiyfZN4RU2+i9+peJh//pXhd9x1oSe1LBkZ3LEbCoygu8W0bXTukU1Jme2txKuotpCZRaC3FLxcQ==
|
||||
|
||||
cacache@^12.0.2:
|
||||
version "12.0.4"
|
||||
resolved "https://registry.yarnpkg.com/cacache/-/cacache-12.0.4.tgz#668bcbd105aeb5f1d92fe25570ec9525c8faa40c"
|
||||
|
||||
Reference in New Issue
Block a user