From 4ba098e0b602979911e0e965646ab12a0af63eaa Mon Sep 17 00:00:00 2001
From: "weakmap@gmail.com"
Date: Sat, 15 Apr 2023 18:24:51 +0200
Subject: [PATCH] bringing back immonet by using scrapingant
---
README.md | 6 ++---
lib/FredyRuntime.js | 6 ++---
lib/api/routes/jobRouter.js | 2 +-
lib/provider/immonet.js | 18 +++++++-------
lib/services/requestDriver.js | 3 ++-
lib/services/scrapingAnt.js | 24 +++++++++++++------
test/provider/immonet.test.js | 12 ++++++++--
test/provider/testProvider.json | 2 +-
ui/src/views/jobs/ProcessingTimes.jsx | 2 +-
.../components/provider/ProviderMutator.jsx | 2 +-
10 files changed, 47 insertions(+), 30 deletions(-)
diff --git a/README.md b/README.md
index e113612..6804ad0 100755
--- a/README.md
+++ b/README.md
@@ -81,10 +81,10 @@ yarn run test
# Architecture

-### Immoscout
-I have added **experimental** support for Immoscout. Immoscout is somewhat special, because they have decided to secure their service from bots using Re-Capture. Finding a way around this is barely possible. For _Fredy_ to be able to bypass this check, I'm using a service called [ScrapingAnt](https://scrapingant.com/). The trick is to use a headless browser, rotating proxies and (once successfully validated) to re-send the cookies each time.
+### Immoscout / Immonet
+I have added **experimental** support for Immoscout and Immonet. They both are somewhat special, because they have decided to secure their service from bots using Re-Capture. Finding a way around this is barely possible. For _Fredy_ to be able to bypass this check, I'm using a service called [ScrapingAnt](https://scrapingant.com/). The trick is to use a headless browser, rotating proxies and (once successfully validated) to re-send the cookies each time.
-To be able to use Immoscout, you need to create an account at ScrapingAnt. Configure the API key in the "General Settings" tab (visible when logged in as administrator).
+To be able to use Immoscout / Immonet, you need to create an account at ScrapingAnt. Configure the API key in the "General Settings" tab (visible when logged in as administrator).
The rest will be handled by _Fredy_. Keep in mind, the support is experimental. There might be bugs and you might not always pass the re-capture check, but most of the time it works rather well :)
If you need more than the 1000 API calls allowed per month, I'd suggest opting for a paid account... ScrapingAnt loves OpenSource, therefore they have decided to give all _Fredy_ users a 10% discount by using the code **FREDY10** (Disclaimer: I do not earn any money for recommending their service).
diff --git a/lib/FredyRuntime.js b/lib/FredyRuntime.js
index 1c82f3a..210731c 100755
--- a/lib/FredyRuntime.js
+++ b/lib/FredyRuntime.js
@@ -45,15 +45,15 @@ class FredyRuntime {
_getListings(url) {
return new Promise((resolve, reject) => {
const id = this._providerId;
- if (scrapingAnt.isImmoscout(id) && !scrapingAnt.isScrapingAntApiKeySet()) {
- const error = 'Immoscout can only be used with if you have set an apikey for scrapingAnt.';
+ if (scrapingAnt.needScrapingAnt(id) && !scrapingAnt.isScrapingAntApiKeySet()) {
+ const error = 'Immoscout or Immonet can only be used with if you have set an apikey for scrapingAnt.';
/* eslint-disable no-console */
console.log(error);
/* eslint-enable no-console */
reject(error);
return;
}
- const u = scrapingAnt.isImmoscout(id) ? scrapingAnt.transformUrlForScrapingAnt(url, id) : url;
+ const u = scrapingAnt.needScrapingAnt(id) ? scrapingAnt.transformUrlForScrapingAnt(url, id) : url;
try {
if (this._providerConfig.paginate != null) {
xray(u, this._providerConfig.crawlContainer, [this._providerConfig.crawlFields])
diff --git a/lib/api/routes/jobRouter.js b/lib/api/routes/jobRouter.js
index c4690e3..09694b4 100644
--- a/lib/api/routes/jobRouter.js
+++ b/lib/api/routes/jobRouter.js
@@ -28,7 +28,7 @@ jobRouter.get('/processingTimes', async (req, res) => {
let scrapingAntData = null;
if (config.scrapingAnt.apiKey != null && config.scrapingAnt.apiKey.length > 0) {
try {
- const response = await fetch(`https://api.scrapingant.com/v1/usage?x-api-key=${config.scrapingAnt.apiKey}`);
+ const response = await fetch(`https://api.scrapingant.com/v2/usage?x-api-key=${config.scrapingAnt.apiKey}`);
scrapingAntData = await response.json();
} catch (Exception) {
console.error('Could not query plan data from scraping ant.', Exception);
diff --git a/lib/provider/immonet.js b/lib/provider/immonet.js
index 3c34593..aa39957 100755
--- a/lib/provider/immonet.js
+++ b/lib/provider/immonet.js
@@ -1,14 +1,12 @@
import utils from '../utils.js';
let appliedBlackList = [];
function normalize(o) {
- const id = parseInt(o.id.substring(o.id.indexOf('_') + 1, o.id.length));
+ const id = o.id.substring(o.id.lastIndexOf('/') + 1, o.id.length);
const size = o.size != null ? o.size.replace('Wohnfläche ', '') : 'N/A m²';
const price = o.price.replace('Kaufpreis ', '');
const address = o.address.split(' • ')[o.address.split(' • ').length - 1];
const title = o.title || 'No title available';
- //normally we would just read the link from the source, but immonet decided to trick user by adding a click listener instead of
- //a href to do some weird reporting. (Very user friendly for handicaped ppl... not)
- const link = `https://www.immonet.de/angebot/${id}`;
+ const link = o.id;
return Object.assign(o, { id, address, price, size, title, link });
}
function applyBlacklist(o) {
@@ -18,14 +16,14 @@ function applyBlacklist(o) {
}
const config = {
url: null,
- crawlContainer: '#result-list-stage .item',
+ crawlContainer: '.content-wrapper-tiles .ng-star-inserted',
sortByDateParam: 'sortby=19',
crawlFields: {
- id: '@id',
- price: 'div[id*="selPrice_"] | trim',
- size: 'div[id*="selArea_"] | trim',
- title: '.item a img@title',
- address: '.item .box-25 .ellipsis .text-100 | removeNewline | trim',
+ id: '.card a@href',
+ title: '.card h3 |trim',
+ price: '.card .has-font-300 .is-bold | trim',
+ size: '.card .has-font-300 .ml-100 | trim',
+ address: '.card span:nth-child(2) | trim',
},
paginate: '#idResultList .margin-bottom-6.margin-bottom-sm-12 .panel a.pull-right@href',
normalize: normalize,
diff --git a/lib/services/requestDriver.js b/lib/services/requestDriver.js
index 7848c97..7d42c5e 100644
--- a/lib/services/requestDriver.js
+++ b/lib/services/requestDriver.js
@@ -46,9 +46,10 @@ function makeDriver(headers = {}) {
/* eslint-enable no-console */
}
}
+
/**
* The regular request driver is taking care of everyting, that doesn't need to be scraped by ScrapingAnt (which is
- * everything != Immoscout as of writing this)
+ * everything != Immoscout & Immonet as of writing this)
*/
return async function driver(context, callback) {
if (context.url.toLowerCase().indexOf('scrapingant') !== -1) {
diff --git a/lib/services/scrapingAnt.js b/lib/services/scrapingAnt.js
index ee1df84..c934a9c 100644
--- a/lib/services/scrapingAnt.js
+++ b/lib/services/scrapingAnt.js
@@ -1,12 +1,22 @@
-import { metaInformation } from '../provider/immoscout.js';
+import { metaInformation as immoScoutInfo } from '../provider/immoscout.js';
+import { metaInformation as immoNetInfo } from '../provider/immonet.js';
import { config } from '../utils.js';
-const isImmoscout = (id) => {
- return id.toLowerCase() === metaInformation.id;
+
+const additionalImmonetUrlParams = `&wait_for_selector=.content-wrapper-tiles&js_snippet=${new Buffer(
+ 'window.scrollTo(0,document.body.scrollHeight);'
+).toString('base64')}`;
+
+const needScrapingAnt = (id) => {
+ return id.toLowerCase() === immoScoutInfo.id || id.toLowerCase() === immoNetInfo.id;
};
export const transformUrlForScrapingAnt = (url, id) => {
- if (isImmoscout(id)) {
- //only do calls to scrapingAnt when dealing with Immoscout
- url = `https://api.scrapingant.com/v1/general?url=${encodeURIComponent(url)}&proxy_type=datacenter`;
+ let urlParams = '';
+ if (needScrapingAnt(id)) {
+ if (id.toLowerCase() === immoNetInfo.id) {
+ urlParams = additionalImmonetUrlParams;
+ }
+ //only do calls to scrapingAnt when dealing with Immoscout/Immonet
+ url = `https://api.scrapingant.com/v2/general?url=${encodeURIComponent(url)}&proxy_type=datacenter${urlParams}`;
}
return url;
};
@@ -16,4 +26,4 @@ export const isScrapingAntApiKeySet = () => {
export const makeUrlResidential = (url) => {
return url.replace('datacenter', 'residential');
};
-export { isImmoscout };
+export { needScrapingAnt };
diff --git a/test/provider/immonet.test.js b/test/provider/immonet.test.js
index a57be7b..d5dd89a 100644
--- a/test/provider/immonet.test.js
+++ b/test/provider/immonet.test.js
@@ -3,6 +3,7 @@ import { get } from '../mocks/mockNotification.js';
import { mockFredy, providerConfig } from '../utils.js';
import chai from 'chai';
import * as provider from '../../lib/provider/immonet.js';
+import * as scrapingAnt from '../../lib/services/scrapingAnt.js';
const expect = chai.expect;
describe('#immonet testsuite()', () => {
after(() => {
@@ -12,6 +13,13 @@ describe('#immonet testsuite()', () => {
it('should test immonet provider', async () => {
const Fredy = await mockFredy();
return await new Promise((resolve) => {
+ if (!scrapingAnt.isScrapingAntApiKeySet()) {
+ /* eslint-disable no-console */
+ console.info('Skipping Immonet test as ScrapingAnt Api Key is not set.');
+ /* eslint-enable no-console */
+ resolve();
+ return;
+ }
const fredy = new Fredy(provider.config, null, provider.metaInformation.id, 'immonet', similarityCache);
fredy.execute().then((listing) => {
expect(listing).to.be.a('array');
@@ -20,17 +28,17 @@ describe('#immonet testsuite()', () => {
expect(notificationObj.serviceName).to.equal('immonet');
notificationObj.payload.forEach((notify) => {
/** check the actual structure **/
- expect(notify.id).to.be.a('number');
+ expect(notify.id).to.be.a('string');
expect(notify.price).to.be.a('string');
expect(notify.size).to.be.a('string');
expect(notify.title).to.be.a('string');
expect(notify.link).to.be.a('string');
expect(notify.address).to.be.a('string');
+
/** check the values if possible **/
expect(notify.price).that.does.include('€');
expect(notify.size).that.does.include('m²');
expect(notify.title).to.be.not.empty;
- expect(notify.link).that.does.include('https://www.immonet.de');
expect(notify.address).to.be.not.empty;
});
resolve();
diff --git a/test/provider/testProvider.json b/test/provider/testProvider.json
index 14dd2ea..a7b63a7 100644
--- a/test/provider/testProvider.json
+++ b/test/provider/testProvider.json
@@ -9,7 +9,7 @@
"enabled": true
},
"immonet": {
- "url": "https://www.immonet.de/immobiliensuche/sel.do?pageoffset=1&listsize=100&objecttype=1&locationname=Düsseldorf&acid=&actype=&district=8717&district=8718&district=8719&district=8720&district=8721&district=8723&district=8724&district=8725&district=8727&district=8728&district=8729&district=8730&district=8731&district=8732&district=8733&district=8737&district=8738&district=8741&district=8745&district=8747&district=8750&district=8752&district=8754&district=8755&district=8756&district=8759&district=8760&district=8761&district=8763&district=8764&district=8765&ajaxIsRadiusActive=false&sortby=19&suchart=1&radius=0&pcatmtypes=1_1&pCatMTypeStoragefield=&parentcat=1&marketingtype=1&fromprice=&toprice=420000&fromarea=90&toarea=&fromplotarea=&toplotarea=&fromrooms=3&torooms=&objectcat=225&objectcat=18&objectcat=17&objectcat=12&objectcat=16&objectcat=181&objectcat=14&objectcat=15&objectcat=226&objectcat=13&wbs=-1&fromyear=&toyear=",
+ "url": "https://www.immonet.de/immobiliensuche/beta?pageoffset=1&listsize=100&objecttype=1&locationname=D%C3%BCsseldorf&acid=&actype=&district=8717&district=8718&district=8719&district=8720&district=8721&district=8723&district=8724&district=8725&district=8727&district=8728&district=8729&district=8730&district=8731&district=8732&district=8733&district=8737&district=8738&district=8741&district=8745&district=8747&district=8750&district=8752&district=8754&district=8755&district=8756&district=8759&district=8760&district=8761&district=8763&district=8764&district=8765&ajaxIsRadiusActive=false&sortby=19&suchart=1&radius=0&pcatmtypes=1_1&pCatMTypeStoragefield=&parentcat=1&marketingtype=1&fromprice=&toprice=420000&fromarea=90&toarea=&fromplotarea=&toplotarea=&fromrooms=3&torooms=&objectcat=225&objectcat=18&objectcat=17&objectcat=12&objectcat=16&objectcat=181&objectcat=14&objectcat=15&objectcat=226&objectcat=13&wbs=-1&fromyear=&toyear=",
"enabled": true
},
"immowelt": {
diff --git a/ui/src/views/jobs/ProcessingTimes.jsx b/ui/src/views/jobs/ProcessingTimes.jsx
index 6310794..7bf1cb7 100644
--- a/ui/src/views/jobs/ProcessingTimes.jsx
+++ b/ui/src/views/jobs/ProcessingTimes.jsx
@@ -47,7 +47,7 @@ export default function ProcessingTimes({ processingTimes }) {
Credits: {processingTimes.scrapingAntData.remained_credits}/
{processingTimes.scrapingAntData.plan_total_credits} (250 credits per call)
- If you want to scrape Immoscout more often, you have to purchase a premium account of{' '}
+ If you want to scrape Immoscout or Immonet more often, you have to purchase a premium account of{' '}
ScrapingAnt
diff --git a/ui/src/views/jobs/mutation/components/provider/ProviderMutator.jsx b/ui/src/views/jobs/mutation/components/provider/ProviderMutator.jsx
index 37c1ab0..482b831 100644
--- a/ui/src/views/jobs/mutation/components/provider/ProviderMutator.jsx
+++ b/ui/src/views/jobs/mutation/components/provider/ProviderMutator.jsx
@@ -101,7 +101,7 @@ export default function ProviderMutator({ onVisibilityChanged, visible = false,
description={
- If you chose Immoscout as a provider, make sure to also add the scrapingAnt apiKey to the config.json.
+ If you chose Immoscout or Immonet as a provider, make sure to also add the scrapingAnt apiKey to the config.json.
(See readme)