bringing back immonet by using scrapingant

This commit is contained in:
weakmap@gmail.com
2023-04-15 18:24:51 +02:00
parent 6fbee3e7c6
commit 4ba098e0b6
10 changed files with 47 additions and 30 deletions

View File

@@ -81,10 +81,10 @@ yarn run test
# Architecture
![Architecture](/doc/architecture.jpg "Architecture")
### Immoscout
I have added **experimental** support for Immoscout. Immoscout is somewhat special, because they have decided to secure their service from bots using Re-Capture. Finding a way around this is barely possible. For _Fredy_ to be able to bypass this check, I'm using a service called [ScrapingAnt](https://scrapingant.com/). The trick is to use a headless browser, rotating proxies and (once successfully validated) to re-send the cookies each time.
### Immoscout / Immonet
I have added **experimental** support for Immoscout and Immonet. They both are somewhat special, because they have decided to secure their service from bots using Re-Capture. Finding a way around this is barely possible. For _Fredy_ to be able to bypass this check, I'm using a service called [ScrapingAnt](https://scrapingant.com/). The trick is to use a headless browser, rotating proxies and (once successfully validated) to re-send the cookies each time.
To be able to use Immoscout, you need to create an account at ScrapingAnt. Configure the API key in the "General Settings" tab (visible when logged in as administrator).
To be able to use Immoscout / Immonet, you need to create an account at ScrapingAnt. Configure the API key in the "General Settings" tab (visible when logged in as administrator).
The rest will be handled by _Fredy_. Keep in mind, the support is experimental. There might be bugs and you might not always pass the re-capture check, but most of the time it works rather well :)
If you need more than the 1000 API calls allowed per month, I'd suggest opting for a paid account... ScrapingAnt loves OpenSource, therefore they have decided to give all _Fredy_ users a 10% discount by using the code **FREDY10** (Disclaimer: I do not earn any money for recommending their service).

View File

@@ -45,15 +45,15 @@ class FredyRuntime {
_getListings(url) {
return new Promise((resolve, reject) => {
const id = this._providerId;
if (scrapingAnt.isImmoscout(id) && !scrapingAnt.isScrapingAntApiKeySet()) {
const error = 'Immoscout can only be used with if you have set an apikey for scrapingAnt.';
if (scrapingAnt.needScrapingAnt(id) && !scrapingAnt.isScrapingAntApiKeySet()) {
const error = 'Immoscout or Immonet can only be used with if you have set an apikey for scrapingAnt.';
/* eslint-disable no-console */
console.log(error);
/* eslint-enable no-console */
reject(error);
return;
}
const u = scrapingAnt.isImmoscout(id) ? scrapingAnt.transformUrlForScrapingAnt(url, id) : url;
const u = scrapingAnt.needScrapingAnt(id) ? scrapingAnt.transformUrlForScrapingAnt(url, id) : url;
try {
if (this._providerConfig.paginate != null) {
xray(u, this._providerConfig.crawlContainer, [this._providerConfig.crawlFields])

View File

@@ -28,7 +28,7 @@ jobRouter.get('/processingTimes', async (req, res) => {
let scrapingAntData = null;
if (config.scrapingAnt.apiKey != null && config.scrapingAnt.apiKey.length > 0) {
try {
const response = await fetch(`https://api.scrapingant.com/v1/usage?x-api-key=${config.scrapingAnt.apiKey}`);
const response = await fetch(`https://api.scrapingant.com/v2/usage?x-api-key=${config.scrapingAnt.apiKey}`);
scrapingAntData = await response.json();
} catch (Exception) {
console.error('Could not query plan data from scraping ant.', Exception);

View File

@@ -1,14 +1,12 @@
import utils from '../utils.js';
let appliedBlackList = [];
function normalize(o) {
const id = parseInt(o.id.substring(o.id.indexOf('_') + 1, o.id.length));
const id = o.id.substring(o.id.lastIndexOf('/') + 1, o.id.length);
const size = o.size != null ? o.size.replace('Wohnfläche ', '') : 'N/A m²';
const price = o.price.replace('Kaufpreis ', '');
const address = o.address.split(' • ')[o.address.split(' • ').length - 1];
const title = o.title || 'No title available';
//normally we would just read the link from the source, but immonet decided to trick user by adding a click listener instead of
//a href to do some weird reporting. (Very user friendly for handicaped ppl... not)
const link = `https://www.immonet.de/angebot/${id}`;
const link = o.id;
return Object.assign(o, { id, address, price, size, title, link });
}
function applyBlacklist(o) {
@@ -18,14 +16,14 @@ function applyBlacklist(o) {
}
const config = {
url: null,
crawlContainer: '#result-list-stage .item',
crawlContainer: '.content-wrapper-tiles .ng-star-inserted',
sortByDateParam: 'sortby=19',
crawlFields: {
id: '@id',
price: 'div[id*="selPrice_"] | trim',
size: 'div[id*="selArea_"] | trim',
title: '.item a img@title',
address: '.item .box-25 .ellipsis .text-100 | removeNewline | trim',
id: '.card a@href',
title: '.card h3 |trim',
price: '.card .has-font-300 .is-bold | trim',
size: '.card .has-font-300 .ml-100 | trim',
address: '.card span:nth-child(2) | trim',
},
paginate: '#idResultList .margin-bottom-6.margin-bottom-sm-12 .panel a.pull-right@href',
normalize: normalize,

View File

@@ -46,9 +46,10 @@ function makeDriver(headers = {}) {
/* eslint-enable no-console */
}
}
/**
* The regular request driver is taking care of everyting, that doesn't need to be scraped by ScrapingAnt (which is
* everything != Immoscout as of writing this)
* everything != Immoscout & Immonet as of writing this)
*/
return async function driver(context, callback) {
if (context.url.toLowerCase().indexOf('scrapingant') !== -1) {

View File

@@ -1,12 +1,22 @@
import { metaInformation } from '../provider/immoscout.js';
import { metaInformation as immoScoutInfo } from '../provider/immoscout.js';
import { metaInformation as immoNetInfo } from '../provider/immonet.js';
import { config } from '../utils.js';
const isImmoscout = (id) => {
return id.toLowerCase() === metaInformation.id;
const additionalImmonetUrlParams = `&wait_for_selector=.content-wrapper-tiles&js_snippet=${new Buffer(
'window.scrollTo(0,document.body.scrollHeight);'
).toString('base64')}`;
const needScrapingAnt = (id) => {
return id.toLowerCase() === immoScoutInfo.id || id.toLowerCase() === immoNetInfo.id;
};
export const transformUrlForScrapingAnt = (url, id) => {
if (isImmoscout(id)) {
//only do calls to scrapingAnt when dealing with Immoscout
url = `https://api.scrapingant.com/v1/general?url=${encodeURIComponent(url)}&proxy_type=datacenter`;
let urlParams = '';
if (needScrapingAnt(id)) {
if (id.toLowerCase() === immoNetInfo.id) {
urlParams = additionalImmonetUrlParams;
}
//only do calls to scrapingAnt when dealing with Immoscout/Immonet
url = `https://api.scrapingant.com/v2/general?url=${encodeURIComponent(url)}&proxy_type=datacenter${urlParams}`;
}
return url;
};
@@ -16,4 +26,4 @@ export const isScrapingAntApiKeySet = () => {
export const makeUrlResidential = (url) => {
return url.replace('datacenter', 'residential');
};
export { isImmoscout };
export { needScrapingAnt };

View File

@@ -3,6 +3,7 @@ import { get } from '../mocks/mockNotification.js';
import { mockFredy, providerConfig } from '../utils.js';
import chai from 'chai';
import * as provider from '../../lib/provider/immonet.js';
import * as scrapingAnt from '../../lib/services/scrapingAnt.js';
const expect = chai.expect;
describe('#immonet testsuite()', () => {
after(() => {
@@ -12,6 +13,13 @@ describe('#immonet testsuite()', () => {
it('should test immonet provider', async () => {
const Fredy = await mockFredy();
return await new Promise((resolve) => {
if (!scrapingAnt.isScrapingAntApiKeySet()) {
/* eslint-disable no-console */
console.info('Skipping Immonet test as ScrapingAnt Api Key is not set.');
/* eslint-enable no-console */
resolve();
return;
}
const fredy = new Fredy(provider.config, null, provider.metaInformation.id, 'immonet', similarityCache);
fredy.execute().then((listing) => {
expect(listing).to.be.a('array');
@@ -20,17 +28,17 @@ describe('#immonet testsuite()', () => {
expect(notificationObj.serviceName).to.equal('immonet');
notificationObj.payload.forEach((notify) => {
/** check the actual structure **/
expect(notify.id).to.be.a('number');
expect(notify.id).to.be.a('string');
expect(notify.price).to.be.a('string');
expect(notify.size).to.be.a('string');
expect(notify.title).to.be.a('string');
expect(notify.link).to.be.a('string');
expect(notify.address).to.be.a('string');
/** check the values if possible **/
expect(notify.price).that.does.include('€');
expect(notify.size).that.does.include('m²');
expect(notify.title).to.be.not.empty;
expect(notify.link).that.does.include('https://www.immonet.de');
expect(notify.address).to.be.not.empty;
});
resolve();

View File

@@ -9,7 +9,7 @@
"enabled": true
},
"immonet": {
"url": "https://www.immonet.de/immobiliensuche/sel.do?pageoffset=1&listsize=100&objecttype=1&locationname=Düsseldorf&acid=&actype=&district=8717&district=8718&district=8719&district=8720&district=8721&district=8723&district=8724&district=8725&district=8727&district=8728&district=8729&district=8730&district=8731&district=8732&district=8733&district=8737&district=8738&district=8741&district=8745&district=8747&district=8750&district=8752&district=8754&district=8755&district=8756&district=8759&district=8760&district=8761&district=8763&district=8764&district=8765&ajaxIsRadiusActive=false&sortby=19&suchart=1&radius=0&pcatmtypes=1_1&pCatMTypeStoragefield=&parentcat=1&marketingtype=1&fromprice=&toprice=420000&fromarea=90&toarea=&fromplotarea=&toplotarea=&fromrooms=3&torooms=&objectcat=225&objectcat=18&objectcat=17&objectcat=12&objectcat=16&objectcat=181&objectcat=14&objectcat=15&objectcat=226&objectcat=13&wbs=-1&fromyear=&toyear=",
"url": "https://www.immonet.de/immobiliensuche/beta?pageoffset=1&listsize=100&objecttype=1&locationname=D%C3%BCsseldorf&acid=&actype=&district=8717&district=8718&district=8719&district=8720&district=8721&district=8723&district=8724&district=8725&district=8727&district=8728&district=8729&district=8730&district=8731&district=8732&district=8733&district=8737&district=8738&district=8741&district=8745&district=8747&district=8750&district=8752&district=8754&district=8755&district=8756&district=8759&district=8760&district=8761&district=8763&district=8764&district=8765&ajaxIsRadiusActive=false&sortby=19&suchart=1&radius=0&pcatmtypes=1_1&pCatMTypeStoragefield=&parentcat=1&marketingtype=1&fromprice=&toprice=420000&fromarea=90&toarea=&fromplotarea=&toplotarea=&fromrooms=3&torooms=&objectcat=225&objectcat=18&objectcat=17&objectcat=12&objectcat=16&objectcat=181&objectcat=14&objectcat=15&objectcat=226&objectcat=13&wbs=-1&fromyear=&toyear=",
"enabled": true
},
"immowelt": {

View File

@@ -47,7 +47,7 @@ export default function ProcessingTimes({ processingTimes }) {
Credits: {processingTimes.scrapingAntData.remained_credits}/
{processingTimes.scrapingAntData.plan_total_credits} (250 credits per call)
</p>
If you want to scrape Immoscout more often, you have to purchase a premium account of{' '}
If you want to scrape Immoscout or Immonet more often, you have to purchase a premium account of{' '}
<a href="https://scrapingant.com/" target="_blank" rel="noreferrer">
ScrapingAnt
</a>

View File

@@ -101,7 +101,7 @@ export default function ProviderMutator({ onVisibilityChanged, visible = false,
description={
<div>
<p>
If you chose Immoscout as a provider, make sure to also add the scrapingAnt apiKey to the config.json.
If you chose Immoscout or Immonet as a provider, make sure to also add the scrapingAnt apiKey to the config.json.
(See readme)
</p>
<p>