From 2062aa11a3d63f4b8c4d0724bc149f68f581a5e3 Mon Sep 17 00:00:00 2001 From: Christian Kellner Date: Mon, 13 Jun 2022 08:10:30 +0200 Subject: [PATCH] Scrapingant proxies (#59) * preparing scraping ant proxies * adding general settings for scraping ant proxy * retrying with new ui settings --- conf/config.json | 2 +- lib/services/requestDriver.js | 30 +++++------ lib/services/scrapingAnt.js | 2 +- package.json | 4 +- .../views/generalSettings/GeneralSettings.js | 51 +++++++++++++++++-- 5 files changed, 64 insertions(+), 25 deletions(-) diff --git a/conf/config.json b/conf/config.json index 79dac47..6df9d31 100755 --- a/conf/config.json +++ b/conf/config.json @@ -1 +1 @@ -{"interval":"60","port":9998,"scrapingAnt":{"apiKey":""},"workingHours":{"from":"","to":""}} +{"interval":"60","port":9998,"scrapingAnt":{"apiKey":"","proxy":"datacenter"},"workingHours":{"from":"","to":""}} \ No newline at end of file diff --git a/lib/services/requestDriver.js b/lib/services/requestDriver.js index f918901..dfff63c 100644 --- a/lib/services/requestDriver.js +++ b/lib/services/requestDriver.js @@ -1,16 +1,21 @@ const axios = require('axios'); +const config = require('../../conf/config.json'); const { makeUrlResidential } = require('./scrapingAnt'); //if ScrapingAnt got blocked, this http status is returned const BLOCKED_HTTP_STATUS = 423; -const MAX_RETRIES_SCRAPING_ANT = 3; +const NOT_FOUND_HTTP_STATUS = 404; +const MAX_RETRIES_SCRAPING_ANT = 10; +const EXPECTED_STATUS_CODES = [BLOCKED_HTTP_STATUS, NOT_FOUND_HTTP_STATUS]; function makeDriver(headers = {}) { let cookies = ''; - async function scrapingAntDriver(context, callback, tryResidentialProxy, retryCounter = 0) { + async function scrapingAntDriver(context, callback, retryCounter = 0) { + const proxyType = config.scrapingAnt?.proxy || 'datacenter'; + try { - const url = context.url; + const url = proxyType === 'residential' ? makeUrlResidential(context.url) : context.url; const result = await axios({ url, headers: { @@ -26,27 +31,16 @@ function makeDriver(headers = {}) { callback(null, result.data.content); } catch (exception) { /* eslint-disable no-console */ - if (exception.response?.status !== BLOCKED_HTTP_STATUS) { + if (!EXPECTED_STATUS_CODES.includes(exception.response?.status)) { console.error(`Error while trying to scrape data from scraping ant. Received error: ${exception.message}`); callback(null, []); return; } - if (!tryResidentialProxy) { - console.debug('ScrapingAnt got blocked out. Retrying with residential Proxy...'); - await scrapingAntDriver({ ...context, url: makeUrlResidential(context.url) }, callback, true, 0); - } else if (retryCounter <= MAX_RETRIES_SCRAPING_ANT) { + if (retryCounter <= MAX_RETRIES_SCRAPING_ANT) { retryCounter++; - console.debug(`ScrapingAnt still got blocked retry ${retryCounter} / ${MAX_RETRIES_SCRAPING_ANT}`); - await scrapingAntDriver( - { - ...context, - url: makeUrlResidential(context.url), - }, - callback, - true, - retryCounter - ); + console.debug(`ScrapingAnt got blocked. Retrying ${retryCounter} / ${MAX_RETRIES_SCRAPING_ANT}`); + await scrapingAntDriver(context, callback, retryCounter); } else { console.error(`Error while trying to scrape data from scraping ant. Received error: ${exception.message}`); callback(null, []); diff --git a/lib/services/scrapingAnt.js b/lib/services/scrapingAnt.js index be776aa..bdd5a82 100644 --- a/lib/services/scrapingAnt.js +++ b/lib/services/scrapingAnt.js @@ -9,7 +9,7 @@ const isImmoscout = (id) => { exports.transformUrlForScrapingAnt = (url, id) => { if (isImmoscout(id)) { //only do calls to scrapingAnt when dealing with Immoscout - url = `https://api.scrapingant.com/v1/general?url=${encodeURIComponent(url)}&proxy_type=residential`; + url = `https://api.scrapingant.com/v1/general?url=${encodeURIComponent(url)}&proxy_type=datacenter`; } return url; }; diff --git a/package.json b/package.json index 57b691e..6aa5103 100755 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "fredy", - "version": "5.6.1", + "version": "5.7.0", "description": "[F]ind [R]eal [E]states [d]amn eas[y].", "scripts": { "start": "node index.js", @@ -11,7 +11,7 @@ "prod:win32": "set BUILD_DEV='false' && webpack --node-env=production --config ./webpack.prod.js", "prod:default": "export BUILD_DEV='false' && webpack --node-env=production --config ./webpack.prod.js", "format": "prettier --write lib/**/*.js ui/src/**/*.js test/**/*.js *.js --single-quote --print-width 120", - "test": "mocha --timeout 20000 test/**/*.test.js", + "test": "mocha --timeout 3000000 test/**/*.test.js", "lint": "eslint ./index.js ./lib/**/*.js ./test/**/*.js" }, "husky": { diff --git a/ui/src/views/generalSettings/GeneralSettings.js b/ui/src/views/generalSettings/GeneralSettings.js index 972a8ac..03dcb88 100644 --- a/ui/src/views/generalSettings/GeneralSettings.js +++ b/ui/src/views/generalSettings/GeneralSettings.js @@ -2,7 +2,7 @@ import React from 'react'; import { useDispatch, useSelector } from 'react-redux'; -import { Button, Form, Icon, Message, Segment } from 'semantic-ui-react'; +import { Button, Form, Icon, Message, Segment, Radio } from 'semantic-ui-react'; import ToastContext from '../../components/toasts/ToastContext'; import Headline from '../../components/headline/Headline'; import { xhrPost } from '../../services/xhr'; @@ -18,6 +18,7 @@ const GeneralSettings = function Users() { const [interval, setInterval] = React.useState(''); const [port, setPort] = React.useState(''); const [scrapingAntApiKey, setScrapingAntApiKey] = React.useState(''); + const [scrapingAntProxy, setScrapingAntProxy] = React.useState(''); const [workingHourFrom, setWorkingHourFrom] = React.useState(null); const [workingHourTo, setWorkingHourTo] = React.useState(null); const ctx = React.useContext(ToastContext); @@ -33,6 +34,7 @@ const GeneralSettings = function Users() { setScrapingAntApiKey(settings?.scrapingAnt?.apiKey); setWorkingHourFrom(settings?.workingHours?.from); setWorkingHourTo(settings?.workingHours?.to); + setScrapingAntProxy(settings?.scrapingAnt?.proxy || 'datacenter'); }, [settings]); const nullOrEmpty = (val) => val == null || val.length === 0; @@ -69,6 +71,7 @@ const GeneralSettings = function Users() { port, scrapingAnt: { apiKey: scrapingAntApiKey, + proxy: scrapingAntProxy, }, workingHours: { from: workingHourFrom, @@ -144,6 +147,48 @@ const GeneralSettings = function Users() { /> + + + ScrapingAnt is needed to scrape Immoscout. ScrapingAnt itself is using 2 different types of proxies.{' '} +
+

Datacenter-Proxy

+ Proxy server located in one of the datacenters across the world. Datacenter proxies are slower and more + likely to fail, but they are cheaper. A call with a datacenter proxy cost 10 credits. +

Residential-Proxy

+ High-quality proxy server located in one of the real people houses across the world. Datacenter proxies + are faster and more likely to success, but they are more expensive. A call with a datacenter proxy cost + 250 credits. +
+
+ + On the free tier, you have 10.000 credits, so chose your option wisely. Keep in mind, only successful + calls will be charged. + +
+ + setScrapingAntProxy(value)} + /> + + + setScrapingAntProxy(value)} + /> + +
+ until