Compare commits

..

1 Commits
5.6.1 ... 5.7.0

Author SHA1 Message Date
Christian Kellner
2062aa11a3 Scrapingant proxies (#59)
* preparing scraping ant proxies

* adding general settings for scraping ant proxy

* retrying with new ui settings
2022-06-13 08:10:30 +02:00
5 changed files with 64 additions and 25 deletions

View File

@@ -1 +1 @@
{"interval":"60","port":9998,"scrapingAnt":{"apiKey":""},"workingHours":{"from":"","to":""}}
{"interval":"60","port":9998,"scrapingAnt":{"apiKey":"","proxy":"datacenter"},"workingHours":{"from":"","to":""}}

View File

@@ -1,16 +1,21 @@
const axios = require('axios');
const config = require('../../conf/config.json');
const { makeUrlResidential } = require('./scrapingAnt');
//if ScrapingAnt got blocked, this http status is returned
const BLOCKED_HTTP_STATUS = 423;
const MAX_RETRIES_SCRAPING_ANT = 3;
const NOT_FOUND_HTTP_STATUS = 404;
const MAX_RETRIES_SCRAPING_ANT = 10;
const EXPECTED_STATUS_CODES = [BLOCKED_HTTP_STATUS, NOT_FOUND_HTTP_STATUS];
function makeDriver(headers = {}) {
let cookies = '';
async function scrapingAntDriver(context, callback, tryResidentialProxy, retryCounter = 0) {
async function scrapingAntDriver(context, callback, retryCounter = 0) {
const proxyType = config.scrapingAnt?.proxy || 'datacenter';
try {
const url = context.url;
const url = proxyType === 'residential' ? makeUrlResidential(context.url) : context.url;
const result = await axios({
url,
headers: {
@@ -26,27 +31,16 @@ function makeDriver(headers = {}) {
callback(null, result.data.content);
} catch (exception) {
/* eslint-disable no-console */
if (exception.response?.status !== BLOCKED_HTTP_STATUS) {
if (!EXPECTED_STATUS_CODES.includes(exception.response?.status)) {
console.error(`Error while trying to scrape data from scraping ant. Received error: ${exception.message}`);
callback(null, []);
return;
}
if (!tryResidentialProxy) {
console.debug('ScrapingAnt got blocked out. Retrying with residential Proxy...');
await scrapingAntDriver({ ...context, url: makeUrlResidential(context.url) }, callback, true, 0);
} else if (retryCounter <= MAX_RETRIES_SCRAPING_ANT) {
if (retryCounter <= MAX_RETRIES_SCRAPING_ANT) {
retryCounter++;
console.debug(`ScrapingAnt still got blocked retry ${retryCounter} / ${MAX_RETRIES_SCRAPING_ANT}`);
await scrapingAntDriver(
{
...context,
url: makeUrlResidential(context.url),
},
callback,
true,
retryCounter
);
console.debug(`ScrapingAnt got blocked. Retrying ${retryCounter} / ${MAX_RETRIES_SCRAPING_ANT}`);
await scrapingAntDriver(context, callback, retryCounter);
} else {
console.error(`Error while trying to scrape data from scraping ant. Received error: ${exception.message}`);
callback(null, []);

View File

@@ -9,7 +9,7 @@ const isImmoscout = (id) => {
exports.transformUrlForScrapingAnt = (url, id) => {
if (isImmoscout(id)) {
//only do calls to scrapingAnt when dealing with Immoscout
url = `https://api.scrapingant.com/v1/general?url=${encodeURIComponent(url)}&proxy_type=residential`;
url = `https://api.scrapingant.com/v1/general?url=${encodeURIComponent(url)}&proxy_type=datacenter`;
}
return url;
};

View File

@@ -1,6 +1,6 @@
{
"name": "fredy",
"version": "5.6.1",
"version": "5.7.0",
"description": "[F]ind [R]eal [E]states [d]amn eas[y].",
"scripts": {
"start": "node index.js",
@@ -11,7 +11,7 @@
"prod:win32": "set BUILD_DEV='false' && webpack --node-env=production --config ./webpack.prod.js",
"prod:default": "export BUILD_DEV='false' && webpack --node-env=production --config ./webpack.prod.js",
"format": "prettier --write lib/**/*.js ui/src/**/*.js test/**/*.js *.js --single-quote --print-width 120",
"test": "mocha --timeout 20000 test/**/*.test.js",
"test": "mocha --timeout 3000000 test/**/*.test.js",
"lint": "eslint ./index.js ./lib/**/*.js ./test/**/*.js"
},
"husky": {

View File

@@ -2,7 +2,7 @@ import React from 'react';
import { useDispatch, useSelector } from 'react-redux';
import { Button, Form, Icon, Message, Segment } from 'semantic-ui-react';
import { Button, Form, Icon, Message, Segment, Radio } from 'semantic-ui-react';
import ToastContext from '../../components/toasts/ToastContext';
import Headline from '../../components/headline/Headline';
import { xhrPost } from '../../services/xhr';
@@ -18,6 +18,7 @@ const GeneralSettings = function Users() {
const [interval, setInterval] = React.useState('');
const [port, setPort] = React.useState('');
const [scrapingAntApiKey, setScrapingAntApiKey] = React.useState('');
const [scrapingAntProxy, setScrapingAntProxy] = React.useState('');
const [workingHourFrom, setWorkingHourFrom] = React.useState(null);
const [workingHourTo, setWorkingHourTo] = React.useState(null);
const ctx = React.useContext(ToastContext);
@@ -33,6 +34,7 @@ const GeneralSettings = function Users() {
setScrapingAntApiKey(settings?.scrapingAnt?.apiKey);
setWorkingHourFrom(settings?.workingHours?.from);
setWorkingHourTo(settings?.workingHours?.to);
setScrapingAntProxy(settings?.scrapingAnt?.proxy || 'datacenter');
}, [settings]);
const nullOrEmpty = (val) => val == null || val.length === 0;
@@ -69,6 +71,7 @@ const GeneralSettings = function Users() {
port,
scrapingAnt: {
apiKey: scrapingAntApiKey,
proxy: scrapingAntProxy,
},
workingHours: {
from: workingHourFrom,
@@ -144,6 +147,48 @@ const GeneralSettings = function Users() {
/>
</SegmentPart>
<SegmentPart
name="ScrapingAnt proxy settings"
helpText="Scraping ant provides different proxies."
icon="key"
>
<Message info>
ScrapingAnt is needed to scrape Immoscout. ScrapingAnt itself is using 2 different types of proxies.{' '}
<br />
<h4>Datacenter-Proxy</h4>
Proxy server located in one of the datacenters across the world. Datacenter proxies are slower and more
likely to fail, but they are cheaper. A call with a datacenter proxy cost 10 credits.
<h4>Residential-Proxy</h4>
High-quality proxy server located in one of the real people houses across the world. Datacenter proxies
are faster and more likely to success, but they are more expensive. A call with a datacenter proxy cost
250 credits.
<br />
<br />
<b>
On the free tier, you have 10.000 credits, so chose your option wisely. Keep in mind, only successful
calls will be charged.
</b>
</Message>
<Form.Field>
<Radio
label="Datacenter proxy"
name="scrapingAntProxy"
value="datacenter"
checked={scrapingAntProxy === 'datacenter'}
onChange={(e, { value }) => setScrapingAntProxy(value)}
/>
</Form.Field>
<Form.Field>
<Radio
label="Residential proxy"
name="scrapingAntProxy"
value="residential"
checked={scrapingAntProxy === 'residential'}
onChange={(e, { value }) => setScrapingAntProxy(value)}
/>
</Form.Field>
</SegmentPart>
<SegmentPart
name="Working hours"
helpText="During this hours, Fredy will search for new apartments. If nothing is configured, Fredy will search around the clock."
@@ -153,7 +198,7 @@ const GeneralSettings = function Users() {
<Form.Input
className="generalSettings__time"
type="time"
placeholder="ScrapingAnt Api Key"
placeholder="Working hours from"
inverted
size="mini"
width={2}
@@ -163,7 +208,7 @@ const GeneralSettings = function Users() {
<div className="generalSettings__until">until</div>
<Form.Input
type="time"
placeholder="ScrapingAnt Api Key"
placeholder="Working hours to"
inverted
size="mini"
width={2}