mirror of
https://github.com/orangecoding/fredy.git
synced 2026-06-16 12:31:07 +00:00
scraping ant retries
This commit is contained in:
@@ -1,12 +1,14 @@
|
||||
const axios = require('axios');
|
||||
const axiosRetry = require('axios-retry');
|
||||
|
||||
axiosRetry(axios, { retryDelay: axiosRetry.exponentialDelay, retries: 3 });
|
||||
const { makeUrlResidential } = require('./scrapingAnt');
|
||||
//if ScrapingAnt got blocked, this http status is returned
|
||||
const BLOCKED_HTTP_STATUS = 423;
|
||||
const MAX_RETRIES_SCRAPING_ANT = 3;
|
||||
|
||||
function makeDriver(headers = {}) {
|
||||
let cookies = '';
|
||||
|
||||
return async function driver(context, callback) {
|
||||
async function scrapingAntDriver(context, callback, tryResidentialProxy, retryCounter = 0) {
|
||||
try {
|
||||
const url = context.url;
|
||||
const result = await axios({
|
||||
@@ -17,15 +19,61 @@ function makeDriver(headers = {}) {
|
||||
},
|
||||
});
|
||||
|
||||
if (typeof result.data === 'object' && url.toLowerCase().indexOf('scrapingant') !== -1) {
|
||||
//assume we have gotten a response from scrapingAnt
|
||||
if (cookies.length === 0) {
|
||||
cookies = result.data.cookies;
|
||||
}
|
||||
callback(null, result.data.content);
|
||||
} else {
|
||||
callback(null, result.data);
|
||||
if (cookies.length === 0) {
|
||||
cookies = result.data.cookies;
|
||||
}
|
||||
|
||||
callback(null, result.data.content);
|
||||
} catch (exception) {
|
||||
/* eslint-disable no-console */
|
||||
if (exception.response?.status !== BLOCKED_HTTP_STATUS) {
|
||||
console.error(`Error while trying to scrape data from scraping ant. Received error: ${exception.message}`);
|
||||
callback(null, []);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!tryResidentialProxy) {
|
||||
console.debug('ScrapingAnt got blocked out. Retrying with residential Proxy...');
|
||||
await scrapingAntDriver({ ...context, url: makeUrlResidential(context.url) }, callback, true, 0);
|
||||
} else if (retryCounter <= MAX_RETRIES_SCRAPING_ANT) {
|
||||
retryCounter++;
|
||||
console.debug(`ScrapingAnt still got blocked retry ${retryCounter} / ${MAX_RETRIES_SCRAPING_ANT}`);
|
||||
await scrapingAntDriver(
|
||||
{
|
||||
...context,
|
||||
url: makeUrlResidential(context.url),
|
||||
},
|
||||
callback,
|
||||
true,
|
||||
retryCounter
|
||||
);
|
||||
} else {
|
||||
console.error(`Error while trying to scrape data from scraping ant. Received error: ${exception.message}`);
|
||||
callback(null, []);
|
||||
}
|
||||
/* eslint-enable no-console */
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The regular request driver is taking care of everyting, that doesn't need to be scraped by ScrapingAnt (which is
|
||||
* everything != Immoscout as of writing this)
|
||||
*/
|
||||
return async function driver(context, callback) {
|
||||
if (context.url.toLowerCase().indexOf('scrapingant') !== -1) {
|
||||
return scrapingAntDriver(context, callback);
|
||||
}
|
||||
|
||||
try {
|
||||
const result = await axios({
|
||||
url: context.url,
|
||||
headers: {
|
||||
...headers,
|
||||
Cookie: cookies,
|
||||
},
|
||||
});
|
||||
|
||||
callback(null, result.data);
|
||||
} catch (exception) {
|
||||
console.error(`Error while trying to scrape data. Received error: ${exception.message}`);
|
||||
callback(null, []);
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
const { metaInformation } = require('../provider/immoscout');
|
||||
//to better configure re-capture chose a random proxy each time we do a call
|
||||
const proxies = ['ae', 'br', 'cn', 'de', 'es', 'fr', 'gb', 'hk', 'in', 'it', 'il', 'jp', 'nl', 'ru', 'sa', 'us', 'cz'];
|
||||
const config = require('../../conf/config.json');
|
||||
|
||||
const isImmoscout = (id) => {
|
||||
@@ -8,13 +7,9 @@ const isImmoscout = (id) => {
|
||||
};
|
||||
|
||||
exports.transformUrlForScrapingAnt = (url, id) => {
|
||||
const randomProxy = proxies[Math.floor(Math.random() * proxies.length)];
|
||||
|
||||
if (isImmoscout(id)) {
|
||||
//only do calls to scrapingAnt when dealing with Immoscout
|
||||
url = `https://api.scrapingant.com/v1/general?url=${encodeURIComponent(
|
||||
url
|
||||
)}&proxy_country=${randomProxy}&proxy_type=residential`;
|
||||
url = `https://api.scrapingant.com/v1/general?url=${encodeURIComponent(url)}&proxy_type=residential`;
|
||||
}
|
||||
return url;
|
||||
};
|
||||
@@ -24,3 +19,7 @@ exports.isScrapingAntApiKeySet = () => {
|
||||
};
|
||||
|
||||
exports.isImmoscout = isImmoscout;
|
||||
|
||||
exports.makeUrlResidential = (url) => {
|
||||
return url.replace('datacenter', 'residential');
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user