mirror of
https://github.com/orangecoding/fredy.git
synced 2026-06-16 12:31:07 +00:00
Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2062aa11a3 | ||
|
|
a4501007ff | ||
|
|
bc01806421 |
@@ -1 +1 @@
|
|||||||
{"interval":"60","port":9998,"scrapingAnt":{"apiKey":""},"workingHours":{"from":"","to":""}}
|
{"interval":"60","port":9998,"scrapingAnt":{"apiKey":"","proxy":"datacenter"},"workingHours":{"from":"","to":""}}
|
||||||
@@ -2,6 +2,8 @@ const { markdown2Html } = require('../../services/markdown');
|
|||||||
const { getJob } = require('../../services/storage/jobStorage');
|
const { getJob } = require('../../services/storage/jobStorage');
|
||||||
const axios = require('axios');
|
const axios = require('axios');
|
||||||
|
|
||||||
|
const MAX_ENTITIES_PER_CHUNK = 8;
|
||||||
|
const RATE_LIMIT_INTERVAL = 1010;
|
||||||
/**
|
/**
|
||||||
* splitting an array into chunks because Telegram only allows for messages up to
|
* splitting an array into chunks because Telegram only allows for messages up to
|
||||||
* 4096 chars, thus we have to split messages into chunks
|
* 4096 chars, thus we have to split messages into chunks
|
||||||
@@ -29,7 +31,7 @@ exports.send = ({ serviceName, newListings, notificationConfig, jobKey }) => {
|
|||||||
const jobName = job == null ? jobKey : job.name;
|
const jobName = job == null ? jobKey : job.name;
|
||||||
|
|
||||||
//we have to split messages into chunk, because otherwise messages are going to become too big and will fail
|
//we have to split messages into chunk, because otherwise messages are going to become too big and will fail
|
||||||
const chunks = arrayChunks(newListings, 3);
|
const chunks = arrayChunks(newListings, MAX_ENTITIES_PER_CHUNK);
|
||||||
|
|
||||||
const promises = chunks.map((chunk) => {
|
const promises = chunks.map((chunk) => {
|
||||||
let message = `<i>${jobName}</i> (${serviceName}) found <b>${newListings.length}</b> new listings:\n\n`;
|
let message = `<i>${jobName}</i> (${serviceName}) found <b>${newListings.length}</b> new listings:\n\n`;
|
||||||
@@ -40,11 +42,21 @@ exports.send = ({ serviceName, newListings, notificationConfig, jobKey }) => {
|
|||||||
'\n\n'
|
'\n\n'
|
||||||
);
|
);
|
||||||
|
|
||||||
return axios.post(`https://api.telegram.org/bot${token}/sendMessage`, {
|
/**
|
||||||
chat_id: chatId,
|
* This is to not break the rate limit. It is to only send 1 message per second
|
||||||
text: message,
|
*/
|
||||||
parse_mode: 'HTML',
|
return new Promise((resolve, reject) => {
|
||||||
disable_web_page_preview: true,
|
setTimeout(() => {
|
||||||
|
axios
|
||||||
|
.post(`https://api.telegram.org/bot${token}/sendMessage`, {
|
||||||
|
chat_id: chatId,
|
||||||
|
text: message,
|
||||||
|
parse_mode: 'HTML',
|
||||||
|
disable_web_page_preview: true,
|
||||||
|
})
|
||||||
|
.then(() => resolve())
|
||||||
|
.catch(() => reject());
|
||||||
|
}, RATE_LIMIT_INTERVAL);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -1,16 +1,21 @@
|
|||||||
const axios = require('axios');
|
const axios = require('axios');
|
||||||
|
const config = require('../../conf/config.json');
|
||||||
|
|
||||||
const { makeUrlResidential } = require('./scrapingAnt');
|
const { makeUrlResidential } = require('./scrapingAnt');
|
||||||
//if ScrapingAnt got blocked, this http status is returned
|
//if ScrapingAnt got blocked, this http status is returned
|
||||||
const BLOCKED_HTTP_STATUS = 423;
|
const BLOCKED_HTTP_STATUS = 423;
|
||||||
const MAX_RETRIES_SCRAPING_ANT = 3;
|
const NOT_FOUND_HTTP_STATUS = 404;
|
||||||
|
const MAX_RETRIES_SCRAPING_ANT = 10;
|
||||||
|
const EXPECTED_STATUS_CODES = [BLOCKED_HTTP_STATUS, NOT_FOUND_HTTP_STATUS];
|
||||||
|
|
||||||
function makeDriver(headers = {}) {
|
function makeDriver(headers = {}) {
|
||||||
let cookies = '';
|
let cookies = '';
|
||||||
|
|
||||||
async function scrapingAntDriver(context, callback, tryResidentialProxy, retryCounter = 0) {
|
async function scrapingAntDriver(context, callback, retryCounter = 0) {
|
||||||
|
const proxyType = config.scrapingAnt?.proxy || 'datacenter';
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const url = context.url;
|
const url = proxyType === 'residential' ? makeUrlResidential(context.url) : context.url;
|
||||||
const result = await axios({
|
const result = await axios({
|
||||||
url,
|
url,
|
||||||
headers: {
|
headers: {
|
||||||
@@ -26,27 +31,16 @@ function makeDriver(headers = {}) {
|
|||||||
callback(null, result.data.content);
|
callback(null, result.data.content);
|
||||||
} catch (exception) {
|
} catch (exception) {
|
||||||
/* eslint-disable no-console */
|
/* eslint-disable no-console */
|
||||||
if (exception.response?.status !== BLOCKED_HTTP_STATUS) {
|
if (!EXPECTED_STATUS_CODES.includes(exception.response?.status)) {
|
||||||
console.error(`Error while trying to scrape data from scraping ant. Received error: ${exception.message}`);
|
console.error(`Error while trying to scrape data from scraping ant. Received error: ${exception.message}`);
|
||||||
callback(null, []);
|
callback(null, []);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!tryResidentialProxy) {
|
if (retryCounter <= MAX_RETRIES_SCRAPING_ANT) {
|
||||||
console.debug('ScrapingAnt got blocked out. Retrying with residential Proxy...');
|
|
||||||
await scrapingAntDriver({ ...context, url: makeUrlResidential(context.url) }, callback, true, 0);
|
|
||||||
} else if (retryCounter <= MAX_RETRIES_SCRAPING_ANT) {
|
|
||||||
retryCounter++;
|
retryCounter++;
|
||||||
console.debug(`ScrapingAnt still got blocked retry ${retryCounter} / ${MAX_RETRIES_SCRAPING_ANT}`);
|
console.debug(`ScrapingAnt got blocked. Retrying ${retryCounter} / ${MAX_RETRIES_SCRAPING_ANT}`);
|
||||||
await scrapingAntDriver(
|
await scrapingAntDriver(context, callback, retryCounter);
|
||||||
{
|
|
||||||
...context,
|
|
||||||
url: makeUrlResidential(context.url),
|
|
||||||
},
|
|
||||||
callback,
|
|
||||||
true,
|
|
||||||
retryCounter
|
|
||||||
);
|
|
||||||
} else {
|
} else {
|
||||||
console.error(`Error while trying to scrape data from scraping ant. Received error: ${exception.message}`);
|
console.error(`Error while trying to scrape data from scraping ant. Received error: ${exception.message}`);
|
||||||
callback(null, []);
|
callback(null, []);
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ const isImmoscout = (id) => {
|
|||||||
exports.transformUrlForScrapingAnt = (url, id) => {
|
exports.transformUrlForScrapingAnt = (url, id) => {
|
||||||
if (isImmoscout(id)) {
|
if (isImmoscout(id)) {
|
||||||
//only do calls to scrapingAnt when dealing with Immoscout
|
//only do calls to scrapingAnt when dealing with Immoscout
|
||||||
url = `https://api.scrapingant.com/v1/general?url=${encodeURIComponent(url)}&proxy_type=residential`;
|
url = `https://api.scrapingant.com/v1/general?url=${encodeURIComponent(url)}&proxy_type=datacenter`;
|
||||||
}
|
}
|
||||||
return url;
|
return url;
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "fredy",
|
"name": "fredy",
|
||||||
"version": "5.6.0",
|
"version": "5.7.0",
|
||||||
"description": "[F]ind [R]eal [E]states [d]amn eas[y].",
|
"description": "[F]ind [R]eal [E]states [d]amn eas[y].",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"start": "node index.js",
|
"start": "node index.js",
|
||||||
@@ -11,7 +11,7 @@
|
|||||||
"prod:win32": "set BUILD_DEV='false' && webpack --node-env=production --config ./webpack.prod.js",
|
"prod:win32": "set BUILD_DEV='false' && webpack --node-env=production --config ./webpack.prod.js",
|
||||||
"prod:default": "export BUILD_DEV='false' && webpack --node-env=production --config ./webpack.prod.js",
|
"prod:default": "export BUILD_DEV='false' && webpack --node-env=production --config ./webpack.prod.js",
|
||||||
"format": "prettier --write lib/**/*.js ui/src/**/*.js test/**/*.js *.js --single-quote --print-width 120",
|
"format": "prettier --write lib/**/*.js ui/src/**/*.js test/**/*.js *.js --single-quote --print-width 120",
|
||||||
"test": "mocha --timeout 20000 test/**/*.test.js",
|
"test": "mocha --timeout 3000000 test/**/*.test.js",
|
||||||
"lint": "eslint ./index.js ./lib/**/*.js ./test/**/*.js"
|
"lint": "eslint ./index.js ./lib/**/*.js ./test/**/*.js"
|
||||||
},
|
},
|
||||||
"husky": {
|
"husky": {
|
||||||
|
|||||||
@@ -39,7 +39,6 @@ describe('#einsAImmobilien testsuite()', () => {
|
|||||||
expect(notify.link).to.be.a('string');
|
expect(notify.link).to.be.a('string');
|
||||||
|
|
||||||
/** check the values if possible **/
|
/** check the values if possible **/
|
||||||
expect(notify.price).that.does.include('EUR');
|
|
||||||
expect(notify.size).to.be.not.empty;
|
expect(notify.size).to.be.not.empty;
|
||||||
expect(notify.title).to.be.not.empty;
|
expect(notify.title).to.be.not.empty;
|
||||||
expect(notify.link).that.does.include('https://www.1a-immobilienmarkt.de');
|
expect(notify.link).that.does.include('https://www.1a-immobilienmarkt.de');
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import React from 'react';
|
|||||||
|
|
||||||
import { useDispatch, useSelector } from 'react-redux';
|
import { useDispatch, useSelector } from 'react-redux';
|
||||||
|
|
||||||
import { Button, Form, Icon, Message, Segment } from 'semantic-ui-react';
|
import { Button, Form, Icon, Message, Segment, Radio } from 'semantic-ui-react';
|
||||||
import ToastContext from '../../components/toasts/ToastContext';
|
import ToastContext from '../../components/toasts/ToastContext';
|
||||||
import Headline from '../../components/headline/Headline';
|
import Headline from '../../components/headline/Headline';
|
||||||
import { xhrPost } from '../../services/xhr';
|
import { xhrPost } from '../../services/xhr';
|
||||||
@@ -18,6 +18,7 @@ const GeneralSettings = function Users() {
|
|||||||
const [interval, setInterval] = React.useState('');
|
const [interval, setInterval] = React.useState('');
|
||||||
const [port, setPort] = React.useState('');
|
const [port, setPort] = React.useState('');
|
||||||
const [scrapingAntApiKey, setScrapingAntApiKey] = React.useState('');
|
const [scrapingAntApiKey, setScrapingAntApiKey] = React.useState('');
|
||||||
|
const [scrapingAntProxy, setScrapingAntProxy] = React.useState('');
|
||||||
const [workingHourFrom, setWorkingHourFrom] = React.useState(null);
|
const [workingHourFrom, setWorkingHourFrom] = React.useState(null);
|
||||||
const [workingHourTo, setWorkingHourTo] = React.useState(null);
|
const [workingHourTo, setWorkingHourTo] = React.useState(null);
|
||||||
const ctx = React.useContext(ToastContext);
|
const ctx = React.useContext(ToastContext);
|
||||||
@@ -33,6 +34,7 @@ const GeneralSettings = function Users() {
|
|||||||
setScrapingAntApiKey(settings?.scrapingAnt?.apiKey);
|
setScrapingAntApiKey(settings?.scrapingAnt?.apiKey);
|
||||||
setWorkingHourFrom(settings?.workingHours?.from);
|
setWorkingHourFrom(settings?.workingHours?.from);
|
||||||
setWorkingHourTo(settings?.workingHours?.to);
|
setWorkingHourTo(settings?.workingHours?.to);
|
||||||
|
setScrapingAntProxy(settings?.scrapingAnt?.proxy || 'datacenter');
|
||||||
}, [settings]);
|
}, [settings]);
|
||||||
|
|
||||||
const nullOrEmpty = (val) => val == null || val.length === 0;
|
const nullOrEmpty = (val) => val == null || val.length === 0;
|
||||||
@@ -69,6 +71,7 @@ const GeneralSettings = function Users() {
|
|||||||
port,
|
port,
|
||||||
scrapingAnt: {
|
scrapingAnt: {
|
||||||
apiKey: scrapingAntApiKey,
|
apiKey: scrapingAntApiKey,
|
||||||
|
proxy: scrapingAntProxy,
|
||||||
},
|
},
|
||||||
workingHours: {
|
workingHours: {
|
||||||
from: workingHourFrom,
|
from: workingHourFrom,
|
||||||
@@ -144,6 +147,48 @@ const GeneralSettings = function Users() {
|
|||||||
/>
|
/>
|
||||||
</SegmentPart>
|
</SegmentPart>
|
||||||
|
|
||||||
|
<SegmentPart
|
||||||
|
name="ScrapingAnt proxy settings"
|
||||||
|
helpText="Scraping ant provides different proxies."
|
||||||
|
icon="key"
|
||||||
|
>
|
||||||
|
<Message info>
|
||||||
|
ScrapingAnt is needed to scrape Immoscout. ScrapingAnt itself is using 2 different types of proxies.{' '}
|
||||||
|
<br />
|
||||||
|
<h4>Datacenter-Proxy</h4>
|
||||||
|
Proxy server located in one of the datacenters across the world. Datacenter proxies are slower and more
|
||||||
|
likely to fail, but they are cheaper. A call with a datacenter proxy cost 10 credits.
|
||||||
|
<h4>Residential-Proxy</h4>
|
||||||
|
High-quality proxy server located in one of the real people houses across the world. Datacenter proxies
|
||||||
|
are faster and more likely to success, but they are more expensive. A call with a datacenter proxy cost
|
||||||
|
250 credits.
|
||||||
|
<br />
|
||||||
|
<br />
|
||||||
|
<b>
|
||||||
|
On the free tier, you have 10.000 credits, so chose your option wisely. Keep in mind, only successful
|
||||||
|
calls will be charged.
|
||||||
|
</b>
|
||||||
|
</Message>
|
||||||
|
<Form.Field>
|
||||||
|
<Radio
|
||||||
|
label="Datacenter proxy"
|
||||||
|
name="scrapingAntProxy"
|
||||||
|
value="datacenter"
|
||||||
|
checked={scrapingAntProxy === 'datacenter'}
|
||||||
|
onChange={(e, { value }) => setScrapingAntProxy(value)}
|
||||||
|
/>
|
||||||
|
</Form.Field>
|
||||||
|
<Form.Field>
|
||||||
|
<Radio
|
||||||
|
label="Residential proxy"
|
||||||
|
name="scrapingAntProxy"
|
||||||
|
value="residential"
|
||||||
|
checked={scrapingAntProxy === 'residential'}
|
||||||
|
onChange={(e, { value }) => setScrapingAntProxy(value)}
|
||||||
|
/>
|
||||||
|
</Form.Field>
|
||||||
|
</SegmentPart>
|
||||||
|
|
||||||
<SegmentPart
|
<SegmentPart
|
||||||
name="Working hours"
|
name="Working hours"
|
||||||
helpText="During this hours, Fredy will search for new apartments. If nothing is configured, Fredy will search around the clock."
|
helpText="During this hours, Fredy will search for new apartments. If nothing is configured, Fredy will search around the clock."
|
||||||
@@ -153,7 +198,7 @@ const GeneralSettings = function Users() {
|
|||||||
<Form.Input
|
<Form.Input
|
||||||
className="generalSettings__time"
|
className="generalSettings__time"
|
||||||
type="time"
|
type="time"
|
||||||
placeholder="ScrapingAnt Api Key"
|
placeholder="Working hours from"
|
||||||
inverted
|
inverted
|
||||||
size="mini"
|
size="mini"
|
||||||
width={2}
|
width={2}
|
||||||
@@ -163,7 +208,7 @@ const GeneralSettings = function Users() {
|
|||||||
<div className="generalSettings__until">until</div>
|
<div className="generalSettings__until">until</div>
|
||||||
<Form.Input
|
<Form.Input
|
||||||
type="time"
|
type="time"
|
||||||
placeholder="ScrapingAnt Api Key"
|
placeholder="Working hours to"
|
||||||
inverted
|
inverted
|
||||||
size="mini"
|
size="mini"
|
||||||
width={2}
|
width={2}
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ export default function ProcessingTimes({ processingTimes }) {
|
|||||||
ScrapingAnt
|
ScrapingAnt
|
||||||
</a>
|
</a>
|
||||||
. You can use the code <b>FREDY10</b> to get 10% off. (No affiliation, we are <b>not</b> getting paid to
|
. You can use the code <b>FREDY10</b> to get 10% off. (No affiliation, we are <b>not</b> getting paid to
|
||||||
recommend ScrapingAnt.
|
recommend ScrapingAnt.)
|
||||||
</Segment>
|
</Segment>
|
||||||
)}
|
)}
|
||||||
</React.Fragment>
|
</React.Fragment>
|
||||||
|
|||||||
Reference in New Issue
Block a user