Compare commits

...

9 Commits
5.5.0 ... 5.8.0

Author SHA1 Message Date
weakmap@gmail.com
265ea58bab correct version 2022-12-11 20:08:40 +01:00
weakmap@gmail.com
ab5ee59d72 ugrading dependencies | fixing tests | supporting multiple provider of the same type 2022-12-11 20:07:18 +01:00
Christian Kellner
2062aa11a3 Scrapingant proxies (#59)
* preparing scraping ant proxies

* adding general settings for scraping ant proxy

* retrying with new ui settings
2022-06-13 08:10:30 +02:00
Christian Kellner
a4501007ff next release version 2022-06-10 14:19:41 +02:00
Christian Kellner
bc01806421 fixing telegram provider not respecting rate limits 2022-06-10 14:19:20 +02:00
Christian Kellner
bfba6d4bd9 next release version 2022-04-29 13:26:29 +02:00
Christian Kellner
676d48807a scraping ant retries 2022-04-29 13:22:39 +02:00
Christian Kellner
1a37773a40 Update package.json 2022-04-05 14:36:46 +02:00
Sven
67497d9828 added run-script-os to scripts to separate win use of set from unix use of export (#52)
Co-authored-by: Sven Simonsen <contact@svensimonsen.com>
2022-04-05 09:25:51 +02:00
20 changed files with 1558 additions and 1435 deletions

View File

@@ -2,6 +2,12 @@ Newer release changelog see https://github.com/orangecoding/fredy/releases
------------
###### [V5.5.0]
- Upgrading dependencies
- fixing provider
- allow multiple instances of 1 provider
- __BREAKING__: Minimum node version is now 16
###### [V5.4.6]
- Adding Instana node.js monitoring
-

View File

@@ -1 +1 @@
{"interval":"60","port":9998,"scrapingAnt":{"apiKey":""},"workingHours":{"from":"","to":""}}
{"interval":"60","port":9998,"scrapingAnt":{"apiKey":"","proxy":"datacenter"},"workingHours":{"from":"","to":""}}

View File

@@ -31,33 +31,20 @@ setInterval(
if (isDuringWorkingHoursOrNotSet) {
config.lastRun = Date.now();
const fetchedProvider = provider
.filter((provider) => provider.endsWith('.js'))
.map((pro) => require(`${path}/${pro}`));
jobStorage
.getJobs()
.filter((job) => job.enabled)
.forEach((job) => {
const providerIds = job.provider.map((provider) => provider.id);
provider
.filter((provider) => provider.endsWith('.js'))
.map((pro) => require(`${path}/${pro}`))
.filter((provider) => providerIds.indexOf(provider.metaInformation.id) !== -1)
.forEach(async (pro) => {
const providerId = pro.metaInformation.id;
if (providerId == null || providerId.length === 0) {
throw new Error('Provider id must not be empty. => ' + pro);
}
const providerConfig = job.provider.find((jobProvider) => jobProvider.id === providerId);
if (providerConfig == null) {
throw new Error(`Provider Config for provider with id ${providerId} not found.`);
}
pro.init(providerConfig, job.blacklist);
await new FredyRuntime(
pro.config,
job.notificationAdapter,
providerId,
job.id,
similarityCache
).execute();
job.provider
.filter((p) => fetchedProvider.find((fp) => fp.metaInformation.id === p.id) != null)
.forEach(async (prov) => {
const pro = fetchedProvider.find((fp) => fp.metaInformation.id === prov.id);
pro.init(prov, job.blacklist);
await new FredyRuntime(pro.config, job.notificationAdapter, prov.id, job.id, similarityCache).execute();
setLastJobExecution(job.id);
});
});

View File

@@ -2,6 +2,8 @@ const { markdown2Html } = require('../../services/markdown');
const { getJob } = require('../../services/storage/jobStorage');
const axios = require('axios');
const MAX_ENTITIES_PER_CHUNK = 8;
const RATE_LIMIT_INTERVAL = 1010;
/**
* splitting an array into chunks because Telegram only allows for messages up to
* 4096 chars, thus we have to split messages into chunks
@@ -29,7 +31,7 @@ exports.send = ({ serviceName, newListings, notificationConfig, jobKey }) => {
const jobName = job == null ? jobKey : job.name;
//we have to split messages into chunk, because otherwise messages are going to become too big and will fail
const chunks = arrayChunks(newListings, 3);
const chunks = arrayChunks(newListings, MAX_ENTITIES_PER_CHUNK);
const promises = chunks.map((chunk) => {
let message = `<i>${jobName}</i> (${serviceName}) found <b>${newListings.length}</b> new listings:\n\n`;
@@ -40,11 +42,21 @@ exports.send = ({ serviceName, newListings, notificationConfig, jobKey }) => {
'\n\n'
);
return axios.post(`https://api.telegram.org/bot${token}/sendMessage`, {
chat_id: chatId,
text: message,
parse_mode: 'HTML',
disable_web_page_preview: true,
/**
* This is to not break the rate limit. It is to only send 1 message per second
*/
return new Promise((resolve, reject) => {
setTimeout(() => {
axios
.post(`https://api.telegram.org/bot${token}/sendMessage`, {
chat_id: chatId,
text: message,
parse_mode: 'HTML',
disable_web_page_preview: true,
})
.then(() => resolve())
.catch(() => reject());
}, RATE_LIMIT_INTERVAL);
});
});

View File

@@ -1,14 +1,21 @@
const axios = require('axios');
const axiosRetry = require('axios-retry');
const config = require('../../conf/config.json');
axiosRetry(axios, { retryDelay: axiosRetry.exponentialDelay, retries: 3 });
const { makeUrlResidential } = require('./scrapingAnt');
//if ScrapingAnt got blocked, this http status is returned
const BLOCKED_HTTP_STATUS = 423;
const NOT_FOUND_HTTP_STATUS = 404;
const MAX_RETRIES_SCRAPING_ANT = 10;
const EXPECTED_STATUS_CODES = [BLOCKED_HTTP_STATUS, NOT_FOUND_HTTP_STATUS];
function makeDriver(headers = {}) {
let cookies = '';
return async function driver(context, callback) {
async function scrapingAntDriver(context, callback, retryCounter = 0) {
const proxyType = config.scrapingAnt?.proxy || 'datacenter';
try {
const url = context.url;
const url = proxyType === 'residential' ? makeUrlResidential(context.url) : context.url;
const result = await axios({
url,
headers: {
@@ -17,15 +24,50 @@ function makeDriver(headers = {}) {
},
});
if (typeof result.data === 'object' && url.toLowerCase().indexOf('scrapingant') !== -1) {
//assume we have gotten a response from scrapingAnt
if (cookies.length === 0) {
cookies = result.data.cookies;
}
callback(null, result.data.content);
} else {
callback(null, result.data);
if (cookies.length === 0) {
cookies = result.data.cookies;
}
callback(null, result.data.content);
} catch (exception) {
/* eslint-disable no-console */
if (!EXPECTED_STATUS_CODES.includes(exception.response?.status)) {
console.error(`Error while trying to scrape data from scraping ant. Received error: ${exception.message}`);
callback(null, []);
return;
}
if (retryCounter <= MAX_RETRIES_SCRAPING_ANT) {
retryCounter++;
console.debug(`ScrapingAnt got blocked. Retrying ${retryCounter} / ${MAX_RETRIES_SCRAPING_ANT}`);
await scrapingAntDriver(context, callback, retryCounter);
} else {
console.error(`Error while trying to scrape data from scraping ant. Received error: ${exception.message}`);
callback(null, []);
}
/* eslint-enable no-console */
}
}
/**
* The regular request driver is taking care of everyting, that doesn't need to be scraped by ScrapingAnt (which is
* everything != Immoscout as of writing this)
*/
return async function driver(context, callback) {
if (context.url.toLowerCase().indexOf('scrapingant') !== -1) {
return scrapingAntDriver(context, callback);
}
try {
const result = await axios({
url: context.url,
headers: {
...headers,
Cookie: cookies,
},
});
callback(null, result.data);
} catch (exception) {
console.error(`Error while trying to scrape data. Received error: ${exception.message}`);
callback(null, []);

View File

@@ -1,6 +1,5 @@
const { metaInformation } = require('../provider/immoscout');
//to better configure re-capture chose a random proxy each time we do a call
const proxies = ['ae', 'br', 'cn', 'de', 'es', 'fr', 'gb', 'hk', 'in', 'it', 'il', 'jp', 'nl', 'ru', 'sa', 'us', 'cz'];
const config = require('../../conf/config.json');
const isImmoscout = (id) => {
@@ -8,13 +7,9 @@ const isImmoscout = (id) => {
};
exports.transformUrlForScrapingAnt = (url, id) => {
const randomProxy = proxies[Math.floor(Math.random() * proxies.length)];
if (isImmoscout(id)) {
//only do calls to scrapingAnt when dealing with Immoscout
url = `https://api.scrapingant.com/v1/general?url=${encodeURIComponent(
url
)}&proxy_country=${randomProxy}&proxy_type=residential`;
url = `https://api.scrapingant.com/v1/general?url=${encodeURIComponent(url)}&proxy_type=datacenter`;
}
return url;
};
@@ -24,3 +19,7 @@ exports.isScrapingAntApiKeySet = () => {
};
exports.isImmoscout = isImmoscout;
exports.makeUrlResidential = (url) => {
return url.replace('datacenter', 'residential');
};

View File

@@ -1,13 +1,17 @@
{
"name": "fredy",
"version": "5.5.0",
"version": "5.8.0",
"description": "[F]ind [R]eal [E]states [d]amn eas[y].",
"scripts": {
"start": "node index.js",
"dev": "yarn && export BUILD_DEV='true' && export NODE_ENV='development' && webpack serve --progress --color --config ./webpack.dev.js",
"prod": "export BUILD_DEV='false' && webpack --node-env=production --config ./webpack.prod.js",
"dev": "run-script-os",
"dev:win32": "yarn && set BUILD_DEV='true' && set NODE_ENV='development' && webpack serve --progress --color --config ./webpack.dev.js",
"dev:default": "yarn && export BUILD_DEV='true' && export NODE_ENV='development' && webpack serve --progress --color --config ./webpack.dev.js",
"prod": "run-script-os",
"prod:win32": "set BUILD_DEV='false' && webpack --node-env=production --config ./webpack.prod.js",
"prod:default": "export BUILD_DEV='false' && webpack --node-env=production --config ./webpack.prod.js",
"format": "prettier --write lib/**/*.js ui/src/**/*.js test/**/*.js *.js --single-quote --print-width 120",
"test": "mocha --timeout 20000 test/**/*.test.js",
"test": "mocha --timeout 3000000 test/**/*.test.js",
"lint": "eslint ./index.js ./lib/**/*.js ./test/**/*.js"
},
"husky": {
@@ -43,7 +47,7 @@
},
"license": "MIT",
"engines": {
"node": ">=14.0.0",
"node": ">=16.0.0",
"npm": ">=7.0.0"
},
"browserslist": [
@@ -55,61 +59,61 @@
"dependencies": {
"@rematch/core": "2.2.0",
"@rematch/loading": "2.1.2",
"@sendgrid/mail": "7.6.2",
"axios": "0.26.1",
"axios-retry": "^3.2.4",
"better-sqlite3": "^7.5.0",
"body-parser": "1.19.2",
"@sendgrid/mail": "7.7.0",
"axios": "0.27.2",
"better-sqlite3": "8.0.1",
"body-parser": "1.20.1",
"cookie-session": "2.0.0",
"handlebars": "4.7.7",
"highcharts": "10.0.0",
"highcharts": "10.3.2",
"highcharts-react-official": "3.1.0",
"lowdb": "1.0.0",
"markdown": "^0.5.0",
"nanoid": "3.3.1",
"node-mailjet": "3.3.7",
"query-string": "7.1.1",
"nanoid": "3.3.3",
"node-mailjet": "3.3.13",
"query-string": "7.1.3",
"react": "17.0.2",
"react-dom": "17.0.2",
"react-redux": "7.2.6",
"react-redux": "8.0.5",
"react-router": "5.2.1",
"react-router-dom": "5.3.0",
"react-switch": "^6.0.0",
"redux": "4.1.2",
"redux-thunk": "2.4.1",
"restana": "4.9.3",
"semantic-ui-react": "2.1.2",
"react-switch": "6.0.0",
"redux": "4.2.0",
"redux-thunk": "2.4.2",
"restana": "4.9.7",
"semantic-ui-react": "2.1.4",
"serve-static": "1.15.0",
"slack": "11.0.2",
"string-similarity": "^4.0.4",
"x-ray": "2.3.4"
},
"devDependencies": {
"@babel/core": "7.17.8",
"@babel/preset-env": "7.16.11",
"@babel/preset-react": "7.16.7",
"@babel/core": "7.20.5",
"@babel/preset-env": "7.20.2",
"@babel/preset-react": "7.18.6",
"babel-eslint": "10.1.0",
"babel-loader": "8.2.4",
"chai": "4.3.6",
"babel-loader": "8.2.5",
"chai": "4.3.7",
"clean-webpack-plugin": "4.0.0",
"copy-webpack-plugin": "10.2.4",
"css-loader": "6.7.1",
"css-loader": "6.7.2",
"eslint": "7.32.0",
"eslint-config-prettier": "8.5.0",
"eslint-plugin-react": "7.29.4",
"eslint-plugin-react": "7.31.11",
"file-loader": "6.2.0",
"history": "5.3.0",
"husky": "4.3.8",
"less": "4.1.2",
"less": "4.1.3",
"less-loader": "10.2.0",
"lint-staged": "12.3.7",
"lint-staged": "12.4.1",
"mocha": "9.2.2",
"prettier": "2.6.1",
"prettier": "2.8.1",
"proxyquire": "2.1.3",
"redux-logger": "3.0.6",
"run-script-os": "^1.1.6",
"style-loader": "3.3.1",
"url-loader": "4.1.1",
"webpack": "5.70.0",
"webpack": "5.75.0",
"webpack-cli": "4.9.2",
"webpack-dev-server": "3.11.2",
"webpack-merge": "5.8.0"

View File

@@ -22,7 +22,7 @@ describe('#einsAImmobilien testsuite()', () => {
it('should test einsAImmobilien provider', async () => {
return await new Promise((resolve) => {
const fredy = new Fredy(provider.config, null, provider.metaInformation.id, 'test1', similarityCache);
const fredy = new Fredy(provider.config, null, provider.metaInformation.id, 'einsAImmobilien', similarityCache);
fredy.execute().then((listings) => {
expect(listings).to.be.a('array');
@@ -39,7 +39,6 @@ describe('#einsAImmobilien testsuite()', () => {
expect(notify.link).to.be.a('string');
/** check the values if possible **/
expect(notify.price).that.does.include('EUR');
expect(notify.size).to.be.not.empty;
expect(notify.title).to.be.not.empty;
expect(notify.link).that.does.include('https://www.1a-immobilienmarkt.de');

View File

@@ -21,7 +21,7 @@ describe('#immonet testsuite()', () => {
it('should test immonet provider', async () => {
return await new Promise((resolve) => {
const fredy = new Fredy(provider.config, null, provider.metaInformation.id, 'test1', similarityCache);
const fredy = new Fredy(provider.config, null, provider.metaInformation.id, 'immonet', similarityCache);
fredy.execute().then((listing) => {
expect(listing).to.be.a('array');

View File

@@ -29,7 +29,7 @@ describe('#immoscout testsuite()', () => {
return;
}
const fredy = new Fredy(provider.config, null, provider.metaInformation.id, 'test1', similarityCache);
const fredy = new Fredy(provider.config, null, provider.metaInformation.id, 'immoscout', similarityCache);
fredy.execute().then((listing) => {
expect(listing).to.be.a('array');

View File

@@ -21,7 +21,7 @@ describe('#immoswp testsuite()', () => {
it('should test immoswp provider', async () => {
return await new Promise((resolve) => {
const fredy = new Fredy(provider.config, null, provider.metaInformation.id, 'test1', similarityCache);
const fredy = new Fredy(provider.config, null, provider.metaInformation.id, 'immoswp', similarityCache);
fredy.execute().then((listing) => {
expect(listing).to.be.a('array');

View File

@@ -20,7 +20,7 @@ describe('#immowelt testsuite()', () => {
});
return await new Promise((resolve) => {
const fredy = new Fredy(provider.config, null, provider.metaInformation.id, 'test1', similarityCache);
const fredy = new Fredy(provider.config, null, provider.metaInformation.id, 'immowelt', similarityCache);
fredy.execute().then((listing) => {
expect(listing).to.be.a('array');

View File

@@ -20,7 +20,7 @@ describe('#kleinanzeigen testsuite()', () => {
});
return await new Promise((resolve) => {
const fredy = new Fredy(provider.config, null, provider.metaInformation.id, 'test1', similarityCache);
const fredy = new Fredy(provider.config, null, provider.metaInformation.id, 'kleinanzeigen', similarityCache);
fredy.execute().then((listing) => {
expect(listing).to.be.a('array');
@@ -31,10 +31,8 @@ describe('#kleinanzeigen testsuite()', () => {
notificationObj.payload.forEach((notify) => {
/** check the actual structure **/
expect(notify.id).to.be.a('number');
expect(notify.size).to.be.a('string');
expect(notify.title).to.be.a('string');
expect(notify.link).to.be.a('string');
expect(notify.price).to.be.a('string');
expect(notify.address).to.be.a('string');
/** check the values if possible **/

View File

@@ -20,7 +20,7 @@ describe('#neubauKompass testsuite()', () => {
it('should test neubauKompass provider', async () => {
return await new Promise((resolve) => {
const fredy = new Fredy(provider.config, null, provider.metaInformation.id, 'test1', similarityCache);
const fredy = new Fredy(provider.config, null, provider.metaInformation.id, 'neubauKompass', similarityCache);
fredy.execute().then((listing) => {
expect(listing).to.be.a('array');

View File

@@ -20,7 +20,7 @@ describe('#wgGesucht testsuite()', () => {
it('should test wgGesucht provider', async () => {
return await new Promise((resolve) => {
const fredy = new Fredy(provider.config, null, provider.metaInformation.id, 'test1', similarityCache);
const fredy = new Fredy(provider.config, null, provider.metaInformation.id, 'wgGesucht', similarityCache);
fredy.execute().then((listing) => {
expect(listing).to.be.a('array');
const notificationObj = mockNotification.get();

View File

@@ -2,7 +2,7 @@ import React from 'react';
import { useDispatch, useSelector } from 'react-redux';
import { Button, Form, Icon, Message, Segment } from 'semantic-ui-react';
import { Button, Form, Icon, Message, Segment, Radio } from 'semantic-ui-react';
import ToastContext from '../../components/toasts/ToastContext';
import Headline from '../../components/headline/Headline';
import { xhrPost } from '../../services/xhr';
@@ -18,6 +18,7 @@ const GeneralSettings = function Users() {
const [interval, setInterval] = React.useState('');
const [port, setPort] = React.useState('');
const [scrapingAntApiKey, setScrapingAntApiKey] = React.useState('');
const [scrapingAntProxy, setScrapingAntProxy] = React.useState('');
const [workingHourFrom, setWorkingHourFrom] = React.useState(null);
const [workingHourTo, setWorkingHourTo] = React.useState(null);
const ctx = React.useContext(ToastContext);
@@ -33,6 +34,7 @@ const GeneralSettings = function Users() {
setScrapingAntApiKey(settings?.scrapingAnt?.apiKey);
setWorkingHourFrom(settings?.workingHours?.from);
setWorkingHourTo(settings?.workingHours?.to);
setScrapingAntProxy(settings?.scrapingAnt?.proxy || 'datacenter');
}, [settings]);
const nullOrEmpty = (val) => val == null || val.length === 0;
@@ -69,6 +71,7 @@ const GeneralSettings = function Users() {
port,
scrapingAnt: {
apiKey: scrapingAntApiKey,
proxy: scrapingAntProxy,
},
workingHours: {
from: workingHourFrom,
@@ -144,6 +147,48 @@ const GeneralSettings = function Users() {
/>
</SegmentPart>
<SegmentPart
name="ScrapingAnt proxy settings"
helpText="Scraping ant provides different proxies."
icon="key"
>
<Message info>
ScrapingAnt is needed to scrape Immoscout. ScrapingAnt itself is using 2 different types of proxies.{' '}
<br />
<h4>Datacenter-Proxy</h4>
Proxy server located in one of the datacenters across the world. Datacenter proxies are slower and more
likely to fail, but they are cheaper. A call with a datacenter proxy cost 10 credits.
<h4>Residential-Proxy</h4>
High-quality proxy server located in one of the real people houses across the world. Datacenter proxies
are faster and more likely to success, but they are more expensive. A call with a datacenter proxy cost
250 credits.
<br />
<br />
<b>
On the free tier, you have 10.000 credits, so chose your option wisely. Keep in mind, only successful
calls will be charged.
</b>
</Message>
<Form.Field>
<Radio
label="Datacenter proxy"
name="scrapingAntProxy"
value="datacenter"
checked={scrapingAntProxy === 'datacenter'}
onChange={(e, { value }) => setScrapingAntProxy(value)}
/>
</Form.Field>
<Form.Field>
<Radio
label="Residential proxy"
name="scrapingAntProxy"
value="residential"
checked={scrapingAntProxy === 'residential'}
onChange={(e, { value }) => setScrapingAntProxy(value)}
/>
</Form.Field>
</SegmentPart>
<SegmentPart
name="Working hours"
helpText="During this hours, Fredy will search for new apartments. If nothing is configured, Fredy will search around the clock."
@@ -153,7 +198,7 @@ const GeneralSettings = function Users() {
<Form.Input
className="generalSettings__time"
type="time"
placeholder="ScrapingAnt Api Key"
placeholder="Working hours from"
inverted
size="mini"
width={2}
@@ -163,7 +208,7 @@ const GeneralSettings = function Users() {
<div className="generalSettings__until">until</div>
<Form.Input
type="time"
placeholder="ScrapingAnt Api Key"
placeholder="Working hours to"
inverted
size="mini"
width={2}

View File

@@ -43,7 +43,7 @@ export default function ProcessingTimes({ processingTimes }) {
ScrapingAnt
</a>
. You can use the code <b>FREDY10</b> to get 10% off. (No affiliation, we are <b>not</b> getting paid to
recommend ScrapingAnt.
recommend ScrapingAnt.)
</Segment>
)}
</React.Fragment>

View File

@@ -81,7 +81,6 @@ export default function JobMutator() {
<ProviderMutator
visible={providerCreationVisible}
onVisibilityChanged={(visible) => setProviderCreationVisibility(visible)}
selected={providerData}
onData={(data) => {
setProviderData([...providerData, data]);
}}

View File

@@ -16,7 +16,7 @@ const sortProvider = (a, b) => {
return 0;
};
export default function ProviderMutator({ onVisibilityChanged, visible = false, selected = [], onData } = {}) {
export default function ProviderMutator({ onVisibilityChanged, visible = false, onData } = {}) {
const provider = useSelector((state) => state.provider);
const [selectedProvider, setSelectedProvider] = useState(null);
const [providerUrl, setProviderUrl] = useState(null);
@@ -107,8 +107,6 @@ export default function ProviderMutator({ onVisibilityChanged, visible = false,
text: pro.name,
};
})
//filter out those, that have already been selected
.filter((option) => selected.find((selectedOption) => selectedOption.id === option.key) == null)
.sort(sortProvider)}
onChange={(e, { value }) => {
const selectedProvider = provider.find((pro) => pro.id === value);

2708
yarn.lock

File diff suppressed because it is too large Load Diff