Compare commits

..

14 Commits

Author SHA1 Message Date
weakmap@gmail.com
c6bb3c44d4 upgrade dependencies, fixing tests 2025-02-23 17:14:39 +01:00
weakmap@gmail.com
a3471a091a upgrade dependencies, fixing tests 2025-02-23 17:13:08 +01:00
Christian Kellner
b5a96afcc8 upgrading dependencies 2025-01-17 22:08:04 +01:00
Stefan
3903ab59cf fix normalized wggesucht link (#123) 2025-01-17 22:05:34 +01:00
weakmap@gmail.com
8fe7cec2a1 improve pushover notification service 2025-01-10 19:51:14 +01:00
Christian Kellner
97deea6f5b Update README.md 2025-01-09 17:31:46 +01:00
Christian Kellner
1ecbbdd774 better logging 2025-01-07 13:34:43 +01:00
Christian Kellner
e1db3840f6 adding puppeteer timeout and fixing waitForSelector 2025-01-07 12:37:50 +01:00
Christian Kellner
26127eeac1 updating dependencies 2025-01-07 12:27:16 +01:00
Christian Kellner
90a4ee5dcf better logging, fixing code smells 2025-01-07 12:25:19 +01:00
Christian Kellner
2aaf63c253 Happy New Year 2025-01-05 06:53:07 +01:00
Christian Kellner
f52e3e9fd8 Update package.json 2025-01-04 21:52:06 +01:00
Fabian Pfaff
0d69232395 install chrome via apt instead of bundled (#122) 2025-01-04 21:50:59 +01:00
weakmap@gmail.com
b473cf7fb4 fixing kleinanzeigen test 2024-12-26 19:18:30 +01:00
18 changed files with 1446 additions and 805 deletions

View File

@@ -4,6 +4,11 @@ WORKDIR /fredy
COPY . /fredy
RUN apt-get update && apt-get install -y chromium
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \
PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium
RUN yarn install
RUN yarn global add pm2

View File

@@ -1,6 +1,6 @@
MIT License
Copyright (c) 2024 Christian Kellner
Copyright (c) 2025 Christian Kellner
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal

View File

@@ -11,7 +11,7 @@ If _Fredy_ finds matching results, it will send them to you via Slack, Email, Te
# Sponsorship [![](https://img.shields.io/static/v1?label=Sponsor&message=%E2%9D%A4&logo=GitHub&color=%23fe8e86)](https://github.com/sponsors/orangecoding)
If you like my work, consider becoming a sponsor. I'm not expecting anybody to pay for _Fredy_ or any other Open Source Project I'm maintaining, however keep in mind, I'm doing all of this in my spare time :) Thanks.
<img src="https://github.com/orangecoding/fredy/blob/master/doc/jetbrains.png" width="200">
[![JetBrains logo.](https://resources.jetbrains.com/storage/products/company/brand/logos/jetbrains.svg)](https://jb.gg/OpenSourceSupport)
_Fredy_ is supported by JetBrains under Open Source Support Program

View File

@@ -1,118 +1,124 @@
import {NoNewListingsWarning} from './errors.js';
import {setKnownListings, getKnownListings} from './services/storage/listingsStorage.js';
import { NoNewListingsWarning } from './errors.js';
import { setKnownListings, getKnownListings } from './services/storage/listingsStorage.js';
import * as notify from './notification/notify.js';
import Extractor from './services/extractor/extractor.js';
import urlModifier from './services/queryStringMutator.js';
class FredyRuntime {
/**
*
* @param providerConfig the config for the specific provider, we're going to query at the moment
* @param notificationConfig the config for all notifications
* @param providerId the id of the provider currently in use
* @param jobKey key of the job that is currently running (from within the config)
* @param similarityCache cache instance holding values to check for similarity of entries
*/
constructor(providerConfig, notificationConfig, providerId, jobKey, similarityCache) {
this._providerConfig = providerConfig;
this._notificationConfig = notificationConfig;
this._providerId = providerId;
this._jobKey = jobKey;
this._similarityCache = similarityCache;
}
/**
*
* @param providerConfig the config for the specific provider, we're going to query at the moment
* @param notificationConfig the config for all notifications
* @param providerId the id of the provider currently in use
* @param jobKey key of the job that is currently running (from within the config)
* @param similarityCache cache instance holding values to check for similarity of entries
*/
constructor(providerConfig, notificationConfig, providerId, jobKey, similarityCache) {
this._providerConfig = providerConfig;
this._notificationConfig = notificationConfig;
this._providerId = providerId;
this._jobKey = jobKey;
this._similarityCache = similarityCache;
}
execute() {
return (
//modify the url to make sure search order is correctly set
Promise.resolve(urlModifier(this._providerConfig.url, this._providerConfig.sortByDateParam))
//scraping the site and try finding new listings
.then(this._getListings.bind(this))
//bring them in a proper form (dictated by the provider)
.then(this._normalize.bind(this))
//filter listings with stuff tagged by the blacklist of the provider
.then(this._filter.bind(this))
//check if new listings available. if so proceed
.then(this._findNew.bind(this))
//store everything in db
.then(this._save.bind(this))
//check for similar listings. if found, remove them before notifying
.then(this._filterBySimilarListings.bind(this))
//notify the user using the configured notification adapter
.then(this._notify.bind(this))
//if an error occurred on the way, handle it here.
.catch(this._handleError.bind(this))
);
}
execute() {
return (
//modify the url to make sure search order is correctly set
Promise.resolve(urlModifier(this._providerConfig.url, this._providerConfig.sortByDateParam))
//scraping the site and try finding new listings
.then(this._getListings.bind(this))
//bring them in a proper form (dictated by the provider)
.then(this._normalize.bind(this))
//filter listings with stuff tagged by the blacklist of the provider
.then(this._filter.bind(this))
//check if new listings available. if so proceed
.then(this._findNew.bind(this))
//store everything in db
.then(this._save.bind(this))
//check for similar listings. if found, remove them before notifying
.then(this._filterBySimilarListings.bind(this))
//notify the user using the configured notification adapter
.then(this._notify.bind(this))
//if an error occurred on the way, handle it here.
.catch(this._handleError.bind(this))
);
}
_getListings(url) {
const extractor = new Extractor();
return new Promise((resolve, reject) => {
extractor.execute(url,this._providerConfig.waitForSelector)
.then(() => {
const listings = extractor.parseResponseText(this._providerConfig.crawlContainer, this._providerConfig.crawlFields);
resolve(listings == null ? [] : listings);
}).catch(err => {
reject(err);
/* eslint-disable no-console */
console.error(err);
/* eslint-enable no-console */
});
_getListings(url) {
const extractor = new Extractor();
return new Promise((resolve, reject) => {
extractor
.execute(url, this._providerConfig.waitForSelector)
.then(() => {
const listings = extractor.parseResponseText(
this._providerConfig.crawlContainer,
this._providerConfig.crawlFields,
url,
);
resolve(listings == null ? [] : listings);
})
.catch((err) => {
reject(err);
/* eslint-disable no-console */
console.error(err);
/* eslint-enable no-console */
});
}
});
}
_normalize(listings) {
return listings.map(this._providerConfig.normalize);
}
_normalize(listings) {
return listings.map(this._providerConfig.normalize);
}
_filter(listings) {
//only return those where all the fields have been found
const keys = Object.keys(this._providerConfig.crawlFields);
const filteredListings = listings.filter((item) => keys.every((key) => key in item));
return filteredListings.filter(this._providerConfig.filter);
}
_filter(listings) {
//only return those where all the fields have been found
const keys = Object.keys(this._providerConfig.crawlFields);
const filteredListings = listings.filter((item) => keys.every((key) => key in item));
return filteredListings.filter(this._providerConfig.filter);
}
_findNew(listings) {
const newListings = listings.filter((o) => getKnownListings(this._jobKey, this._providerId)[o.id] == null);
if (newListings.length === 0) {
throw new NoNewListingsWarning();
}
return newListings;
_findNew(listings) {
const newListings = listings.filter((o) => getKnownListings(this._jobKey, this._providerId)[o.id] == null);
if (newListings.length === 0) {
throw new NoNewListingsWarning();
}
return newListings;
}
_notify(newListings) {
if (newListings.length === 0) {
throw new NoNewListingsWarning();
}
const sendNotifications = notify.send(this._providerId, newListings, this._notificationConfig, this._jobKey);
return Promise.all(sendNotifications).then(() => newListings);
_notify(newListings) {
if (newListings.length === 0) {
throw new NoNewListingsWarning();
}
const sendNotifications = notify.send(this._providerId, newListings, this._notificationConfig, this._jobKey);
return Promise.all(sendNotifications).then(() => newListings);
}
_save(newListings) {
const currentListings = getKnownListings(this._jobKey, this._providerId) || {};
newListings.forEach((listing) => {
currentListings[listing.id] = Date.now();
});
setKnownListings(this._jobKey, this._providerId, currentListings);
return newListings;
}
_save(newListings) {
const currentListings = getKnownListings(this._jobKey, this._providerId) || {};
newListings.forEach((listing) => {
currentListings[listing.id] = Date.now();
});
setKnownListings(this._jobKey, this._providerId, currentListings);
return newListings;
}
_filterBySimilarListings(listings) {
const filteredList = listings.filter((listing) => {
const similar = this._similarityCache.hasSimilarEntries(this._jobKey, listing.title);
if (similar) {
/* eslint-disable no-console */
console.debug(`Filtering similar entry for job with id ${this._jobKey} with title: `, listing.title);
/* eslint-enable no-console */
}
return !similar;
});
filteredList.forEach((filter) => this._similarityCache.addCacheEntry(this._jobKey, filter.title));
return filteredList;
}
_filterBySimilarListings(listings) {
const filteredList = listings.filter((listing) => {
const similar = this._similarityCache.hasSimilarEntries(this._jobKey, listing.title);
if (similar) {
/* eslint-disable no-console */
console.debug(`Filtering similar entry for job with id ${this._jobKey} with title: `, listing.title);
/* eslint-enable no-console */
}
return !similar;
});
filteredList.forEach((filter) => this._similarityCache.addCacheEntry(this._jobKey, filter.title));
return filteredList;
}
_handleError(err) {
if (err.name !== 'NoNewListingsWarning') console.error(err);
}
_handleError(err) {
if (err.name !== 'NoNewListingsWarning') console.error(err);
}
}
export default FredyRuntime;

View File

@@ -1,11 +1,9 @@
import restana from 'restana';
import fetch from 'node-fetch';
import * as jobStorage from '../../services/storage/jobStorage.js';
import * as userStorage from '../../services/storage/userStorage.js';
import * as immoscoutProvider from '../../provider/immoscout.js';
import { config } from '../../utils.js';
import { isAdmin } from '../security.js';
import {trackDemoJobCreated} from '../../services/tracking/Tracker.js';
import { trackDemoJobCreated } from '../../services/tracking/Tracker.js';
const service = restana();
const jobRouter = service.newRouter();
function doesJobBelongsToUser(job, req) {
@@ -28,7 +26,7 @@ jobRouter.get('/', async (req, res) => {
jobRouter.get('/processingTimes', async (req, res) => {
res.body = {
interval: config.interval,
lastRun: config.lastRun || null
lastRun: config.lastRun || null,
};
res.send();
});
@@ -51,7 +49,7 @@ jobRouter.post('/', async (req, res) => {
trackDemoJobCreated({
name,
provider,
adapter: notificationAdapter
adapter: notificationAdapter,
});
res.send();
});

View File

@@ -7,9 +7,11 @@ export const send = ({ serviceName, newListings, notificationConfig, jobKey }) =
const job = getJob(jobKey);
const jobName = job == null ? jobKey : job.name;
const promises = newListings.map((newListing) => {
const message = `Address: ${newListing.address} Size: ${newListing.size.replace(/2m/g, '$m^2$')} Price: ${
newListing.price
}`;
const message = `
Address: ${newListing.address}
Size: ${newListing.size.replace(/2m/g, '$m^2$')}
Price: ${newListing.price}
Link: ${newListing.link}`;
return fetch(server, {
method: 'POST',
body: JSON.stringify({

View File

@@ -1,50 +1,73 @@
import { markdown2Html } from '../../services/markdown.js';
import { getJob } from '../../services/storage/jobStorage.js';
import {markdown2Html} from '../../services/markdown.js';
import {getJob} from '../../services/storage/jobStorage.js';
import fetch from 'node-fetch';
export const send = ({ serviceName, newListings, notificationConfig, jobKey }) => {
const { token, user, device } = notificationConfig.find((adapter) => adapter.id === config.id).fields;
const job = getJob(jobKey);
const jobName = job == null ? jobKey : job.name;
const promises = newListings.map((newListing) => {
const title = `${jobName} at ${serviceName}: ${newListing.title}`;
const message = `Address: ${newListing.address}\nSize: ${newListing.size}\nPrice: ${newListing.price}\nLink: ${newListing.link}`;
return fetch('https://api.pushover.net/1/messages.json', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
token: token,
user: user,
message: message,
device: device,
title: title,
}),
export const send = ({serviceName, newListings, notificationConfig, jobKey}) => {
const {token, user, device} = notificationConfig.find((adapter) => adapter.id === config.id).fields;
const job = getJob(jobKey);
const jobName = job == null ? jobKey : job.name;
const promises = newListings.map((newListing) => {
const title = `${jobName} at ${serviceName}: ${newListing.title}`;
const message = `Address: ${newListing.address}\nSize: ${newListing.size}\nPrice: ${newListing.price}\nLink: ${newListing.link}`;
return fetch('https://api.pushover.net/1/messages.json', {
method: 'POST',
headers: {'Content-Type': 'application/json'},
body: JSON.stringify({
token: token,
user: user,
message: message,
device: device,
title: title,
}),
});
});
});
return Promise.all(promises);
return Promise.all(promises)
.then((responses) => {
// Convert all responses to JSON
return Promise.all(responses.map((response) => response.json()));
})
.then((data) => {
// Check for errors in the data
const error = data
.map((item) => (item.errors != null && item.errors.length > 0 ? item.errors.join(', ') : null))
.filter((err) => err !== null);
if (error.length > 0) {
// Reject with the combined error messages
return Promise.reject(error.join('; '));
}
return data;
})
.then(() => {
return Promise.resolve();
})
.catch((error) => {
return Promise.reject(error);
});
};
export const config = {
id: 'pushover',
name: 'Pushover',
readme: markdown2Html('lib/notification/adapter/pushover.md'),
description: 'Fredy will send new listings to your mobile using Pushover.',
fields: {
token: {
type: 'text',
label: 'API token',
description: 'Your application\'s API token.',
id: 'pushover',
name: 'Pushover',
readme: markdown2Html('lib/notification/adapter/pushover.md'),
description: 'Fredy will send new listings to your mobile using Pushover.',
fields: {
token: {
type: 'text',
label: 'API token',
description: 'Your application\'s API token.',
},
user: {
type: 'text',
label: 'User key',
description: 'Your user/group key.',
},
device: {
type: 'text',
label: 'Device name',
description: 'The device name to send your notification to. Messages may be addressed to multiple specific devices by joining them with a comma.',
},
},
user: {
type: 'text',
label: 'User key',
description: 'Your user/group key.',
},
device: {
type: 'text',
label: 'Device name',
description: 'The device name to send your notification to. Messages may be addressed to multiple specific devices by joining them with a comma.',
},
},
};

View File

@@ -1,4 +1,4 @@
import utils, {buildHash} from '../utils.js';
import utils, { buildHash } from '../utils.js';
let appliedBlackList = [];
/**
@@ -26,7 +26,7 @@ const config = {
url: null,
crawlContainer: 'div[data-testid="serp-core-classified-card-testid"]',
sortByDateParam: 'sortby=19',
waitForSelector: 'div[data-testid="serp-core-classified-card-testid"]',
waitForSelector: 'div[data-testid="serp-resultscount-testid"]',
crawlFields: {
id: 'button@title |trim', // immonet is a piece of sh*t. See comment above
title: 'button@title |trim',

View File

@@ -18,7 +18,7 @@ const config = {
crawlContainer:
'div[data-testid="serp-core-scrollablelistview-testid"]:not(div[data-testid="serp-enlargementlist-testid"] div[data-testid="serp-card-testid"]) div[data-testid="serp-core-classified-card-testid"]',
sortByDateParam: 'order=DateDesc',
waitForSelector: 'div[data-testid="cardmfe-price-testid"]',
waitForSelector: 'div[data-testid="serp-gridcontainer-testid"]',
crawlFields: {
id: 'a@href',
price: 'div[data-testid="cardmfe-price-testid"] | removeNewline | trim',

View File

@@ -15,7 +15,7 @@ function applyBlacklist(o) {
const descNotBlacklisted = !utils.isOneOf(o.description, appliedBlackList);
const isBlacklistedDistrict =
appliedBlacklistedDistricts.length === 0 ? false : utils.isOneOf(o.description, appliedBlacklistedDistricts);
return !isBlacklistedDistrict && titleNotBlacklisted && descNotBlacklisted;
return o.title != null && !isBlacklistedDistrict && titleNotBlacklisted && descNotBlacklisted;
}
const config = {

View File

@@ -4,7 +4,8 @@ let appliedBlackList = [];
function normalize(o) {
const id = buildHash(o.id, o.price);
return Object.assign(o, {id});
const link = `https://www.wg-gesucht.de${o.link}`;
return Object.assign(o, { id, link });
}
function applyBlacklist(o) {

View File

@@ -1,45 +1,43 @@
import {setDebug} from './utils.js';
import { setDebug } from './utils.js';
import puppeteerExtractor from './puppeteerExtractor.js';
import {loadParser, parse} from './parser/parser.js';
import { loadParser, parse } from './parser/parser.js';
const DEFAULT_OPTIONS = {
debug: false,
puppeteerTimeout: 20_000,
puppeteerHeadless: true
debug: false,
puppeteerTimeout: 60_000,
puppeteerHeadless: true,
};
export default class Extractor {
constructor(options) {
this.options = {
...DEFAULT_OPTIONS,
...options
};
this.responseText = null;
setDebug(this.options);
constructor(options) {
this.options = {
...DEFAULT_OPTIONS,
...options,
};
this.responseText = null;
setDebug(this.options);
}
/**
* if you are extracting data from a SPA, you must provide a selector, otherwise
* your response will never contain what you are really looking for
* @param url
* @param waitForSelector
*/
execute = async (url, waitForSelector = null) => {
this.responseText = null;
try {
this.responseText = await puppeteerExtractor(url, waitForSelector, this.options);
if (this.responseText != null) {
loadParser(this.responseText);
}
} catch (error) {
console.error('Error trying to load page.', error);
}
return this;
};
/**
* if you are extracting data from a SPA, you must provide a selector, otherwise
* your response will never contain what you are really looking for
* @param url
* @param waitForSelector
*/
execute = async (url, waitForSelector = null) => {
this.responseText = null;
try {
this.responseText = await puppeteerExtractor(url, waitForSelector, this.options);
if(this.responseText != null) {
loadParser(this.responseText);
}
} catch (error) {
console.error('Error trying to load page.', error);
}
return this;
};
parseResponseText = (crawlContainer, crawlFields) => {
return parse(crawlContainer, crawlFields, this.responseText);
};
parseResponseText = (crawlContainer, crawlFields, url) => {
return parse(crawlContainer, crawlFields, this.responseText, url);
};
}

View File

@@ -3,92 +3,95 @@ import * as cheerio from 'cheerio';
let $ = null;
export function loadParser(text) {
$ = cheerio.load(text);
$ = cheerio.load(text);
}
export function parse(crawlContainer, crawlFields, text) {
if (!text) {
console.warn('Cannot parse, text was empty.');
return null;
}
export function parse(crawlContainer, crawlFields, text, url) {
if (!text) {
console.warn('Cannot parse, text was empty for url ', url);
return null;
}
if (!crawlContainer || !crawlFields) {
console.warn('Cannot parse, selector was empty.');
return null;
}
if (!crawlContainer || !crawlFields) {
console.warn('Cannot parse, selector was empty for url ', url);
return null;
}
const result = [];
const result = [];
if ($(crawlContainer).length === 0) {
console.error('No elements in crawl container found!');
}
if ($(crawlContainer).length === 0) {
console.warn('No elements in crawl container found for url ', url);
return null;
}
$(crawlContainer).each((_, element) => {
const container = $(element);
const parsedObject = {};
$(crawlContainer).each((_, element) => {
const container = $(element);
const parsedObject = {};
// Parse fields based on crawlFields
for (const [key, fieldSelector] of Object.entries(crawlFields)) {
let value;
// Parse fields based on crawlFields
for (const [key, fieldSelector] of Object.entries(crawlFields)) {
let value;
try {
try {
const selector = fieldSelector.includes('|')
? fieldSelector.substring(0, fieldSelector.indexOf('|')).trim()
: fieldSelector;
const selector = fieldSelector.includes('|') ? fieldSelector.substring(0, fieldSelector.indexOf('|')).trim() : fieldSelector;
if (selector.includes('@')) {
const [sel, attr] = selector.split('@');
if (sel.length === 0) {
value = container.attr(attr.trim());
} else {
value = container.find(sel.trim()).attr(attr.trim());
}
} else {
value = container.find(selector.trim()).text();
}
// Apply modifiers if specified
if (fieldSelector.includes('|')) {
const [_, ...modifiers] = fieldSelector.split('|').map(s => s.trim());
value = applyModifiers(value, modifiers);
}
parsedObject[key] = value || null;
} catch (error) {
console.error(`Error parsing field '${key}' with selector '${fieldSelector}':`, error);
parsedObject[key] = null;
}
}
if (parsedObject.id != null) {
result.push(parsedObject);
if (selector.includes('@')) {
const [sel, attr] = selector.split('@');
if (sel.length === 0) {
value = container.attr(attr.trim());
} else {
value = container.find(sel.trim()).attr(attr.trim());
}
} else {
console.warn('ID not found. Not relaying object.');
value = container.find(selector.trim()).text();
}
});
return result;
// Apply modifiers if specified
if (fieldSelector.includes('|')) {
/* eslint-disable no-unused-vars */
const [_, ...modifiers] = fieldSelector.split('|').map((s) => s.trim());
/* eslint-disable no-unused-vars */
value = applyModifiers(value, modifiers);
}
parsedObject[key] = value || null;
} catch (error) {
console.error(`Error parsing field '${key}' with selector '${fieldSelector}':`, error);
parsedObject[key] = null;
}
}
if (parsedObject.id != null) {
result.push(parsedObject);
} else {
console.warn('ID not found. Not relaying object.');
}
});
return result;
}
// Helper function to apply modifiers
function applyModifiers(value, modifiers) {
if (!value) return value;
if (!value) return value;
modifiers.forEach(modifier => {
switch (modifier) {
case 'int':
value = parseInt(value, 10);
break;
case 'trim':
value = value.replace(/\s+/g, ' ').trim();
break;
case 'removeNewline':
value = value.replace(/\n/g, ' ');
break;
default:
console.warn(`Unknown modifier: ${modifier}`);
}
});
modifiers.forEach((modifier) => {
switch (modifier) {
case 'int':
value = parseInt(value, 10);
break;
case 'trim':
value = value.replace(/\s+/g, ' ').trim();
break;
case 'removeNewline':
value = value.replace(/\n/g, ' ');
break;
default:
console.warn(`Unknown modifier: ${modifier}`);
}
});
return value;
return value;
}

View File

@@ -1,48 +1,49 @@
import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import {debug, DEFAULT_HEADER, botDetected} from './utils.js';
import { debug, DEFAULT_HEADER, botDetected } from './utils.js';
puppeteer.use(StealthPlugin());
export default async function execute(url, waitForSelector, options) {
let browser;
try {
debug(`Sending request to ${url} using Puppeteer.`);
let browser;
try {
debug(`Sending request to ${url} using Puppeteer.`);
browser = await puppeteer.launch({
headless: options.puppeteerHeadless ?? true,
args: ['--no-sandbox', '--disable-gpu', '--disable-setuid-sandbox']
});
let page = await browser.newPage();
await page.setExtraHTTPHeaders(DEFAULT_HEADER);
const response = await page.goto(url, {
waitUntil: 'domcontentloaded'
});
let pageSource;
//if we're extracting data from a spa, we must wait for the selector
if (waitForSelector != null) {
await page.waitForSelector(waitForSelector);
pageSource = await page.evaluate(selector => {
return document.querySelector(selector).innerHTML;
}, waitForSelector);
} else {
pageSource = await page.content();
}
const statusCode = response.status();
if (botDetected(pageSource, statusCode)) {
console.warn('We have been detected as a bot :-/ Tried url: => ', url);
return null;
}
return await page.content();
} catch (error) {
console.error('Error executing with puppeteer executor', error);
return null;
} finally {
if (browser != null) {
await browser.close();
}
browser = await puppeteer.launch({
headless: options.puppeteerHeadless ?? true,
args: ['--no-sandbox', '--disable-gpu', '--disable-setuid-sandbox'],
timeout: options.puppeteerTimeout || 30_000,
});
let page = await browser.newPage();
await page.setExtraHTTPHeaders(DEFAULT_HEADER);
const response = await page.goto(url, {
waitUntil: 'domcontentloaded',
});
let pageSource;
//if we're extracting data from a spa, we must wait for the selector
if (waitForSelector != null) {
await page.waitForSelector(waitForSelector);
pageSource = await page.evaluate((selector) => {
return document.querySelector(selector).innerHTML;
}, waitForSelector);
} else {
pageSource = await page.content();
}
}
const statusCode = response.status();
if (botDetected(pageSource, statusCode)) {
console.warn('We have been detected as a bot :-/ Tried url: => ', url);
return null;
}
return await page.content();
} catch (error) {
console.error('Error executing with puppeteer executor', error);
return null;
} finally {
if (browser != null) {
await browser.close();
}
}
}

View File

@@ -1,35 +1,32 @@
let debuggingOn = false;
export const DEFAULT_HEADER = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
Connection: 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
};
export const setDebug = options => {
debuggingOn = !!options?.debug;
export const setDebug = (options) => {
debuggingOn = !!options?.debug;
};
export const debug = (message) => {
if(debuggingOn) {
console.debug(message);
}
if (debuggingOn) {
/* eslint-disable no-console */
console.debug(message);
/* eslint-enable no-console */
}
};
export const botDetected = (pageSource, statusCode) => {
const suspiciousStatusCodes = [
403, 429
];
const botDetectionPatterns = [
/verify you are human/i,
/access denied/i,
/x-amz-cf-id/i,
];
const suspiciousStatusCodes = [403, 429];
const botDetectionPatterns = [/verify you are human/i, /access denied/i, /x-amz-cf-id/i];
const detectedInSource = botDetectionPatterns.some(pattern => pattern.test(pageSource));
const detectedByStatus = suspiciousStatusCodes.includes(statusCode);
const detectedInSource = botDetectionPatterns.some((pattern) => pattern.test(pageSource));
const detectedByStatus = suspiciousStatusCodes.includes(statusCode);
return detectedInSource || detectedByStatus;
};
return detectedInSource || detectedByStatus;
};

View File

@@ -1,6 +1,6 @@
{
"name": "fredy",
"version": "11.0.0",
"version": "11.0.4",
"description": "[F]ind [R]eal [E]states [d]amn eas[y].",
"scripts": {
"start": "node prod.js",
@@ -50,27 +50,27 @@
"Firefox ESR"
],
"dependencies": {
"@douyinfe/semi-ui": "2.71.3",
"@douyinfe/semi-ui": "2.75.0",
"@rematch/core": "2.2.0",
"@rematch/loading": "2.1.2",
"@sendgrid/mail": "8.1.4",
"@vitejs/plugin-react": "4.3.4",
"better-sqlite3": "^11.7.0",
"better-sqlite3": "^11.8.1",
"body-parser": "1.20.3",
"cheerio": "^1.0.0",
"cookie-session": "2.1.0",
"handlebars": "4.7.8",
"highcharts": "12.1.0",
"highcharts": "12.1.2",
"highcharts-react-official": "3.2.1",
"lodash": "4.17.21",
"lowdb": "6.0.1",
"markdown": "^0.5.0",
"mixpanel": "^0.18.0",
"nanoid": "5.0.9",
"nanoid": "5.1.2",
"node-fetch": "3.3.2",
"node-mailjet": "6.0.6",
"package-up": "^5.0.0",
"puppeteer": "^23.10.4",
"puppeteer": "^24.2.1",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-stealth": "^2.11.2",
"query-string": "9.1.1",
@@ -88,21 +88,21 @@
"vite": "5.4.11"
},
"devDependencies": {
"@babel/core": "7.26.0",
"@babel/eslint-parser": "7.25.9",
"@babel/preset-env": "7.26.0",
"@babel/core": "7.26.9",
"@babel/eslint-parser": "7.26.8",
"@babel/preset-env": "7.26.9",
"@babel/preset-react": "7.26.3",
"chai": "5.1.2",
"chai": "5.2.0",
"eslint": "8.56.0",
"eslint-config-prettier": "8.8.0",
"eslint-plugin-react": "7.37.2",
"esmock": "2.6.9",
"eslint-plugin-react": "7.37.4",
"esmock": "2.7.0",
"history": "5.3.0",
"husky": "9.1.7",
"less": "4.2.1",
"lint-staged": "15.2.11",
"less": "4.2.2",
"lint-staged": "15.4.3",
"mocha": "10.8.2",
"prettier": "3.4.2",
"prettier": "3.5.2",
"redux-logger": "3.0.6"
}
}

View File

@@ -1,40 +1,38 @@
import * as similarityCache from '../../lib/services/similarity-check/similarityCache.js';
import { get } from '../mocks/mockNotification.js';
import { mockFredy, providerConfig } from '../utils.js';
import { expect } from 'chai';
import {get} from '../mocks/mockNotification.js';
import {mockFredy, providerConfig} from '../utils.js';
import {expect} from 'chai';
import * as provider from '../../lib/provider/immonet.js';
describe('#immonet testsuite()', () => {
after(() => {
similarityCache.stopCacheCleanup();
});
provider.init(providerConfig.immonet, [], []);
it('should test immonet provider', async () => {
const Fredy = await mockFredy();
return await new Promise((resolve) => {
const fredy = new Fredy(provider.config, null, provider.metaInformation.id, 'immonet', similarityCache);
fredy.execute().then((listing) => {
expect(listing).to.be.a('array');
const notificationObj = get();
expect(notificationObj).to.be.a('object');
expect(notificationObj.serviceName).to.equal('immonet');
notificationObj.payload.forEach((notify) => {
/** check the actual structure **/
expect(notify.id).to.be.a('string');
expect(notify.price).to.be.a('string');
expect(notify.size).to.be.a('string');
expect(notify.title).to.be.a('string');
expect(notify.link).to.be.a('string');
expect(notify.address).to.be.a('string');
/** check the values if possible **/
expect(notify.price).that.does.include('€');
expect(notify.size).that.does.include('m²');
expect(notify.title).to.be.not.empty;
expect(notify.address).to.be.not.empty;
});
resolve();
});
after(() => {
similarityCache.stopCacheCleanup();
});
provider.init(providerConfig.immonet, [], []);
it('should test immonet provider', async () => {
const Fredy = await mockFredy();
return await new Promise((resolve) => {
const fredy = new Fredy(provider.config, null, provider.metaInformation.id, 'immonet', similarityCache);
fredy.execute().then((listing) => {
expect(listing).to.be.a('array');
const notificationObj = get();
expect(notificationObj).to.be.a('object');
expect(notificationObj.serviceName).to.equal('immonet');
notificationObj.payload.forEach((notify) => {
/** check the actual structure **/
expect(notify.id).to.be.a('string');
expect(notify.price).to.be.a('string');
expect(notify.size).to.be.a('string');
expect(notify.title).to.be.a('string');
expect(notify.link).to.be.a('string');
expect(notify.address).to.be.a('string');
expect(notify.size).that.does.include('m²');
expect(notify.title).to.be.not.empty;
expect(notify.address).to.be.not.empty;
});
resolve();
});
});
});
});
});

1477
yarn.lock

File diff suppressed because it is too large Load Diff