mirror of
https://github.com/orangecoding/fredy.git
synced 2026-06-16 12:31:07 +00:00
Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
996b841cfb | ||
|
|
b2e294e38c | ||
|
|
8afeaa05d9 | ||
|
|
ec47137b89 | ||
|
|
33161de087 | ||
|
|
acab23207e | ||
|
|
2896d531e4 |
34
README.md
34
README.md
@@ -167,6 +167,40 @@ For more information on how to set it up and use it, please refer to the [MCP Re
|
||||
|
||||
Immoscout has implemented advanced bot detection. In order to work around this, we are using a reversed engineered version of their mobile api. See [Immoscout Reverse Engineering Documentation](https://github.com/orangecoding/fredy/blob/master/reverse-engineered-immoscout.md)
|
||||
|
||||
## 🛡️ Bot Detection & Proxies
|
||||
|
||||
Most browser-based providers (immowelt, immonet, kleinanzeigen, ...) are scraped through a hardened headless browser ([CloakBrowser](https://www.npmjs.com/package/cloakbrowser)). It makes the **browser fingerprint** indistinguishable from a real Chrome, which is enough when you run Fredy on a normal home connection.
|
||||
|
||||
On a **server / VPS the requests usually originate from a datacenter IP**, and providers behind anti-bot systems (e.g. AWS CloudFront/WAF) block those based on **IP reputation alone**, no matter how perfect the fingerprint is. The typical symptom: it works locally but you get `We have been detected as a bot :-/` on the server.
|
||||
|
||||
### The fix: a residential proxy
|
||||
|
||||
A **residential proxy** routes Fredy's browser through the internet connection of a real household, so the provider sees a "normal user" IP instead of a datacenter. For German portals, use a **German (DE) residential** (or mobile/4G) proxy. Plain VPNs and **datacenter proxies do not help** here, they share the same bad reputation as your server.
|
||||
|
||||
**Configure it** under **Settings → Execution → Proxy URL**. Supported formats:
|
||||
|
||||
```
|
||||
http://user:pass@host:port
|
||||
socks5://user:pass@host:port
|
||||
```
|
||||
|
||||
Leave the field empty to disable. The proxy applies to all headless-browser providers and takes effect on the next job run (no restart needed). Immoscout uses a separate mobile API and is not affected.
|
||||
|
||||
### Where to get a residential proxy
|
||||
|
||||
Residential proxies are a paid service (usually billed per GB, Fredy's traffic is small). Well-known providers offering German residential IPs include:
|
||||
|
||||
| Provider | Notes |
|
||||
|---|---|
|
||||
| [IPRoyal](https://iproyal.com) | Pay-as-you-go, no monthly minimum, good for low volume |
|
||||
| [Webshare](https://www.webshare.io) | Cheap entry tier, has a small free plan to test with |
|
||||
| [Decodo (formerly Smartproxy)](https://decodo.com) | Easy setup, country/city targeting |
|
||||
| [SOAX](https://soax.com) | Residential + mobile, fine-grained geo-targeting |
|
||||
| [Bright Data](https://brightdata.com) | Largest pool, most features, higher complexity/price |
|
||||
| [Oxylabs](https://oxylabs.io) | Enterprise-grade, larger plans |
|
||||
|
||||
This is not an endorsement, pick whatever fits your budget. For low-volume use like Fredy, a pay-as-you-go plan (e.g. IPRoyal) or a cheap entry tier (e.g. Webshare) is usually plenty. Make sure to select **Germany** as the proxy location and keep the search interval reasonable (the higher the interval, the less you look like a bot).
|
||||
|
||||
## Analytics
|
||||
|
||||
Fredy is completely free (and will always remain free). However, it would be a huge help if you’d allow me to collect some analytical data.
|
||||
|
||||
@@ -227,7 +227,7 @@ class FredyPipelineExecutioner {
|
||||
const extractor = new Extractor({ ...this._providerConfig.puppeteerOptions, browser: this._browser });
|
||||
return new Promise((resolve, reject) => {
|
||||
extractor
|
||||
.execute(url, this._providerConfig.waitForSelector, this._jobKey)
|
||||
.execute(url, this._providerConfig.waitForSelector, this._providerId)
|
||||
.then(() => {
|
||||
const listings = extractor.parseResponseText(
|
||||
this._providerConfig.crawlContainer,
|
||||
|
||||
@@ -196,8 +196,8 @@ const config = {
|
||||
id: '.aditem@data-adid',
|
||||
price: '.aditem-main--middle--price-shipping--price | removeNewline | trim',
|
||||
tags: '.aditem-main--middle--tags | removeNewline | trim',
|
||||
title: '.aditem-main .text-module-begin a | removeNewline | trim',
|
||||
link: '.aditem-main .text-module-begin a@href | removeNewline | trim',
|
||||
title: '.aditem-main .text-module-begin | removeNewline | trim',
|
||||
link: '.aditem@data-href',
|
||||
description: '.aditem-main .aditem-main--middle--description | removeNewline | trim',
|
||||
address: '.aditem-main--top--left | trim | removeNewline',
|
||||
image: 'img@src',
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
*/
|
||||
|
||||
import { launch } from 'cloakbrowser/puppeteer';
|
||||
import { debug, botDetected } from './utils.js';
|
||||
import { botDetected, debug } from './utils.js';
|
||||
import { getPreLaunchConfig } from './botPrevention.js';
|
||||
import logger from '../logger.js';
|
||||
import { trackPoi } from '../tracking/Tracker.js';
|
||||
@@ -50,7 +50,7 @@ export async function launchBrowser(url, options) {
|
||||
preCfg.windowSizeArg,
|
||||
];
|
||||
|
||||
const browser = await launch({
|
||||
return await launch({
|
||||
headless: options?.puppeteerHeadless ?? true,
|
||||
humanize: true,
|
||||
args,
|
||||
@@ -59,8 +59,6 @@ export async function launchBrowser(url, options) {
|
||||
...(options?.proxyUrl ? { proxy: options.proxyUrl } : {}),
|
||||
...(preCfg.timezone ? { timezone: preCfg.timezone } : {}),
|
||||
});
|
||||
|
||||
return browser;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -14,6 +14,7 @@ import * as similarityCache from '../similarity-check/similarityCache.js';
|
||||
import { isRunning, markFinished, markRunning } from './run-state.js';
|
||||
import { sendToUsers } from '../sse/sse-broker.js';
|
||||
import * as puppeteerExtractor from '../extractor/puppeteerExtractor.js';
|
||||
import { getSettings } from '../storage/settingsStorage.js';
|
||||
|
||||
/**
|
||||
* Initializes the job execution service.
|
||||
@@ -160,6 +161,14 @@ export function initJobExecutionService({ providers, settings, intervalMs }) {
|
||||
}
|
||||
let browser;
|
||||
try {
|
||||
// Read the proxy live (not from the startup snapshot) so changing it in the
|
||||
// UI takes effect on the next run without a backend restart. An empty value
|
||||
// disables the proxy. Routing the headless browser through a (German
|
||||
// residential) proxy avoids datacenter-IP based bot detection on the
|
||||
// Puppeteer-based providers (immowelt, immonet, kleinanzeigen, ...).
|
||||
const liveSettings = await getSettings();
|
||||
const proxyUrl = typeof liveSettings?.proxyUrl === 'string' ? liveSettings.proxyUrl.trim() : '';
|
||||
|
||||
const jobProviders = job.provider.filter(
|
||||
(p) => providers.find((loaded) => loaded.metaInformation.id === p.id) != null,
|
||||
);
|
||||
@@ -168,14 +177,14 @@ export function initJobExecutionService({ providers, settings, intervalMs }) {
|
||||
const matchedProvider = providers.find((loaded) => loaded.metaInformation.id === prov.id);
|
||||
matchedProvider.init({ ...prov, userId: job.userId }, job.blacklist);
|
||||
|
||||
if (browser && !browser.isConnected()) {
|
||||
if (browser && !browser.connected) {
|
||||
logger.debug('Browser is disconnected, nullifying to launch a new one.');
|
||||
await puppeteerExtractor.closeBrowser(browser);
|
||||
browser = null;
|
||||
}
|
||||
|
||||
if (!browser && matchedProvider.config.getListings == null) {
|
||||
browser = await puppeteerExtractor.launchBrowser(matchedProvider.config.url, {});
|
||||
browser = await puppeteerExtractor.launchBrowser(matchedProvider.config.url, proxyUrl ? { proxyUrl } : {});
|
||||
}
|
||||
|
||||
await new FredyPipelineExecutioner(matchedProvider.config, job, prov.id, similarityCache, browser).execute();
|
||||
|
||||
28
package.json
28
package.json
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "fredy",
|
||||
"version": "22.0.6",
|
||||
"version": "22.1.0",
|
||||
"description": "[F]ind [R]eal [E]states [d]amn eas[y].",
|
||||
"scripts": {
|
||||
"prepare": "husky",
|
||||
@@ -62,9 +62,9 @@
|
||||
"Firefox ESR"
|
||||
],
|
||||
"dependencies": {
|
||||
"@douyinfe/semi-icons": "^2.97.0",
|
||||
"@douyinfe/semi-ui": "2.97.0",
|
||||
"@douyinfe/semi-ui-19": "^2.97.0",
|
||||
"@douyinfe/semi-icons": "^2.99.2",
|
||||
"@douyinfe/semi-ui": "2.99.2",
|
||||
"@douyinfe/semi-ui-19": "^2.99.2",
|
||||
"@fastify/cookie": "^11.0.2",
|
||||
"@fastify/helmet": "^13.0.2",
|
||||
"@fastify/session": "^11.1.1",
|
||||
@@ -73,12 +73,12 @@
|
||||
"@modelcontextprotocol/sdk": "^1.29.0",
|
||||
"@sendgrid/mail": "8.1.6",
|
||||
"@turf/boolean-point-in-polygon": "^7.3.5",
|
||||
"@vitejs/plugin-react": "6.0.1",
|
||||
"@vitejs/plugin-react": "6.0.2",
|
||||
"adm-zip": "^0.5.17",
|
||||
"better-sqlite3": "^12.10.0",
|
||||
"chart.js": "^4.5.1",
|
||||
"cheerio": "^1.2.0",
|
||||
"cloakbrowser": "^0.3.28",
|
||||
"cloakbrowser": "^0.3.30",
|
||||
"fastify": "^5.8.5",
|
||||
"handlebars": "4.7.9",
|
||||
"maplibre-gl": "^5.24.0",
|
||||
@@ -89,18 +89,18 @@
|
||||
"nodemailer": "^8.0.7",
|
||||
"p-throttle": "^8.1.0",
|
||||
"package-up": "^5.0.0",
|
||||
"puppeteer-core": "^24.43.1",
|
||||
"puppeteer-core": "^25.0.4",
|
||||
"query-string": "9.3.1",
|
||||
"react": "19.2.6",
|
||||
"react-chartjs-2": "^5.3.1",
|
||||
"react-dom": "19.2.6",
|
||||
"react-range-slider-input": "^3.3.5",
|
||||
"react-router": "7.15.0",
|
||||
"react-router-dom": "7.15.0",
|
||||
"react-router": "7.15.1",
|
||||
"react-router-dom": "7.15.1",
|
||||
"resend": "^6.12.3",
|
||||
"semver": "^7.8.0",
|
||||
"semver": "^7.8.1",
|
||||
"slack": "11.0.2",
|
||||
"vite": "8.0.12",
|
||||
"vite": "8.0.14",
|
||||
"x-var": "^3.0.1",
|
||||
"zustand": "^5.0.13"
|
||||
},
|
||||
@@ -111,16 +111,16 @@
|
||||
"@babel/preset-react": "7.28.5",
|
||||
"@eslint/js": "^10.0.1",
|
||||
"chalk": "^5.6.2",
|
||||
"eslint": "10.3.0",
|
||||
"eslint": "10.4.0",
|
||||
"eslint-config-prettier": "10.1.8",
|
||||
"eslint-plugin-react": "7.37.5",
|
||||
"globals": "^17.6.0",
|
||||
"history": "5.3.0",
|
||||
"husky": "9.1.7",
|
||||
"less": "4.6.4",
|
||||
"lint-staged": "17.0.4",
|
||||
"lint-staged": "17.0.5",
|
||||
"nodemon": "^3.1.14",
|
||||
"prettier": "3.8.3",
|
||||
"vitest": "^4.1.6"
|
||||
"vitest": "^4.1.7"
|
||||
}
|
||||
}
|
||||
|
||||
37
test/services/extractor/puppeteerExtractor.test.js
Normal file
37
test/services/extractor/puppeteerExtractor.test.js
Normal file
@@ -0,0 +1,37 @@
|
||||
/*
|
||||
* Copyright (c) 2026 by Christian Kellner.
|
||||
* Licensed under Apache-2.0 with Commons Clause and Attribution/Naming Clause
|
||||
*/
|
||||
|
||||
import { vi, describe, it, expect, beforeEach } from 'vitest';
|
||||
|
||||
// Mock the CloakBrowser launcher so no real Chromium binary is needed and we can
|
||||
// assert which options get forwarded to it.
|
||||
const { launchMock } = vi.hoisted(() => ({ launchMock: vi.fn() }));
|
||||
|
||||
vi.mock('cloakbrowser/puppeteer', () => ({
|
||||
launch: launchMock,
|
||||
}));
|
||||
|
||||
const { launchBrowser } = await import('../../../lib/services/extractor/puppeteerExtractor.js');
|
||||
|
||||
describe('launchBrowser proxy forwarding', () => {
|
||||
beforeEach(() => {
|
||||
launchMock.mockReset();
|
||||
launchMock.mockResolvedValue({ close: async () => {} });
|
||||
});
|
||||
|
||||
it('forwards proxyUrl to CloakBrowser as the proxy option', async () => {
|
||||
await launchBrowser('https://www.immowelt.de/', { proxyUrl: 'http://user:pass@host:8080' });
|
||||
|
||||
expect(launchMock).toHaveBeenCalledTimes(1);
|
||||
expect(launchMock.mock.calls[0][0]).toMatchObject({ proxy: 'http://user:pass@host:8080' });
|
||||
});
|
||||
|
||||
it('does not set a proxy when no proxyUrl is given', async () => {
|
||||
await launchBrowser('https://www.immowelt.de/', {});
|
||||
|
||||
expect(launchMock).toHaveBeenCalledTimes(1);
|
||||
expect(launchMock.mock.calls[0][0].proxy).toBeUndefined();
|
||||
});
|
||||
});
|
||||
@@ -18,6 +18,7 @@ describe('services/jobs/jobExecutionService', () => {
|
||||
const busPath = root + '/lib/services/events/event-bus.js';
|
||||
const jobStoragePath = root + '/lib/services/storage/jobStorage.js';
|
||||
const userStoragePath = root + '/lib/services/storage/userStorage.js';
|
||||
const settingsStoragePath = root + '/lib/services/storage/settingsStorage.js';
|
||||
const brokerPath = root + '/lib/services/sse/sse-broker.js';
|
||||
const utilsPath = root + '/lib/utils.js';
|
||||
const loggerPath = root + '/lib/services/logger.js';
|
||||
@@ -33,6 +34,9 @@ describe('services/jobs/jobExecutionService', () => {
|
||||
getUsers: () => state.users.slice(),
|
||||
getUser: (id) => state.users.find((u) => u.id === id) || null,
|
||||
}));
|
||||
vi.doMock(settingsStoragePath, () => ({
|
||||
getSettings: async () => ({}),
|
||||
}));
|
||||
vi.doMock(brokerPath, () => ({
|
||||
sendToUsers: (...args) => calls.sent.push(args),
|
||||
}));
|
||||
|
||||
@@ -57,6 +57,7 @@ const GeneralSettings = function GeneralSettings() {
|
||||
const currentUser = useSelector((state) => state.user.currentUser);
|
||||
|
||||
const [interval, setInterval] = React.useState('');
|
||||
const [proxyUrl, setProxyUrl] = React.useState('');
|
||||
const [port, setPort] = React.useState('');
|
||||
const [workingHourFrom, setWorkingHourFrom] = React.useState(null);
|
||||
const [workingHourTo, setWorkingHourTo] = React.useState(null);
|
||||
@@ -91,6 +92,7 @@ const GeneralSettings = function GeneralSettings() {
|
||||
React.useEffect(() => {
|
||||
async function init() {
|
||||
setInterval(settings?.interval);
|
||||
setProxyUrl(settings?.proxyUrl ?? '');
|
||||
setPort(settings?.port);
|
||||
setWorkingHourFrom(settings?.workingHours?.from);
|
||||
setWorkingHourTo(settings?.workingHours?.to);
|
||||
@@ -133,6 +135,7 @@ const GeneralSettings = function GeneralSettings() {
|
||||
try {
|
||||
await xhrPost('/api/admin/generalSettings', {
|
||||
interval,
|
||||
proxyUrl: proxyUrl?.trim() ?? '',
|
||||
port,
|
||||
workingHours: {
|
||||
from: workingHourFrom,
|
||||
@@ -376,6 +379,18 @@ const GeneralSettings = function GeneralSettings() {
|
||||
</div>
|
||||
</SegmentPart>
|
||||
|
||||
<SegmentPart
|
||||
name="Proxy URL"
|
||||
helpText="Optional. Routes the scraping browser through a proxy. Server/datacenter IPs are frequently blocked by providers (e.g. immowelt) regardless of browser fingerprint, a German residential proxy makes requests look like a normal household and is the most effective fix. Format: http://user:pass@host:port or socks5://user:pass@host:port. Leave empty to disable."
|
||||
>
|
||||
<Input
|
||||
type="text"
|
||||
placeholder="http://user:pass@host:port"
|
||||
value={proxyUrl}
|
||||
onChange={(value) => setProxyUrl(value)}
|
||||
/>
|
||||
</SegmentPart>
|
||||
|
||||
<div className="generalSettings__save-row">
|
||||
<Button type="primary" theme="solid" onClick={handleStore} icon={<IconSave />}>
|
||||
Save
|
||||
|
||||
Reference in New Issue
Block a user