From 996b841cfb5e04f56e01400c65a685d7d01abc85 Mon Sep 17 00:00:00 2001 From: orangecoding Date: Sun, 24 May 2026 20:49:27 +0200 Subject: [PATCH] adding ability to add proxies for cloak --- README.md | 34 ++++++ lib/services/jobs/jobExecutionService.js | 11 +- package.json | 8 +- .../extractor/puppeteerExtractor.test.js | 37 ++++++ .../services/jobs/jobExecutionService.test.js | 4 + .../views/generalSettings/GeneralSettings.jsx | 15 +++ yarn.lock | 112 +++++++++--------- 7 files changed, 160 insertions(+), 61 deletions(-) create mode 100644 test/services/extractor/puppeteerExtractor.test.js diff --git a/README.md b/README.md index b0b473d..6d07e30 100755 --- a/README.md +++ b/README.md @@ -167,6 +167,40 @@ For more information on how to set it up and use it, please refer to the [MCP Re Immoscout has implemented advanced bot detection. In order to work around this, we are using a reversed engineered version of their mobile api. See [Immoscout Reverse Engineering Documentation](https://github.com/orangecoding/fredy/blob/master/reverse-engineered-immoscout.md) +## 🛡️ Bot Detection & Proxies + +Most browser-based providers (immowelt, immonet, kleinanzeigen, ...) are scraped through a hardened headless browser ([CloakBrowser](https://www.npmjs.com/package/cloakbrowser)). It makes the **browser fingerprint** indistinguishable from a real Chrome, which is enough when you run Fredy on a normal home connection. + +On a **server / VPS the requests usually originate from a datacenter IP**, and providers behind anti-bot systems (e.g. AWS CloudFront/WAF) block those based on **IP reputation alone**, no matter how perfect the fingerprint is. The typical symptom: it works locally but you get `We have been detected as a bot :-/` on the server. + +### The fix: a residential proxy + +A **residential proxy** routes Fredy's browser through the internet connection of a real household, so the provider sees a "normal user" IP instead of a datacenter. For German portals, use a **German (DE) residential** (or mobile/4G) proxy. Plain VPNs and **datacenter proxies do not help** here, they share the same bad reputation as your server. + +**Configure it** under **Settings → Execution → Proxy URL**. Supported formats: + +``` +http://user:pass@host:port +socks5://user:pass@host:port +``` + +Leave the field empty to disable. The proxy applies to all headless-browser providers and takes effect on the next job run (no restart needed). Immoscout uses a separate mobile API and is not affected. + +### Where to get a residential proxy + +Residential proxies are a paid service (usually billed per GB, Fredy's traffic is small). Well-known providers offering German residential IPs include: + +| Provider | Notes | +|---|---| +| [IPRoyal](https://iproyal.com) | Pay-as-you-go, no monthly minimum, good for low volume | +| [Webshare](https://www.webshare.io) | Cheap entry tier, has a small free plan to test with | +| [Decodo (formerly Smartproxy)](https://decodo.com) | Easy setup, country/city targeting | +| [SOAX](https://soax.com) | Residential + mobile, fine-grained geo-targeting | +| [Bright Data](https://brightdata.com) | Largest pool, most features, higher complexity/price | +| [Oxylabs](https://oxylabs.io) | Enterprise-grade, larger plans | + +This is not an endorsement, pick whatever fits your budget. For low-volume use like Fredy, a pay-as-you-go plan (e.g. IPRoyal) or a cheap entry tier (e.g. Webshare) is usually plenty. Make sure to select **Germany** as the proxy location and keep the search interval reasonable (the higher the interval, the less you look like a bot). + ## Analytics Fredy is completely free (and will always remain free). However, it would be a huge help if you’d allow me to collect some analytical data. diff --git a/lib/services/jobs/jobExecutionService.js b/lib/services/jobs/jobExecutionService.js index ca25dbf..ea7a02e 100644 --- a/lib/services/jobs/jobExecutionService.js +++ b/lib/services/jobs/jobExecutionService.js @@ -14,6 +14,7 @@ import * as similarityCache from '../similarity-check/similarityCache.js'; import { isRunning, markFinished, markRunning } from './run-state.js'; import { sendToUsers } from '../sse/sse-broker.js'; import * as puppeteerExtractor from '../extractor/puppeteerExtractor.js'; +import { getSettings } from '../storage/settingsStorage.js'; /** * Initializes the job execution service. @@ -160,6 +161,14 @@ export function initJobExecutionService({ providers, settings, intervalMs }) { } let browser; try { + // Read the proxy live (not from the startup snapshot) so changing it in the + // UI takes effect on the next run without a backend restart. An empty value + // disables the proxy. Routing the headless browser through a (German + // residential) proxy avoids datacenter-IP based bot detection on the + // Puppeteer-based providers (immowelt, immonet, kleinanzeigen, ...). + const liveSettings = await getSettings(); + const proxyUrl = typeof liveSettings?.proxyUrl === 'string' ? liveSettings.proxyUrl.trim() : ''; + const jobProviders = job.provider.filter( (p) => providers.find((loaded) => loaded.metaInformation.id === p.id) != null, ); @@ -175,7 +184,7 @@ export function initJobExecutionService({ providers, settings, intervalMs }) { } if (!browser && matchedProvider.config.getListings == null) { - browser = await puppeteerExtractor.launchBrowser(matchedProvider.config.url, {}); + browser = await puppeteerExtractor.launchBrowser(matchedProvider.config.url, proxyUrl ? { proxyUrl } : {}); } await new FredyPipelineExecutioner(matchedProvider.config, job, prov.id, similarityCache, browser).execute(); diff --git a/package.json b/package.json index 7ec524b..161d9bd 100755 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "fredy", - "version": "22.0.10", + "version": "22.1.0", "description": "[F]ind [R]eal [E]states [d]amn eas[y].", "scripts": { "prepare": "husky", @@ -62,9 +62,9 @@ "Firefox ESR" ], "dependencies": { - "@douyinfe/semi-icons": "^2.99.0", - "@douyinfe/semi-ui": "2.99.0", - "@douyinfe/semi-ui-19": "^2.99.0", + "@douyinfe/semi-icons": "^2.99.2", + "@douyinfe/semi-ui": "2.99.2", + "@douyinfe/semi-ui-19": "^2.99.2", "@fastify/cookie": "^11.0.2", "@fastify/helmet": "^13.0.2", "@fastify/session": "^11.1.1", diff --git a/test/services/extractor/puppeteerExtractor.test.js b/test/services/extractor/puppeteerExtractor.test.js new file mode 100644 index 0000000..966d6f9 --- /dev/null +++ b/test/services/extractor/puppeteerExtractor.test.js @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2026 by Christian Kellner. + * Licensed under Apache-2.0 with Commons Clause and Attribution/Naming Clause + */ + +import { vi, describe, it, expect, beforeEach } from 'vitest'; + +// Mock the CloakBrowser launcher so no real Chromium binary is needed and we can +// assert which options get forwarded to it. +const { launchMock } = vi.hoisted(() => ({ launchMock: vi.fn() })); + +vi.mock('cloakbrowser/puppeteer', () => ({ + launch: launchMock, +})); + +const { launchBrowser } = await import('../../../lib/services/extractor/puppeteerExtractor.js'); + +describe('launchBrowser proxy forwarding', () => { + beforeEach(() => { + launchMock.mockReset(); + launchMock.mockResolvedValue({ close: async () => {} }); + }); + + it('forwards proxyUrl to CloakBrowser as the proxy option', async () => { + await launchBrowser('https://www.immowelt.de/', { proxyUrl: 'http://user:pass@host:8080' }); + + expect(launchMock).toHaveBeenCalledTimes(1); + expect(launchMock.mock.calls[0][0]).toMatchObject({ proxy: 'http://user:pass@host:8080' }); + }); + + it('does not set a proxy when no proxyUrl is given', async () => { + await launchBrowser('https://www.immowelt.de/', {}); + + expect(launchMock).toHaveBeenCalledTimes(1); + expect(launchMock.mock.calls[0][0].proxy).toBeUndefined(); + }); +}); diff --git a/test/services/jobs/jobExecutionService.test.js b/test/services/jobs/jobExecutionService.test.js index d61370d..ed48c65 100644 --- a/test/services/jobs/jobExecutionService.test.js +++ b/test/services/jobs/jobExecutionService.test.js @@ -18,6 +18,7 @@ describe('services/jobs/jobExecutionService', () => { const busPath = root + '/lib/services/events/event-bus.js'; const jobStoragePath = root + '/lib/services/storage/jobStorage.js'; const userStoragePath = root + '/lib/services/storage/userStorage.js'; + const settingsStoragePath = root + '/lib/services/storage/settingsStorage.js'; const brokerPath = root + '/lib/services/sse/sse-broker.js'; const utilsPath = root + '/lib/utils.js'; const loggerPath = root + '/lib/services/logger.js'; @@ -33,6 +34,9 @@ describe('services/jobs/jobExecutionService', () => { getUsers: () => state.users.slice(), getUser: (id) => state.users.find((u) => u.id === id) || null, })); + vi.doMock(settingsStoragePath, () => ({ + getSettings: async () => ({}), + })); vi.doMock(brokerPath, () => ({ sendToUsers: (...args) => calls.sent.push(args), })); diff --git a/ui/src/views/generalSettings/GeneralSettings.jsx b/ui/src/views/generalSettings/GeneralSettings.jsx index 65d35b7..b53fd21 100644 --- a/ui/src/views/generalSettings/GeneralSettings.jsx +++ b/ui/src/views/generalSettings/GeneralSettings.jsx @@ -57,6 +57,7 @@ const GeneralSettings = function GeneralSettings() { const currentUser = useSelector((state) => state.user.currentUser); const [interval, setInterval] = React.useState(''); + const [proxyUrl, setProxyUrl] = React.useState(''); const [port, setPort] = React.useState(''); const [workingHourFrom, setWorkingHourFrom] = React.useState(null); const [workingHourTo, setWorkingHourTo] = React.useState(null); @@ -91,6 +92,7 @@ const GeneralSettings = function GeneralSettings() { React.useEffect(() => { async function init() { setInterval(settings?.interval); + setProxyUrl(settings?.proxyUrl ?? ''); setPort(settings?.port); setWorkingHourFrom(settings?.workingHours?.from); setWorkingHourTo(settings?.workingHours?.to); @@ -133,6 +135,7 @@ const GeneralSettings = function GeneralSettings() { try { await xhrPost('/api/admin/generalSettings', { interval, + proxyUrl: proxyUrl?.trim() ?? '', port, workingHours: { from: workingHourFrom, @@ -376,6 +379,18 @@ const GeneralSettings = function GeneralSettings() { + + setProxyUrl(value)} + /> + +