merged master

This commit is contained in:
orangecoding
2025-11-27 15:58:12 +01:00
12 changed files with 855 additions and 324 deletions

View File

@@ -0,0 +1,57 @@
import { markdown2Html } from '../../services/markdown.js';
const mapListing = (listing) => ({
address: listing.address,
description: listing.description,
id: listing.id,
imageUrl: listing.image,
price: listing.price,
size: listing.size,
title: listing.title,
url: listing.link,
});
export const send = ({ serviceName, newListings, notificationConfig, jobKey }) => {
const { authToken, endpointUrl } = notificationConfig.find((a) => a.id === config.id).fields;
const listings = newListings.map(mapListing);
const body = {
jobId: jobKey,
timestamp: new Date().toISOString(),
provider: serviceName,
listings,
};
const headers = {
'Content-Type': 'application/json',
};
if (authToken != null) {
headers['Authorization'] = `Bearer ${authToken}`;
}
return fetch(endpointUrl, {
method: 'POST',
headers: headers,
body: JSON.stringify(body),
});
};
export const config = {
id: 'http',
name: 'HTTP',
readme: markdown2Html('lib/notification/adapter/http.md'),
description: 'Fredy will send a generic HTTP POST request.',
fields: {
endpointUrl: {
description: "Your application's endpoint URL.",
label: 'Endpoint URL',
type: 'text',
},
authToken: {
description: "Your application's auth token, if required by your endpoint.",
label: 'Auth token (optional)',
optional: true,
type: 'text',
},
},
};

View File

@@ -0,0 +1,43 @@
### HTTP Adapter
This is a generic adapter for sending notifications via HTTP requests.
You can leverage this adapter to integrate with various webhooks or APIs that accept HTTP requests. (e.g. Supabase
Functions, a Node.js server, etc.)
HTTP adapter supports a `authToken` field, which can be used to include an authorization token in the request headers.
Your token would be included as a Bearer token in the `Authorization` header, which is a common method for securing API requests.
Request Details:
<details>
Request Method: POST
Headers:
```
Content Type: `application/json`
Authorization: Bearer {your-optional-auth-token}
```
Body:
```json
{
"jobId": "mg1waX4RHmIzL5NDYtYp-",
"provider": "immoscout",
"timestamp": "2024-06-15T12:34:56Z",
"listings": [
{
"address": "Str. 123, Bielefeld, Germany",
"description": "Möbliert: Einziehen & wohlfühlen: Neu möbliert.",
"id": "123456789",
"imageUrl": "https://<target-url>.com/listings/123456789.jpg",
"price": "1.240 €",
"size": "38 m²",
"title": "Schöne 1-Zimmer-Wohnung in Bielefeld",
"url": "https://<target-url>.com/listings/123456789"
}
]
}
```
</details>

View File

@@ -0,0 +1,274 @@
import { DEFAULT_HEADER } from './utils.js';
// Helper to safely coerce numbers
const toInt = (v, d) => {
const n = parseInt(v, 10);
return Number.isFinite(n) ? n : d;
};
/**
* Compute pre-launch configuration and flags for Puppeteer with bot prevention in mind.
* Returns language, user agent, viewport (with optional jitter), and additional launch args.
*
* @param {string} url
* @param {object} [options]
*/
export function getPreLaunchConfig(url, options = {}) {
const { hostname } = new URL(url);
const acceptLanguage = options.acceptLanguage || 'de-DE,de;q=0.9,en-US;q=0.7,en;q=0.5';
const langForFlag = acceptLanguage.split(',')[0];
const baseViewport = { width: 1366, height: 768, deviceScaleFactor: 1 };
const jitter = options.viewportJitter !== false ? Math.floor(Math.random() * 6) : 0; // 0..5 px
const width = toInt(options?.viewport?.width, baseViewport.width) + jitter;
const height = toInt(options?.viewport?.height, baseViewport.height) + jitter;
const deviceScaleFactor = toInt(options?.viewport?.deviceScaleFactor, baseViewport.deviceScaleFactor);
const viewport = { width, height, deviceScaleFactor };
const userAgent =
options.userAgent ||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36';
const windowSizeArg = `--window-size=${viewport.width},${viewport.height}`;
const langArg = `--lang=${langForFlag}`;
const extraArgs = [
'--disable-blink-features=AutomationControlled',
'--force-webrtc-ip-handling-policy=disable_non_proxied_udp',
'--webrtc-ip-handling-policy=default_public_interface_only',
'--proxy-bypass-list=<-loopback>',
];
const headers = {
...DEFAULT_HEADER,
'Accept-Language': acceptLanguage,
'User-Agent': userAgent,
Referer: options?.referer || `https://${hostname}/`,
Connection: 'keep-alive',
DNT: '1',
};
const timezone = options?.timezone || 'Europe/Berlin';
return {
acceptLanguage,
langForFlag,
userAgent,
viewport,
windowSizeArg,
langArg,
extraArgs,
headers,
timezone,
humanDelay: options?.humanDelay !== false,
};
}
/**
* Apply bot-prevention hardening to a Puppeteer page.
* Sets UA, viewport, JS enabled, headers, timezone and injects stealth-like patches.
*
* @param {import('puppeteer').Page} page
* @param {ReturnType<typeof getPreLaunchConfig>} cfg
*/
export async function applyBotPreventionToPage(page, cfg) {
await page.setUserAgent(cfg.userAgent);
await page.setViewport(cfg.viewport);
await page.setJavaScriptEnabled(true);
await page.setExtraHTTPHeaders(cfg.headers);
try {
if (cfg.timezone) await page.emulateTimezone(cfg.timezone);
} catch {
// ignore timezone failures
}
// Inject patches as early as possible
await page.evaluateOnNewDocument(() => {
try {
// webdriver
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
// chrome runtime
// @ts-ignore
if (!window.chrome) {
// @ts-ignore
window.chrome = { runtime: {} };
}
// languages
// @ts-ignore
Object.defineProperty(navigator, 'languages', {
get: () => (window.localStorage.getItem('__LANGS__') || 'de-DE,de').split(','),
});
// plugins
// @ts-ignore
Object.defineProperty(navigator, 'plugins', {
get: () => [{}, {}, {}],
});
// platform and concurrency hints
// @ts-ignore
Object.defineProperty(navigator, 'platform', { get: () => 'Win32' });
// @ts-ignore
if (typeof navigator.hardwareConcurrency === 'number' && navigator.hardwareConcurrency < 2) {
Object.defineProperty(navigator, 'hardwareConcurrency', { get: () => 4 });
}
// @ts-ignore
if (typeof navigator.deviceMemory === 'number' && navigator.deviceMemory < 2) {
Object.defineProperty(navigator, 'deviceMemory', { get: () => 8 });
}
// userAgentData (Client Hints)
try {
// @ts-ignore
if ('userAgentData' in navigator) {
// @ts-ignore
Object.defineProperty(navigator, 'userAgentData', {
get: () => ({
brands: [
{ brand: 'Chromium', version: '126' },
{ brand: 'Google Chrome', version: '126' },
],
mobile: false,
platform: 'Windows',
getHighEntropyValues: async (hints) => {
const values = {
platform: 'Windows',
platformVersion: '15.0.0',
architecture: 'x86',
model: '',
uaFullVersion: '126.0.0.0',
bitness: '64',
};
const out = {};
for (const k of hints || []) if (k in values) out[k] = values[k];
return out;
},
}),
});
}
} catch {
//noop
}
// Permissions API
const origQuery = navigator.permissions && navigator.permissions.query;
if (origQuery) {
// @ts-ignore
navigator.permissions.query = (parameters) =>
origQuery.call(navigator.permissions, parameters).then((result) => {
if (parameters && parameters.name === 'notifications') {
Object.defineProperty(result, 'state', { get: () => Notification.permission });
}
return result;
});
}
// WebGL vendor/renderer
const patchWebGL = (proto) => {
if (!proto || !proto.getParameter) return;
const getParameter = proto.getParameter;
// @ts-ignore
proto.getParameter = function (param) {
const UNMASKED_VENDOR_WEBGL = 0x9245;
const UNMASKED_RENDERER_WEBGL = 0x9246;
if (param === UNMASKED_VENDOR_WEBGL) return 'Google Inc.';
if (param === UNMASKED_RENDERER_WEBGL)
return 'ANGLE (NVIDIA, NVIDIA GeForce GTX 1660 Ti Direct3D11 vs_5_0 ps_5_0)';
return getParameter.call(this, param);
};
};
// @ts-ignore
patchWebGL(WebGLRenderingContext?.prototype);
// @ts-ignore
patchWebGL(WebGL2RenderingContext?.prototype);
// AudioContext timestamp rounding consistency
const patchAudio = (Ctx) => {
try {
if (!Ctx) return;
const proto = Ctx.prototype;
const createOsc = proto.createOscillator;
proto.createOscillator = function () {
const osc = createOsc.call(this);
const start = osc.start;
osc.start = function (when) {
return start.call(this, when || 0);
};
return osc;
};
} catch {
//noop
}
};
// @ts-ignore
patchAudio(window.AudioContext);
// @ts-ignore
patchAudio(window.OfflineAudioContext);
// Navigator.connection
try {
// @ts-ignore
Object.defineProperty(navigator, 'connection', { get: () => undefined });
} catch {
//noop
}
// Consistent outer sizes
try {
const calcOuter = () => {
const w = window.innerWidth + 16;
const h = window.innerHeight + 88;
return { w, h };
};
const { w: outerW, h: outerH } = calcOuter();
// @ts-ignore
Object.defineProperty(window, 'outerWidth', { get: () => outerW });
// @ts-ignore
Object.defineProperty(window, 'outerHeight', { get: () => outerH });
} catch {
//noop
}
} catch {
//noop
}
});
}
/**
* Persist languages value before navigation via localStorage.
* @param {import('puppeteer').Page} page
* @param {ReturnType<typeof getPreLaunchConfig>} cfg
*/
export async function applyLanguagePersistence(page, cfg) {
await page.evaluateOnNewDocument((langs) => {
try {
window.localStorage.setItem('__LANGS__', langs);
} catch {
// noop
}
}, cfg.acceptLanguage.split(';')[0]);
}
/**
* Perform subtle human-like interactions post navigation.
* @param {import('puppeteer').Page} page
* @param {ReturnType<typeof getPreLaunchConfig>} cfg
*/
export async function applyPostNavigationHumanSignals(page, cfg) {
if (!cfg.humanDelay) return;
const delay = 200 + Math.floor(Math.random() * 400);
await new Promise((res) => setTimeout(res, delay));
try {
const vw = cfg.viewport.width;
const vh = cfg.viewport.height;
const mx = Math.floor(vw * (0.3 + Math.random() * 0.4));
const my = Math.floor(vh * (0.3 + Math.random() * 0.4));
await page.mouse.move(mx, my, { steps: 10 + Math.floor(Math.random() * 10) });
await page.mouse.wheel({ deltaY: 100 + Math.floor(Math.random() * 200) });
} catch {
// ignore if mouse is unavailable
}
}

View File

@@ -1,11 +1,16 @@
import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import { debug, DEFAULT_HEADER, botDetected } from './utils.js';
import { debug, botDetected } from './utils.js';
import {
getPreLaunchConfig,
applyBotPreventionToPage,
applyLanguagePersistence,
applyPostNavigationHumanSignals,
} from './botPrevention.js';
import logger from '../logger.js';
import fs from 'fs';
import os from 'os';
import path from 'path';
import { URL } from 'url';
puppeteer.use(StealthPlugin());
@@ -40,6 +45,11 @@ export default async function execute(url, waitForSelector, options) {
if (options?.proxyUrl) {
launchArgs.push(`--proxy-server=${options.proxyUrl}`);
}
// Prepare bot prevention pre-launch config
const preCfg = getPreLaunchConfig(url, options || {});
launchArgs.push(preCfg.langArg);
launchArgs.push(preCfg.windowSizeArg);
launchArgs.push(...preCfg.extraArgs);
browser = await puppeteer.launch({
headless: options?.puppeteerHeadless ?? true,
@@ -50,58 +60,9 @@ export default async function execute(url, waitForSelector, options) {
});
page = await browser.newPage();
// Derive domain-specific defaults
const { hostname } = new URL(url);
// Set a realistic modern user agent unless provided
const userAgent =
options?.userAgent ||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36';
await page.setUserAgent(userAgent);
// Viewport and device scale for typical desktop
await page.setViewport({ width: 1366, height: 768, deviceScaleFactor: 1 });
// Extra HTTP headers with localized Accept-Language
const acceptLanguage = options?.acceptLanguage || 'de-DE,de;q=0.9,en-US;q=0.7,en;q=0.5';
const headers = {
...DEFAULT_HEADER,
'Accept-Language': acceptLanguage,
'User-Agent': userAgent,
Referer: options?.referer || `https://${hostname}/`,
Connection: 'keep-alive',
DNT: '1',
};
await page.setExtraHTTPHeaders(headers);
// Timezone and locale tweaks to look German when needed
try {
const tz = options?.timezone || 'Europe/Berlin';
if (tz) await page.emulateTimezone(tz);
} catch {
//noop
}
// Harden navigator properties (stealth already covers many, but we ensure critical ones)
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
// Plugins and mimeTypes
// @ts-ignore
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3] });
// @ts-ignore
Object.defineProperty(navigator, 'languages', {
get: () => (window.localStorage.getItem('__LANGS__') || 'de-DE,de').split(','),
});
});
await applyBotPreventionToPage(page, preCfg);
// Provide languages value before navigation
await page.evaluateOnNewDocument((langs) => {
try {
window.localStorage.setItem('__LANGS__', langs);
} catch {
//noop
}
}, acceptLanguage.split(';')[0]);
await applyLanguagePersistence(page, preCfg);
// Optional cookies
if (Array.isArray(options?.cookies) && options.cookies.length > 0) {
@@ -113,11 +74,8 @@ export default async function execute(url, waitForSelector, options) {
waitUntil: options?.waitUntil || 'domcontentloaded',
});
// Optionally wait a random small delay to mimic human rendering time
if (options?.humanDelay !== false) {
const delay = 200 + Math.floor(Math.random() * 400);
await new Promise((res) => setTimeout(res, delay));
}
// Optionally wait and add subtle human-like interactions
await applyPostNavigationHumanSignals(page, preCfg);
let pageSource;
// if we're extracting data from a SPA, we must wait for the selector