more efficient bot protection

This commit is contained in:
orangecoding
2025-11-27 10:30:47 +01:00
parent 4aab850b4f
commit 22df683969
3 changed files with 389 additions and 58 deletions

View File

@@ -0,0 +1,274 @@
import { DEFAULT_HEADER } from './utils.js';
// Helper to safely coerce numbers
const toInt = (v, d) => {
const n = parseInt(v, 10);
return Number.isFinite(n) ? n : d;
};
/**
* Compute pre-launch configuration and flags for Puppeteer with bot prevention in mind.
* Returns language, user agent, viewport (with optional jitter), and additional launch args.
*
* @param {string} url
* @param {object} [options]
*/
export function getPreLaunchConfig(url, options = {}) {
const { hostname } = new URL(url);
const acceptLanguage = options.acceptLanguage || 'de-DE,de;q=0.9,en-US;q=0.7,en;q=0.5';
const langForFlag = acceptLanguage.split(',')[0];
const baseViewport = { width: 1366, height: 768, deviceScaleFactor: 1 };
const jitter = options.viewportJitter !== false ? Math.floor(Math.random() * 6) : 0; // 0..5 px
const width = toInt(options?.viewport?.width, baseViewport.width) + jitter;
const height = toInt(options?.viewport?.height, baseViewport.height) + jitter;
const deviceScaleFactor = toInt(options?.viewport?.deviceScaleFactor, baseViewport.deviceScaleFactor);
const viewport = { width, height, deviceScaleFactor };
const userAgent =
options.userAgent ||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36';
const windowSizeArg = `--window-size=${viewport.width},${viewport.height}`;
const langArg = `--lang=${langForFlag}`;
const extraArgs = [
'--disable-blink-features=AutomationControlled',
'--force-webrtc-ip-handling-policy=disable_non_proxied_udp',
'--webrtc-ip-handling-policy=default_public_interface_only',
'--proxy-bypass-list=<-loopback>',
];
const headers = {
...DEFAULT_HEADER,
'Accept-Language': acceptLanguage,
'User-Agent': userAgent,
Referer: options?.referer || `https://${hostname}/`,
Connection: 'keep-alive',
DNT: '1',
};
const timezone = options?.timezone || 'Europe/Berlin';
return {
acceptLanguage,
langForFlag,
userAgent,
viewport,
windowSizeArg,
langArg,
extraArgs,
headers,
timezone,
humanDelay: options?.humanDelay !== false,
};
}
/**
* Apply bot-prevention hardening to a Puppeteer page.
* Sets UA, viewport, JS enabled, headers, timezone and injects stealth-like patches.
*
* @param {import('puppeteer').Page} page
* @param {ReturnType<typeof getPreLaunchConfig>} cfg
*/
export async function applyBotPreventionToPage(page, cfg) {
await page.setUserAgent(cfg.userAgent);
await page.setViewport(cfg.viewport);
await page.setJavaScriptEnabled(true);
await page.setExtraHTTPHeaders(cfg.headers);
try {
if (cfg.timezone) await page.emulateTimezone(cfg.timezone);
} catch {
// ignore timezone failures
}
// Inject patches as early as possible
await page.evaluateOnNewDocument(() => {
try {
// webdriver
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
// chrome runtime
// @ts-ignore
if (!window.chrome) {
// @ts-ignore
window.chrome = { runtime: {} };
}
// languages
// @ts-ignore
Object.defineProperty(navigator, 'languages', {
get: () => (window.localStorage.getItem('__LANGS__') || 'de-DE,de').split(','),
});
// plugins
// @ts-ignore
Object.defineProperty(navigator, 'plugins', {
get: () => [{}, {}, {}],
});
// platform and concurrency hints
// @ts-ignore
Object.defineProperty(navigator, 'platform', { get: () => 'Win32' });
// @ts-ignore
if (typeof navigator.hardwareConcurrency === 'number' && navigator.hardwareConcurrency < 2) {
Object.defineProperty(navigator, 'hardwareConcurrency', { get: () => 4 });
}
// @ts-ignore
if (typeof navigator.deviceMemory === 'number' && navigator.deviceMemory < 2) {
Object.defineProperty(navigator, 'deviceMemory', { get: () => 8 });
}
// userAgentData (Client Hints)
try {
// @ts-ignore
if ('userAgentData' in navigator) {
// @ts-ignore
Object.defineProperty(navigator, 'userAgentData', {
get: () => ({
brands: [
{ brand: 'Chromium', version: '126' },
{ brand: 'Google Chrome', version: '126' },
],
mobile: false,
platform: 'Windows',
getHighEntropyValues: async (hints) => {
const values = {
platform: 'Windows',
platformVersion: '15.0.0',
architecture: 'x86',
model: '',
uaFullVersion: '126.0.0.0',
bitness: '64',
};
const out = {};
for (const k of hints || []) if (k in values) out[k] = values[k];
return out;
},
}),
});
}
} catch {
//noop
}
// Permissions API
const origQuery = navigator.permissions && navigator.permissions.query;
if (origQuery) {
// @ts-ignore
navigator.permissions.query = (parameters) =>
origQuery.call(navigator.permissions, parameters).then((result) => {
if (parameters && parameters.name === 'notifications') {
Object.defineProperty(result, 'state', { get: () => Notification.permission });
}
return result;
});
}
// WebGL vendor/renderer
const patchWebGL = (proto) => {
if (!proto || !proto.getParameter) return;
const getParameter = proto.getParameter;
// @ts-ignore
proto.getParameter = function (param) {
const UNMASKED_VENDOR_WEBGL = 0x9245;
const UNMASKED_RENDERER_WEBGL = 0x9246;
if (param === UNMASKED_VENDOR_WEBGL) return 'Google Inc.';
if (param === UNMASKED_RENDERER_WEBGL)
return 'ANGLE (NVIDIA, NVIDIA GeForce GTX 1660 Ti Direct3D11 vs_5_0 ps_5_0)';
return getParameter.call(this, param);
};
};
// @ts-ignore
patchWebGL(WebGLRenderingContext?.prototype);
// @ts-ignore
patchWebGL(WebGL2RenderingContext?.prototype);
// AudioContext timestamp rounding consistency
const patchAudio = (Ctx) => {
try {
if (!Ctx) return;
const proto = Ctx.prototype;
const createOsc = proto.createOscillator;
proto.createOscillator = function () {
const osc = createOsc.call(this);
const start = osc.start;
osc.start = function (when) {
return start.call(this, when || 0);
};
return osc;
};
} catch {
//noop
}
};
// @ts-ignore
patchAudio(window.AudioContext);
// @ts-ignore
patchAudio(window.OfflineAudioContext);
// Navigator.connection
try {
// @ts-ignore
Object.defineProperty(navigator, 'connection', { get: () => undefined });
} catch {
//noop
}
// Consistent outer sizes
try {
const calcOuter = () => {
const w = window.innerWidth + 16;
const h = window.innerHeight + 88;
return { w, h };
};
const { w: outerW, h: outerH } = calcOuter();
// @ts-ignore
Object.defineProperty(window, 'outerWidth', { get: () => outerW });
// @ts-ignore
Object.defineProperty(window, 'outerHeight', { get: () => outerH });
} catch {
//noop
}
} catch {
//noop
}
});
}
/**
* Persist languages value before navigation via localStorage.
* @param {import('puppeteer').Page} page
* @param {ReturnType<typeof getPreLaunchConfig>} cfg
*/
export async function applyLanguagePersistence(page, cfg) {
await page.evaluateOnNewDocument((langs) => {
try {
window.localStorage.setItem('__LANGS__', langs);
} catch {
// noop
}
}, cfg.acceptLanguage.split(';')[0]);
}
/**
* Perform subtle human-like interactions post navigation.
* @param {import('puppeteer').Page} page
* @param {ReturnType<typeof getPreLaunchConfig>} cfg
*/
export async function applyPostNavigationHumanSignals(page, cfg) {
if (!cfg.humanDelay) return;
const delay = 200 + Math.floor(Math.random() * 400);
await new Promise((res) => setTimeout(res, delay));
try {
const vw = cfg.viewport.width;
const vh = cfg.viewport.height;
const mx = Math.floor(vw * (0.3 + Math.random() * 0.4));
const my = Math.floor(vh * (0.3 + Math.random() * 0.4));
await page.mouse.move(mx, my, { steps: 10 + Math.floor(Math.random() * 10) });
await page.mouse.wheel({ deltaY: 100 + Math.floor(Math.random() * 200) });
} catch {
// ignore if mouse is unavailable
}
}

View File

@@ -1,11 +1,16 @@
import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import { debug, DEFAULT_HEADER, botDetected } from './utils.js';
import { debug, botDetected } from './utils.js';
import {
getPreLaunchConfig,
applyBotPreventionToPage,
applyLanguagePersistence,
applyPostNavigationHumanSignals,
} from './botPrevention.js';
import logger from '../logger.js';
import fs from 'fs';
import os from 'os';
import path from 'path';
import { URL } from 'url';
puppeteer.use(StealthPlugin());
@@ -40,6 +45,11 @@ export default async function execute(url, waitForSelector, options) {
if (options?.proxyUrl) {
launchArgs.push(`--proxy-server=${options.proxyUrl}`);
}
// Prepare bot prevention pre-launch config
const preCfg = getPreLaunchConfig(url, options || {});
launchArgs.push(preCfg.langArg);
launchArgs.push(preCfg.windowSizeArg);
launchArgs.push(...preCfg.extraArgs);
browser = await puppeteer.launch({
headless: options?.puppeteerHeadless ?? true,
@@ -50,58 +60,9 @@ export default async function execute(url, waitForSelector, options) {
});
page = await browser.newPage();
// Derive domain-specific defaults
const { hostname } = new URL(url);
// Set a realistic modern user agent unless provided
const userAgent =
options?.userAgent ||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36';
await page.setUserAgent(userAgent);
// Viewport and device scale for typical desktop
await page.setViewport({ width: 1366, height: 768, deviceScaleFactor: 1 });
// Extra HTTP headers with localized Accept-Language
const acceptLanguage = options?.acceptLanguage || 'de-DE,de;q=0.9,en-US;q=0.7,en;q=0.5';
const headers = {
...DEFAULT_HEADER,
'Accept-Language': acceptLanguage,
'User-Agent': userAgent,
Referer: options?.referer || `https://${hostname}/`,
Connection: 'keep-alive',
DNT: '1',
};
await page.setExtraHTTPHeaders(headers);
// Timezone and locale tweaks to look German when needed
try {
const tz = options?.timezone || 'Europe/Berlin';
if (tz) await page.emulateTimezone(tz);
} catch {
//noop
}
// Harden navigator properties (stealth already covers many, but we ensure critical ones)
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
// Plugins and mimeTypes
// @ts-ignore
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3] });
// @ts-ignore
Object.defineProperty(navigator, 'languages', {
get: () => (window.localStorage.getItem('__LANGS__') || 'de-DE,de').split(','),
});
});
await applyBotPreventionToPage(page, preCfg);
// Provide languages value before navigation
await page.evaluateOnNewDocument((langs) => {
try {
window.localStorage.setItem('__LANGS__', langs);
} catch {
//noop
}
}, acceptLanguage.split(';')[0]);
await applyLanguagePersistence(page, preCfg);
// Optional cookies
if (Array.isArray(options?.cookies) && options.cookies.length > 0) {
@@ -113,11 +74,8 @@ export default async function execute(url, waitForSelector, options) {
waitUntil: options?.waitUntil || 'domcontentloaded',
});
// Optionally wait a random small delay to mimic human rendering time
if (options?.humanDelay !== false) {
const delay = 200 + Math.floor(Math.random() * 400);
await new Promise((res) => setTimeout(res, delay));
}
// Optionally wait and add subtle human-like interactions
await applyPostNavigationHumanSignals(page, preCfg);
let pageSource;
// if we're extracting data from a SPA, we must wait for the selector

View File

@@ -0,0 +1,99 @@
import { describe, it } from 'mocha';
import { expect } from 'chai';
import {
getPreLaunchConfig,
applyBotPreventionToPage,
applyLanguagePersistence,
applyPostNavigationHumanSignals,
} from '../../../lib/services/extractor/botPrevention.js';
describe('botPrevention helper', () => {
it('getPreLaunchConfig builds deterministic values when jitter disabled', () => {
const url = 'https://example.com/some/path';
const options = {
acceptLanguage: 'de-DE,de;q=0.9',
userAgent: 'TestAgent/1.0',
viewport: { width: 1200, height: 700, deviceScaleFactor: 2 },
viewportJitter: false,
referer: 'https://example.com/ref',
timezone: 'Europe/Berlin',
};
const cfg = getPreLaunchConfig(url, options);
expect(cfg.acceptLanguage).to.equal('de-DE,de;q=0.9');
expect(cfg.langArg).to.equal('--lang=de-DE');
expect(cfg.windowSizeArg).to.equal('--window-size=1200,700');
expect(cfg.viewport).to.deep.equal({ width: 1200, height: 700, deviceScaleFactor: 2 });
expect(cfg.userAgent).to.equal('TestAgent/1.0');
expect(cfg.headers['Accept-Language']).to.equal('de-DE,de;q=0.9');
expect(cfg.headers['User-Agent']).to.equal('TestAgent/1.0');
expect(cfg.headers.Referer).to.equal('https://example.com/ref');
expect(cfg.extraArgs).to.include('--disable-blink-features=AutomationControlled');
expect(cfg.extraArgs).to.include('--proxy-bypass-list=<-loopback>');
});
it('applyBotPreventionToPage sets UA, viewport, headers and injects patches', async () => {
const calls = [];
const page = {
setUserAgent: async (ua) => calls.push(['setUserAgent', ua]),
setViewport: async (vp) => calls.push(['setViewport', vp]),
setJavaScriptEnabled: async (on) => calls.push(['setJavaScriptEnabled', on]),
setExtraHTTPHeaders: async (h) => calls.push(['setExtraHTTPHeaders', h]),
emulateTimezone: async (tz) => calls.push(['emulateTimezone', tz]),
evaluateOnNewDocument: async (fn) => calls.push(['evaluateOnNewDocument', typeof fn]),
};
const cfg = getPreLaunchConfig('https://example.org/', {
userAgent: 'Foo/Bar',
acceptLanguage: 'en-US,en',
viewport: { width: 1000, height: 600, deviceScaleFactor: 1 },
viewportJitter: false,
timezone: 'UTC',
});
await applyBotPreventionToPage(page, cfg);
expect(calls[0]).to.deep.equal(['setUserAgent', 'Foo/Bar']);
expect(calls.some((c) => c[0] === 'setViewport' && c[1].width === 1000 && c[1].height === 600)).to.equal(true);
expect(calls.some((c) => c[0] === 'setJavaScriptEnabled' && c[1] === true)).to.equal(true);
const headerCall = calls.find((c) => c[0] === 'setExtraHTTPHeaders');
expect(headerCall).to.exist;
expect(headerCall[1]['Accept-Language']).to.equal('en-US,en');
expect(headerCall[1]['User-Agent']).to.equal('Foo/Bar');
expect(calls.some((c) => c[0] === 'emulateTimezone' && c[1] === 'UTC')).to.equal(true);
expect(calls.some((c) => c[0] === 'evaluateOnNewDocument' && c[1] === 'function')).to.equal(true);
});
it('applyLanguagePersistence stores languages early', async () => {
const calls = [];
const page = {
evaluateOnNewDocument: async (fn, arg) => calls.push(['evaluateOnNewDocument', typeof fn, arg]),
};
const cfg = getPreLaunchConfig('https://example.org/', {
acceptLanguage: 'de-DE,de;q=0.9',
viewportJitter: false,
});
await applyLanguagePersistence(page, cfg);
const call = calls[0];
expect(call[0]).to.equal('evaluateOnNewDocument');
expect(call[1]).to.equal('function');
expect(call[2]).to.equal('de-DE,de');
});
it('applyPostNavigationHumanSignals moves mouse and scrolls when enabled', async () => {
const mouseCalls = [];
const page = {
mouse: {
move: async (x, y, opts) => mouseCalls.push(['move', x, y, opts && typeof opts.steps === 'number']),
wheel: async (opts) => mouseCalls.push(['wheel', typeof opts.deltaY === 'number']),
},
};
const cfg = {
humanDelay: true,
viewport: { width: 1200, height: 800 },
};
await applyPostNavigationHumanSignals(page, cfg);
expect(mouseCalls.some((c) => c[0] === 'move')).to.equal(true);
expect(mouseCalls.some((c) => c[0] === 'wheel')).to.equal(true);
});
});