mirror of
https://github.com/orangecoding/fredy.git
synced 2026-06-16 12:31:07 +00:00
more efficient bot protection
This commit is contained in:
274
lib/services/extractor/botPrevention.js
Normal file
274
lib/services/extractor/botPrevention.js
Normal file
@@ -0,0 +1,274 @@
|
||||
import { DEFAULT_HEADER } from './utils.js';
|
||||
|
||||
// Helper to safely coerce numbers
|
||||
const toInt = (v, d) => {
|
||||
const n = parseInt(v, 10);
|
||||
return Number.isFinite(n) ? n : d;
|
||||
};
|
||||
|
||||
/**
|
||||
* Compute pre-launch configuration and flags for Puppeteer with bot prevention in mind.
|
||||
* Returns language, user agent, viewport (with optional jitter), and additional launch args.
|
||||
*
|
||||
* @param {string} url
|
||||
* @param {object} [options]
|
||||
*/
|
||||
export function getPreLaunchConfig(url, options = {}) {
|
||||
const { hostname } = new URL(url);
|
||||
|
||||
const acceptLanguage = options.acceptLanguage || 'de-DE,de;q=0.9,en-US;q=0.7,en;q=0.5';
|
||||
const langForFlag = acceptLanguage.split(',')[0];
|
||||
|
||||
const baseViewport = { width: 1366, height: 768, deviceScaleFactor: 1 };
|
||||
const jitter = options.viewportJitter !== false ? Math.floor(Math.random() * 6) : 0; // 0..5 px
|
||||
const width = toInt(options?.viewport?.width, baseViewport.width) + jitter;
|
||||
const height = toInt(options?.viewport?.height, baseViewport.height) + jitter;
|
||||
const deviceScaleFactor = toInt(options?.viewport?.deviceScaleFactor, baseViewport.deviceScaleFactor);
|
||||
const viewport = { width, height, deviceScaleFactor };
|
||||
|
||||
const userAgent =
|
||||
options.userAgent ||
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36';
|
||||
|
||||
const windowSizeArg = `--window-size=${viewport.width},${viewport.height}`;
|
||||
const langArg = `--lang=${langForFlag}`;
|
||||
|
||||
const extraArgs = [
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
'--force-webrtc-ip-handling-policy=disable_non_proxied_udp',
|
||||
'--webrtc-ip-handling-policy=default_public_interface_only',
|
||||
'--proxy-bypass-list=<-loopback>',
|
||||
];
|
||||
|
||||
const headers = {
|
||||
...DEFAULT_HEADER,
|
||||
'Accept-Language': acceptLanguage,
|
||||
'User-Agent': userAgent,
|
||||
Referer: options?.referer || `https://${hostname}/`,
|
||||
Connection: 'keep-alive',
|
||||
DNT: '1',
|
||||
};
|
||||
|
||||
const timezone = options?.timezone || 'Europe/Berlin';
|
||||
|
||||
return {
|
||||
acceptLanguage,
|
||||
langForFlag,
|
||||
userAgent,
|
||||
viewport,
|
||||
windowSizeArg,
|
||||
langArg,
|
||||
extraArgs,
|
||||
headers,
|
||||
timezone,
|
||||
humanDelay: options?.humanDelay !== false,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply bot-prevention hardening to a Puppeteer page.
|
||||
* Sets UA, viewport, JS enabled, headers, timezone and injects stealth-like patches.
|
||||
*
|
||||
* @param {import('puppeteer').Page} page
|
||||
* @param {ReturnType<typeof getPreLaunchConfig>} cfg
|
||||
*/
|
||||
export async function applyBotPreventionToPage(page, cfg) {
|
||||
await page.setUserAgent(cfg.userAgent);
|
||||
await page.setViewport(cfg.viewport);
|
||||
await page.setJavaScriptEnabled(true);
|
||||
await page.setExtraHTTPHeaders(cfg.headers);
|
||||
try {
|
||||
if (cfg.timezone) await page.emulateTimezone(cfg.timezone);
|
||||
} catch {
|
||||
// ignore timezone failures
|
||||
}
|
||||
|
||||
// Inject patches as early as possible
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
try {
|
||||
// webdriver
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
||||
|
||||
// chrome runtime
|
||||
// @ts-ignore
|
||||
if (!window.chrome) {
|
||||
// @ts-ignore
|
||||
window.chrome = { runtime: {} };
|
||||
}
|
||||
|
||||
// languages
|
||||
// @ts-ignore
|
||||
Object.defineProperty(navigator, 'languages', {
|
||||
get: () => (window.localStorage.getItem('__LANGS__') || 'de-DE,de').split(','),
|
||||
});
|
||||
|
||||
// plugins
|
||||
// @ts-ignore
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [{}, {}, {}],
|
||||
});
|
||||
|
||||
// platform and concurrency hints
|
||||
// @ts-ignore
|
||||
Object.defineProperty(navigator, 'platform', { get: () => 'Win32' });
|
||||
// @ts-ignore
|
||||
if (typeof navigator.hardwareConcurrency === 'number' && navigator.hardwareConcurrency < 2) {
|
||||
Object.defineProperty(navigator, 'hardwareConcurrency', { get: () => 4 });
|
||||
}
|
||||
// @ts-ignore
|
||||
if (typeof navigator.deviceMemory === 'number' && navigator.deviceMemory < 2) {
|
||||
Object.defineProperty(navigator, 'deviceMemory', { get: () => 8 });
|
||||
}
|
||||
|
||||
// userAgentData (Client Hints)
|
||||
try {
|
||||
// @ts-ignore
|
||||
if ('userAgentData' in navigator) {
|
||||
// @ts-ignore
|
||||
Object.defineProperty(navigator, 'userAgentData', {
|
||||
get: () => ({
|
||||
brands: [
|
||||
{ brand: 'Chromium', version: '126' },
|
||||
{ brand: 'Google Chrome', version: '126' },
|
||||
],
|
||||
mobile: false,
|
||||
platform: 'Windows',
|
||||
getHighEntropyValues: async (hints) => {
|
||||
const values = {
|
||||
platform: 'Windows',
|
||||
platformVersion: '15.0.0',
|
||||
architecture: 'x86',
|
||||
model: '',
|
||||
uaFullVersion: '126.0.0.0',
|
||||
bitness: '64',
|
||||
};
|
||||
const out = {};
|
||||
for (const k of hints || []) if (k in values) out[k] = values[k];
|
||||
return out;
|
||||
},
|
||||
}),
|
||||
});
|
||||
}
|
||||
} catch {
|
||||
//noop
|
||||
}
|
||||
|
||||
// Permissions API
|
||||
const origQuery = navigator.permissions && navigator.permissions.query;
|
||||
if (origQuery) {
|
||||
// @ts-ignore
|
||||
navigator.permissions.query = (parameters) =>
|
||||
origQuery.call(navigator.permissions, parameters).then((result) => {
|
||||
if (parameters && parameters.name === 'notifications') {
|
||||
Object.defineProperty(result, 'state', { get: () => Notification.permission });
|
||||
}
|
||||
return result;
|
||||
});
|
||||
}
|
||||
|
||||
// WebGL vendor/renderer
|
||||
const patchWebGL = (proto) => {
|
||||
if (!proto || !proto.getParameter) return;
|
||||
const getParameter = proto.getParameter;
|
||||
// @ts-ignore
|
||||
proto.getParameter = function (param) {
|
||||
const UNMASKED_VENDOR_WEBGL = 0x9245;
|
||||
const UNMASKED_RENDERER_WEBGL = 0x9246;
|
||||
if (param === UNMASKED_VENDOR_WEBGL) return 'Google Inc.';
|
||||
if (param === UNMASKED_RENDERER_WEBGL)
|
||||
return 'ANGLE (NVIDIA, NVIDIA GeForce GTX 1660 Ti Direct3D11 vs_5_0 ps_5_0)';
|
||||
return getParameter.call(this, param);
|
||||
};
|
||||
};
|
||||
// @ts-ignore
|
||||
patchWebGL(WebGLRenderingContext?.prototype);
|
||||
// @ts-ignore
|
||||
patchWebGL(WebGL2RenderingContext?.prototype);
|
||||
|
||||
// AudioContext timestamp rounding consistency
|
||||
const patchAudio = (Ctx) => {
|
||||
try {
|
||||
if (!Ctx) return;
|
||||
const proto = Ctx.prototype;
|
||||
const createOsc = proto.createOscillator;
|
||||
proto.createOscillator = function () {
|
||||
const osc = createOsc.call(this);
|
||||
const start = osc.start;
|
||||
osc.start = function (when) {
|
||||
return start.call(this, when || 0);
|
||||
};
|
||||
return osc;
|
||||
};
|
||||
} catch {
|
||||
//noop
|
||||
}
|
||||
};
|
||||
// @ts-ignore
|
||||
patchAudio(window.AudioContext);
|
||||
// @ts-ignore
|
||||
patchAudio(window.OfflineAudioContext);
|
||||
|
||||
// Navigator.connection
|
||||
try {
|
||||
// @ts-ignore
|
||||
Object.defineProperty(navigator, 'connection', { get: () => undefined });
|
||||
} catch {
|
||||
//noop
|
||||
}
|
||||
|
||||
// Consistent outer sizes
|
||||
try {
|
||||
const calcOuter = () => {
|
||||
const w = window.innerWidth + 16;
|
||||
const h = window.innerHeight + 88;
|
||||
return { w, h };
|
||||
};
|
||||
const { w: outerW, h: outerH } = calcOuter();
|
||||
// @ts-ignore
|
||||
Object.defineProperty(window, 'outerWidth', { get: () => outerW });
|
||||
// @ts-ignore
|
||||
Object.defineProperty(window, 'outerHeight', { get: () => outerH });
|
||||
} catch {
|
||||
//noop
|
||||
}
|
||||
} catch {
|
||||
//noop
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Persist languages value before navigation via localStorage.
|
||||
* @param {import('puppeteer').Page} page
|
||||
* @param {ReturnType<typeof getPreLaunchConfig>} cfg
|
||||
*/
|
||||
export async function applyLanguagePersistence(page, cfg) {
|
||||
await page.evaluateOnNewDocument((langs) => {
|
||||
try {
|
||||
window.localStorage.setItem('__LANGS__', langs);
|
||||
} catch {
|
||||
// noop
|
||||
}
|
||||
}, cfg.acceptLanguage.split(';')[0]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform subtle human-like interactions post navigation.
|
||||
* @param {import('puppeteer').Page} page
|
||||
* @param {ReturnType<typeof getPreLaunchConfig>} cfg
|
||||
*/
|
||||
export async function applyPostNavigationHumanSignals(page, cfg) {
|
||||
if (!cfg.humanDelay) return;
|
||||
const delay = 200 + Math.floor(Math.random() * 400);
|
||||
await new Promise((res) => setTimeout(res, delay));
|
||||
try {
|
||||
const vw = cfg.viewport.width;
|
||||
const vh = cfg.viewport.height;
|
||||
const mx = Math.floor(vw * (0.3 + Math.random() * 0.4));
|
||||
const my = Math.floor(vh * (0.3 + Math.random() * 0.4));
|
||||
await page.mouse.move(mx, my, { steps: 10 + Math.floor(Math.random() * 10) });
|
||||
await page.mouse.wheel({ deltaY: 100 + Math.floor(Math.random() * 200) });
|
||||
} catch {
|
||||
// ignore if mouse is unavailable
|
||||
}
|
||||
}
|
||||
@@ -1,11 +1,16 @@
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
import { debug, DEFAULT_HEADER, botDetected } from './utils.js';
|
||||
import { debug, botDetected } from './utils.js';
|
||||
import {
|
||||
getPreLaunchConfig,
|
||||
applyBotPreventionToPage,
|
||||
applyLanguagePersistence,
|
||||
applyPostNavigationHumanSignals,
|
||||
} from './botPrevention.js';
|
||||
import logger from '../logger.js';
|
||||
import fs from 'fs';
|
||||
import os from 'os';
|
||||
import path from 'path';
|
||||
import { URL } from 'url';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
@@ -40,6 +45,11 @@ export default async function execute(url, waitForSelector, options) {
|
||||
if (options?.proxyUrl) {
|
||||
launchArgs.push(`--proxy-server=${options.proxyUrl}`);
|
||||
}
|
||||
// Prepare bot prevention pre-launch config
|
||||
const preCfg = getPreLaunchConfig(url, options || {});
|
||||
launchArgs.push(preCfg.langArg);
|
||||
launchArgs.push(preCfg.windowSizeArg);
|
||||
launchArgs.push(...preCfg.extraArgs);
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
headless: options?.puppeteerHeadless ?? true,
|
||||
@@ -50,58 +60,9 @@ export default async function execute(url, waitForSelector, options) {
|
||||
});
|
||||
|
||||
page = await browser.newPage();
|
||||
|
||||
// Derive domain-specific defaults
|
||||
const { hostname } = new URL(url);
|
||||
|
||||
// Set a realistic modern user agent unless provided
|
||||
const userAgent =
|
||||
options?.userAgent ||
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36';
|
||||
await page.setUserAgent(userAgent);
|
||||
|
||||
// Viewport and device scale for typical desktop
|
||||
await page.setViewport({ width: 1366, height: 768, deviceScaleFactor: 1 });
|
||||
|
||||
// Extra HTTP headers with localized Accept-Language
|
||||
const acceptLanguage = options?.acceptLanguage || 'de-DE,de;q=0.9,en-US;q=0.7,en;q=0.5';
|
||||
const headers = {
|
||||
...DEFAULT_HEADER,
|
||||
'Accept-Language': acceptLanguage,
|
||||
'User-Agent': userAgent,
|
||||
Referer: options?.referer || `https://${hostname}/`,
|
||||
Connection: 'keep-alive',
|
||||
DNT: '1',
|
||||
};
|
||||
await page.setExtraHTTPHeaders(headers);
|
||||
|
||||
// Timezone and locale tweaks to look German when needed
|
||||
try {
|
||||
const tz = options?.timezone || 'Europe/Berlin';
|
||||
if (tz) await page.emulateTimezone(tz);
|
||||
} catch {
|
||||
//noop
|
||||
}
|
||||
|
||||
// Harden navigator properties (stealth already covers many, but we ensure critical ones)
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
||||
// Plugins and mimeTypes
|
||||
// @ts-ignore
|
||||
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3] });
|
||||
// @ts-ignore
|
||||
Object.defineProperty(navigator, 'languages', {
|
||||
get: () => (window.localStorage.getItem('__LANGS__') || 'de-DE,de').split(','),
|
||||
});
|
||||
});
|
||||
await applyBotPreventionToPage(page, preCfg);
|
||||
// Provide languages value before navigation
|
||||
await page.evaluateOnNewDocument((langs) => {
|
||||
try {
|
||||
window.localStorage.setItem('__LANGS__', langs);
|
||||
} catch {
|
||||
//noop
|
||||
}
|
||||
}, acceptLanguage.split(';')[0]);
|
||||
await applyLanguagePersistence(page, preCfg);
|
||||
|
||||
// Optional cookies
|
||||
if (Array.isArray(options?.cookies) && options.cookies.length > 0) {
|
||||
@@ -113,11 +74,8 @@ export default async function execute(url, waitForSelector, options) {
|
||||
waitUntil: options?.waitUntil || 'domcontentloaded',
|
||||
});
|
||||
|
||||
// Optionally wait a random small delay to mimic human rendering time
|
||||
if (options?.humanDelay !== false) {
|
||||
const delay = 200 + Math.floor(Math.random() * 400);
|
||||
await new Promise((res) => setTimeout(res, delay));
|
||||
}
|
||||
// Optionally wait and add subtle human-like interactions
|
||||
await applyPostNavigationHumanSignals(page, preCfg);
|
||||
|
||||
let pageSource;
|
||||
// if we're extracting data from a SPA, we must wait for the selector
|
||||
|
||||
99
test/services/extractor/botPrevention.test.js
Normal file
99
test/services/extractor/botPrevention.test.js
Normal file
@@ -0,0 +1,99 @@
|
||||
import { describe, it } from 'mocha';
|
||||
import { expect } from 'chai';
|
||||
|
||||
import {
|
||||
getPreLaunchConfig,
|
||||
applyBotPreventionToPage,
|
||||
applyLanguagePersistence,
|
||||
applyPostNavigationHumanSignals,
|
||||
} from '../../../lib/services/extractor/botPrevention.js';
|
||||
|
||||
describe('botPrevention helper', () => {
|
||||
it('getPreLaunchConfig builds deterministic values when jitter disabled', () => {
|
||||
const url = 'https://example.com/some/path';
|
||||
const options = {
|
||||
acceptLanguage: 'de-DE,de;q=0.9',
|
||||
userAgent: 'TestAgent/1.0',
|
||||
viewport: { width: 1200, height: 700, deviceScaleFactor: 2 },
|
||||
viewportJitter: false,
|
||||
referer: 'https://example.com/ref',
|
||||
timezone: 'Europe/Berlin',
|
||||
};
|
||||
const cfg = getPreLaunchConfig(url, options);
|
||||
|
||||
expect(cfg.acceptLanguage).to.equal('de-DE,de;q=0.9');
|
||||
expect(cfg.langArg).to.equal('--lang=de-DE');
|
||||
expect(cfg.windowSizeArg).to.equal('--window-size=1200,700');
|
||||
expect(cfg.viewport).to.deep.equal({ width: 1200, height: 700, deviceScaleFactor: 2 });
|
||||
expect(cfg.userAgent).to.equal('TestAgent/1.0');
|
||||
expect(cfg.headers['Accept-Language']).to.equal('de-DE,de;q=0.9');
|
||||
expect(cfg.headers['User-Agent']).to.equal('TestAgent/1.0');
|
||||
expect(cfg.headers.Referer).to.equal('https://example.com/ref');
|
||||
expect(cfg.extraArgs).to.include('--disable-blink-features=AutomationControlled');
|
||||
expect(cfg.extraArgs).to.include('--proxy-bypass-list=<-loopback>');
|
||||
});
|
||||
|
||||
it('applyBotPreventionToPage sets UA, viewport, headers and injects patches', async () => {
|
||||
const calls = [];
|
||||
const page = {
|
||||
setUserAgent: async (ua) => calls.push(['setUserAgent', ua]),
|
||||
setViewport: async (vp) => calls.push(['setViewport', vp]),
|
||||
setJavaScriptEnabled: async (on) => calls.push(['setJavaScriptEnabled', on]),
|
||||
setExtraHTTPHeaders: async (h) => calls.push(['setExtraHTTPHeaders', h]),
|
||||
emulateTimezone: async (tz) => calls.push(['emulateTimezone', tz]),
|
||||
evaluateOnNewDocument: async (fn) => calls.push(['evaluateOnNewDocument', typeof fn]),
|
||||
};
|
||||
const cfg = getPreLaunchConfig('https://example.org/', {
|
||||
userAgent: 'Foo/Bar',
|
||||
acceptLanguage: 'en-US,en',
|
||||
viewport: { width: 1000, height: 600, deviceScaleFactor: 1 },
|
||||
viewportJitter: false,
|
||||
timezone: 'UTC',
|
||||
});
|
||||
|
||||
await applyBotPreventionToPage(page, cfg);
|
||||
|
||||
expect(calls[0]).to.deep.equal(['setUserAgent', 'Foo/Bar']);
|
||||
expect(calls.some((c) => c[0] === 'setViewport' && c[1].width === 1000 && c[1].height === 600)).to.equal(true);
|
||||
expect(calls.some((c) => c[0] === 'setJavaScriptEnabled' && c[1] === true)).to.equal(true);
|
||||
const headerCall = calls.find((c) => c[0] === 'setExtraHTTPHeaders');
|
||||
expect(headerCall).to.exist;
|
||||
expect(headerCall[1]['Accept-Language']).to.equal('en-US,en');
|
||||
expect(headerCall[1]['User-Agent']).to.equal('Foo/Bar');
|
||||
expect(calls.some((c) => c[0] === 'emulateTimezone' && c[1] === 'UTC')).to.equal(true);
|
||||
expect(calls.some((c) => c[0] === 'evaluateOnNewDocument' && c[1] === 'function')).to.equal(true);
|
||||
});
|
||||
|
||||
it('applyLanguagePersistence stores languages early', async () => {
|
||||
const calls = [];
|
||||
const page = {
|
||||
evaluateOnNewDocument: async (fn, arg) => calls.push(['evaluateOnNewDocument', typeof fn, arg]),
|
||||
};
|
||||
const cfg = getPreLaunchConfig('https://example.org/', {
|
||||
acceptLanguage: 'de-DE,de;q=0.9',
|
||||
viewportJitter: false,
|
||||
});
|
||||
await applyLanguagePersistence(page, cfg);
|
||||
const call = calls[0];
|
||||
expect(call[0]).to.equal('evaluateOnNewDocument');
|
||||
expect(call[1]).to.equal('function');
|
||||
expect(call[2]).to.equal('de-DE,de');
|
||||
});
|
||||
|
||||
it('applyPostNavigationHumanSignals moves mouse and scrolls when enabled', async () => {
|
||||
const mouseCalls = [];
|
||||
const page = {
|
||||
mouse: {
|
||||
move: async (x, y, opts) => mouseCalls.push(['move', x, y, opts && typeof opts.steps === 'number']),
|
||||
wheel: async (opts) => mouseCalls.push(['wheel', typeof opts.deltaY === 'number']),
|
||||
},
|
||||
};
|
||||
const cfg = {
|
||||
humanDelay: true,
|
||||
viewport: { width: 1200, height: 800 },
|
||||
};
|
||||
await applyPostNavigationHumanSignals(page, cfg);
|
||||
expect(mouseCalls.some((c) => c[0] === 'move')).to.equal(true);
|
||||
expect(mouseCalls.some((c) => c[0] === 'wheel')).to.equal(true);
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user