diff --git a/lib/services/extractor/botPrevention.js b/lib/services/extractor/botPrevention.js new file mode 100644 index 0000000..3990405 --- /dev/null +++ b/lib/services/extractor/botPrevention.js @@ -0,0 +1,274 @@ +import { DEFAULT_HEADER } from './utils.js'; + +// Helper to safely coerce numbers +const toInt = (v, d) => { + const n = parseInt(v, 10); + return Number.isFinite(n) ? n : d; +}; + +/** + * Compute pre-launch configuration and flags for Puppeteer with bot prevention in mind. + * Returns language, user agent, viewport (with optional jitter), and additional launch args. + * + * @param {string} url + * @param {object} [options] + */ +export function getPreLaunchConfig(url, options = {}) { + const { hostname } = new URL(url); + + const acceptLanguage = options.acceptLanguage || 'de-DE,de;q=0.9,en-US;q=0.7,en;q=0.5'; + const langForFlag = acceptLanguage.split(',')[0]; + + const baseViewport = { width: 1366, height: 768, deviceScaleFactor: 1 }; + const jitter = options.viewportJitter !== false ? Math.floor(Math.random() * 6) : 0; // 0..5 px + const width = toInt(options?.viewport?.width, baseViewport.width) + jitter; + const height = toInt(options?.viewport?.height, baseViewport.height) + jitter; + const deviceScaleFactor = toInt(options?.viewport?.deviceScaleFactor, baseViewport.deviceScaleFactor); + const viewport = { width, height, deviceScaleFactor }; + + const userAgent = + options.userAgent || + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'; + + const windowSizeArg = `--window-size=${viewport.width},${viewport.height}`; + const langArg = `--lang=${langForFlag}`; + + const extraArgs = [ + '--disable-blink-features=AutomationControlled', + '--force-webrtc-ip-handling-policy=disable_non_proxied_udp', + '--webrtc-ip-handling-policy=default_public_interface_only', + '--proxy-bypass-list=<-loopback>', + ]; + + const headers = { + ...DEFAULT_HEADER, + 'Accept-Language': acceptLanguage, + 'User-Agent': userAgent, + Referer: options?.referer || `https://${hostname}/`, + Connection: 'keep-alive', + DNT: '1', + }; + + const timezone = options?.timezone || 'Europe/Berlin'; + + return { + acceptLanguage, + langForFlag, + userAgent, + viewport, + windowSizeArg, + langArg, + extraArgs, + headers, + timezone, + humanDelay: options?.humanDelay !== false, + }; +} + +/** + * Apply bot-prevention hardening to a Puppeteer page. + * Sets UA, viewport, JS enabled, headers, timezone and injects stealth-like patches. + * + * @param {import('puppeteer').Page} page + * @param {ReturnType} cfg + */ +export async function applyBotPreventionToPage(page, cfg) { + await page.setUserAgent(cfg.userAgent); + await page.setViewport(cfg.viewport); + await page.setJavaScriptEnabled(true); + await page.setExtraHTTPHeaders(cfg.headers); + try { + if (cfg.timezone) await page.emulateTimezone(cfg.timezone); + } catch { + // ignore timezone failures + } + + // Inject patches as early as possible + await page.evaluateOnNewDocument(() => { + try { + // webdriver + Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); + + // chrome runtime + // @ts-ignore + if (!window.chrome) { + // @ts-ignore + window.chrome = { runtime: {} }; + } + + // languages + // @ts-ignore + Object.defineProperty(navigator, 'languages', { + get: () => (window.localStorage.getItem('__LANGS__') || 'de-DE,de').split(','), + }); + + // plugins + // @ts-ignore + Object.defineProperty(navigator, 'plugins', { + get: () => [{}, {}, {}], + }); + + // platform and concurrency hints + // @ts-ignore + Object.defineProperty(navigator, 'platform', { get: () => 'Win32' }); + // @ts-ignore + if (typeof navigator.hardwareConcurrency === 'number' && navigator.hardwareConcurrency < 2) { + Object.defineProperty(navigator, 'hardwareConcurrency', { get: () => 4 }); + } + // @ts-ignore + if (typeof navigator.deviceMemory === 'number' && navigator.deviceMemory < 2) { + Object.defineProperty(navigator, 'deviceMemory', { get: () => 8 }); + } + + // userAgentData (Client Hints) + try { + // @ts-ignore + if ('userAgentData' in navigator) { + // @ts-ignore + Object.defineProperty(navigator, 'userAgentData', { + get: () => ({ + brands: [ + { brand: 'Chromium', version: '126' }, + { brand: 'Google Chrome', version: '126' }, + ], + mobile: false, + platform: 'Windows', + getHighEntropyValues: async (hints) => { + const values = { + platform: 'Windows', + platformVersion: '15.0.0', + architecture: 'x86', + model: '', + uaFullVersion: '126.0.0.0', + bitness: '64', + }; + const out = {}; + for (const k of hints || []) if (k in values) out[k] = values[k]; + return out; + }, + }), + }); + } + } catch { + //noop + } + + // Permissions API + const origQuery = navigator.permissions && navigator.permissions.query; + if (origQuery) { + // @ts-ignore + navigator.permissions.query = (parameters) => + origQuery.call(navigator.permissions, parameters).then((result) => { + if (parameters && parameters.name === 'notifications') { + Object.defineProperty(result, 'state', { get: () => Notification.permission }); + } + return result; + }); + } + + // WebGL vendor/renderer + const patchWebGL = (proto) => { + if (!proto || !proto.getParameter) return; + const getParameter = proto.getParameter; + // @ts-ignore + proto.getParameter = function (param) { + const UNMASKED_VENDOR_WEBGL = 0x9245; + const UNMASKED_RENDERER_WEBGL = 0x9246; + if (param === UNMASKED_VENDOR_WEBGL) return 'Google Inc.'; + if (param === UNMASKED_RENDERER_WEBGL) + return 'ANGLE (NVIDIA, NVIDIA GeForce GTX 1660 Ti Direct3D11 vs_5_0 ps_5_0)'; + return getParameter.call(this, param); + }; + }; + // @ts-ignore + patchWebGL(WebGLRenderingContext?.prototype); + // @ts-ignore + patchWebGL(WebGL2RenderingContext?.prototype); + + // AudioContext timestamp rounding consistency + const patchAudio = (Ctx) => { + try { + if (!Ctx) return; + const proto = Ctx.prototype; + const createOsc = proto.createOscillator; + proto.createOscillator = function () { + const osc = createOsc.call(this); + const start = osc.start; + osc.start = function (when) { + return start.call(this, when || 0); + }; + return osc; + }; + } catch { + //noop + } + }; + // @ts-ignore + patchAudio(window.AudioContext); + // @ts-ignore + patchAudio(window.OfflineAudioContext); + + // Navigator.connection + try { + // @ts-ignore + Object.defineProperty(navigator, 'connection', { get: () => undefined }); + } catch { + //noop + } + + // Consistent outer sizes + try { + const calcOuter = () => { + const w = window.innerWidth + 16; + const h = window.innerHeight + 88; + return { w, h }; + }; + const { w: outerW, h: outerH } = calcOuter(); + // @ts-ignore + Object.defineProperty(window, 'outerWidth', { get: () => outerW }); + // @ts-ignore + Object.defineProperty(window, 'outerHeight', { get: () => outerH }); + } catch { + //noop + } + } catch { + //noop + } + }); +} + +/** + * Persist languages value before navigation via localStorage. + * @param {import('puppeteer').Page} page + * @param {ReturnType} cfg + */ +export async function applyLanguagePersistence(page, cfg) { + await page.evaluateOnNewDocument((langs) => { + try { + window.localStorage.setItem('__LANGS__', langs); + } catch { + // noop + } + }, cfg.acceptLanguage.split(';')[0]); +} + +/** + * Perform subtle human-like interactions post navigation. + * @param {import('puppeteer').Page} page + * @param {ReturnType} cfg + */ +export async function applyPostNavigationHumanSignals(page, cfg) { + if (!cfg.humanDelay) return; + const delay = 200 + Math.floor(Math.random() * 400); + await new Promise((res) => setTimeout(res, delay)); + try { + const vw = cfg.viewport.width; + const vh = cfg.viewport.height; + const mx = Math.floor(vw * (0.3 + Math.random() * 0.4)); + const my = Math.floor(vh * (0.3 + Math.random() * 0.4)); + await page.mouse.move(mx, my, { steps: 10 + Math.floor(Math.random() * 10) }); + await page.mouse.wheel({ deltaY: 100 + Math.floor(Math.random() * 200) }); + } catch { + // ignore if mouse is unavailable + } +} diff --git a/lib/services/extractor/puppeteerExtractor.js b/lib/services/extractor/puppeteerExtractor.js index 5b93232..1d8e1a0 100644 --- a/lib/services/extractor/puppeteerExtractor.js +++ b/lib/services/extractor/puppeteerExtractor.js @@ -1,11 +1,16 @@ import puppeteer from 'puppeteer-extra'; import StealthPlugin from 'puppeteer-extra-plugin-stealth'; -import { debug, DEFAULT_HEADER, botDetected } from './utils.js'; +import { debug, botDetected } from './utils.js'; +import { + getPreLaunchConfig, + applyBotPreventionToPage, + applyLanguagePersistence, + applyPostNavigationHumanSignals, +} from './botPrevention.js'; import logger from '../logger.js'; import fs from 'fs'; import os from 'os'; import path from 'path'; -import { URL } from 'url'; puppeteer.use(StealthPlugin()); @@ -40,6 +45,11 @@ export default async function execute(url, waitForSelector, options) { if (options?.proxyUrl) { launchArgs.push(`--proxy-server=${options.proxyUrl}`); } + // Prepare bot prevention pre-launch config + const preCfg = getPreLaunchConfig(url, options || {}); + launchArgs.push(preCfg.langArg); + launchArgs.push(preCfg.windowSizeArg); + launchArgs.push(...preCfg.extraArgs); browser = await puppeteer.launch({ headless: options?.puppeteerHeadless ?? true, @@ -50,58 +60,9 @@ export default async function execute(url, waitForSelector, options) { }); page = await browser.newPage(); - - // Derive domain-specific defaults - const { hostname } = new URL(url); - - // Set a realistic modern user agent unless provided - const userAgent = - options?.userAgent || - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'; - await page.setUserAgent(userAgent); - - // Viewport and device scale for typical desktop - await page.setViewport({ width: 1366, height: 768, deviceScaleFactor: 1 }); - - // Extra HTTP headers with localized Accept-Language - const acceptLanguage = options?.acceptLanguage || 'de-DE,de;q=0.9,en-US;q=0.7,en;q=0.5'; - const headers = { - ...DEFAULT_HEADER, - 'Accept-Language': acceptLanguage, - 'User-Agent': userAgent, - Referer: options?.referer || `https://${hostname}/`, - Connection: 'keep-alive', - DNT: '1', - }; - await page.setExtraHTTPHeaders(headers); - - // Timezone and locale tweaks to look German when needed - try { - const tz = options?.timezone || 'Europe/Berlin'; - if (tz) await page.emulateTimezone(tz); - } catch { - //noop - } - - // Harden navigator properties (stealth already covers many, but we ensure critical ones) - await page.evaluateOnNewDocument(() => { - Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); - // Plugins and mimeTypes - // @ts-ignore - Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3] }); - // @ts-ignore - Object.defineProperty(navigator, 'languages', { - get: () => (window.localStorage.getItem('__LANGS__') || 'de-DE,de').split(','), - }); - }); + await applyBotPreventionToPage(page, preCfg); // Provide languages value before navigation - await page.evaluateOnNewDocument((langs) => { - try { - window.localStorage.setItem('__LANGS__', langs); - } catch { - //noop - } - }, acceptLanguage.split(';')[0]); + await applyLanguagePersistence(page, preCfg); // Optional cookies if (Array.isArray(options?.cookies) && options.cookies.length > 0) { @@ -113,11 +74,8 @@ export default async function execute(url, waitForSelector, options) { waitUntil: options?.waitUntil || 'domcontentloaded', }); - // Optionally wait a random small delay to mimic human rendering time - if (options?.humanDelay !== false) { - const delay = 200 + Math.floor(Math.random() * 400); - await new Promise((res) => setTimeout(res, delay)); - } + // Optionally wait and add subtle human-like interactions + await applyPostNavigationHumanSignals(page, preCfg); let pageSource; // if we're extracting data from a SPA, we must wait for the selector diff --git a/test/services/extractor/botPrevention.test.js b/test/services/extractor/botPrevention.test.js new file mode 100644 index 0000000..6ff70d7 --- /dev/null +++ b/test/services/extractor/botPrevention.test.js @@ -0,0 +1,99 @@ +import { describe, it } from 'mocha'; +import { expect } from 'chai'; + +import { + getPreLaunchConfig, + applyBotPreventionToPage, + applyLanguagePersistence, + applyPostNavigationHumanSignals, +} from '../../../lib/services/extractor/botPrevention.js'; + +describe('botPrevention helper', () => { + it('getPreLaunchConfig builds deterministic values when jitter disabled', () => { + const url = 'https://example.com/some/path'; + const options = { + acceptLanguage: 'de-DE,de;q=0.9', + userAgent: 'TestAgent/1.0', + viewport: { width: 1200, height: 700, deviceScaleFactor: 2 }, + viewportJitter: false, + referer: 'https://example.com/ref', + timezone: 'Europe/Berlin', + }; + const cfg = getPreLaunchConfig(url, options); + + expect(cfg.acceptLanguage).to.equal('de-DE,de;q=0.9'); + expect(cfg.langArg).to.equal('--lang=de-DE'); + expect(cfg.windowSizeArg).to.equal('--window-size=1200,700'); + expect(cfg.viewport).to.deep.equal({ width: 1200, height: 700, deviceScaleFactor: 2 }); + expect(cfg.userAgent).to.equal('TestAgent/1.0'); + expect(cfg.headers['Accept-Language']).to.equal('de-DE,de;q=0.9'); + expect(cfg.headers['User-Agent']).to.equal('TestAgent/1.0'); + expect(cfg.headers.Referer).to.equal('https://example.com/ref'); + expect(cfg.extraArgs).to.include('--disable-blink-features=AutomationControlled'); + expect(cfg.extraArgs).to.include('--proxy-bypass-list=<-loopback>'); + }); + + it('applyBotPreventionToPage sets UA, viewport, headers and injects patches', async () => { + const calls = []; + const page = { + setUserAgent: async (ua) => calls.push(['setUserAgent', ua]), + setViewport: async (vp) => calls.push(['setViewport', vp]), + setJavaScriptEnabled: async (on) => calls.push(['setJavaScriptEnabled', on]), + setExtraHTTPHeaders: async (h) => calls.push(['setExtraHTTPHeaders', h]), + emulateTimezone: async (tz) => calls.push(['emulateTimezone', tz]), + evaluateOnNewDocument: async (fn) => calls.push(['evaluateOnNewDocument', typeof fn]), + }; + const cfg = getPreLaunchConfig('https://example.org/', { + userAgent: 'Foo/Bar', + acceptLanguage: 'en-US,en', + viewport: { width: 1000, height: 600, deviceScaleFactor: 1 }, + viewportJitter: false, + timezone: 'UTC', + }); + + await applyBotPreventionToPage(page, cfg); + + expect(calls[0]).to.deep.equal(['setUserAgent', 'Foo/Bar']); + expect(calls.some((c) => c[0] === 'setViewport' && c[1].width === 1000 && c[1].height === 600)).to.equal(true); + expect(calls.some((c) => c[0] === 'setJavaScriptEnabled' && c[1] === true)).to.equal(true); + const headerCall = calls.find((c) => c[0] === 'setExtraHTTPHeaders'); + expect(headerCall).to.exist; + expect(headerCall[1]['Accept-Language']).to.equal('en-US,en'); + expect(headerCall[1]['User-Agent']).to.equal('Foo/Bar'); + expect(calls.some((c) => c[0] === 'emulateTimezone' && c[1] === 'UTC')).to.equal(true); + expect(calls.some((c) => c[0] === 'evaluateOnNewDocument' && c[1] === 'function')).to.equal(true); + }); + + it('applyLanguagePersistence stores languages early', async () => { + const calls = []; + const page = { + evaluateOnNewDocument: async (fn, arg) => calls.push(['evaluateOnNewDocument', typeof fn, arg]), + }; + const cfg = getPreLaunchConfig('https://example.org/', { + acceptLanguage: 'de-DE,de;q=0.9', + viewportJitter: false, + }); + await applyLanguagePersistence(page, cfg); + const call = calls[0]; + expect(call[0]).to.equal('evaluateOnNewDocument'); + expect(call[1]).to.equal('function'); + expect(call[2]).to.equal('de-DE,de'); + }); + + it('applyPostNavigationHumanSignals moves mouse and scrolls when enabled', async () => { + const mouseCalls = []; + const page = { + mouse: { + move: async (x, y, opts) => mouseCalls.push(['move', x, y, opts && typeof opts.steps === 'number']), + wheel: async (opts) => mouseCalls.push(['wheel', typeof opts.deltaY === 'number']), + }, + }; + const cfg = { + humanDelay: true, + viewport: { width: 1200, height: 800 }, + }; + await applyPostNavigationHumanSignals(page, cfg); + expect(mouseCalls.some((c) => c[0] === 'move')).to.equal(true); + expect(mouseCalls.some((c) => c[0] === 'wheel')).to.equal(true); + }); +});