Compare commits

...

2 Commits

Author SHA1 Message Date
orangecoding
0606122736 improving bot detection prevention 2025-11-16 19:59:08 +01:00
orangecoding
53d5098cec fixing wrong number extraction 2025-11-03 20:01:55 +01:00
5 changed files with 664 additions and 271 deletions

View File

@@ -8,7 +8,7 @@ function normalize(o) {
const title = o.title || 'No title available'; const title = o.title || 'No title available';
const link = o.link != null ? decodeURIComponent(o.link) : config.url; const link = o.link != null ? decodeURIComponent(o.link) : config.url;
var urlReg = new RegExp(/url\((.*?)\)/gim); const urlReg = new RegExp(/url\((.*?)\)/gim);
const image = o.image != null ? urlReg.exec(o.image)[1] : null; const image = o.image != null ? urlReg.exec(o.image)[1] : null;
return Object.assign(o, { id, address, title, link, image }); return Object.assign(o, { id, address, title, link, image });
} }

View File

@@ -5,6 +5,7 @@ import logger from '../logger.js';
import fs from 'fs'; import fs from 'fs';
import os from 'os'; import os from 'os';
import path from 'path'; import path from 'path';
import { URL } from 'url';
puppeteer.use(StealthPlugin()); puppeteer.use(StealthPlugin());
@@ -27,23 +28,97 @@ export default async function execute(url, waitForSelector, options) {
removeUserDataDir = true; removeUserDataDir = true;
} }
const launchArgs = [
'--no-sandbox',
'--disable-gpu',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-crash-reporter',
'--no-first-run',
'--no-default-browser-check',
];
if (options?.proxyUrl) {
launchArgs.push(`--proxy-server=${options.proxyUrl}`);
}
browser = await puppeteer.launch({ browser = await puppeteer.launch({
headless: options.puppeteerHeadless ?? true, headless: options?.puppeteerHeadless ?? true,
args: [ args: launchArgs,
'--no-sandbox', timeout: options?.puppeteerTimeout || 30_000,
'--disable-gpu',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-crash-reporter',
],
timeout: options.puppeteerTimeout || 30_000,
userDataDir, userDataDir,
executablePath: options?.executablePath, // allow using system Chrome
}); });
page = await browser.newPage(); page = await browser.newPage();
await page.setExtraHTTPHeaders(DEFAULT_HEADER);
const response = await page.goto(url, { // Derive domain-specific defaults
waitUntil: 'domcontentloaded', const { hostname } = new URL(url);
// Set a realistic modern user agent unless provided
const userAgent =
options?.userAgent ||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36';
await page.setUserAgent(userAgent);
// Viewport and device scale for typical desktop
await page.setViewport({ width: 1366, height: 768, deviceScaleFactor: 1 });
// Extra HTTP headers with localized Accept-Language
const acceptLanguage = options?.acceptLanguage || 'de-DE,de;q=0.9,en-US;q=0.7,en;q=0.5';
const headers = {
...DEFAULT_HEADER,
'Accept-Language': acceptLanguage,
'User-Agent': userAgent,
Referer: options?.referer || `https://${hostname}/`,
Connection: 'keep-alive',
DNT: '1',
};
await page.setExtraHTTPHeaders(headers);
// Timezone and locale tweaks to look German when needed
try {
const tz = options?.timezone || 'Europe/Berlin';
if (tz) await page.emulateTimezone(tz);
} catch {
//noop
}
// Harden navigator properties (stealth already covers many, but we ensure critical ones)
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
// Plugins and mimeTypes
// @ts-ignore
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3] });
// @ts-ignore
Object.defineProperty(navigator, 'languages', {
get: () => (window.localStorage.getItem('__LANGS__') || 'de-DE,de').split(','),
});
}); });
// Provide languages value before navigation
await page.evaluateOnNewDocument((langs) => {
try {
window.localStorage.setItem('__LANGS__', langs);
} catch {
//noop
}
}, acceptLanguage.split(';')[0]);
// Optional cookies
if (Array.isArray(options?.cookies) && options.cookies.length > 0) {
await page.setCookie(...options.cookies);
}
// Navigation
const response = await page.goto(url, {
waitUntil: options?.waitUntil || 'domcontentloaded',
});
// Optionally wait a random small delay to mimic human rendering time
if (options?.humanDelay !== false) {
const delay = 200 + Math.floor(Math.random() * 400);
await new Promise((res) => setTimeout(res, delay));
}
let pageSource; let pageSource;
// if we're extracting data from a SPA, we must wait for the selector // if we're extracting data from a SPA, we must wait for the selector
if (waitForSelector != null) { if (waitForSelector != null) {
@@ -57,7 +132,7 @@ export default async function execute(url, waitForSelector, options) {
pageSource = await page.content(); pageSource = await page.content();
} }
const statusCode = response.status(); const statusCode = response?.status?.() ?? 200;
if (botDetected(pageSource, statusCode)) { if (botDetected(pageSource, statusCode)) {
logger.warn('We have been detected as a bot :-/ Tried url: => ', url); logger.warn('We have been detected as a bot :-/ Tried url: => ', url);

View File

@@ -152,8 +152,9 @@ export const storeListings = (jobId, providerId, listings) => {
*/ */
function extractNumber(str) { function extractNumber(str) {
if (!str) return null; if (!str) return null;
const match = str.replace(/[.,]/g, '').match(/\d+/); const cleaned = str.replace(/\./g, '').replace(',', '.');
return match ? +match[0] : null; const num = parseFloat(cleaned);
return isNaN(num) ? null : num;
} }
/** /**

View File

@@ -1,6 +1,6 @@
{ {
"name": "fredy", "name": "fredy",
"version": "14.3.2", "version": "14.3.4",
"description": "[F]ind [R]eal [E]states [d]amn eas[y].", "description": "[F]ind [R]eal [E]states [d]amn eas[y].",
"scripts": { "scripts": {
"prepare": "husky", "prepare": "husky",
@@ -56,13 +56,13 @@
"Firefox ESR" "Firefox ESR"
], ],
"dependencies": { "dependencies": {
"@douyinfe/semi-icons": "^2.87.1", "@douyinfe/semi-icons": "^2.88.0",
"@douyinfe/semi-ui": "2.87.1", "@douyinfe/semi-ui": "2.88.0",
"@sendgrid/mail": "8.1.6", "@sendgrid/mail": "8.1.6",
"@visactor/react-vchart": "^2.0.5", "@visactor/react-vchart": "^2.0.8",
"@visactor/vchart": "^2.0.5", "@visactor/vchart": "^2.0.8",
"@visactor/vchart-semi-theme": "^1.12.2", "@visactor/vchart-semi-theme": "^1.12.2",
"@vitejs/plugin-react": "5.1.0", "@vitejs/plugin-react": "5.1.1",
"better-sqlite3": "^12.4.1", "better-sqlite3": "^12.4.1",
"body-parser": "2.2.0", "body-parser": "2.2.0",
"cheerio": "^1.1.2", "cheerio": "^1.1.2",
@@ -73,21 +73,21 @@
"node-cron": "^4.2.1", "node-cron": "^4.2.1",
"node-fetch": "3.3.2", "node-fetch": "3.3.2",
"node-mailjet": "6.0.11", "node-mailjet": "6.0.11",
"p-throttle": "^8.0.0", "p-throttle": "^8.1.0",
"package-up": "^5.0.0", "package-up": "^5.0.0",
"puppeteer": "^24.27.0", "puppeteer": "^24.30.0",
"puppeteer-extra": "^3.3.6", "puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-stealth": "^2.11.2", "puppeteer-extra-plugin-stealth": "^2.11.2",
"query-string": "9.3.1", "query-string": "9.3.1",
"react": "18.3.1", "react": "18.3.1",
"react-dom": "18.3.1", "react-dom": "18.3.1",
"react-router": "7.9.5", "react-router": "7.9.6",
"react-router-dom": "7.9.5", "react-router-dom": "7.9.6",
"restana": "5.1.0", "restana": "5.1.0",
"semver": "^7.7.3", "semver": "^7.7.3",
"serve-static": "2.2.0", "serve-static": "2.2.0",
"slack": "11.0.2", "slack": "11.0.2",
"vite": "7.1.12", "vite": "7.2.2",
"x-var": "^3.0.1", "x-var": "^3.0.1",
"zustand": "^5.0.8" "zustand": "^5.0.8"
}, },
@@ -96,8 +96,8 @@
"@babel/eslint-parser": "7.28.5", "@babel/eslint-parser": "7.28.5",
"@babel/preset-env": "7.28.5", "@babel/preset-env": "7.28.5",
"@babel/preset-react": "7.28.5", "@babel/preset-react": "7.28.5",
"chai": "6.2.0", "chai": "6.2.1",
"eslint": "9.39.0", "eslint": "9.39.1",
"eslint-config-prettier": "10.1.8", "eslint-config-prettier": "10.1.8",
"eslint-plugin-react": "7.37.5", "eslint-plugin-react": "7.37.5",
"esmock": "2.7.3", "esmock": "2.7.3",
@@ -105,8 +105,8 @@
"husky": "9.1.7", "husky": "9.1.7",
"less": "4.4.2", "less": "4.4.2",
"lint-staged": "16.2.6", "lint-staged": "16.2.6",
"mocha": "11.7.4", "mocha": "11.7.5",
"nodemon": "^3.1.10", "nodemon": "^3.1.11",
"prettier": "3.6.2" "prettier": "3.6.2"
} }
} }

797
yarn.lock

File diff suppressed because it is too large Load Diff