Compare commits

..

8 Commits

Author SHA1 Message Date
Christian Kellner
1ecbbdd774 better logging 2025-01-07 13:34:43 +01:00
Christian Kellner
e1db3840f6 adding puppeteer timeout and fixing waitForSelector 2025-01-07 12:37:50 +01:00
Christian Kellner
26127eeac1 updating dependencies 2025-01-07 12:27:16 +01:00
Christian Kellner
90a4ee5dcf better logging, fixing code smells 2025-01-07 12:25:19 +01:00
Christian Kellner
2aaf63c253 Happy New Year 2025-01-05 06:53:07 +01:00
Christian Kellner
f52e3e9fd8 Update package.json 2025-01-04 21:52:06 +01:00
Fabian Pfaff
0d69232395 install chrome via apt instead of bundled (#122) 2025-01-04 21:50:59 +01:00
weakmap@gmail.com
b473cf7fb4 fixing kleinanzeigen test 2024-12-26 19:18:30 +01:00
13 changed files with 1036 additions and 481 deletions

View File

@@ -4,6 +4,11 @@ WORKDIR /fredy
COPY . /fredy COPY . /fredy
RUN apt-get update && apt-get install -y chromium
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \
PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium
RUN yarn install RUN yarn install
RUN yarn global add pm2 RUN yarn global add pm2

View File

@@ -1,6 +1,6 @@
MIT License MIT License
Copyright (c) 2024 Christian Kellner Copyright (c) 2025 Christian Kellner
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal

View File

@@ -47,11 +47,17 @@ class FredyRuntime {
_getListings(url) { _getListings(url) {
const extractor = new Extractor(); const extractor = new Extractor();
return new Promise((resolve, reject) => { return new Promise((resolve, reject) => {
extractor.execute(url,this._providerConfig.waitForSelector) extractor
.execute(url, this._providerConfig.waitForSelector)
.then(() => { .then(() => {
const listings = extractor.parseResponseText(this._providerConfig.crawlContainer, this._providerConfig.crawlFields); const listings = extractor.parseResponseText(
this._providerConfig.crawlContainer,
this._providerConfig.crawlFields,
url,
);
resolve(listings == null ? [] : listings); resolve(listings == null ? [] : listings);
}).catch(err => { })
.catch((err) => {
reject(err); reject(err);
/* eslint-disable no-console */ /* eslint-disable no-console */
console.error(err); console.error(err);

View File

@@ -1,8 +1,6 @@
import restana from 'restana'; import restana from 'restana';
import fetch from 'node-fetch';
import * as jobStorage from '../../services/storage/jobStorage.js'; import * as jobStorage from '../../services/storage/jobStorage.js';
import * as userStorage from '../../services/storage/userStorage.js'; import * as userStorage from '../../services/storage/userStorage.js';
import * as immoscoutProvider from '../../provider/immoscout.js';
import { config } from '../../utils.js'; import { config } from '../../utils.js';
import { isAdmin } from '../security.js'; import { isAdmin } from '../security.js';
import { trackDemoJobCreated } from '../../services/tracking/Tracker.js'; import { trackDemoJobCreated } from '../../services/tracking/Tracker.js';
@@ -28,7 +26,7 @@ jobRouter.get('/', async (req, res) => {
jobRouter.get('/processingTimes', async (req, res) => { jobRouter.get('/processingTimes', async (req, res) => {
res.body = { res.body = {
interval: config.interval, interval: config.interval,
lastRun: config.lastRun || null lastRun: config.lastRun || null,
}; };
res.send(); res.send();
}); });
@@ -51,7 +49,7 @@ jobRouter.post('/', async (req, res) => {
trackDemoJobCreated({ trackDemoJobCreated({
name, name,
provider, provider,
adapter: notificationAdapter adapter: notificationAdapter,
}); });
res.send(); res.send();
}); });

View File

@@ -26,7 +26,7 @@ const config = {
url: null, url: null,
crawlContainer: 'div[data-testid="serp-core-classified-card-testid"]', crawlContainer: 'div[data-testid="serp-core-classified-card-testid"]',
sortByDateParam: 'sortby=19', sortByDateParam: 'sortby=19',
waitForSelector: 'div[data-testid="serp-core-classified-card-testid"]', waitForSelector: 'div[data-testid="serp-resultscount-testid"]',
crawlFields: { crawlFields: {
id: 'button@title |trim', // immonet is a piece of sh*t. See comment above id: 'button@title |trim', // immonet is a piece of sh*t. See comment above
title: 'button@title |trim', title: 'button@title |trim',

View File

@@ -18,7 +18,7 @@ const config = {
crawlContainer: crawlContainer:
'div[data-testid="serp-core-scrollablelistview-testid"]:not(div[data-testid="serp-enlargementlist-testid"] div[data-testid="serp-card-testid"]) div[data-testid="serp-core-classified-card-testid"]', 'div[data-testid="serp-core-scrollablelistview-testid"]:not(div[data-testid="serp-enlargementlist-testid"] div[data-testid="serp-card-testid"]) div[data-testid="serp-core-classified-card-testid"]',
sortByDateParam: 'order=DateDesc', sortByDateParam: 'order=DateDesc',
waitForSelector: 'div[data-testid="cardmfe-price-testid"]', waitForSelector: 'div[data-testid="serp-gridcontainer-testid"]',
crawlFields: { crawlFields: {
id: 'a@href', id: 'a@href',
price: 'div[data-testid="cardmfe-price-testid"] | removeNewline | trim', price: 'div[data-testid="cardmfe-price-testid"] | removeNewline | trim',

View File

@@ -15,7 +15,7 @@ function applyBlacklist(o) {
const descNotBlacklisted = !utils.isOneOf(o.description, appliedBlackList); const descNotBlacklisted = !utils.isOneOf(o.description, appliedBlackList);
const isBlacklistedDistrict = const isBlacklistedDistrict =
appliedBlacklistedDistricts.length === 0 ? false : utils.isOneOf(o.description, appliedBlacklistedDistricts); appliedBlacklistedDistricts.length === 0 ? false : utils.isOneOf(o.description, appliedBlacklistedDistricts);
return !isBlacklistedDistrict && titleNotBlacklisted && descNotBlacklisted; return o.title != null && !isBlacklistedDistrict && titleNotBlacklisted && descNotBlacklisted;
} }
const config = { const config = {

View File

@@ -4,16 +4,15 @@ import {loadParser, parse} from './parser/parser.js';
const DEFAULT_OPTIONS = { const DEFAULT_OPTIONS = {
debug: false, debug: false,
puppeteerTimeout: 20_000, puppeteerTimeout: 60_000,
puppeteerHeadless: true puppeteerHeadless: true,
}; };
export default class Extractor { export default class Extractor {
constructor(options) { constructor(options) {
this.options = { this.options = {
...DEFAULT_OPTIONS, ...DEFAULT_OPTIONS,
...options ...options,
}; };
this.responseText = null; this.responseText = null;
setDebug(this.options); setDebug(this.options);
@@ -38,8 +37,7 @@ export default class Extractor {
return this; return this;
}; };
parseResponseText = (crawlContainer, crawlFields, url) => {
parseResponseText = (crawlContainer, crawlFields) => { return parse(crawlContainer, crawlFields, this.responseText, url);
return parse(crawlContainer, crawlFields, this.responseText);
}; };
} }

View File

@@ -6,21 +6,22 @@ export function loadParser(text) {
$ = cheerio.load(text); $ = cheerio.load(text);
} }
export function parse(crawlContainer, crawlFields, text) { export function parse(crawlContainer, crawlFields, text, url) {
if (!text) { if (!text) {
console.warn('Cannot parse, text was empty.'); console.warn('Cannot parse, text was empty for url ', url);
return null; return null;
} }
if (!crawlContainer || !crawlFields) { if (!crawlContainer || !crawlFields) {
console.warn('Cannot parse, selector was empty.'); console.warn('Cannot parse, selector was empty for url ', url);
return null; return null;
} }
const result = []; const result = [];
if ($(crawlContainer).length === 0) { if ($(crawlContainer).length === 0) {
console.error('No elements in crawl container found!'); console.warn('No elements in crawl container found for url ', url);
return null;
} }
$(crawlContainer).each((_, element) => { $(crawlContainer).each((_, element) => {
@@ -32,8 +33,9 @@ export function parse(crawlContainer, crawlFields, text) {
let value; let value;
try { try {
const selector = fieldSelector.includes('|')
const selector = fieldSelector.includes('|') ? fieldSelector.substring(0, fieldSelector.indexOf('|')).trim() : fieldSelector; ? fieldSelector.substring(0, fieldSelector.indexOf('|')).trim()
: fieldSelector;
if (selector.includes('@')) { if (selector.includes('@')) {
const [sel, attr] = selector.split('@'); const [sel, attr] = selector.split('@');
@@ -48,7 +50,9 @@ export function parse(crawlContainer, crawlFields, text) {
// Apply modifiers if specified // Apply modifiers if specified
if (fieldSelector.includes('|')) { if (fieldSelector.includes('|')) {
const [_, ...modifiers] = fieldSelector.split('|').map(s => s.trim()); /* eslint-disable no-unused-vars */
const [_, ...modifiers] = fieldSelector.split('|').map((s) => s.trim());
/* eslint-disable no-unused-vars */
value = applyModifiers(value, modifiers); value = applyModifiers(value, modifiers);
} }
@@ -73,7 +77,7 @@ export function parse(crawlContainer, crawlFields, text) {
function applyModifiers(value, modifiers) { function applyModifiers(value, modifiers) {
if (!value) return value; if (!value) return value;
modifiers.forEach(modifier => { modifiers.forEach((modifier) => {
switch (modifier) { switch (modifier) {
case 'int': case 'int':
value = parseInt(value, 10); value = parseInt(value, 10);
@@ -91,4 +95,3 @@ function applyModifiers(value, modifiers) {
return value; return value;
} }

View File

@@ -11,18 +11,19 @@ export default async function execute(url, waitForSelector, options) {
browser = await puppeteer.launch({ browser = await puppeteer.launch({
headless: options.puppeteerHeadless ?? true, headless: options.puppeteerHeadless ?? true,
args: ['--no-sandbox', '--disable-gpu', '--disable-setuid-sandbox'] args: ['--no-sandbox', '--disable-gpu', '--disable-setuid-sandbox'],
timeout: options.puppeteerTimeout || 30_000,
}); });
let page = await browser.newPage(); let page = await browser.newPage();
await page.setExtraHTTPHeaders(DEFAULT_HEADER); await page.setExtraHTTPHeaders(DEFAULT_HEADER);
const response = await page.goto(url, { const response = await page.goto(url, {
waitUntil: 'domcontentloaded' waitUntil: 'domcontentloaded',
}); });
let pageSource; let pageSource;
//if we're extracting data from a spa, we must wait for the selector //if we're extracting data from a spa, we must wait for the selector
if (waitForSelector != null) { if (waitForSelector != null) {
await page.waitForSelector(waitForSelector); await page.waitForSelector(waitForSelector);
pageSource = await page.evaluate(selector => { pageSource = await page.evaluate((selector) => {
return document.querySelector(selector).innerHTML; return document.querySelector(selector).innerHTML;
}, waitForSelector); }, waitForSelector);
} else { } else {

View File

@@ -1,34 +1,31 @@
let debuggingOn = false; let debuggingOn = false;
export const DEFAULT_HEADER = { export const DEFAULT_HEADER = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5', 'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive', Connection: 'keep-alive',
'Upgrade-Insecure-Requests': '1', 'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36' 'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
}; };
export const setDebug = options => { export const setDebug = (options) => {
debuggingOn = !!options?.debug; debuggingOn = !!options?.debug;
}; };
export const debug = (message) => { export const debug = (message) => {
if (debuggingOn) { if (debuggingOn) {
/* eslint-disable no-console */
console.debug(message); console.debug(message);
/* eslint-enable no-console */
} }
}; };
export const botDetected = (pageSource, statusCode) => { export const botDetected = (pageSource, statusCode) => {
const suspiciousStatusCodes = [ const suspiciousStatusCodes = [403, 429];
403, 429 const botDetectionPatterns = [/verify you are human/i, /access denied/i, /x-amz-cf-id/i];
];
const botDetectionPatterns = [
/verify you are human/i,
/access denied/i,
/x-amz-cf-id/i,
];
const detectedInSource = botDetectionPatterns.some(pattern => pattern.test(pageSource)); const detectedInSource = botDetectionPatterns.some((pattern) => pattern.test(pageSource));
const detectedByStatus = suspiciousStatusCodes.includes(statusCode); const detectedByStatus = suspiciousStatusCodes.includes(statusCode);
return detectedInSource || detectedByStatus; return detectedInSource || detectedByStatus;

View File

@@ -1,6 +1,6 @@
{ {
"name": "fredy", "name": "fredy",
"version": "11.0.0", "version": "11.0.1",
"description": "[F]ind [R]eal [E]states [d]amn eas[y].", "description": "[F]ind [R]eal [E]states [d]amn eas[y].",
"scripts": { "scripts": {
"start": "node prod.js", "start": "node prod.js",
@@ -50,17 +50,17 @@
"Firefox ESR" "Firefox ESR"
], ],
"dependencies": { "dependencies": {
"@douyinfe/semi-ui": "2.71.3", "@douyinfe/semi-ui": "2.72.3",
"@rematch/core": "2.2.0", "@rematch/core": "2.2.0",
"@rematch/loading": "2.1.2", "@rematch/loading": "2.1.2",
"@sendgrid/mail": "8.1.4", "@sendgrid/mail": "8.1.4",
"@vitejs/plugin-react": "4.3.4", "@vitejs/plugin-react": "4.3.4",
"better-sqlite3": "^11.7.0", "better-sqlite3": "^11.7.2",
"body-parser": "1.20.3", "body-parser": "1.20.3",
"cheerio": "^1.0.0", "cheerio": "^1.0.0",
"cookie-session": "2.1.0", "cookie-session": "2.1.0",
"handlebars": "4.7.8", "handlebars": "4.7.8",
"highcharts": "12.1.0", "highcharts": "12.1.2",
"highcharts-react-official": "3.2.1", "highcharts-react-official": "3.2.1",
"lodash": "4.17.21", "lodash": "4.17.21",
"lowdb": "6.0.1", "lowdb": "6.0.1",
@@ -70,7 +70,7 @@
"node-fetch": "3.3.2", "node-fetch": "3.3.2",
"node-mailjet": "6.0.6", "node-mailjet": "6.0.6",
"package-up": "^5.0.0", "package-up": "^5.0.0",
"puppeteer": "^23.10.4", "puppeteer": "^23.11.1",
"puppeteer-extra": "^3.3.6", "puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-stealth": "^2.11.2", "puppeteer-extra-plugin-stealth": "^2.11.2",
"query-string": "9.1.1", "query-string": "9.1.1",
@@ -95,12 +95,12 @@
"chai": "5.1.2", "chai": "5.1.2",
"eslint": "8.56.0", "eslint": "8.56.0",
"eslint-config-prettier": "8.8.0", "eslint-config-prettier": "8.8.0",
"eslint-plugin-react": "7.37.2", "eslint-plugin-react": "7.37.3",
"esmock": "2.6.9", "esmock": "2.6.9",
"history": "5.3.0", "history": "5.3.0",
"husky": "9.1.7", "husky": "9.1.7",
"less": "4.2.1", "less": "4.2.1",
"lint-staged": "15.2.11", "lint-staged": "15.3.0",
"mocha": "10.8.2", "mocha": "10.8.2",
"prettier": "3.4.2", "prettier": "3.4.2",
"redux-logger": "3.0.6" "redux-logger": "3.0.6"

943
yarn.lock

File diff suppressed because it is too large Load Diff