mirror of
https://github.com/orangecoding/fredy.git
synced 2026-06-16 12:31:07 +00:00
Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1ecbbdd774 | ||
|
|
e1db3840f6 | ||
|
|
26127eeac1 | ||
|
|
90a4ee5dcf | ||
|
|
2aaf63c253 | ||
|
|
f52e3e9fd8 | ||
|
|
0d69232395 | ||
|
|
b473cf7fb4 |
@@ -4,6 +4,11 @@ WORKDIR /fredy
|
|||||||
|
|
||||||
COPY . /fredy
|
COPY . /fredy
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y chromium
|
||||||
|
|
||||||
|
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \
|
||||||
|
PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium
|
||||||
|
|
||||||
RUN yarn install
|
RUN yarn install
|
||||||
|
|
||||||
RUN yarn global add pm2
|
RUN yarn global add pm2
|
||||||
|
|||||||
2
LICENSE
2
LICENSE
@@ -1,6 +1,6 @@
|
|||||||
MIT License
|
MIT License
|
||||||
|
|
||||||
Copyright (c) 2024 Christian Kellner
|
Copyright (c) 2025 Christian Kellner
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
|||||||
@@ -47,11 +47,17 @@ class FredyRuntime {
|
|||||||
_getListings(url) {
|
_getListings(url) {
|
||||||
const extractor = new Extractor();
|
const extractor = new Extractor();
|
||||||
return new Promise((resolve, reject) => {
|
return new Promise((resolve, reject) => {
|
||||||
extractor.execute(url,this._providerConfig.waitForSelector)
|
extractor
|
||||||
|
.execute(url, this._providerConfig.waitForSelector)
|
||||||
.then(() => {
|
.then(() => {
|
||||||
const listings = extractor.parseResponseText(this._providerConfig.crawlContainer, this._providerConfig.crawlFields);
|
const listings = extractor.parseResponseText(
|
||||||
|
this._providerConfig.crawlContainer,
|
||||||
|
this._providerConfig.crawlFields,
|
||||||
|
url,
|
||||||
|
);
|
||||||
resolve(listings == null ? [] : listings);
|
resolve(listings == null ? [] : listings);
|
||||||
}).catch(err => {
|
})
|
||||||
|
.catch((err) => {
|
||||||
reject(err);
|
reject(err);
|
||||||
/* eslint-disable no-console */
|
/* eslint-disable no-console */
|
||||||
console.error(err);
|
console.error(err);
|
||||||
|
|||||||
@@ -1,8 +1,6 @@
|
|||||||
import restana from 'restana';
|
import restana from 'restana';
|
||||||
import fetch from 'node-fetch';
|
|
||||||
import * as jobStorage from '../../services/storage/jobStorage.js';
|
import * as jobStorage from '../../services/storage/jobStorage.js';
|
||||||
import * as userStorage from '../../services/storage/userStorage.js';
|
import * as userStorage from '../../services/storage/userStorage.js';
|
||||||
import * as immoscoutProvider from '../../provider/immoscout.js';
|
|
||||||
import { config } from '../../utils.js';
|
import { config } from '../../utils.js';
|
||||||
import { isAdmin } from '../security.js';
|
import { isAdmin } from '../security.js';
|
||||||
import { trackDemoJobCreated } from '../../services/tracking/Tracker.js';
|
import { trackDemoJobCreated } from '../../services/tracking/Tracker.js';
|
||||||
@@ -28,7 +26,7 @@ jobRouter.get('/', async (req, res) => {
|
|||||||
jobRouter.get('/processingTimes', async (req, res) => {
|
jobRouter.get('/processingTimes', async (req, res) => {
|
||||||
res.body = {
|
res.body = {
|
||||||
interval: config.interval,
|
interval: config.interval,
|
||||||
lastRun: config.lastRun || null
|
lastRun: config.lastRun || null,
|
||||||
};
|
};
|
||||||
res.send();
|
res.send();
|
||||||
});
|
});
|
||||||
@@ -51,7 +49,7 @@ jobRouter.post('/', async (req, res) => {
|
|||||||
trackDemoJobCreated({
|
trackDemoJobCreated({
|
||||||
name,
|
name,
|
||||||
provider,
|
provider,
|
||||||
adapter: notificationAdapter
|
adapter: notificationAdapter,
|
||||||
});
|
});
|
||||||
res.send();
|
res.send();
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ const config = {
|
|||||||
url: null,
|
url: null,
|
||||||
crawlContainer: 'div[data-testid="serp-core-classified-card-testid"]',
|
crawlContainer: 'div[data-testid="serp-core-classified-card-testid"]',
|
||||||
sortByDateParam: 'sortby=19',
|
sortByDateParam: 'sortby=19',
|
||||||
waitForSelector: 'div[data-testid="serp-core-classified-card-testid"]',
|
waitForSelector: 'div[data-testid="serp-resultscount-testid"]',
|
||||||
crawlFields: {
|
crawlFields: {
|
||||||
id: 'button@title |trim', // immonet is a piece of sh*t. See comment above
|
id: 'button@title |trim', // immonet is a piece of sh*t. See comment above
|
||||||
title: 'button@title |trim',
|
title: 'button@title |trim',
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ const config = {
|
|||||||
crawlContainer:
|
crawlContainer:
|
||||||
'div[data-testid="serp-core-scrollablelistview-testid"]:not(div[data-testid="serp-enlargementlist-testid"] div[data-testid="serp-card-testid"]) div[data-testid="serp-core-classified-card-testid"]',
|
'div[data-testid="serp-core-scrollablelistview-testid"]:not(div[data-testid="serp-enlargementlist-testid"] div[data-testid="serp-card-testid"]) div[data-testid="serp-core-classified-card-testid"]',
|
||||||
sortByDateParam: 'order=DateDesc',
|
sortByDateParam: 'order=DateDesc',
|
||||||
waitForSelector: 'div[data-testid="cardmfe-price-testid"]',
|
waitForSelector: 'div[data-testid="serp-gridcontainer-testid"]',
|
||||||
crawlFields: {
|
crawlFields: {
|
||||||
id: 'a@href',
|
id: 'a@href',
|
||||||
price: 'div[data-testid="cardmfe-price-testid"] | removeNewline | trim',
|
price: 'div[data-testid="cardmfe-price-testid"] | removeNewline | trim',
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ function applyBlacklist(o) {
|
|||||||
const descNotBlacklisted = !utils.isOneOf(o.description, appliedBlackList);
|
const descNotBlacklisted = !utils.isOneOf(o.description, appliedBlackList);
|
||||||
const isBlacklistedDistrict =
|
const isBlacklistedDistrict =
|
||||||
appliedBlacklistedDistricts.length === 0 ? false : utils.isOneOf(o.description, appliedBlacklistedDistricts);
|
appliedBlacklistedDistricts.length === 0 ? false : utils.isOneOf(o.description, appliedBlacklistedDistricts);
|
||||||
return !isBlacklistedDistrict && titleNotBlacklisted && descNotBlacklisted;
|
return o.title != null && !isBlacklistedDistrict && titleNotBlacklisted && descNotBlacklisted;
|
||||||
}
|
}
|
||||||
|
|
||||||
const config = {
|
const config = {
|
||||||
|
|||||||
@@ -4,16 +4,15 @@ import {loadParser, parse} from './parser/parser.js';
|
|||||||
|
|
||||||
const DEFAULT_OPTIONS = {
|
const DEFAULT_OPTIONS = {
|
||||||
debug: false,
|
debug: false,
|
||||||
puppeteerTimeout: 20_000,
|
puppeteerTimeout: 60_000,
|
||||||
puppeteerHeadless: true
|
puppeteerHeadless: true,
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
export default class Extractor {
|
export default class Extractor {
|
||||||
constructor(options) {
|
constructor(options) {
|
||||||
this.options = {
|
this.options = {
|
||||||
...DEFAULT_OPTIONS,
|
...DEFAULT_OPTIONS,
|
||||||
...options
|
...options,
|
||||||
};
|
};
|
||||||
this.responseText = null;
|
this.responseText = null;
|
||||||
setDebug(this.options);
|
setDebug(this.options);
|
||||||
@@ -38,8 +37,7 @@ export default class Extractor {
|
|||||||
return this;
|
return this;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
parseResponseText = (crawlContainer, crawlFields, url) => {
|
||||||
parseResponseText = (crawlContainer, crawlFields) => {
|
return parse(crawlContainer, crawlFields, this.responseText, url);
|
||||||
return parse(crawlContainer, crawlFields, this.responseText);
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,21 +6,22 @@ export function loadParser(text) {
|
|||||||
$ = cheerio.load(text);
|
$ = cheerio.load(text);
|
||||||
}
|
}
|
||||||
|
|
||||||
export function parse(crawlContainer, crawlFields, text) {
|
export function parse(crawlContainer, crawlFields, text, url) {
|
||||||
if (!text) {
|
if (!text) {
|
||||||
console.warn('Cannot parse, text was empty.');
|
console.warn('Cannot parse, text was empty for url ', url);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!crawlContainer || !crawlFields) {
|
if (!crawlContainer || !crawlFields) {
|
||||||
console.warn('Cannot parse, selector was empty.');
|
console.warn('Cannot parse, selector was empty for url ', url);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
const result = [];
|
const result = [];
|
||||||
|
|
||||||
if ($(crawlContainer).length === 0) {
|
if ($(crawlContainer).length === 0) {
|
||||||
console.error('No elements in crawl container found!');
|
console.warn('No elements in crawl container found for url ', url);
|
||||||
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
$(crawlContainer).each((_, element) => {
|
$(crawlContainer).each((_, element) => {
|
||||||
@@ -32,8 +33,9 @@ export function parse(crawlContainer, crawlFields, text) {
|
|||||||
let value;
|
let value;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
const selector = fieldSelector.includes('|')
|
||||||
const selector = fieldSelector.includes('|') ? fieldSelector.substring(0, fieldSelector.indexOf('|')).trim() : fieldSelector;
|
? fieldSelector.substring(0, fieldSelector.indexOf('|')).trim()
|
||||||
|
: fieldSelector;
|
||||||
|
|
||||||
if (selector.includes('@')) {
|
if (selector.includes('@')) {
|
||||||
const [sel, attr] = selector.split('@');
|
const [sel, attr] = selector.split('@');
|
||||||
@@ -48,7 +50,9 @@ export function parse(crawlContainer, crawlFields, text) {
|
|||||||
|
|
||||||
// Apply modifiers if specified
|
// Apply modifiers if specified
|
||||||
if (fieldSelector.includes('|')) {
|
if (fieldSelector.includes('|')) {
|
||||||
const [_, ...modifiers] = fieldSelector.split('|').map(s => s.trim());
|
/* eslint-disable no-unused-vars */
|
||||||
|
const [_, ...modifiers] = fieldSelector.split('|').map((s) => s.trim());
|
||||||
|
/* eslint-disable no-unused-vars */
|
||||||
value = applyModifiers(value, modifiers);
|
value = applyModifiers(value, modifiers);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -73,7 +77,7 @@ export function parse(crawlContainer, crawlFields, text) {
|
|||||||
function applyModifiers(value, modifiers) {
|
function applyModifiers(value, modifiers) {
|
||||||
if (!value) return value;
|
if (!value) return value;
|
||||||
|
|
||||||
modifiers.forEach(modifier => {
|
modifiers.forEach((modifier) => {
|
||||||
switch (modifier) {
|
switch (modifier) {
|
||||||
case 'int':
|
case 'int':
|
||||||
value = parseInt(value, 10);
|
value = parseInt(value, 10);
|
||||||
@@ -91,4 +95,3 @@ function applyModifiers(value, modifiers) {
|
|||||||
|
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -11,18 +11,19 @@ export default async function execute(url, waitForSelector, options) {
|
|||||||
|
|
||||||
browser = await puppeteer.launch({
|
browser = await puppeteer.launch({
|
||||||
headless: options.puppeteerHeadless ?? true,
|
headless: options.puppeteerHeadless ?? true,
|
||||||
args: ['--no-sandbox', '--disable-gpu', '--disable-setuid-sandbox']
|
args: ['--no-sandbox', '--disable-gpu', '--disable-setuid-sandbox'],
|
||||||
|
timeout: options.puppeteerTimeout || 30_000,
|
||||||
});
|
});
|
||||||
let page = await browser.newPage();
|
let page = await browser.newPage();
|
||||||
await page.setExtraHTTPHeaders(DEFAULT_HEADER);
|
await page.setExtraHTTPHeaders(DEFAULT_HEADER);
|
||||||
const response = await page.goto(url, {
|
const response = await page.goto(url, {
|
||||||
waitUntil: 'domcontentloaded'
|
waitUntil: 'domcontentloaded',
|
||||||
});
|
});
|
||||||
let pageSource;
|
let pageSource;
|
||||||
//if we're extracting data from a spa, we must wait for the selector
|
//if we're extracting data from a spa, we must wait for the selector
|
||||||
if (waitForSelector != null) {
|
if (waitForSelector != null) {
|
||||||
await page.waitForSelector(waitForSelector);
|
await page.waitForSelector(waitForSelector);
|
||||||
pageSource = await page.evaluate(selector => {
|
pageSource = await page.evaluate((selector) => {
|
||||||
return document.querySelector(selector).innerHTML;
|
return document.querySelector(selector).innerHTML;
|
||||||
}, waitForSelector);
|
}, waitForSelector);
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -1,34 +1,31 @@
|
|||||||
let debuggingOn = false;
|
let debuggingOn = false;
|
||||||
|
|
||||||
export const DEFAULT_HEADER = {
|
export const DEFAULT_HEADER = {
|
||||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||||
'Accept-Language': 'en-US,en;q=0.5',
|
'Accept-Language': 'en-US,en;q=0.5',
|
||||||
'Connection': 'keep-alive',
|
Connection: 'keep-alive',
|
||||||
'Upgrade-Insecure-Requests': '1',
|
'Upgrade-Insecure-Requests': '1',
|
||||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
|
'User-Agent':
|
||||||
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
|
||||||
};
|
};
|
||||||
|
|
||||||
export const setDebug = options => {
|
export const setDebug = (options) => {
|
||||||
debuggingOn = !!options?.debug;
|
debuggingOn = !!options?.debug;
|
||||||
};
|
};
|
||||||
|
|
||||||
export const debug = (message) => {
|
export const debug = (message) => {
|
||||||
if (debuggingOn) {
|
if (debuggingOn) {
|
||||||
|
/* eslint-disable no-console */
|
||||||
console.debug(message);
|
console.debug(message);
|
||||||
|
/* eslint-enable no-console */
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
export const botDetected = (pageSource, statusCode) => {
|
export const botDetected = (pageSource, statusCode) => {
|
||||||
const suspiciousStatusCodes = [
|
const suspiciousStatusCodes = [403, 429];
|
||||||
403, 429
|
const botDetectionPatterns = [/verify you are human/i, /access denied/i, /x-amz-cf-id/i];
|
||||||
];
|
|
||||||
const botDetectionPatterns = [
|
|
||||||
/verify you are human/i,
|
|
||||||
/access denied/i,
|
|
||||||
/x-amz-cf-id/i,
|
|
||||||
];
|
|
||||||
|
|
||||||
const detectedInSource = botDetectionPatterns.some(pattern => pattern.test(pageSource));
|
const detectedInSource = botDetectionPatterns.some((pattern) => pattern.test(pageSource));
|
||||||
const detectedByStatus = suspiciousStatusCodes.includes(statusCode);
|
const detectedByStatus = suspiciousStatusCodes.includes(statusCode);
|
||||||
|
|
||||||
return detectedInSource || detectedByStatus;
|
return detectedInSource || detectedByStatus;
|
||||||
|
|||||||
14
package.json
14
package.json
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "fredy",
|
"name": "fredy",
|
||||||
"version": "11.0.0",
|
"version": "11.0.1",
|
||||||
"description": "[F]ind [R]eal [E]states [d]amn eas[y].",
|
"description": "[F]ind [R]eal [E]states [d]amn eas[y].",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"start": "node prod.js",
|
"start": "node prod.js",
|
||||||
@@ -50,17 +50,17 @@
|
|||||||
"Firefox ESR"
|
"Firefox ESR"
|
||||||
],
|
],
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@douyinfe/semi-ui": "2.71.3",
|
"@douyinfe/semi-ui": "2.72.3",
|
||||||
"@rematch/core": "2.2.0",
|
"@rematch/core": "2.2.0",
|
||||||
"@rematch/loading": "2.1.2",
|
"@rematch/loading": "2.1.2",
|
||||||
"@sendgrid/mail": "8.1.4",
|
"@sendgrid/mail": "8.1.4",
|
||||||
"@vitejs/plugin-react": "4.3.4",
|
"@vitejs/plugin-react": "4.3.4",
|
||||||
"better-sqlite3": "^11.7.0",
|
"better-sqlite3": "^11.7.2",
|
||||||
"body-parser": "1.20.3",
|
"body-parser": "1.20.3",
|
||||||
"cheerio": "^1.0.0",
|
"cheerio": "^1.0.0",
|
||||||
"cookie-session": "2.1.0",
|
"cookie-session": "2.1.0",
|
||||||
"handlebars": "4.7.8",
|
"handlebars": "4.7.8",
|
||||||
"highcharts": "12.1.0",
|
"highcharts": "12.1.2",
|
||||||
"highcharts-react-official": "3.2.1",
|
"highcharts-react-official": "3.2.1",
|
||||||
"lodash": "4.17.21",
|
"lodash": "4.17.21",
|
||||||
"lowdb": "6.0.1",
|
"lowdb": "6.0.1",
|
||||||
@@ -70,7 +70,7 @@
|
|||||||
"node-fetch": "3.3.2",
|
"node-fetch": "3.3.2",
|
||||||
"node-mailjet": "6.0.6",
|
"node-mailjet": "6.0.6",
|
||||||
"package-up": "^5.0.0",
|
"package-up": "^5.0.0",
|
||||||
"puppeteer": "^23.10.4",
|
"puppeteer": "^23.11.1",
|
||||||
"puppeteer-extra": "^3.3.6",
|
"puppeteer-extra": "^3.3.6",
|
||||||
"puppeteer-extra-plugin-stealth": "^2.11.2",
|
"puppeteer-extra-plugin-stealth": "^2.11.2",
|
||||||
"query-string": "9.1.1",
|
"query-string": "9.1.1",
|
||||||
@@ -95,12 +95,12 @@
|
|||||||
"chai": "5.1.2",
|
"chai": "5.1.2",
|
||||||
"eslint": "8.56.0",
|
"eslint": "8.56.0",
|
||||||
"eslint-config-prettier": "8.8.0",
|
"eslint-config-prettier": "8.8.0",
|
||||||
"eslint-plugin-react": "7.37.2",
|
"eslint-plugin-react": "7.37.3",
|
||||||
"esmock": "2.6.9",
|
"esmock": "2.6.9",
|
||||||
"history": "5.3.0",
|
"history": "5.3.0",
|
||||||
"husky": "9.1.7",
|
"husky": "9.1.7",
|
||||||
"less": "4.2.1",
|
"less": "4.2.1",
|
||||||
"lint-staged": "15.2.11",
|
"lint-staged": "15.3.0",
|
||||||
"mocha": "10.8.2",
|
"mocha": "10.8.2",
|
||||||
"prettier": "3.4.2",
|
"prettier": "3.4.2",
|
||||||
"redux-logger": "3.0.6"
|
"redux-logger": "3.0.6"
|
||||||
|
|||||||
Reference in New Issue
Block a user