mirror of
https://github.com/orangecoding/fredy.git
synced 2026-06-16 12:31:07 +00:00
Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3f294b8099 | ||
|
|
11fd18e76a |
@@ -23,7 +23,7 @@ const config = {
|
|||||||
url: null,
|
url: null,
|
||||||
crawlContainer: '.col-12.mb-4',
|
crawlContainer: '.col-12.mb-4',
|
||||||
sortByDateParam: 'Sortierung=Id&Richtung=DESC',
|
sortByDateParam: 'Sortierung=Id&Richtung=DESC',
|
||||||
waitForSelector: '.nbk-section',
|
waitForSelector: 'div[data-live-name-value="SearchList"]',
|
||||||
crawlFields: {
|
crawlFields: {
|
||||||
id: 'a@href',
|
id: 'a@href',
|
||||||
title: 'a@title | removeNewline | trim',
|
title: 'a@title | removeNewline | trim',
|
||||||
|
|||||||
@@ -9,12 +9,12 @@ export function loadParser(text) {
|
|||||||
|
|
||||||
export function parse(crawlContainer, crawlFields, text, url) {
|
export function parse(crawlContainer, crawlFields, text, url) {
|
||||||
if (!text) {
|
if (!text) {
|
||||||
logger.warn('No content found for ', url);
|
logger.debug('No content found for ', url);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!crawlContainer || !crawlFields) {
|
if (!crawlContainer || !crawlFields) {
|
||||||
logger.warn('Cannot parse, selector was empty for url ', url);
|
logger.debug('Cannot parse, selector was empty for url ', url);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2,30 +2,56 @@ import puppeteer from 'puppeteer-extra';
|
|||||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||||
import { debug, DEFAULT_HEADER, botDetected } from './utils.js';
|
import { debug, DEFAULT_HEADER, botDetected } from './utils.js';
|
||||||
import logger from '../logger.js';
|
import logger from '../logger.js';
|
||||||
|
import fs from 'fs';
|
||||||
|
import os from 'os';
|
||||||
|
import path from 'path';
|
||||||
|
|
||||||
puppeteer.use(StealthPlugin());
|
puppeteer.use(StealthPlugin());
|
||||||
|
|
||||||
export default async function execute(url, waitForSelector, options) {
|
export default async function execute(url, waitForSelector, options) {
|
||||||
let browser;
|
let browser;
|
||||||
|
let page;
|
||||||
|
let result = null;
|
||||||
|
let userDataDir;
|
||||||
|
let removeUserDataDir = false;
|
||||||
try {
|
try {
|
||||||
debug(`Sending request to ${url} using Puppeteer.`);
|
debug(`Sending request to ${url} using Puppeteer.`);
|
||||||
|
|
||||||
|
// Prepare a dedicated temporary userDataDir to avoid leaking /tmp/.org.chromium.* dirs
|
||||||
|
if (options && options.userDataDir) {
|
||||||
|
userDataDir = options.userDataDir;
|
||||||
|
removeUserDataDir = !!options.cleanupUserDataDir;
|
||||||
|
} else {
|
||||||
|
const prefix = path.join(os.tmpdir(), 'puppeteer-fredy-');
|
||||||
|
userDataDir = fs.mkdtempSync(prefix);
|
||||||
|
removeUserDataDir = true;
|
||||||
|
}
|
||||||
|
|
||||||
browser = await puppeteer.launch({
|
browser = await puppeteer.launch({
|
||||||
headless: options.puppeteerHeadless ?? true,
|
headless: options.puppeteerHeadless ?? true,
|
||||||
args: ['--no-sandbox', '--disable-gpu', '--disable-setuid-sandbox'],
|
args: [
|
||||||
|
'--no-sandbox',
|
||||||
|
'--disable-gpu',
|
||||||
|
'--disable-setuid-sandbox',
|
||||||
|
'--disable-dev-shm-usage',
|
||||||
|
'--disable-crash-reporter',
|
||||||
|
],
|
||||||
timeout: options.puppeteerTimeout || 30_000,
|
timeout: options.puppeteerTimeout || 30_000,
|
||||||
|
userDataDir,
|
||||||
});
|
});
|
||||||
let page = await browser.newPage();
|
page = await browser.newPage();
|
||||||
await page.setExtraHTTPHeaders(DEFAULT_HEADER);
|
await page.setExtraHTTPHeaders(DEFAULT_HEADER);
|
||||||
const response = await page.goto(url, {
|
const response = await page.goto(url, {
|
||||||
waitUntil: 'domcontentloaded',
|
waitUntil: 'domcontentloaded',
|
||||||
});
|
});
|
||||||
let pageSource;
|
let pageSource;
|
||||||
//if we're extracting data from a spa, we must wait for the selector
|
// if we're extracting data from a SPA, we must wait for the selector
|
||||||
if (waitForSelector != null) {
|
if (waitForSelector != null) {
|
||||||
await page.waitForSelector(waitForSelector);
|
const selectorTimeout = options?.puppeteerSelectorTimeout ?? options?.puppeteerTimeout ?? 30_000;
|
||||||
|
await page.waitForSelector(waitForSelector, { timeout: selectorTimeout });
|
||||||
pageSource = await page.evaluate((selector) => {
|
pageSource = await page.evaluate((selector) => {
|
||||||
return document.querySelector(selector).innerHTML;
|
const el = document.querySelector(selector);
|
||||||
|
return el ? el.innerHTML : '';
|
||||||
}, waitForSelector);
|
}, waitForSelector);
|
||||||
} else {
|
} else {
|
||||||
pageSource = await page.content();
|
pageSource = await page.content();
|
||||||
@@ -35,16 +61,35 @@ export default async function execute(url, waitForSelector, options) {
|
|||||||
|
|
||||||
if (botDetected(pageSource, statusCode)) {
|
if (botDetected(pageSource, statusCode)) {
|
||||||
logger.warn('We have been detected as a bot :-/ Tried url: => ', url);
|
logger.warn('We have been detected as a bot :-/ Tried url: => ', url);
|
||||||
return null;
|
result = null;
|
||||||
|
} else {
|
||||||
|
result = pageSource || (await page.content());
|
||||||
}
|
}
|
||||||
|
|
||||||
return await page.content();
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error('Error executing with puppeteer executor', error);
|
logger.error('Error executing with puppeteer executor', error);
|
||||||
return null;
|
result = null;
|
||||||
} finally {
|
} finally {
|
||||||
if (browser != null) {
|
try {
|
||||||
await browser.close();
|
if (page) {
|
||||||
|
await page.close();
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
if (browser != null) {
|
||||||
|
await browser.close();
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
if (removeUserDataDir && userDataDir) {
|
||||||
|
await fs.promises.rm(userDataDir, { recursive: true, force: true });
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// ignore
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "fredy",
|
"name": "fredy",
|
||||||
"version": "12.1.5",
|
"version": "12.1.7",
|
||||||
"description": "[F]ind [R]eal [E]states [d]amn eas[y].",
|
"description": "[F]ind [R]eal [E]states [d]amn eas[y].",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"prepare": "husky",
|
"prepare": "husky",
|
||||||
@@ -70,7 +70,7 @@
|
|||||||
"handlebars": "4.7.8",
|
"handlebars": "4.7.8",
|
||||||
"lodash": "4.17.21",
|
"lodash": "4.17.21",
|
||||||
"markdown": "^0.5.0",
|
"markdown": "^0.5.0",
|
||||||
"nanoid": "5.1.5",
|
"nanoid": "5.1.6",
|
||||||
"node-cron": "^4.2.1",
|
"node-cron": "^4.2.1",
|
||||||
"node-fetch": "3.3.2",
|
"node-fetch": "3.3.2",
|
||||||
"node-mailjet": "6.0.9",
|
"node-mailjet": "6.0.9",
|
||||||
|
|||||||
@@ -5406,10 +5406,10 @@ nano-spawn@1.0.3:
|
|||||||
resolved "https://registry.yarnpkg.com/nano-spawn/-/nano-spawn-1.0.3.tgz#ef8d89a275eebc8657e67b95fc312a6527a05b8d"
|
resolved "https://registry.yarnpkg.com/nano-spawn/-/nano-spawn-1.0.3.tgz#ef8d89a275eebc8657e67b95fc312a6527a05b8d"
|
||||||
integrity sha512-jtpsQDetTnvS2Ts1fiRdci5rx0VYws5jGyC+4IYOTnIQ/wwdf6JdomlHBwqC3bJYOvaKu0C2GSZ1A60anrYpaA==
|
integrity sha512-jtpsQDetTnvS2Ts1fiRdci5rx0VYws5jGyC+4IYOTnIQ/wwdf6JdomlHBwqC3bJYOvaKu0C2GSZ1A60anrYpaA==
|
||||||
|
|
||||||
nanoid@5.1.5:
|
nanoid@5.1.6:
|
||||||
version "5.1.5"
|
version "5.1.6"
|
||||||
resolved "https://registry.yarnpkg.com/nanoid/-/nanoid-5.1.5.tgz#f7597f9d9054eb4da9548cdd53ca70f1790e87de"
|
resolved "https://registry.yarnpkg.com/nanoid/-/nanoid-5.1.6.tgz#30363f664797e7d40429f6c16946d6bd7a3f26c9"
|
||||||
integrity sha512-Ir/+ZpE9fDsNH0hQ3C68uyThDXzYcim2EqcZ8zn8Chtt1iylPT9xXJB0kPCnqzgcEGikO9RxSrh63MsmVCU7Fw==
|
integrity sha512-c7+7RQ+dMB5dPwwCp4ee1/iV/q2P6aK1mTZcfr1BTuVlyW9hJYiMPybJCcnBlQtuSmTIWNeazm/zqNoZSSElBg==
|
||||||
|
|
||||||
nanoid@^3.3.11:
|
nanoid@^3.3.11:
|
||||||
version "3.3.11"
|
version "3.3.11"
|
||||||
|
|||||||
Reference in New Issue
Block a user