mirror of
https://github.com/orangecoding/fredy.git
synced 2026-06-16 12:31:07 +00:00
adding puppeteer timeout and fixing waitForSelector
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
import utils, {buildHash} from '../utils.js';
|
||||
import utils, { buildHash } from '../utils.js';
|
||||
let appliedBlackList = [];
|
||||
|
||||
/**
|
||||
@@ -26,7 +26,7 @@ const config = {
|
||||
url: null,
|
||||
crawlContainer: 'div[data-testid="serp-core-classified-card-testid"]',
|
||||
sortByDateParam: 'sortby=19',
|
||||
waitForSelector: 'div[data-testid="serp-core-classified-card-testid"]',
|
||||
waitForSelector: 'div[data-testid="serp-resultscount-testid"]',
|
||||
crawlFields: {
|
||||
id: 'button@title |trim', // immonet is a piece of sh*t. See comment above
|
||||
title: 'button@title |trim',
|
||||
|
||||
@@ -18,7 +18,7 @@ const config = {
|
||||
crawlContainer:
|
||||
'div[data-testid="serp-core-scrollablelistview-testid"]:not(div[data-testid="serp-enlargementlist-testid"] div[data-testid="serp-card-testid"]) div[data-testid="serp-core-classified-card-testid"]',
|
||||
sortByDateParam: 'order=DateDesc',
|
||||
waitForSelector: 'div[data-testid="cardmfe-price-testid"]',
|
||||
waitForSelector: 'div[data-testid="serp-gridcontainer-testid"]',
|
||||
crawlFields: {
|
||||
id: 'a@href',
|
||||
price: 'div[data-testid="cardmfe-price-testid"] | removeNewline | trim',
|
||||
|
||||
@@ -21,6 +21,7 @@ export function parse(crawlContainer, crawlFields, text, url) {
|
||||
|
||||
if ($(crawlContainer).length === 0) {
|
||||
console.error('No elements in crawl container found for url ', url);
|
||||
return null;
|
||||
}
|
||||
|
||||
$(crawlContainer).each((_, element) => {
|
||||
|
||||
@@ -1,48 +1,49 @@
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
import {debug, DEFAULT_HEADER, botDetected} from './utils.js';
|
||||
import { debug, DEFAULT_HEADER, botDetected } from './utils.js';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
export default async function execute(url, waitForSelector, options) {
|
||||
let browser;
|
||||
try {
|
||||
debug(`Sending request to ${url} using Puppeteer.`);
|
||||
let browser;
|
||||
try {
|
||||
debug(`Sending request to ${url} using Puppeteer.`);
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
headless: options.puppeteerHeadless ?? true,
|
||||
args: ['--no-sandbox', '--disable-gpu', '--disable-setuid-sandbox']
|
||||
});
|
||||
let page = await browser.newPage();
|
||||
await page.setExtraHTTPHeaders(DEFAULT_HEADER);
|
||||
const response = await page.goto(url, {
|
||||
waitUntil: 'domcontentloaded'
|
||||
});
|
||||
let pageSource;
|
||||
//if we're extracting data from a spa, we must wait for the selector
|
||||
if (waitForSelector != null) {
|
||||
await page.waitForSelector(waitForSelector);
|
||||
pageSource = await page.evaluate(selector => {
|
||||
return document.querySelector(selector).innerHTML;
|
||||
}, waitForSelector);
|
||||
} else {
|
||||
pageSource = await page.content();
|
||||
}
|
||||
|
||||
const statusCode = response.status();
|
||||
|
||||
if (botDetected(pageSource, statusCode)) {
|
||||
console.warn('We have been detected as a bot :-/ Tried url: => ', url);
|
||||
return null;
|
||||
}
|
||||
|
||||
return await page.content();
|
||||
} catch (error) {
|
||||
console.error('Error executing with puppeteer executor', error);
|
||||
return null;
|
||||
} finally {
|
||||
if (browser != null) {
|
||||
await browser.close();
|
||||
}
|
||||
browser = await puppeteer.launch({
|
||||
headless: options.puppeteerHeadless ?? true,
|
||||
args: ['--no-sandbox', '--disable-gpu', '--disable-setuid-sandbox'],
|
||||
timeout: options.puppeteerTimeout || 30_000,
|
||||
});
|
||||
let page = await browser.newPage();
|
||||
await page.setExtraHTTPHeaders(DEFAULT_HEADER);
|
||||
const response = await page.goto(url, {
|
||||
waitUntil: 'domcontentloaded',
|
||||
});
|
||||
let pageSource;
|
||||
//if we're extracting data from a spa, we must wait for the selector
|
||||
if (waitForSelector != null) {
|
||||
await page.waitForSelector(waitForSelector);
|
||||
pageSource = await page.evaluate((selector) => {
|
||||
return document.querySelector(selector).innerHTML;
|
||||
}, waitForSelector);
|
||||
} else {
|
||||
pageSource = await page.content();
|
||||
}
|
||||
}
|
||||
|
||||
const statusCode = response.status();
|
||||
|
||||
if (botDetected(pageSource, statusCode)) {
|
||||
console.warn('We have been detected as a bot :-/ Tried url: => ', url);
|
||||
return null;
|
||||
}
|
||||
|
||||
return await page.content();
|
||||
} catch (error) {
|
||||
console.error('Error executing with puppeteer executor', error);
|
||||
return null;
|
||||
} finally {
|
||||
if (browser != null) {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user