better logging, fixing code smells

This commit is contained in:
Christian Kellner
2025-01-07 12:25:19 +01:00
parent 2aaf63c253
commit 90a4ee5dcf
5 changed files with 232 additions and 231 deletions

View File

@@ -1,45 +1,43 @@
import {setDebug} from './utils.js';
import { setDebug } from './utils.js';
import puppeteerExtractor from './puppeteerExtractor.js';
import {loadParser, parse} from './parser/parser.js';
import { loadParser, parse } from './parser/parser.js';
const DEFAULT_OPTIONS = {
debug: false,
puppeteerTimeout: 20_000,
puppeteerHeadless: true
debug: false,
puppeteerTimeout: 60_000,
puppeteerHeadless: true,
};
export default class Extractor {
constructor(options) {
this.options = {
...DEFAULT_OPTIONS,
...options
};
this.responseText = null;
setDebug(this.options);
constructor(options) {
this.options = {
...DEFAULT_OPTIONS,
...options,
};
this.responseText = null;
setDebug(this.options);
}
/**
* if you are extracting data from a SPA, you must provide a selector, otherwise
* your response will never contain what you are really looking for
* @param url
* @param waitForSelector
*/
execute = async (url, waitForSelector = null) => {
this.responseText = null;
try {
this.responseText = await puppeteerExtractor(url, waitForSelector, this.options);
if (this.responseText != null) {
loadParser(this.responseText);
}
} catch (error) {
console.error('Error trying to load page.', error);
}
return this;
};
/**
* if you are extracting data from a SPA, you must provide a selector, otherwise
* your response will never contain what you are really looking for
* @param url
* @param waitForSelector
*/
execute = async (url, waitForSelector = null) => {
this.responseText = null;
try {
this.responseText = await puppeteerExtractor(url, waitForSelector, this.options);
if(this.responseText != null) {
loadParser(this.responseText);
}
} catch (error) {
console.error('Error trying to load page.', error);
}
return this;
};
parseResponseText = (crawlContainer, crawlFields) => {
return parse(crawlContainer, crawlFields, this.responseText);
};
parseResponseText = (crawlContainer, crawlFields, url) => {
return parse(crawlContainer, crawlFields, this.responseText, url);
};
}