better logging, fixing code smells

This commit is contained in:
Christian Kellner
2025-01-07 12:25:19 +01:00
parent 2aaf63c253
commit 90a4ee5dcf
5 changed files with 232 additions and 231 deletions

View File

@@ -3,92 +3,94 @@ import * as cheerio from 'cheerio';
let $ = null;
export function loadParser(text) {
$ = cheerio.load(text);
$ = cheerio.load(text);
}
export function parse(crawlContainer, crawlFields, text) {
if (!text) {
console.warn('Cannot parse, text was empty.');
return null;
}
export function parse(crawlContainer, crawlFields, text, url) {
if (!text) {
console.warn('Cannot parse, text was empty for url ', url);
return null;
}
if (!crawlContainer || !crawlFields) {
console.warn('Cannot parse, selector was empty.');
return null;
}
if (!crawlContainer || !crawlFields) {
console.warn('Cannot parse, selector was empty for url ', url);
return null;
}
const result = [];
const result = [];
if ($(crawlContainer).length === 0) {
console.error('No elements in crawl container found!');
}
if ($(crawlContainer).length === 0) {
console.error('No elements in crawl container found for url ', url);
}
$(crawlContainer).each((_, element) => {
const container = $(element);
const parsedObject = {};
$(crawlContainer).each((_, element) => {
const container = $(element);
const parsedObject = {};
// Parse fields based on crawlFields
for (const [key, fieldSelector] of Object.entries(crawlFields)) {
let value;
// Parse fields based on crawlFields
for (const [key, fieldSelector] of Object.entries(crawlFields)) {
let value;
try {
try {
const selector = fieldSelector.includes('|')
? fieldSelector.substring(0, fieldSelector.indexOf('|')).trim()
: fieldSelector;
const selector = fieldSelector.includes('|') ? fieldSelector.substring(0, fieldSelector.indexOf('|')).trim() : fieldSelector;
if (selector.includes('@')) {
const [sel, attr] = selector.split('@');
if (sel.length === 0) {
value = container.attr(attr.trim());
} else {
value = container.find(sel.trim()).attr(attr.trim());
}
} else {
value = container.find(selector.trim()).text();
}
// Apply modifiers if specified
if (fieldSelector.includes('|')) {
const [_, ...modifiers] = fieldSelector.split('|').map(s => s.trim());
value = applyModifiers(value, modifiers);
}
parsedObject[key] = value || null;
} catch (error) {
console.error(`Error parsing field '${key}' with selector '${fieldSelector}':`, error);
parsedObject[key] = null;
}
}
if (parsedObject.id != null) {
result.push(parsedObject);
if (selector.includes('@')) {
const [sel, attr] = selector.split('@');
if (sel.length === 0) {
value = container.attr(attr.trim());
} else {
value = container.find(sel.trim()).attr(attr.trim());
}
} else {
console.warn('ID not found. Not relaying object.');
value = container.find(selector.trim()).text();
}
});
return result;
// Apply modifiers if specified
if (fieldSelector.includes('|')) {
/* eslint-disable no-unused-vars */
const [_, ...modifiers] = fieldSelector.split('|').map((s) => s.trim());
/* eslint-disable no-unused-vars */
value = applyModifiers(value, modifiers);
}
parsedObject[key] = value || null;
} catch (error) {
console.error(`Error parsing field '${key}' with selector '${fieldSelector}':`, error);
parsedObject[key] = null;
}
}
if (parsedObject.id != null) {
result.push(parsedObject);
} else {
console.warn('ID not found. Not relaying object.');
}
});
return result;
}
// Helper function to apply modifiers
function applyModifiers(value, modifiers) {
if (!value) return value;
if (!value) return value;
modifiers.forEach(modifier => {
switch (modifier) {
case 'int':
value = parseInt(value, 10);
break;
case 'trim':
value = value.replace(/\s+/g, ' ').trim();
break;
case 'removeNewline':
value = value.replace(/\n/g, ' ');
break;
default:
console.warn(`Unknown modifier: ${modifier}`);
}
});
modifiers.forEach((modifier) => {
switch (modifier) {
case 'int':
value = parseInt(value, 10);
break;
case 'trim':
value = value.replace(/\s+/g, ' ').trim();
break;
case 'removeNewline':
value = value.replace(/\n/g, ' ');
break;
default:
console.warn(`Unknown modifier: ${modifier}`);
}
});
return value;
return value;
}