mirror of
https://github.com/orangecoding/fredy.git
synced 2026-06-16 12:31:07 +00:00
committed by
GitHub
parent
58965a6f1b
commit
214e714c03
@@ -1,134 +1,118 @@
|
||||
import { NoNewListingsWarning } from './errors.js';
|
||||
import { setKnownListings, getKnownListings } from './services/storage/listingsStorage.js';
|
||||
import {NoNewListingsWarning} from './errors.js';
|
||||
import {setKnownListings, getKnownListings} from './services/storage/listingsStorage.js';
|
||||
import * as notify from './notification/notify.js';
|
||||
import xray from './services/scraper.js';
|
||||
import * as scrapingAnt from './services/scrapingAnt.js';
|
||||
import Extractor from './services/extractor/extractor.js';
|
||||
import urlModifier from './services/queryStringMutator.js';
|
||||
|
||||
class FredyRuntime {
|
||||
/**
|
||||
*
|
||||
* @param providerConfig the config for the specific provider, we're going to query at the moment
|
||||
* @param notificationConfig the config for all notifications
|
||||
* @param providerId the id of the provider currently in use
|
||||
* @param jobKey key of the job that is currently running (from within the config)
|
||||
* @param similarityCache cache instance holding values to check for similarity of entries
|
||||
*/
|
||||
constructor(providerConfig, notificationConfig, providerId, jobKey, similarityCache) {
|
||||
this._providerConfig = providerConfig;
|
||||
this._notificationConfig = notificationConfig;
|
||||
this._providerId = providerId;
|
||||
this._jobKey = jobKey;
|
||||
this._similarityCache = similarityCache;
|
||||
}
|
||||
execute() {
|
||||
return (
|
||||
//modify the url to make sure search order is correctly set
|
||||
Promise.resolve(urlModifier(this._providerConfig.url, this._providerConfig.sortByDateParam))
|
||||
//scraping the site and try finding new listings
|
||||
.then(this._getListings.bind(this))
|
||||
//bring them in a proper form (dictated by the provider)
|
||||
.then(this._normalize.bind(this))
|
||||
//filter listings with stuff tagged by the blacklist of the provider
|
||||
.then(this._filter.bind(this))
|
||||
//check if new listings available. if so proceed
|
||||
.then(this._findNew.bind(this))
|
||||
//store everything in db
|
||||
.then(this._save.bind(this))
|
||||
//check for similar listings. if found, remove them before notifying
|
||||
.then(this._filterBySimilarListings.bind(this))
|
||||
//notify the user using the configured notification adapter
|
||||
.then(this._notify.bind(this))
|
||||
//if an error occurred on the way, handle it here.
|
||||
.catch(this._handleError.bind(this))
|
||||
);
|
||||
}
|
||||
_getListings(url) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const id = this._providerId;
|
||||
if (scrapingAnt.needScrapingAnt(id) && !scrapingAnt.isScrapingAntApiKeySet()) {
|
||||
const error = 'Immoscout or Immonet can only be used with if you have set an apikey for scrapingAnt.';
|
||||
/* eslint-disable no-console */
|
||||
console.log(error);
|
||||
/* eslint-enable no-console */
|
||||
reject(error);
|
||||
return;
|
||||
}
|
||||
const u = scrapingAnt.needScrapingAnt(id) ? scrapingAnt.transformUrlForScrapingAnt(url, id) : url;
|
||||
try {
|
||||
if (this._providerConfig.paginate != null) {
|
||||
xray(u, this._providerConfig.crawlContainer, [this._providerConfig.crawlFields])
|
||||
//the first 2 pages should be enough here
|
||||
.limit(2)
|
||||
.paginate(this._providerConfig.paginate)
|
||||
.then((listings) => {
|
||||
resolve(listings == null ? [] : listings);
|
||||
})
|
||||
.catch((err) => {
|
||||
reject(err);
|
||||
console.error(err);
|
||||
});
|
||||
} else {
|
||||
xray(u, this._providerConfig.crawlContainer, [this._providerConfig.crawlFields])
|
||||
.then((listings) => {
|
||||
resolve(listings == null ? [] : listings);
|
||||
})
|
||||
.catch((err) => {
|
||||
reject(err);
|
||||
console.error(err);
|
||||
/**
|
||||
*
|
||||
* @param providerConfig the config for the specific provider, we're going to query at the moment
|
||||
* @param notificationConfig the config for all notifications
|
||||
* @param providerId the id of the provider currently in use
|
||||
* @param jobKey key of the job that is currently running (from within the config)
|
||||
* @param similarityCache cache instance holding values to check for similarity of entries
|
||||
*/
|
||||
constructor(providerConfig, notificationConfig, providerId, jobKey, similarityCache) {
|
||||
this._providerConfig = providerConfig;
|
||||
this._notificationConfig = notificationConfig;
|
||||
this._providerId = providerId;
|
||||
this._jobKey = jobKey;
|
||||
this._similarityCache = similarityCache;
|
||||
}
|
||||
|
||||
execute() {
|
||||
return (
|
||||
//modify the url to make sure search order is correctly set
|
||||
Promise.resolve(urlModifier(this._providerConfig.url, this._providerConfig.sortByDateParam))
|
||||
//scraping the site and try finding new listings
|
||||
.then(this._getListings.bind(this))
|
||||
//bring them in a proper form (dictated by the provider)
|
||||
.then(this._normalize.bind(this))
|
||||
//filter listings with stuff tagged by the blacklist of the provider
|
||||
.then(this._filter.bind(this))
|
||||
//check if new listings available. if so proceed
|
||||
.then(this._findNew.bind(this))
|
||||
//store everything in db
|
||||
.then(this._save.bind(this))
|
||||
//check for similar listings. if found, remove them before notifying
|
||||
.then(this._filterBySimilarListings.bind(this))
|
||||
//notify the user using the configured notification adapter
|
||||
.then(this._notify.bind(this))
|
||||
//if an error occurred on the way, handle it here.
|
||||
.catch(this._handleError.bind(this))
|
||||
);
|
||||
}
|
||||
|
||||
_getListings(url) {
|
||||
const extractor = new Extractor();
|
||||
return new Promise((resolve, reject) => {
|
||||
extractor.execute(url,this._providerConfig.waitForSelector)
|
||||
.then(() => {
|
||||
const listings = extractor.parseResponseText(this._providerConfig.crawlContainer, this._providerConfig.crawlFields);
|
||||
resolve(listings == null ? [] : listings);
|
||||
}).catch(err => {
|
||||
reject(err);
|
||||
/* eslint-disable no-console */
|
||||
console.error(err);
|
||||
/* eslint-enable no-console */
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
_normalize(listings) {
|
||||
return listings.map(this._providerConfig.normalize);
|
||||
}
|
||||
|
||||
_filter(listings) {
|
||||
//only return those where all the fields have been found
|
||||
const keys = Object.keys(this._providerConfig.crawlFields);
|
||||
const filteredListings = listings.filter((item) => keys.every((key) => key in item));
|
||||
return filteredListings.filter(this._providerConfig.filter);
|
||||
}
|
||||
|
||||
_findNew(listings) {
|
||||
const newListings = listings.filter((o) => getKnownListings(this._jobKey, this._providerId)[o.id] == null);
|
||||
if (newListings.length === 0) {
|
||||
throw new NoNewListingsWarning();
|
||||
}
|
||||
} catch (error) {
|
||||
reject(error);
|
||||
console.error(error);
|
||||
}
|
||||
});
|
||||
}
|
||||
_normalize(listings) {
|
||||
return listings.map(this._providerConfig.normalize);
|
||||
}
|
||||
_filter(listings) {
|
||||
//only return those where all the fields have been found
|
||||
const keys = Object.keys(this._providerConfig.crawlFields);
|
||||
const filteredListings = listings.filter((item) => keys.every((key) => key in item));
|
||||
return filteredListings.filter(this._providerConfig.filter);
|
||||
}
|
||||
_findNew(listings) {
|
||||
const newListings = listings.filter((o) => getKnownListings(this._jobKey, this._providerId)[o.id] == null);
|
||||
if (newListings.length === 0) {
|
||||
throw new NoNewListingsWarning();
|
||||
return newListings;
|
||||
}
|
||||
return newListings;
|
||||
}
|
||||
_notify(newListings) {
|
||||
if (newListings.length === 0) {
|
||||
throw new NoNewListingsWarning();
|
||||
|
||||
_notify(newListings) {
|
||||
if (newListings.length === 0) {
|
||||
throw new NoNewListingsWarning();
|
||||
}
|
||||
const sendNotifications = notify.send(this._providerId, newListings, this._notificationConfig, this._jobKey);
|
||||
return Promise.all(sendNotifications).then(() => newListings);
|
||||
}
|
||||
|
||||
_save(newListings) {
|
||||
const currentListings = getKnownListings(this._jobKey, this._providerId) || {};
|
||||
newListings.forEach((listing) => {
|
||||
currentListings[listing.id] = Date.now();
|
||||
});
|
||||
setKnownListings(this._jobKey, this._providerId, currentListings);
|
||||
return newListings;
|
||||
}
|
||||
|
||||
_filterBySimilarListings(listings) {
|
||||
const filteredList = listings.filter((listing) => {
|
||||
const similar = this._similarityCache.hasSimilarEntries(this._jobKey, listing.title);
|
||||
if (similar) {
|
||||
/* eslint-disable no-console */
|
||||
console.debug(`Filtering similar entry for job with id ${this._jobKey} with title: `, listing.title);
|
||||
/* eslint-enable no-console */
|
||||
}
|
||||
return !similar;
|
||||
});
|
||||
filteredList.forEach((filter) => this._similarityCache.addCacheEntry(this._jobKey, filter.title));
|
||||
return filteredList;
|
||||
}
|
||||
|
||||
_handleError(err) {
|
||||
if (err.name !== 'NoNewListingsWarning') console.error(err);
|
||||
}
|
||||
const sendNotifications = notify.send(this._providerId, newListings, this._notificationConfig, this._jobKey);
|
||||
return Promise.all(sendNotifications).then(() => newListings);
|
||||
}
|
||||
_save(newListings) {
|
||||
const currentListings = getKnownListings(this._jobKey, this._providerId) || {};
|
||||
newListings.forEach((listing) => {
|
||||
currentListings[listing.id] = Date.now();
|
||||
});
|
||||
setKnownListings(this._jobKey, this._providerId, currentListings);
|
||||
return newListings;
|
||||
}
|
||||
_filterBySimilarListings(listings) {
|
||||
const filteredList = listings.filter((listing) => {
|
||||
const similar = this._similarityCache.hasSimilarEntries(this._jobKey, listing.title);
|
||||
if (similar) {
|
||||
/* eslint-disable no-console */
|
||||
console.debug(`Filtering similar entry for job with id ${this._jobKey} with title: `, listing.title);
|
||||
/* eslint-enable no-console */
|
||||
}
|
||||
return !similar;
|
||||
});
|
||||
filteredList.forEach((filter) => this._similarityCache.addCacheEntry(this._jobKey, filter.title));
|
||||
return filteredList;
|
||||
}
|
||||
_handleError(err) {
|
||||
if (err.name !== 'NoNewListingsWarning') console.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
export default FredyRuntime;
|
||||
|
||||
@@ -5,7 +5,6 @@ import * as userStorage from '../../services/storage/userStorage.js';
|
||||
import * as immoscoutProvider from '../../provider/immoscout.js';
|
||||
import { config } from '../../utils.js';
|
||||
import { isAdmin } from '../security.js';
|
||||
import {isScrapingAntApiKeySet} from '../../services/scrapingAnt.js';
|
||||
import {trackDemoJobCreated} from '../../services/tracking/Tracker.js';
|
||||
const service = restana();
|
||||
const jobRouter = service.newRouter();
|
||||
@@ -27,34 +26,14 @@ jobRouter.get('/', async (req, res) => {
|
||||
res.send();
|
||||
});
|
||||
jobRouter.get('/processingTimes', async (req, res) => {
|
||||
let scrapingAntData = {};
|
||||
if (isScrapingAntApiKeySet()) {
|
||||
try {
|
||||
const response = await fetch(`https://api.scrapingant.com/v2/usage?x-api-key=${config.scrapingAnt.apiKey}`);
|
||||
scrapingAntData = await response.json();
|
||||
} catch (Exception) {
|
||||
console.error('Could not query plan data from scraping ant.', Exception);
|
||||
}
|
||||
}
|
||||
res.body = {
|
||||
interval: config.interval,
|
||||
lastRun: config.lastRun || null,
|
||||
scrapingAntData,
|
||||
error: scrapingAntData?.detail == null ? null : scrapingAntData?.detail
|
||||
lastRun: config.lastRun || null
|
||||
};
|
||||
res.send();
|
||||
});
|
||||
jobRouter.post('/', async (req, res) => {
|
||||
const { provider, notificationAdapter, name, blacklist = [], jobId, enabled } = req.body;
|
||||
if (
|
||||
provider.find((p) => p.id === immoscoutProvider.metaInformation.id) != null &&
|
||||
(config.scrapingAnt.apiKey == null || config.scrapingAnt.apiKey.length === 0)
|
||||
) {
|
||||
res.send(
|
||||
new Error('To use Immoscout as provider, you need to configure ScrapingAnt first. Please check the readme.')
|
||||
);
|
||||
return;
|
||||
}
|
||||
try {
|
||||
jobStorage.upsertJob({
|
||||
userId: req.session.currentUser,
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
export const DEFAULT_CONFIG = {
|
||||
'interval': '60',
|
||||
'port': 9998,
|
||||
'scrapingAnt': {'apiKey': '', 'proxy': 'datacenter'},
|
||||
'workingHours': {'from': '', 'to': ''},
|
||||
'demoMode': false,
|
||||
'analyticsEnabled': null
|
||||
|
||||
@@ -2,14 +2,10 @@ import utils, { buildHash } from '../utils.js';
|
||||
let appliedBlackList = [];
|
||||
|
||||
function normalize(o) {
|
||||
let size = `${o.size.replace(' Wohnfläche ', '').trim()}`;
|
||||
if (o.rooms != null) {
|
||||
size += ` / / ${o.rooms.trim()}`;
|
||||
}
|
||||
const link = `https://www.1a-immobilienmarkt.de/expose/${o.id}.html`;
|
||||
const price = normalizePrice(o.price);
|
||||
const id = buildHash(o.id, price);
|
||||
return Object.assign(o, { id, price, size, link });
|
||||
return Object.assign(o, { id, price, link });
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -39,12 +35,12 @@ const config = {
|
||||
url: null,
|
||||
crawlContainer: '.tabelle',
|
||||
sortByDateParam: 'sort_type=newest',
|
||||
waitForSelector: 'body',
|
||||
crawlFields: {
|
||||
id: '.inner_object_data input[name="marker_objekt_id"]@value | int',
|
||||
price: '.tabelle .inner_object_data .single_data_price | removeNewline | trim',
|
||||
size: '.tabelle .inner_object_data .data_boxes div:nth-child(1)',
|
||||
rooms: '.tabelle .inner_object_data .data_boxes div:nth-child(2)',
|
||||
title: '.tabelle .inner_object_data .tabelle_inhalt_titel_black | removeNewline | trim',
|
||||
price: '.inner_object_data .single_data_price | removeNewline | trim',
|
||||
size: '.tabelle .tabelle_inhalt_infos .single_data_box | removeNewline | trim',
|
||||
title: '.inner_object_data .tabelle_inhalt_titel_black | removeNewline | trim',
|
||||
},
|
||||
normalize: normalize,
|
||||
filter: applyBlacklist,
|
||||
|
||||
@@ -11,8 +11,9 @@ function normalize(o) {
|
||||
const price = o.price || 'N/A €';
|
||||
const title = o.title || 'No title available';
|
||||
const address = o.address || 'No address available';
|
||||
const link = shortenLink(o.link);
|
||||
const id = buildHash(parseId(shortenLink(o.link)), o.price);
|
||||
const shortLink = shortenLink(o.link);
|
||||
const link = `https://www.immobilien.de/${shortLink}`;
|
||||
const id = buildHash(parseId(shortLink), o.price);
|
||||
return Object.assign(o, { id, price, size, title, address, link });
|
||||
}
|
||||
function applyBlacklist(o) {
|
||||
@@ -22,9 +23,11 @@ function applyBlacklist(o) {
|
||||
}
|
||||
const config = {
|
||||
url: null,
|
||||
crawlContainer: '.estates_list .list_immo a._ref',
|
||||
crawlContainer: '._ref',
|
||||
sortByDateParam: 'sort_col=*created_ts&sort_dir=desc',
|
||||
waitForSelector: 'body',
|
||||
crawlFields: {
|
||||
id: '@href', //will be transformed later
|
||||
price: '.list_entry .immo_preis .label_info',
|
||||
size: '.list_entry .flaeche .label_info | removeNewline | trim',
|
||||
title: '.list_entry .part_text h3 span',
|
||||
@@ -32,7 +35,6 @@ const config = {
|
||||
link: '@href',
|
||||
address: '.list_entry .place',
|
||||
},
|
||||
paginate: '.list_immo .blocknav .blocknav_list li.next a@href',
|
||||
normalize: normalize,
|
||||
filter: applyBlacklist,
|
||||
};
|
||||
|
||||
@@ -1,12 +1,20 @@
|
||||
import utils, {buildHash} from '../utils.js';
|
||||
let appliedBlackList = [];
|
||||
|
||||
/**
|
||||
* Note, Immonet is rly a piece of sh*t. It is using a weird combination of React and some buttons (instead of links),
|
||||
* so that if somebody clicks the listing, a new page will open with the actual link to the listing. Of course, a scraper
|
||||
* cannot do this (which is why I always just return the link to the whole list of listings).
|
||||
* This is not only bad for us, but also bad for ppl with disabilities...
|
||||
*/
|
||||
|
||||
function normalize(o) {
|
||||
const size = o.size != null ? o.size.replace('Wohnfläche ', '') : 'N/A m²';
|
||||
const price = o.price.replace('Kaufpreis ', '');
|
||||
const address = o.address.split(' • ')[o.address.split(' • ').length - 1];
|
||||
const title = o.title || 'No title available';
|
||||
const link = o.id;
|
||||
const id = buildHash(o.id.substring(o.id.lastIndexOf('/') + 1, o.id.length), price);
|
||||
const link = config.url;
|
||||
const id = buildHash(title, price);
|
||||
return Object.assign(o, { id, address, price, size, title, link });
|
||||
}
|
||||
function applyBlacklist(o) {
|
||||
@@ -16,16 +24,16 @@ function applyBlacklist(o) {
|
||||
}
|
||||
const config = {
|
||||
url: null,
|
||||
crawlContainer: '.content-wrapper-tiles .ng-star-inserted',
|
||||
crawlContainer: 'div[data-testid="serp-core-classified-card-testid"]',
|
||||
sortByDateParam: 'sortby=19',
|
||||
waitForSelector: 'div[data-testid="serp-core-classified-card-testid"]',
|
||||
crawlFields: {
|
||||
id: '.card a@href',
|
||||
title: '.card h3 |trim',
|
||||
price: '.card .has-font-300 .is-bold | trim',
|
||||
size: '.card .has-font-300 .ml-100 | trim',
|
||||
address: '.card span:nth-child(2) | trim',
|
||||
id: 'button@title |trim', // immonet is a piece of sh*t. See comment above
|
||||
title: 'button@title |trim',
|
||||
price: 'div[data-testid="cardmfe-price-testid"] | trim',
|
||||
size: 'div[data-testid="cardmfe-keyfacts-testid"] | trim',
|
||||
address: 'div[data-testid="cardmfe-description-box-address"] | trim',
|
||||
},
|
||||
paginate: '#idResultList .margin-bottom-6.margin-bottom-sm-12 .panel a.pull-right@href',
|
||||
normalize: normalize,
|
||||
filter: applyBlacklist,
|
||||
};
|
||||
|
||||
@@ -17,6 +17,7 @@ const config = {
|
||||
url: null,
|
||||
crawlContainer: '#resultListItems li.result-list__listing',
|
||||
sortByDateParam: 'sorting=2',
|
||||
waitForSelector: 'body',
|
||||
crawlFields: {
|
||||
id: '.result-list-entry@data-obid | int',
|
||||
price: '.result-list-entry .result-list-entry__criteria .grid-item:first-child dd | removeNewline | trim',
|
||||
@@ -25,7 +26,6 @@ const config = {
|
||||
link: '.result-list-entry .result-list-entry__brand-title-container@href',
|
||||
address: '.result-list-entry .result-list-entry__map-link',
|
||||
},
|
||||
paginate: '#pager .align-right a@href',
|
||||
normalize: normalize,
|
||||
filter: applyBlacklist,
|
||||
};
|
||||
|
||||
@@ -23,6 +23,7 @@ const config = {
|
||||
url: null,
|
||||
crawlContainer: '.js-serp-item',
|
||||
sortByDateParam: 's=most_recently_updated_first',
|
||||
waitForSelector: 'body',
|
||||
crawlFields: {
|
||||
id: '.js-bookmark-btn@data-id',
|
||||
price: 'div.align-items-start div:first-child | trim',
|
||||
@@ -31,7 +32,6 @@ const config = {
|
||||
link: '.ci-search-result__link@href',
|
||||
description: '.js-show-more-item-sm | removeNewline | trim',
|
||||
},
|
||||
paginate: 'li.page-item.pagination__item a.page-link@href',
|
||||
normalize: normalize,
|
||||
filter: applyBlacklist,
|
||||
};
|
||||
|
||||
@@ -16,17 +16,17 @@ function applyBlacklist(o) {
|
||||
const config = {
|
||||
url: null,
|
||||
crawlContainer:
|
||||
'div[data-testid="serp-core-scrollablelistview-testid"]:not(div[data-testid="serp-enlargementlist-testid"] div[data-testid="serp-card-testid"])',
|
||||
'div[data-testid="serp-core-scrollablelistview-testid"]:not(div[data-testid="serp-enlargementlist-testid"] div[data-testid="serp-card-testid"]) div[data-testid="serp-core-classified-card-testid"]',
|
||||
sortByDateParam: 'order=DateDesc',
|
||||
waitForSelector: 'div[data-testid="cardmfe-price-testid"]',
|
||||
crawlFields: {
|
||||
id: 'a@id',
|
||||
id: 'a@href',
|
||||
price: 'div[data-testid="cardmfe-price-testid"] | removeNewline | trim',
|
||||
size: 'div[data-testid="cardmfe-keyfacts-testid"] | removeNewline | trim',
|
||||
title: '.css-1cbj9xw',
|
||||
link: 'a@href',
|
||||
address: 'div[data-testid="cardmfe-description-box-address"] | removeNewline | trim',
|
||||
},
|
||||
paginate: '#pnlPaging #nlbPlus@href',
|
||||
normalize: normalize,
|
||||
filter: applyBlacklist,
|
||||
};
|
||||
|
||||
@@ -6,7 +6,8 @@ let appliedBlacklistedDistricts = [];
|
||||
function normalize(o) {
|
||||
const size = o.size || '--- m²';
|
||||
const id = buildHash(o.id, o.price);
|
||||
return Object.assign(o, {id, size});
|
||||
const link = `https://www.kleinanzeigen.de${o.link}`;
|
||||
return Object.assign(o, {id, size, link});
|
||||
}
|
||||
|
||||
function applyBlacklist(o) {
|
||||
@@ -22,16 +23,16 @@ const config = {
|
||||
crawlContainer: '#srchrslt-adtable .ad-listitem ',
|
||||
//sort by date is standard oO
|
||||
sortByDateParam: null,
|
||||
waitForSelector: 'body',
|
||||
crawlFields: {
|
||||
id: '.aditem@data-adid | int',
|
||||
price: '.aditem-main--middle--price-shipping--price | removeNewline | trim',
|
||||
size: '.aditem-main .text-module-end span:nth-child(2) | removeNewline | trim',
|
||||
size: '.aditem-main .text-module-end | removeNewline | trim',
|
||||
title: '.aditem-main .text-module-begin a | removeNewline | trim',
|
||||
link: '.aditem-main .text-module-begin a@href | removeNewline | trim',
|
||||
description: '.aditem-main p:not(.text-module-end) | removeNewline | trim',
|
||||
description: '.aditem-main .aditem-main--middle--description | removeNewline | trim',
|
||||
address: '.aditem-main--top--left | trim | removeNewline',
|
||||
},
|
||||
paginate: '#srchrslt-pagination .pagination-next@href',
|
||||
normalize: normalize,
|
||||
filter: applyBlacklist,
|
||||
};
|
||||
|
||||
@@ -8,7 +8,7 @@ function nullOrEmpty(val) {
|
||||
|
||||
function normalize(o) {
|
||||
const link = nullOrEmpty(o.link) ? 'NO LINK' : `https://www.neubaukompass.de${o.link.substring(o.link.indexOf('/neubau'))}`;
|
||||
const id = buildHash(o.id, o.price);
|
||||
const id = buildHash(o.link, o.price);
|
||||
return Object.assign(o, {id, link});
|
||||
}
|
||||
|
||||
@@ -18,16 +18,16 @@ function applyBlacklist(o) {
|
||||
|
||||
const config = {
|
||||
url: null,
|
||||
crawlContainer: '.nbk-container >div article',
|
||||
crawlContainer: '.col-12.mb-4',
|
||||
sortByDateParam: 'Sortierung=Id&Richtung=DESC',
|
||||
waitForSelector: '.nbk-section',
|
||||
crawlFields: {
|
||||
id: '@id',
|
||||
title: 'a.nbk-truncate@title | removeNewline | trim',
|
||||
link: 'a.nbk-truncate@href',
|
||||
address: 'p.nbk-truncate | removeNewline | trim',
|
||||
price: 'p.nbk-mb-0 | removeNewline | trim',
|
||||
id: 'a@href',
|
||||
title: 'a@title | removeNewline | trim',
|
||||
link: 'a@href',
|
||||
address: '.nbk-project-card__description | removeNewline | trim',
|
||||
price: '.nbk-project-card__spec-item .nbk-project-card__spec-value | removeNewline | trim',
|
||||
},
|
||||
paginate: '.numbered-pager__bottom .numbered-pager--info li:nth-child(2) a@href',
|
||||
normalize: normalize,
|
||||
filter: applyBlacklist,
|
||||
};
|
||||
|
||||
@@ -17,6 +17,7 @@ const config = {
|
||||
url: null,
|
||||
crawlContainer: '#main_column .wgg_card',
|
||||
sortByDateParam: 'sort_column=0&sort_order=0',
|
||||
waitForSelector: 'body',
|
||||
crawlFields: {
|
||||
id: '@data-id',
|
||||
details: '.row .noprint .col-xs-11 |removeNewline |trim',
|
||||
|
||||
45
lib/services/extractor/extractor.js
Normal file
45
lib/services/extractor/extractor.js
Normal file
@@ -0,0 +1,45 @@
|
||||
import {setDebug} from './utils.js';
|
||||
import puppeteerExtractor from './puppeteerExtractor.js';
|
||||
import {loadParser, parse} from './parser/parser.js';
|
||||
|
||||
const DEFAULT_OPTIONS = {
|
||||
debug: false,
|
||||
puppeteerTimeout: 20_000,
|
||||
puppeteerHeadless: true
|
||||
|
||||
};
|
||||
|
||||
export default class Extractor {
|
||||
constructor(options) {
|
||||
this.options = {
|
||||
...DEFAULT_OPTIONS,
|
||||
...options
|
||||
};
|
||||
this.responseText = null;
|
||||
setDebug(this.options);
|
||||
}
|
||||
|
||||
/**
|
||||
* if you are extracting data from a SPA, you must provide a selector, otherwise
|
||||
* your response will never contain what you are really looking for
|
||||
* @param url
|
||||
* @param waitForSelector
|
||||
*/
|
||||
execute = async (url, waitForSelector = null) => {
|
||||
this.responseText = null;
|
||||
try {
|
||||
this.responseText = await puppeteerExtractor(url, waitForSelector, this.options);
|
||||
if(this.responseText != null) {
|
||||
loadParser(this.responseText);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error trying to load page.', error);
|
||||
}
|
||||
return this;
|
||||
};
|
||||
|
||||
|
||||
parseResponseText = (crawlContainer, crawlFields) => {
|
||||
return parse(crawlContainer, crawlFields, this.responseText);
|
||||
};
|
||||
}
|
||||
94
lib/services/extractor/parser/parser.js
Normal file
94
lib/services/extractor/parser/parser.js
Normal file
@@ -0,0 +1,94 @@
|
||||
import * as cheerio from 'cheerio';
|
||||
|
||||
let $ = null;
|
||||
|
||||
export function loadParser(text) {
|
||||
$ = cheerio.load(text);
|
||||
}
|
||||
|
||||
export function parse(crawlContainer, crawlFields, text) {
|
||||
if (!text) {
|
||||
console.warn('Cannot parse, text was empty.');
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!crawlContainer || !crawlFields) {
|
||||
console.warn('Cannot parse, selector was empty.');
|
||||
return null;
|
||||
}
|
||||
|
||||
const result = [];
|
||||
|
||||
if ($(crawlContainer).length === 0) {
|
||||
console.error('No elements in crawl container found!');
|
||||
}
|
||||
|
||||
$(crawlContainer).each((_, element) => {
|
||||
const container = $(element);
|
||||
const parsedObject = {};
|
||||
|
||||
// Parse fields based on crawlFields
|
||||
for (const [key, fieldSelector] of Object.entries(crawlFields)) {
|
||||
let value;
|
||||
|
||||
try {
|
||||
|
||||
const selector = fieldSelector.includes('|') ? fieldSelector.substring(0, fieldSelector.indexOf('|')).trim() : fieldSelector;
|
||||
|
||||
if (selector.includes('@')) {
|
||||
const [sel, attr] = selector.split('@');
|
||||
if (sel.length === 0) {
|
||||
value = container.attr(attr.trim());
|
||||
} else {
|
||||
value = container.find(sel.trim()).attr(attr.trim());
|
||||
}
|
||||
} else {
|
||||
value = container.find(selector.trim()).text();
|
||||
}
|
||||
|
||||
// Apply modifiers if specified
|
||||
if (fieldSelector.includes('|')) {
|
||||
const [_, ...modifiers] = fieldSelector.split('|').map(s => s.trim());
|
||||
value = applyModifiers(value, modifiers);
|
||||
}
|
||||
|
||||
parsedObject[key] = value || null;
|
||||
} catch (error) {
|
||||
console.error(`Error parsing field '${key}' with selector '${fieldSelector}':`, error);
|
||||
parsedObject[key] = null;
|
||||
}
|
||||
}
|
||||
|
||||
if (parsedObject.id != null) {
|
||||
result.push(parsedObject);
|
||||
} else {
|
||||
console.warn('ID not found. Not relaying object.');
|
||||
}
|
||||
});
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// Helper function to apply modifiers
|
||||
function applyModifiers(value, modifiers) {
|
||||
if (!value) return value;
|
||||
|
||||
modifiers.forEach(modifier => {
|
||||
switch (modifier) {
|
||||
case 'int':
|
||||
value = parseInt(value, 10);
|
||||
break;
|
||||
case 'trim':
|
||||
value = value.replace(/\s+/g, ' ').trim();
|
||||
break;
|
||||
case 'removeNewline':
|
||||
value = value.replace(/\n/g, ' ');
|
||||
break;
|
||||
default:
|
||||
console.warn(`Unknown modifier: ${modifier}`);
|
||||
}
|
||||
});
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
48
lib/services/extractor/puppeteerExtractor.js
Normal file
48
lib/services/extractor/puppeteerExtractor.js
Normal file
@@ -0,0 +1,48 @@
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
import {debug, DEFAULT_HEADER, botDetected} from './utils.js';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
export default async function execute(url, waitForSelector, options) {
|
||||
let browser;
|
||||
try {
|
||||
debug(`Sending request to ${url} using Puppeteer.`);
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
headless: options.puppeteerHeadless ?? true,
|
||||
args: ['--no-sandbox', '--disable-gpu', '--disable-setuid-sandbox']
|
||||
});
|
||||
let page = await browser.newPage();
|
||||
await page.setExtraHTTPHeaders(DEFAULT_HEADER);
|
||||
const response = await page.goto(url, {
|
||||
waitUntil: 'domcontentloaded'
|
||||
});
|
||||
let pageSource;
|
||||
//if we're extracting data from a spa, we must wait for the selector
|
||||
if (waitForSelector != null) {
|
||||
await page.waitForSelector(waitForSelector);
|
||||
pageSource = await page.evaluate(selector => {
|
||||
return document.querySelector(selector).innerHTML;
|
||||
}, waitForSelector);
|
||||
} else {
|
||||
pageSource = await page.content();
|
||||
}
|
||||
|
||||
const statusCode = response.status();
|
||||
|
||||
if (botDetected(pageSource, statusCode)) {
|
||||
console.warn('We have been detected as a bot :-/ Tried url: => ', url);
|
||||
return null;
|
||||
}
|
||||
|
||||
return await page.content();
|
||||
} catch (error) {
|
||||
console.error('Error executing with puppeteer executor', error);
|
||||
return null;
|
||||
} finally {
|
||||
if (browser != null) {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
35
lib/services/extractor/utils.js
Normal file
35
lib/services/extractor/utils.js
Normal file
@@ -0,0 +1,35 @@
|
||||
let debuggingOn = false;
|
||||
|
||||
export const DEFAULT_HEADER = {
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
|
||||
};
|
||||
|
||||
export const setDebug = options => {
|
||||
debuggingOn = !!options?.debug;
|
||||
};
|
||||
|
||||
export const debug = (message) => {
|
||||
if(debuggingOn) {
|
||||
console.debug(message);
|
||||
}
|
||||
};
|
||||
|
||||
export const botDetected = (pageSource, statusCode) => {
|
||||
const suspiciousStatusCodes = [
|
||||
403, 429
|
||||
];
|
||||
const botDetectionPatterns = [
|
||||
/verify you are human/i,
|
||||
/access denied/i,
|
||||
/x-amz-cf-id/i,
|
||||
];
|
||||
|
||||
const detectedInSource = botDetectionPatterns.some(pattern => pattern.test(pageSource));
|
||||
const detectedByStatus = suspiciousStatusCodes.includes(statusCode);
|
||||
|
||||
return detectedInSource || detectedByStatus;
|
||||
};
|
||||
@@ -1,77 +0,0 @@
|
||||
import fetch from 'node-fetch';
|
||||
import { config } from '../utils.js';
|
||||
import { makeUrlResidential } from './scrapingAnt.js';
|
||||
import https from 'https';
|
||||
//if ScrapingAnt got blocked, this http status is returned
|
||||
const BLOCKED_HTTP_STATUS = 423;
|
||||
const NOT_FOUND_HTTP_STATUS = 404;
|
||||
const MAX_RETRIES_SCRAPING_ANT = 10;
|
||||
const EXPECTED_STATUS_CODES = [BLOCKED_HTTP_STATUS, NOT_FOUND_HTTP_STATUS];
|
||||
const agent = new https.Agent({
|
||||
rejectUnauthorized: false,
|
||||
});
|
||||
|
||||
function makeDriver(headers = {}) {
|
||||
let cookies = '';
|
||||
async function scrapingAntDriver(context, callback, retryCounter = 0) {
|
||||
const proxyType = config.scrapingAnt?.proxy || 'datacenter';
|
||||
try {
|
||||
const url = proxyType === 'residential' ? makeUrlResidential(context.url) : context.url;
|
||||
const response = await fetch(url, {
|
||||
headers: {
|
||||
...headers,
|
||||
cookie: cookies,
|
||||
},
|
||||
});
|
||||
const result = await response.text();
|
||||
if (EXPECTED_STATUS_CODES.includes(response.status)) {
|
||||
throw new Error(`${response.status}`);
|
||||
}
|
||||
if (cookies.length === 0) {
|
||||
cookies = response.headers.raw()['set-cookie'] || [];
|
||||
}
|
||||
callback(null, result);
|
||||
} catch (exception) {
|
||||
/* eslint-disable no-console */
|
||||
if (!EXPECTED_STATUS_CODES.includes(exception.response?.status) && !EXPECTED_STATUS_CODES.includes(Number(exception.message))) {
|
||||
console.error(`Error while trying to scrape data from scraping ant. Received error: ${exception.message}`);
|
||||
callback(null, []);
|
||||
return;
|
||||
}
|
||||
if (retryCounter <= MAX_RETRIES_SCRAPING_ANT) {
|
||||
retryCounter++;
|
||||
console.debug(`ScrapingAnt got blocked. Retrying ${retryCounter} / ${MAX_RETRIES_SCRAPING_ANT}`);
|
||||
await scrapingAntDriver(context, callback, retryCounter);
|
||||
} else {
|
||||
console.error(`Error while trying to scrape data from scraping ant. Received error: ${exception.message}`);
|
||||
callback(null, []);
|
||||
}
|
||||
/* eslint-enable no-console */
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The regular request driver is taking care of everyting, that doesn't need to be scraped by ScrapingAnt (which is
|
||||
* everything != Immoscout & Immonet as of writing this)
|
||||
*/
|
||||
return async function driver(context, callback) {
|
||||
if (context.url.toLowerCase().indexOf('scrapingant') !== -1) {
|
||||
return scrapingAntDriver(context, callback);
|
||||
}
|
||||
try {
|
||||
const response = await fetch(context.url, {
|
||||
headers: {
|
||||
...headers,
|
||||
Cookie: cookies,
|
||||
},
|
||||
agent,
|
||||
});
|
||||
const result = await response.text();
|
||||
callback(null, result);
|
||||
} catch (exception) {
|
||||
console.error(`Error while trying to scrape data. Received error: ${exception.message}`);
|
||||
callback(null, []);
|
||||
}
|
||||
};
|
||||
}
|
||||
export default makeDriver;
|
||||
@@ -1,36 +0,0 @@
|
||||
import { config } from '../utils.js';
|
||||
import makeDriver from './requestDriver.js';
|
||||
import Xray from 'x-ray';
|
||||
class Scraper {
|
||||
constructor() {
|
||||
const filters = {
|
||||
removeNewline: this._removeNewline,
|
||||
trim: this._trim,
|
||||
int: this._int,
|
||||
};
|
||||
const headers = {
|
||||
'User-Agent':
|
||||
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.85 Safari/537.36',
|
||||
};
|
||||
if (config.scrapingAnt != null && config.scrapingAnt.apiKey != null) {
|
||||
headers['x-api-key'] = config.scrapingAnt.apiKey;
|
||||
}
|
||||
const driver = makeDriver(headers);
|
||||
const xray = Xray({ filters });
|
||||
xray.driver(driver);
|
||||
this.xray = xray;
|
||||
}
|
||||
get x() {
|
||||
return this.xray;
|
||||
}
|
||||
_removeNewline(value) {
|
||||
return typeof value === 'string' ? value.replace(/\\n/g, '') : value;
|
||||
}
|
||||
_trim(value) {
|
||||
return typeof value === 'string' ? value.replace(/\s+/g, ' ').trim() : value;
|
||||
}
|
||||
_int(value) {
|
||||
return typeof value === 'string' ? parseInt(value, 10) : value;
|
||||
}
|
||||
}
|
||||
export default new Scraper().x;
|
||||
@@ -1,30 +0,0 @@
|
||||
import { metaInformation as immoScoutInfo } from '../provider/immoscout.js';
|
||||
import { metaInformation as immoNetInfo } from '../provider/immonet.js';
|
||||
import { metaInformation as neuBauCompassInfo } from '../provider/neubauKompass.js';
|
||||
import { config } from '../utils.js';
|
||||
|
||||
const additionalImmonetUrlParams = `&wait_for_selector=.content-wrapper-tiles&js_snippet=${Buffer.from(
|
||||
'window.scrollTo(0,document.body.scrollHeight);'
|
||||
).toString('base64')}`;
|
||||
|
||||
const needScrapingAnt = (id) => {
|
||||
return id.toLowerCase() === immoScoutInfo.id || id.toLowerCase() === immoNetInfo.id || id.toLowerCase() === neuBauCompassInfo.id.toLowerCase();
|
||||
};
|
||||
export const transformUrlForScrapingAnt = (url, id) => {
|
||||
let urlParams = '';
|
||||
if (needScrapingAnt(id)) {
|
||||
if (id.toLowerCase() === immoNetInfo.id) {
|
||||
urlParams = additionalImmonetUrlParams;
|
||||
}
|
||||
//only do calls to scrapingAnt when dealing with Immoscout/Immonet
|
||||
url = `https://api.scrapingant.com/v2/general?url=${encodeURIComponent(url)}&proxy_type=datacenter${urlParams}`;
|
||||
}
|
||||
return url;
|
||||
};
|
||||
export const isScrapingAntApiKeySet = () => {
|
||||
return config.scrapingAnt != null && config.scrapingAnt.apiKey != null && config.scrapingAnt.apiKey.length > 8;
|
||||
};
|
||||
export const makeUrlResidential = (url) => {
|
||||
return url.replace('datacenter', 'residential');
|
||||
};
|
||||
export { needScrapingAnt };
|
||||
Reference in New Issue
Block a user