adding or replacing sort params for provider urls when necessary

This commit is contained in:
orangecoding
2021-11-26 21:02:09 +01:00
parent aad0884976
commit c1c4d55ede
14 changed files with 135 additions and 2 deletions

View File

@@ -4,6 +4,7 @@ const { setKnownListings, getKnownListings } = require('./services/storage/listi
const notify = require('./notification/notify');
const xray = require('./services/scraper');
const scrapingAnt = require('./services/scrapingAnt');
const urlModifier = require('./services/queryStringMutator');
class FredyRuntime {
/**
@@ -24,7 +25,8 @@ class FredyRuntime {
execute() {
return (
Promise.resolve(this._providerConfig.url)
//modify the url to make sure search order is correctly set
Promise.resolve(urlModifier(this._providerConfig.url, this._providerConfig.sortByDateParam))
//scraping the site and try finding new listings
.then(this._getListings.bind(this))
//bring them in a proper form (dictated by the provider)

View File

@@ -22,6 +22,7 @@ function applyBlacklist(o) {
const config = {
url: null,
crawlContainer: '.tabelle',
sortByDateParam: 'sort_type=newest',
crawlFields: {
id: '.inner_object_data input[name="marker_objekt_id"]@value | int',
price: '.tabelle .inner_object_data .single_data_price | removeNewline | trim',

View File

@@ -24,6 +24,7 @@ function applyBlacklist(o) {
const config = {
url: null,
crawlContainer: '#result-list-stage .item',
sortByDateParam: 'sortby=19',
crawlFields: {
id: '@id',
price: 'div[id*="selPrice_"] | trim',

View File

@@ -20,6 +20,7 @@ function applyBlacklist(o) {
const config = {
url: null,
crawlContainer: '#resultListItems li.result-list__listing',
sortByDateParam: 'sorting=2',
crawlFields: {
id: '.result-list-entry@data-obid | int',
price: '.result-list-entry .result-list-entry__criteria .grid-item:first-child dd | removeNewline | trim',

View File

@@ -16,6 +16,7 @@ function applyBlacklist(o) {
const config = {
url: null,
crawlContainer: "div[class^='EstateItem-']",
sortByDateParam: 'sd=DESC&sf=TIMESTAMP',
crawlFields: {
id: 'a@id',
price: "div[class^='KeyFacts-'] [data-test='price'] | removeNewline | trim",

View File

@@ -21,6 +21,8 @@ function applyBlacklist(o) {
const config = {
url: null,
crawlContainer: '#srchrslt-adtable .ad-listitem ',
//sort by date is standard oO
sortByDateParam: null,
crawlFields: {
id: '.aditem@data-adid | int',
price: '.aditem-main--middle--price | removeNewline | trim',

View File

@@ -13,6 +13,7 @@ function applyBlacklist(o) {
const config = {
url: null,
crawlContainer: '.nbk-container >div article',
sortByDateParam: 'Sortierung=Id&Richtung=DESC',
crawlFields: {
id: '@id',
title: 'a.nbk-truncate@title | removeNewline | trim',

View File

@@ -16,6 +16,7 @@ function applyBlacklist(o) {
const config = {
url: null,
crawlContainer: '#main_column .wgg_card',
sortByDateParam: 'sort_column=0&sort_order=0',
crawlFields: {
id: '@data-id',
details: '.row .noprint .col-xs-11 |removeNewline |trim',

View File

@@ -0,0 +1,22 @@
const queryString = require('query-string');
/**
* for Fredy, it is important to sort search results by date, starting with the latest listing. if it is not sorted, we
* might never actually find the newest results, no matter how many pages we crawl.
* It has been written in the documentation, but obviously nobody reads docu theses days which is why it's been done
* automagically now.
*
* @param _url actual provider url containing the searchParams
* @param sortByDateParam param(s) indicating the correct sort order
* @returns {`${string}?${string}`} correctly formatted url
*/
module.exports = (_url, sortByDateParam) => {
//if no mutation is necessary, just return the original url
if (sortByDateParam == null) {
return _url;
}
const original = queryString.parseUrl(_url);
const mutate = queryString.parse(sortByDateParam);
return `${original.url}?${queryString.stringify({ ...original.query, ...mutate })}`;
};