adding or replacing sort params for provider urls when necessary

This commit is contained in:
orangecoding
2021-11-26 21:02:09 +01:00
parent aad0884976
commit c1c4d55ede
14 changed files with 135 additions and 2 deletions

View File

@@ -1,3 +1,13 @@
###### [V5.4.0]
- Upgrading dependencies
- Provider urls are now automagically been changed to include the correct sort order for search results
```
Note: It has been an point of confusion since the very beginning of Fredy, that people simply copied the url, but
did not take care of sorting the search results by date. If this is not done, Fredy will most likely not see the latest
results, thus cannot report them. This release fixes it by adding the necessary params (or replaces them).
```
###### [V5.3.0]
- Upgrading dependencies
- It's now possible to send mails to multiple receiver using comma separation for MailJet & Sendgrid

View File

@@ -4,6 +4,7 @@ const { setKnownListings, getKnownListings } = require('./services/storage/listi
const notify = require('./notification/notify');
const xray = require('./services/scraper');
const scrapingAnt = require('./services/scrapingAnt');
const urlModifier = require('./services/queryStringMutator');
class FredyRuntime {
/**
@@ -24,7 +25,8 @@ class FredyRuntime {
execute() {
return (
Promise.resolve(this._providerConfig.url)
//modify the url to make sure search order is correctly set
Promise.resolve(urlModifier(this._providerConfig.url, this._providerConfig.sortByDateParam))
//scraping the site and try finding new listings
.then(this._getListings.bind(this))
//bring them in a proper form (dictated by the provider)

View File

@@ -22,6 +22,7 @@ function applyBlacklist(o) {
const config = {
url: null,
crawlContainer: '.tabelle',
sortByDateParam: 'sort_type=newest',
crawlFields: {
id: '.inner_object_data input[name="marker_objekt_id"]@value | int',
price: '.tabelle .inner_object_data .single_data_price | removeNewline | trim',

View File

@@ -24,6 +24,7 @@ function applyBlacklist(o) {
const config = {
url: null,
crawlContainer: '#result-list-stage .item',
sortByDateParam: 'sortby=19',
crawlFields: {
id: '@id',
price: 'div[id*="selPrice_"] | trim',

View File

@@ -20,6 +20,7 @@ function applyBlacklist(o) {
const config = {
url: null,
crawlContainer: '#resultListItems li.result-list__listing',
sortByDateParam: 'sorting=2',
crawlFields: {
id: '.result-list-entry@data-obid | int',
price: '.result-list-entry .result-list-entry__criteria .grid-item:first-child dd | removeNewline | trim',

View File

@@ -16,6 +16,7 @@ function applyBlacklist(o) {
const config = {
url: null,
crawlContainer: "div[class^='EstateItem-']",
sortByDateParam: 'sd=DESC&sf=TIMESTAMP',
crawlFields: {
id: 'a@id',
price: "div[class^='KeyFacts-'] [data-test='price'] | removeNewline | trim",

View File

@@ -21,6 +21,8 @@ function applyBlacklist(o) {
const config = {
url: null,
crawlContainer: '#srchrslt-adtable .ad-listitem ',
//sort by date is standard oO
sortByDateParam: null,
crawlFields: {
id: '.aditem@data-adid | int',
price: '.aditem-main--middle--price | removeNewline | trim',

View File

@@ -13,6 +13,7 @@ function applyBlacklist(o) {
const config = {
url: null,
crawlContainer: '.nbk-container >div article',
sortByDateParam: 'Sortierung=Id&Richtung=DESC',
crawlFields: {
id: '@id',
title: 'a.nbk-truncate@title | removeNewline | trim',

View File

@@ -16,6 +16,7 @@ function applyBlacklist(o) {
const config = {
url: null,
crawlContainer: '#main_column .wgg_card',
sortByDateParam: 'sort_column=0&sort_order=0',
crawlFields: {
id: '@data-id',
details: '.row .noprint .col-xs-11 |removeNewline |trim',

View File

@@ -0,0 +1,22 @@
const queryString = require('query-string');
/**
* for Fredy, it is important to sort search results by date, starting with the latest listing. if it is not sorted, we
* might never actually find the newest results, no matter how many pages we crawl.
* It has been written in the documentation, but obviously nobody reads docu theses days which is why it's been done
* automagically now.
*
* @param _url actual provider url containing the searchParams
* @param sortByDateParam param(s) indicating the correct sort order
* @returns {`${string}?${string}`} correctly formatted url
*/
module.exports = (_url, sortByDateParam) => {
//if no mutation is necessary, just return the original url
if (sortByDateParam == null) {
return _url;
}
const original = queryString.parseUrl(_url);
const mutate = queryString.parse(sortByDateParam);
return `${original.url}?${queryString.stringify({ ...original.query, ...mutate })}`;
};

View File

@@ -1,6 +1,6 @@
{
"name": "fredy",
"version": "5.3.2",
"version": "5.4.ß",
"description": "[F]ind [R]eal [E]states [d]amn eas[y].",
"scripts": {
"start": "node index.js",
@@ -67,6 +67,7 @@
"markdown": "^0.5.0",
"nanoid": "3.1.30",
"node-mailjet": "3.3.4",
"query-string": "^7.0.1",
"react": "17.0.2",
"react-dom": "17.0.2",
"react-redux": "7.2.6",

View File

@@ -0,0 +1,32 @@
const testData = require('./testData.json');
const expect = require('chai').expect;
const fs = require('fs');
const mutator = require('../../lib/services/queryStringMutator.js');
const queryString = require('query-string');
/**
* Test test might look a bit weird at first, but listen stranger...
* It's not wise to compare 2 urls, as this means all url params must be in the expected order. This is however not
* guaranteed, as params (and their order) are totally variable.
*/
describe('queryStringMutator', () => {
it('should fix all urls', () => {
let _provider = fs.readdirSync('./lib/provider/').map((integPath) => require(`../../lib/provider/${integPath}`));
for (let test of testData) {
const provider = _provider.find((p) => p.metaInformation.id === test.id);
if (provider == null) {
throw new Error(`Cannot find provider for given id: ${test.id}`);
}
const fixedUrl = mutator(test.url, provider.config.sortByDateParam);
const expectedParams = queryString.parseUrl(test.shouldBecome);
const actualParams = queryString.parseUrl(fixedUrl);
//check if all new params are existing
expect(Object.keys(expectedParams.query)).to.include.members(Object.keys(actualParams.query));
expect(Object.values(expectedParams.query)).to.include.members(Object.values(actualParams.query));
}
});
});

View File

@@ -0,0 +1,33 @@
[
{
"url": "https://www.immowelt.de/liste/40589/wohnungen/mieten?d=true&sd=DESC&sf=PRIMARY_PRICE_AMOUNT&sp=1",
"shouldBecome": "https://www.immowelt.de/liste/40589/wohnungen/mieten?d=true&sd=DESC&sf=TIMESTAMP&sp=1",
"id": "immowelt"
},
{
"url": "https://www.1a-immobilienmarkt.de/suchen/duesseldorf/wohnung-mieten.html?search=yes",
"shouldBecome": "https://www.1a-immobilienmarkt.de/suchen/duesseldorf/wohnung-mieten.html?search=yes&sort_type=newest",
"id": "einsAImmobilien"
},
{
"url": "https://www.wg-gesucht.de/1-zimmer-wohnungen-in-Dusseldorf.30.1.1.0.html?sort_column=1&sort_order=0",
"shouldBecome": "https://www.wg-gesucht.de/1-zimmer-wohnungen-in-Dusseldorf.30.1.1.0.html?sort_column=0&sort_order=0",
"id": "wgGesucht"
},
{
"url": "https://www.immonet.de/immobiliensuche/sel.do?sortby=0&suchart=1&objecttype=1&marketingtype=2&parentcat=1&locationname=d%C3%BCsseldorf",
"shouldBecome": "https://www.immonet.de/immobiliensuche/sel.do?sortby=19&suchart=1&objecttype=1&marketingtype=2&parentcat=1&locationname=d%C3%BCsseldorf",
"id": "immonet"
},
{
"url": "https://www.immobilienscout24.de/Suche/de/nordrhein-westfalen/duesseldorf/wohnung-mieten",
"shouldBecome": "https://www.immobilienscout24.de/Suche/de/nordrhein-westfalen/duesseldorf/wohnung-mieten?sorting=2",
"id": "immoscout"
},
{
"url": "https://www.neubaukompass.de/neubau-immobilien/berlin-region/",
"shouldBecome": "https://www.neubaukompass.de/neubau-immobilien/berlin-region/?Sortierung=Id&Richtung=DESC",
"id": "neubauKompass"
}
]

View File

@@ -3631,6 +3631,11 @@ fill-range@^7.0.1:
dependencies:
to-regex-range "^5.0.1"
filter-obj@^1.1.0:
version "1.1.0"
resolved "https://registry.yarnpkg.com/filter-obj/-/filter-obj-1.1.0.tgz#9b311112bc6c6127a16e016c6c5d7f19e0805c5b"
integrity sha1-mzERErxsYSehbgFsbF1/GeCAXFs=
finalhandler@~1.1.2:
version "1.1.2"
resolved "https://registry.yarnpkg.com/finalhandler/-/finalhandler-1.1.2.tgz#b7e7d000ffd11938d0fdb053506f6ebabe9f587d"
@@ -6335,6 +6340,16 @@ qs@^6.9.4:
dependencies:
side-channel "^1.0.4"
query-string@^7.0.1:
version "7.0.1"
resolved "https://registry.yarnpkg.com/query-string/-/query-string-7.0.1.tgz#45bd149cf586aaa582dffc7ec7a8ad97dd02f75d"
integrity sha512-uIw3iRvHnk9to1blJCG3BTc+Ro56CBowJXKmNNAm3RulvPBzWLRqKSiiDk+IplJhsydwtuNMHi8UGQFcCLVfkA==
dependencies:
decode-uri-component "^0.2.0"
filter-obj "^1.1.0"
split-on-first "^1.0.0"
strict-uri-encode "^2.0.0"
querystring@0.2.0, querystring@^0.2.0:
version "0.2.0"
resolved "https://registry.yarnpkg.com/querystring/-/querystring-0.2.0.tgz#b209849203bb25df820da756e747005878521620"
@@ -7276,6 +7291,11 @@ spdy@^4.0.2:
select-hose "^2.0.0"
spdy-transport "^3.0.0"
split-on-first@^1.0.0:
version "1.1.0"
resolved "https://registry.yarnpkg.com/split-on-first/-/split-on-first-1.1.0.tgz#f610afeee3b12bce1d0c30425e76398b78249a5f"
integrity sha512-43ZssAJaMusuKWL8sKUBQXHWOpq8d6CfN/u1p4gUzfJkM05C8rxTmYrkIPTXapZpORA6LkkzcUulJ8FqA7Uudw==
split-string@^3.0.1, split-string@^3.0.2:
version "3.1.0"
resolved "https://registry.yarnpkg.com/split-string/-/split-string-3.1.0.tgz#7cb09dda3a86585705c64b39a6466038682e8fe2"
@@ -7315,6 +7335,11 @@ stream-to-string@^1.1.0:
dependencies:
promise-polyfill "^1.1.6"
strict-uri-encode@^2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/strict-uri-encode/-/strict-uri-encode-2.0.0.tgz#b9c7330c7042862f6b142dc274bbcc5866ce3546"
integrity sha1-ucczDHBChi9rFC3CdLvMWGbONUY=
string-argv@^0.3.1:
version "0.3.1"
resolved "https://registry.yarnpkg.com/string-argv/-/string-argv-0.3.1.tgz#95e2fbec0427ae19184935f816d74aaa4c5c19da"