diff --git a/.github/workflows/check_source.yml b/.github/workflows/check_source.yml index a3d1b92..f0c34fa 100644 --- a/.github/workflows/check_source.yml +++ b/.github/workflows/check_source.yml @@ -13,7 +13,7 @@ jobs: - uses: actions/setup-node@v4 with: - node-version: 20 + node-version: 22 cache: 'yarn' - name: Install dependencies diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 731a08c..7ea905f 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -57,3 +57,41 @@ jobs: labels: ${{ steps.meta.outputs.labels }} cache-from: type=gha cache-to: type=gha,mode=max + + # Test container health with docker compose + - name: Test container with docker compose + run: | + echo "Starting container with docker compose..." + docker compose up --build -d + echo "Waiting for container to be ready (60 seconds for start_period)..." + sleep 60 + + echo "Monitoring container health for 30 seconds..." + SECONDS_ELAPSED=0 + HEALTH_CHECK_INTERVAL=5 + TOTAL_DURATION=30 + + while [ $SECONDS_ELAPSED -lt $TOTAL_DURATION ]; do + HEALTH_STATUS=$(docker inspect --format='{{.State.Health.Status}}' fredy 2>/dev/null || echo "not_found") + CONTAINER_STATUS=$(docker inspect --format='{{.State.Status}}' fredy 2>/dev/null || echo "not_found") + echo "[$SECONDS_ELAPSED/$TOTAL_DURATION sec] Container: $CONTAINER_STATUS, Health: $HEALTH_STATUS" + + # Check if container is not running or unhealthy + if [ "$CONTAINER_STATUS" != "running" ]; then + echo "Container stopped running! Status: $CONTAINER_STATUS" + docker compose logs fredy + exit 1 + fi + + if [ "$HEALTH_STATUS" = "unhealthy" ]; then + echo "Container is unhealthy!" + docker compose logs fredy + docker inspect --format='{{json .State.Health}}' fredy | jq + exit 1 + fi + + sleep $HEALTH_CHECK_INTERVAL + SECONDS_ELAPSED=$((SECONDS_ELAPSED + HEALTH_CHECK_INTERVAL)) + done + + docker compose down diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d5ce6ee..236ddf3 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -15,7 +15,7 @@ jobs: - uses: actions/setup-node@v4 with: - node-version: 20 + node-version: 22 cache: 'yarn' - run: yarn install diff --git a/.gitignore b/.gitignore index 45b5ac8..f59f123 100755 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ node_modules/ ui/public/ -db/ +db/*.json +db/*.db* npm-debug.log .DS_Store .idea diff --git a/Dockerfile b/Dockerfile index 153f921..19ba420 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,9 +2,10 @@ FROM node:22-slim WORKDIR /fredy -# Install Chromium without extra recommended packages and clean apt cache +# Install Chromium and curl without extra recommended packages and clean apt cache +# curl is needed for the health check RUN apt-get update \ - && apt-get install -y --no-install-recommends chromium \ + && apt-get install -y --no-install-recommends chromium curl \ && rm -rf /var/lib/apt/lists/* ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \ diff --git a/README.md b/README.md index 17a37da..9e47046 100755 --- a/README.md +++ b/README.md @@ -1,3 +1,20 @@ +

+ + + + + + Jetbrains Open Source + + +

+ +![Tests](https://github.com/orangecoding/fredy/actions/workflows/test.yml/badge.svg) +[![Docker](https://github.com/orangecoding/fredy/actions/workflows/docker.yml/badge.svg)](https://github.com/orangecoding/fredy/actions/workflows/docker.yml) +![Source](https://github.com/orangecoding/fredy/actions/workflows/check_source.yml/badge.svg) +![Docker Pulls](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fghcr-badge.elias.eu.org%2Fapi%2Forangecoding%2Ffredy%2Ffredy&query=%24.downloadCount&label=Docker%20Pulls) + + # Fredy 🏡 – Your Self-Hosted Real Estate Finder for Germany Finding an apartment or house in Germany can be stressful and @@ -11,11 +28,7 @@ With a modern architecture, Fredy provides a **clean Web UI**, removes duplicates across platforms, and stores results so you never see the same listing twice. - -![Tests](https://github.com/orangecoding/fredy/actions/workflows/test.yml/badge.svg) -[![Docker](https://github.com/orangecoding/fredy/actions/workflows/docker.yml/badge.svg)](https://github.com/orangecoding/fredy/actions/workflows/docker.yml) -![Source](https://github.com/orangecoding/fredy/actions/workflows/check_source.yml/badge.svg) ------------------------------------------------------------------------ @@ -39,10 +52,18 @@ same listing twice. I maintain Fredy and other open-source projects in my free time.\ If you find it useful, consider supporting the project 💙 -[JetBrains](https://jb.gg/OpenSourceSupport) +Fredy is proudly backed by the **JetBrains Open Source Support Program**. -Fredy is proudly supported by the **JetBrains Open Source Support -Program**. + + + + Jetbrains Open Source + + +------------------------------------------------------------------------ + +## 👨‍🏫 Demo +You can try out Fredy here: [Fredy Demo](https://fredy-demo.orange-coding.net/) ------------------------------------------------------------------------ @@ -50,10 +71,15 @@ Program**. ### With Docker +> [!NOTE] +> In order to start Fredy, you must provide a config.json. As a start, use the one in this repo: https://github.com/orangecoding/fredy/blob/master/conf/config.json + ``` bash -docker pull ghcr.io/orangecoding/fredy:master -docker create --name fredy -v /path/to/your/conf/:/conf -p 9998:9998 fredy/fredy -docker start fredy +docker run -d --name fredy \ + -v fredy_conf:/conf \ + -v fredy_db:/db \ + -p 9998:9998 \ + ghcr.io/orangecoding/fredy:master ``` Logs: @@ -64,7 +90,7 @@ docker logs fredy -f ### Manual (Node.js) -- Requirement: **Node.js 20 or higher** +- Requirement: **Node.js 22 or higher** - Install dependencies and start: ``` bash @@ -128,7 +154,7 @@ Immoscout has implemented advanced bot detection. In order to work around this, Fredy is completely free (and will always remain free). However, it would be a huge help if you’d allow me to collect some analytical data. Before you freak out, let me explain... -If you agree, Fredy will send a ping to my Mixpanel project each time it runs. +If you agree, Fredy will send a ping once every 6 hours to my internal tracking project (Will be open sourced soon). The data includes: names of active adapters/providers, OS, architecture, Node version, and language. The information is entirely anonymous and helps me understand which adapters/providers are most frequently used.

**Thanks**🤘 @@ -188,9 +214,7 @@ flowchart TD Thanks to everyone who has contributed! -``{=html} -``{=html} -``{=html} + See the [Contributing Guide](https://github.com/orangecoding/fredy/blob/master/CONTRIBUTING.md). diff --git a/conf/config.json b/conf/config.json old mode 100755 new mode 100644 index 3b48654..3f8d054 --- a/conf/config.json +++ b/conf/config.json @@ -1 +1 @@ -{"interval":"60","port":9998,"workingHours":{"from":"","to":""},"demoMode":false,"analyticsEnabled":false} \ No newline at end of file +{"interval":"60","port":9998,"workingHours":{"from":"","to":""},"demoMode":false,"analyticsEnabled":true,"sqlitepath":"/db"} \ No newline at end of file diff --git a/db/.gitkeep b/db/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/docker-compose.yml b/docker-compose.yml index 0061893..38b5fdf 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -11,5 +11,12 @@ services: - ./conf:/conf - ./db:/db ports: - - 9998:9998 + - "9998:9998" restart: unless-stopped + healthcheck: + # The container will immediately stop when health check fails after retries + test: ["CMD-SHELL", "curl --fail --silent --show-error --max-time 5 http://localhost:9998/ || exit 1"] + interval: 120s + timeout: 10s + retries: 1 + start_period: 10s diff --git a/index.js b/index.js index c08652f..16179ab 100755 --- a/index.js +++ b/index.js @@ -1,62 +1,88 @@ import fs from 'fs'; -import { config } from './lib/utils.js'; +import path from 'path'; +import { config, getProviders, refreshConfig } from './lib/utils.js'; import * as similarityCache from './lib/services/similarity-check/similarityCache.js'; -import { setLastJobExecution } from './lib/services/storage/listingsStorage.js'; import * as jobStorage from './lib/services/storage/jobStorage.js'; import FredyRuntime from './lib/FredyRuntime.js'; import { duringWorkingHoursOrNotSet } from './lib/utils.js'; -import './lib/api/api.js'; -import { track } from './lib/services/tracking/Tracker.js'; -import { handleDemoUser } from './lib/services/storage/userStorage.js'; -import { cleanupDemoAtMidnight } from './lib/services/demoCleanup.js'; -//if db folder does not exist, ensure to create it before loading anything else -if (!fs.existsSync('./db')) { - fs.mkdirSync('./db'); +import { runMigrations } from './lib/services/storage/migrations/migrate.js'; +import { ensureDemoUserExists, ensureAdminUserExists } from './lib/services/storage/userStorage.js'; +import { cleanupDemoAtMidnight } from './lib/services/crons/demoCleanup-cron.js'; +import { initTrackerCron } from './lib/services/crons/tracker-cron.js'; +import logger from './lib/services/logger.js'; +import { bus } from './lib/services/events/event-bus.js'; +import { initActiveCheckerCron } from './lib/services/crons/listing-alive-cron.js'; + +// Load configuration before any other startup steps +await refreshConfig(); + +// Ensure sqlite directory exists before loading anything else (based on config.sqlitepath) +const rawDir = config.sqlitepath || '/db'; +const relDir = rawDir.startsWith('/') ? rawDir.slice(1) : rawDir; +const absDir = path.isAbsolute(relDir) ? relDir : path.join(process.cwd(), relDir); +if (!fs.existsSync(absDir)) { + fs.mkdirSync(absDir, { recursive: true }); } -const path = './lib/provider'; -const provider = fs.readdirSync(path).filter((file) => file.endsWith('.js')); + +// Run DB migrations once at startup and block until finished +await runMigrations(); + +// Load provider modules once at startup +const providers = await getProviders(); + //assuming interval is always in minutes const INTERVAL = config.interval * 60 * 1000; -/* eslint-disable no-console */ -console.log(`Started Fredy successfully. Ui can be accessed via http://localhost:${config.port}`); + +// Initialize API only after migrations completed +await import('./lib/api/api.js'); + if (config.demoMode) { - console.info('Running in demo mode'); + logger.info('Running in demo mode'); cleanupDemoAtMidnight(); } -/* eslint-enable no-console */ -const fetchedProvider = await Promise.all( - provider.filter((provider) => provider.endsWith('.js')).map(async (pro) => import(`${path}/${pro}`)), -); -handleDemoUser(); +logger.info(`Started Fredy successfully. Ui can be accessed via http://localhost:${config.port}`); -setInterval( - (function exec() { - const isDuringWorkingHoursOrNotSet = duringWorkingHoursOrNotSet(config, Date.now()); - if (!config.demoMode) { - if (isDuringWorkingHoursOrNotSet) { - track(); - config.lastRun = Date.now(); - jobStorage - .getJobs() - .filter((job) => job.enabled) - .forEach((job) => { - job.provider - .filter((p) => fetchedProvider.find((fp) => fp.metaInformation.id === p.id) != null) - .forEach(async (prov) => { - const pro = fetchedProvider.find((fp) => fp.metaInformation.id === prov.id); - pro.init(prov, job.blacklist); - await new FredyRuntime(pro.config, job.notificationAdapter, prov.id, job.id, similarityCache).execute(); - setLastJobExecution(job.id); - }); - }); - } else { - /* eslint-disable no-console */ - console.debug('Working hours set. Skipping as outside of working hours.'); - /* eslint-enable no-console */ - } +ensureAdminUserExists(); +ensureDemoUserExists(); +await initTrackerCron(); +//do not wait for this to finish, let it run in the background +initActiveCheckerCron(); + +bus.on('jobs:runAll', () => { + logger.debug('Running Fredy Job manually'); + execute(); +}); + +const execute = () => { + const isDuringWorkingHoursOrNotSet = duringWorkingHoursOrNotSet(config, Date.now()); + if (!config.demoMode) { + if (isDuringWorkingHoursOrNotSet) { + config.lastRun = Date.now(); + jobStorage + .getJobs() + .filter((job) => job.enabled) + .forEach((job) => { + job.provider + .filter((p) => providers.find((loaded) => loaded.metaInformation.id === p.id) != null) + .forEach(async (prov) => { + const matchedProvider = providers.find((loaded) => loaded.metaInformation.id === prov.id); + matchedProvider.init(prov, job.blacklist); + await new FredyRuntime( + matchedProvider.config, + job.notificationAdapter, + prov.id, + job.id, + similarityCache, + ).execute(); + }); + }); + } else { + logger.debug('Working hours set. Skipping as outside of working hours.'); } - return exec; - })(), - INTERVAL, -); + } +}; + +setInterval(execute, INTERVAL); +//start once at startup +execute(); diff --git a/lib/FredyRuntime.js b/lib/FredyRuntime.js index 8037f89..7693916 100755 --- a/lib/FredyRuntime.js +++ b/lib/FredyRuntime.js @@ -1,8 +1,9 @@ import { NoNewListingsWarning } from './errors.js'; -import { setKnownListings, getKnownListings } from './services/storage/listingsStorage.js'; +import { storeListings, getKnownListingHashesForJobAndProvider } from './services/storage/listingsStorage.js'; import * as notify from './notification/notify.js'; import Extractor from './services/extractor/extractor.js'; import urlModifier from './services/queryStringMutator.js'; +import logger from './services/logger.js'; class FredyRuntime { /** @@ -59,7 +60,7 @@ class FredyRuntime { }) .catch((err) => { reject(err); - console.error(err); + logger.error(err); }); }); } @@ -76,7 +77,9 @@ class FredyRuntime { } _findNew(listings) { - const newListings = listings.filter((o) => getKnownListings(this._jobKey, this._providerId)[o.id] == null); + const hashes = getKnownListingHashesForJobAndProvider(this._jobKey, this._providerId) || []; + + const newListings = listings.filter((o) => !hashes.includes(o.id)); if (newListings.length === 0) { throw new NoNewListingsWarning(); } @@ -92,30 +95,24 @@ class FredyRuntime { } _save(newListings) { - const currentListings = getKnownListings(this._jobKey, this._providerId) || {}; - newListings.forEach((listing) => { - currentListings[listing.id] = Date.now(); - }); - setKnownListings(this._jobKey, this._providerId, currentListings); + storeListings(this._jobKey, this._providerId, newListings); return newListings; } _filterBySimilarListings(listings) { const filteredList = listings.filter((listing) => { - const similar = this._similarityCache.hasSimilarEntries(this._jobKey, listing.title); + const similar = this._similarityCache.hasSimilarEntries(listing.title, listing.address); if (similar) { - /* eslint-disable no-console */ - console.debug(`Filtering similar entry for job with id ${this._jobKey} with title: `, listing.title); - /* eslint-enable no-console */ + logger.debug(`Filtering similar entry for title: ${listing.title} and address ${listing.address}`); } return !similar; }); - filteredList.forEach((filter) => this._similarityCache.addCacheEntry(this._jobKey, filter.title)); + filteredList.forEach((filter) => this._similarityCache.addCacheEntry(filter.title, listings.address)); return filteredList; } _handleError(err) { - if (err.name !== 'NoNewListingsWarning') console.error(err); + if (err.name !== 'NoNewListingsWarning') logger.error(err); } } diff --git a/lib/api/api.js b/lib/api/api.js index ca2cb8f..14dc00e 100644 --- a/lib/api/api.js +++ b/lib/api/api.js @@ -7,12 +7,14 @@ import { loginRouter } from './routes/loginRoute.js'; import { config } from '../utils.js'; import { userRouter } from './routes/userRoute.js'; import { jobRouter } from './routes/jobRouter.js'; +import { versionRouter } from './routes/versionRouter.js'; import bodyParser from 'body-parser'; import restana from 'restana'; import files from 'serve-static'; import path from 'path'; import { getDirName } from '../utils.js'; import { demoRouter } from './routes/demoRouter.js'; +import logger from '../services/logger.js'; const service = restana(); const staticService = files(path.join(getDirName(), '../ui/public')); const PORT = config.port || 9998; @@ -22,6 +24,7 @@ service.use(cookieSession()); service.use(staticService); service.use('/api/admin', authInterceptor()); service.use('/api/jobs', authInterceptor()); +service.use('/api/version', authInterceptor()); // /admin can only be accessed when user is having admin permissions service.use('/api/admin', adminInterceptor()); service.use('/api/jobs/notificationAdapter', notificationAdapterRouter); @@ -29,12 +32,12 @@ service.use('/api/admin/generalSettings', generalSettingsRouter); service.use('/api/jobs/provider', providerRouter); service.use('/api/jobs/insights', analyticsRouter); service.use('/api/admin/users', userRouter); +service.use('/api/version', versionRouter); service.use('/api/jobs', jobRouter); service.use('/api/login', loginRouter); //this route is unsecured intentionally as it is being queried from the login page service.use('/api/demo', demoRouter); -/* eslint-disable no-console */ service.start(PORT).then(() => { - console.info(`Started API service on port ${PORT}`); + logger.debug(`Started API service on port ${PORT}`); }); diff --git a/lib/api/routes/generalSettingsRoute.js b/lib/api/routes/generalSettingsRoute.js index f6acbde..1462d64 100644 --- a/lib/api/routes/generalSettingsRoute.js +++ b/lib/api/routes/generalSettingsRoute.js @@ -1,7 +1,8 @@ import restana from 'restana'; import { config, getDirName, readConfigFromStorage, refreshConfig } from '../../utils.js'; import fs from 'fs'; -import { handleDemoUser } from '../../services/storage/userStorage.js'; +import { ensureDemoUserExists } from '../../services/storage/userStorage.js'; +import logger from '../../services/logger.js'; const service = restana(); const generalSettingsRouter = service.newRouter(); generalSettingsRouter.get('/', async (req, res) => { @@ -18,9 +19,9 @@ generalSettingsRouter.post('/', async (req, res) => { const currentConfig = await readConfigFromStorage(); fs.writeFileSync(`${getDirName()}/../conf/config.json`, JSON.stringify({ ...currentConfig, ...settings })); await refreshConfig(); - handleDemoUser(); + ensureDemoUserExists(); } catch (err) { - console.error(err); + logger.error(err); res.send(new Error('Error while trying to write settings.')); return; } diff --git a/lib/api/routes/jobRouter.js b/lib/api/routes/jobRouter.js index 077aa48..9cc4a2c 100644 --- a/lib/api/routes/jobRouter.js +++ b/lib/api/routes/jobRouter.js @@ -3,9 +3,12 @@ import * as jobStorage from '../../services/storage/jobStorage.js'; import * as userStorage from '../../services/storage/userStorage.js'; import { config } from '../../utils.js'; import { isAdmin } from '../security.js'; -import { trackDemoJobCreated } from '../../services/tracking/Tracker.js'; +import logger from '../../services/logger.js'; +import { bus } from '../../services/events/event-bus.js'; + const service = restana(); const jobRouter = service.newRouter(); + function doesJobBelongsToUser(job, req) { const userId = req.session.currentUser; if (userId == null) { @@ -17,6 +20,7 @@ function doesJobBelongsToUser(job, req) { } return user.isAdmin || job.userId === user.id; } + jobRouter.get('/', async (req, res) => { const isUserAdmin = isAdmin(req); //show only the jobs which belongs to the user (or all of the user is an admin) @@ -30,6 +34,12 @@ jobRouter.get('/processingTimes', async (req, res) => { }; res.send(); }); + +jobRouter.post('/startAll', async (req, res) => { + bus.emit('jobs:runAll'); + res.send(); +}); + jobRouter.post('/', async (req, res) => { const { provider, notificationAdapter, name, blacklist = [], jobId, enabled } = req.body; try { @@ -44,13 +54,8 @@ jobRouter.post('/', async (req, res) => { }); } catch (error) { res.send(new Error(error)); - console.error(error); + logger.error(error); } - trackDemoJobCreated({ - name, - provider, - adapter: notificationAdapter, - }); res.send(); }); jobRouter.delete('', async (req, res) => { @@ -64,7 +69,7 @@ jobRouter.delete('', async (req, res) => { } } catch (error) { res.send(new Error(error)); - console.error(error); + logger.error(error); } res.send(); }); @@ -83,7 +88,7 @@ jobRouter.put('/:jobId/status', async (req, res) => { } } catch (error) { res.send(new Error(error)); - console.error(error); + logger.error(error); } res.send(); }); diff --git a/lib/api/routes/loginRoute.js b/lib/api/routes/loginRoute.js index 0a2626d..1eb28cc 100644 --- a/lib/api/routes/loginRoute.js +++ b/lib/api/routes/loginRoute.js @@ -3,6 +3,7 @@ import * as userStorage from '../../services/storage/userStorage.js'; import * as hasher from '../../services/security/hash.js'; import { config } from '../../utils.js'; import { trackDemoAccessed } from '../../services/tracking/Tracker.js'; +import logger from '../../services/logger.js'; const service = restana(); const loginRouter = service.newRouter(); loginRouter.get('/user', async (req, res) => { @@ -27,7 +28,7 @@ loginRouter.post('/', async (req, res) => { } if (user.password === hasher.hash(password)) { if (config.demoMode) { - trackDemoAccessed(); + await trackDemoAccessed(); } req.session.currentUser = user.id; @@ -35,7 +36,7 @@ loginRouter.post('/', async (req, res) => { res.send(200); return; } else { - console.error(`User ${username} tried to login, but password was wrong.`); + logger.error(`User ${username} tried to login, but password was wrong.`); } res.send(401); }); diff --git a/lib/api/routes/versionRouter.js b/lib/api/routes/versionRouter.js new file mode 100644 index 0000000..1db98a3 --- /dev/null +++ b/lib/api/routes/versionRouter.js @@ -0,0 +1,30 @@ +import restana from 'restana'; +import fetch from 'node-fetch'; +import { getPackageVersion } from '../../utils.js'; + +const service = restana(); +const versionRouter = service.newRouter(); + +versionRouter.get('/', async (req, res) => { + const versionPayload = await getCurrentVersionFromGithub(); + res.body = versionPayload == null ? { newVersion: false } : versionPayload; + res.send(); +}); + +async function getCurrentVersionFromGithub() { + const raw = await fetch('https://api.github.com/repos/orangecoding/fredy/releases/latest'); + const data = await raw.json(); + const localFredyVersion = await getPackageVersion(); + if (localFredyVersion === data.tag_name) { + return null; + } + return { + newVersion: true, + version: data.tag_name, + url: data.html_url, + body: data.body, + localFredyVersion, + }; +} + +export { versionRouter }; diff --git a/lib/defaultConfig.js b/lib/defaultConfig.js index 56744f9..4e31e85 100644 --- a/lib/defaultConfig.js +++ b/lib/defaultConfig.js @@ -4,4 +4,6 @@ export const DEFAULT_CONFIG = { workingHours: { from: '', to: '' }, demoMode: false, analyticsEnabled: null, + // Default path for sqlite storage directory. Interpreted relative to project root. + sqlitepath: '/db', }; diff --git a/lib/notification/adapter/mailJet.js b/lib/notification/adapter/mailJet.js index 826e913..7c57845 100755 --- a/lib/notification/adapter/mailJet.js +++ b/lib/notification/adapter/mailJet.js @@ -5,6 +5,7 @@ import Handlebars from 'handlebars'; import fetch from 'node-fetch'; import { markdown2Html } from '../../services/markdown.js'; import { getDirName, normalizeImageUrl } from '../../utils.js'; +import logger from '../../services/logger.js'; const __dirname = getDirName(); const template = fs.readFileSync(path.resolve(__dirname + '/notification/emailTemplate/template.hbs'), 'utf8'); @@ -24,7 +25,7 @@ const toBase64 = async (url) => { const ab = await res.arrayBuffer(); return Buffer.from(ab).toString('base64'); } catch (error) { - console.error(`Error fetching image from ${url}:`, error.message); + logger.error(`Error fetching image from ${url}:`, error.message); throw error; } }; @@ -62,7 +63,7 @@ const mapListingsWithCid = async (serviceName, jobKey, listings) => { item.hasImage = true; item.imageCid = cid; } catch (error) { - console.warn(`Skipping image for listing ${i} due to error: ${error.message}`); + logger.warn(`Skipping image for listing ${i} due to error: ${error.message}`); } } diff --git a/lib/notification/adapter/sqlite.js b/lib/notification/adapter/sqlite.js index 4291643..12c40a2 100644 --- a/lib/notification/adapter/sqlite.js +++ b/lib/notification/adapter/sqlite.js @@ -1,7 +1,18 @@ import { markdown2Html } from '../../services/markdown.js'; import Database from 'better-sqlite3'; -export const send = ({ serviceName, newListings, jobKey }) => { - const db = new Database('db/listings.db'); +import path from 'path'; +import fs from 'fs'; + +export const send = ({ serviceName, newListings, jobKey, notificationConfig }) => { + const sqliteConfig = notificationConfig.find((adapter) => adapter.id === config.id); + const dbPath = sqliteConfig?.fields?.dbPath || 'db/listings.db'; + + const dbDir = path.dirname(dbPath); + if (!fs.existsSync(dbDir)) { + fs.mkdirSync(dbDir, { recursive: true }); + } + + const db = new Database(dbPath); const fields = [ 'serviceName', 'jobKey', @@ -30,8 +41,16 @@ export const send = ({ serviceName, newListings, jobKey }) => { }; export const config = { id: 'sqlite', - name: 'Sqlite', - description: 'This adapter stores listings in a local sqlite3 database.', - config: {}, + name: 'SQLite', + description: 'This adapter stores listings in a local SQLite 3 database.', + fields: { + dbPath: { + type: 'text', + label: 'Database Path', + description: + 'Path to the SQLite database file (e.g., db/listings.db). If not specified, defaults to db/listings.db', + placeholder: 'db/listings.db', + }, + }, readme: markdown2Html('lib/notification/adapter/sqlite.md'), }; diff --git a/lib/notification/adapter/sqlite.md b/lib/notification/adapter/sqlite.md index 6b455b5..bc9592f 100644 --- a/lib/notification/adapter/sqlite.md +++ b/lib/notification/adapter/sqlite.md @@ -1,9 +1,9 @@ -### Sqlite Adapter +### SQLite Adapter -This adapter stores search results in a sqlite database located in db/listings.db. This file can be used for further analysis later on. +This adapter stores search results in an SQLite database. By default, the database is located at `db/listings.db`, but you can configure a custom location. This file can be used for further analysis later. -Fields are: +The database table contains the following columns (all stored as `TEXT` type): ``` -['serviceName', 'jobKey', 'id', 'size', 'rooms', 'price', 'address', 'title', 'link', 'description'] +['serviceName', 'jobKey', 'id', 'size', 'rooms', 'price', 'address', 'title', 'link', 'description', 'image'] ``` diff --git a/lib/notification/adapter/telegram.js b/lib/notification/adapter/telegram.js index 15b5289..a33b1ce 100644 --- a/lib/notification/adapter/telegram.js +++ b/lib/notification/adapter/telegram.js @@ -63,31 +63,41 @@ export const send = ({ serviceName, newListings, notificationConfig, jobKey }) = const jobName = job == null ? jobKey : job.name; const throttledCall = getThrottled(chatId, async function (endpoint, body) { - await fetch(`https://api.telegram.org/bot${token}/${endpoint}`, { + const res = await fetch(`https://api.telegram.org/bot${token}/${endpoint}`, { method: 'post', body: JSON.stringify(body), headers: { 'Content-Type': 'application/json' }, }); + return res; }); const promises = newListings.map(async (o) => { const img = normalizeImageUrl(o.image); + const textPayload = { + chat_id: chatId, + text: buildText(jobName, serviceName, o), + parse_mode: 'HTML', + disable_web_page_preview: true, + }; - if (img) { - return throttledCall('sendPhoto', { + if (!img) { + return throttledCall('sendMessage', textPayload); + } + + try { + return await throttledCall('sendPhoto', { chat_id: chatId, photo: img, caption: buildCaption(jobName, serviceName, o), parse_mode: 'HTML', }); + } catch (e) { + // If we see a timeout due to sending an image, try sending it without + if (e && (e.code === 'ETIMEDOUT' || e.errno === 'ETIMEDOUT')) { + return throttledCall('sendMessage', textPayload); + } + throw e; } - - return throttledCall('sendMessage', { - chat_id: chatId, - text: buildText(jobName, serviceName, o), - parse_mode: 'HTML', - disable_web_page_preview: true, - }); }); return Promise.all(promises); diff --git a/lib/provider/einsAImmobilien.js b/lib/provider/einsAImmobilien.js index 5e41cd0..eabe299 100755 --- a/lib/provider/einsAImmobilien.js +++ b/lib/provider/einsAImmobilien.js @@ -1,4 +1,5 @@ -import utils, { buildHash } from '../utils.js'; +import { buildHash, isOneOf } from '../utils.js'; +import checkIfListingIsActive from '../services/listings/listingActiveTester.js'; let appliedBlackList = []; function normalize(o) { @@ -7,7 +8,8 @@ function normalize(o) { const price = normalizePrice(o.price); const id = buildHash(o.id, price); const image = baseUrl + o.image; - return Object.assign(o, { id, price, link, image }); + const address = o.address == null ? null : o.address.trim().replaceAll('/', ','); + return Object.assign(o, { id, price, link, image, address }); } /** @@ -28,8 +30,8 @@ function normalizePrice(price) { return result[0]; } function applyBlacklist(o) { - const titleNotBlacklisted = !utils.isOneOf(o.title, appliedBlackList); - const descNotBlacklisted = !utils.isOneOf(o.description, appliedBlackList); + const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList); + const descNotBlacklisted = !isOneOf(o.description, appliedBlackList); return titleNotBlacklisted && descNotBlacklisted; } @@ -44,9 +46,11 @@ const config = { size: '.tabelle .tabelle_inhalt_infos .single_data_box | removeNewline | trim', title: '.inner_object_data .tabelle_inhalt_titel_black | removeNewline | trim', image: '.inner_object_pic img@src', + address: '.tabelle .tabelle_inhalt_infos .left_information > div:nth-child(2) | removeNewline | trim', }, normalize: normalize, filter: applyBlacklist, + activeTester: checkIfListingIsActive, }; export const init = (sourceConfig, blacklist) => { config.enabled = sourceConfig.enabled; diff --git a/lib/provider/immobilienDe.js b/lib/provider/immobilienDe.js index 8692fae..ba39412 100644 --- a/lib/provider/immobilienDe.js +++ b/lib/provider/immobilienDe.js @@ -1,4 +1,5 @@ -import utils, { buildHash } from '../utils.js'; +import { buildHash, isOneOf } from '../utils.js'; +import checkIfListingIsActive from '../services/listings/listingActiveTester.js'; let appliedBlackList = []; @@ -12,10 +13,10 @@ function parseId(shortenedLink) { function normalize(o) { const baseUrl = 'https://www.immobilien.de'; - const size = o.size || 'N/A m²'; - const price = o.price || 'N/A €'; + const size = o.size || null; + const price = o.price || null; const title = o.title || 'No title available'; - const address = o.address || 'No address available'; + const address = o.address || null; const shortLink = shortenLink(o.link); const link = `${baseUrl}/${shortLink}`; const image = baseUrl + o.image; @@ -24,8 +25,8 @@ function normalize(o) { } function applyBlacklist(o) { - const titleNotBlacklisted = !utils.isOneOf(o.title, appliedBlackList); - const descNotBlacklisted = !utils.isOneOf(o.description, appliedBlackList); + const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList); + const descNotBlacklisted = !isOneOf(o.description, appliedBlackList); return titleNotBlacklisted && descNotBlacklisted; } @@ -46,6 +47,7 @@ const config = { }, normalize: normalize, filter: applyBlacklist, + activeTester: checkIfListingIsActive, }; export const init = (sourceConfig, blacklist) => { config.enabled = sourceConfig.enabled; diff --git a/lib/provider/immonet.js b/lib/provider/immonet.js index 0cac585..2499e56 100755 --- a/lib/provider/immonet.js +++ b/lib/provider/immonet.js @@ -1,25 +1,19 @@ -import utils, { buildHash } from '../utils.js'; +import { isOneOf, buildHash } from '../utils.js'; +import checkIfListingIsActive from '../services/listings/listingActiveTester.js'; let appliedBlackList = []; -/** - * Note, Immonet is rly a piece of sh*t. It is using a weird combination of React and some buttons (instead of links), - * so that if somebody clicks the listing, a new page will open with the actual link to the listing. Of course, a scraper - * cannot do this (which is why I always just return the link to the whole list of listings). - * This is not only bad for us, but also bad for ppl with disabilities... - */ - function normalize(o) { const size = o.size != null ? o.size.replace('Wohnfläche ', '') : 'N/A m²'; const price = o.price.replace('Kaufpreis ', ''); const address = o.address?.split(' • ')?.pop() ?? null; const title = o.title || 'No title available'; - const link = config.url; + const link = o.link != null ? decodeURIComponent(o.link) : config.url; const id = buildHash(title, price); return Object.assign(o, { id, address, price, size, title, link }); } function applyBlacklist(o) { - const titleNotBlacklisted = !utils.isOneOf(o.title, appliedBlackList); - const descNotBlacklisted = !utils.isOneOf(o.description, appliedBlackList); + const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList); + const descNotBlacklisted = !isOneOf(o.description, appliedBlackList); return titleNotBlacklisted && descNotBlacklisted; } const config = { @@ -28,15 +22,17 @@ const config = { sortByDateParam: 'sortby=19', waitForSelector: 'div[data-testid="serp-gridcontainer-testid"]', crawlFields: { - id: 'button@title |trim', // immonet is a piece of sh*t. See comment above + id: 'button@title |trim', title: 'button@title |trim', price: 'div[data-testid="cardmfe-price-testid"] | trim', size: 'div[data-testid="cardmfe-keyfacts-testid"] | trim', address: 'div[data-testid="cardmfe-description-box-address"] | trim', image: 'div[data-testid="cardmfe-picture-box-test-id"] img@src', + link: 'button@data-base', }, normalize: normalize, filter: applyBlacklist, + activeTester: checkIfListingIsActive, }; export const init = (sourceConfig, blacklist) => { config.enabled = sourceConfig.enabled; diff --git a/lib/provider/immoscout.js b/lib/provider/immoscout.js index 06a4b39..58e1e3b 100644 --- a/lib/provider/immoscout.js +++ b/lib/provider/immoscout.js @@ -3,7 +3,7 @@ * * The mobile API provides the following endpoints: * - GET /search/total?{search parameters}: Returns the total number of listings for the given query - * Example: `curl -H "User-Agent: ImmoScout24_1410_30_._" https://api.mobile.immobilienscout24.de/search/total?searchType=region&realestatetype=apartmentrent&pricetype=calculatedtotalrent&geocodes=%2Fde%2Fberlin%2Fberlin ` + * Example: `curl -H "User-Agent: ImmoScout_27.3_26.0_._" https://api.mobile.immobilienscout24.de/search/total?searchType=region&realestatetype=apartmentrent&pricetype=calculatedtotalrent&geocodes=%2Fde%2Fberlin%2Fberlin ` * * - POST /search/list?{search parameters}: Actually retrieves the listings. Body is json encoded and contains * data specifying additional results (advertisements) to return. The format is as follows: @@ -15,12 +15,12 @@ * ``` * It is not necessary to provide data for the specified keys. * - * Example: `curl -X POST 'https://api.mobile.immobilienscout24.de/search/list?pricetype=calculatedtotalrent&realestatetype=apartmentrent&searchType=region&geocodes=%2Fde%2Fberlin%2Fberlin&pagenumber=1' -H "Connection: keep-alive" -H "User-Agent: ImmoScout24_1410_30_._" -H "Accept: application/json" -H "Content-Type: application/json" -d '{"supportedResultListType": [], "userData": {}}'` + * Example: `curl -X POST 'https://api.mobile.immobilienscout24.de/search/list?pricetype=calculatedtotalrent&realestatetype=apartmentrent&searchType=region&geocodes=%2Fde%2Fberlin%2Fberlin&pagenumber=1' -H "Connection: keep-alive" -H "User-Agent: ImmoScout_27.3_26.0_._" -H "Accept: application/json" -H "Content-Type: application/json" -d '{"supportedResultListType": [], "userData": {}}'` * - GET /expose/{id} - Returns the details of a listing. The response contains additional details not included in the * listing response. * - * Example: `curl -H "User-Agent: ImmoScout24_1410_30_._" "https://api.mobile.immobilienscout24.de/expose/158382494"` + * Example: `curl -H "User-Agent: ImmoScout_27.3_26.0_._" "https://api.mobile.immobilienscout24.de/expose/158382494"` * * * It is necessary to set the correct User Agent (see `getListings`) in the request header. @@ -35,15 +35,19 @@ * */ -import utils, { buildHash } from '../utils.js'; -import { convertWebToMobile } from '../services/immoscout/immoscout-web-translator.js'; +import { buildHash, isOneOf } from '../utils.js'; +import { + convertImmoscoutListingToMobileListing, + convertWebToMobile, +} from '../services/immoscout/immoscout-web-translator.js'; +import logger from '../services/logger.js'; let appliedBlackList = []; async function getListings(url) { const response = await fetch(url, { method: 'POST', headers: { - 'User-Agent': 'ImmoScout24_1410_30_._', + 'User-Agent': 'ImmoScout_27.3_26.0_._', 'Content-Type': 'application/json', }, body: JSON.stringify({ @@ -52,7 +56,7 @@ async function getListings(url) { }), }); if (!response.ok) { - console.error('Error fetching data from ImmoScout Mobile API:', response.statusText); + logger.error('Error fetching data from ImmoScout Mobile API:', response.statusText); return []; } @@ -68,6 +72,7 @@ async function getListings(url) { price: price?.value, size: size?.value, title: item.title, + description: item.description, link: `${metaInformation.baseUrl}expose/${item.id}`, address: item.address?.line, image, @@ -75,6 +80,25 @@ async function getListings(url) { }); } +async function isListingActive(link) { + const result = await fetch(convertImmoscoutListingToMobileListing(link), { + headers: { + 'User-Agent': 'ImmoScout_27.3_26.0_._', + }, + }); + + if (result.status === 200) { + return 1; + } + + if (result.status === 404) { + return 0; + } + + logger.warn('Unknown status for immoscout listing', link); + return -1; +} + function nullOrEmpty(val) { return val == null || val.length === 0; } @@ -85,7 +109,7 @@ function normalize(o) { return Object.assign(o, { id, title, address }); } function applyBlacklist(o) { - return !utils.isOneOf(o.title, appliedBlackList); + return !isOneOf(o.title, appliedBlackList); } const config = { url: null, @@ -102,6 +126,7 @@ const config = { normalize: normalize, filter: applyBlacklist, getListings: getListings, + activeTester: isListingActive, }; export const init = (sourceConfig, blacklist) => { config.enabled = sourceConfig.enabled; diff --git a/lib/provider/immoswp.js b/lib/provider/immoswp.js index 7bb9548..5c911d1 100755 --- a/lib/provider/immoswp.js +++ b/lib/provider/immoswp.js @@ -1,4 +1,5 @@ -import utils, { buildHash } from '../utils.js'; +import { isOneOf, buildHash } from '../utils.js'; +import checkIfListingIsActive from '../services/listings/listingActiveTester.js'; let appliedBlackList = []; @@ -14,8 +15,8 @@ function normalize(o) { } function applyBlacklist(o) { - const titleNotBlacklisted = !utils.isOneOf(o.title, appliedBlackList); - const descNotBlacklisted = !utils.isOneOf(o.description, appliedBlackList); + const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList); + const descNotBlacklisted = !isOneOf(o.description, appliedBlackList); return titleNotBlacklisted && descNotBlacklisted; } @@ -35,6 +36,7 @@ const config = { }, normalize: normalize, filter: applyBlacklist, + activeTester: checkIfListingIsActive, }; export const init = (sourceConfig, blacklist) => { config.enabled = sourceConfig.enabled; diff --git a/lib/provider/immowelt.js b/lib/provider/immowelt.js index afdb057..924936e 100755 --- a/lib/provider/immowelt.js +++ b/lib/provider/immowelt.js @@ -1,4 +1,5 @@ -import utils, { buildHash } from '../utils.js'; +import { buildHash, isOneOf } from '../utils.js'; +import checkIfListingIsActive from '../services/listings/listingActiveTester.js'; let appliedBlackList = []; @@ -8,8 +9,8 @@ function normalize(o) { } function applyBlacklist(o) { - const titleNotBlacklisted = !utils.isOneOf(o.title, appliedBlackList); - const descNotBlacklisted = !utils.isOneOf(o.description, appliedBlackList); + const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList); + const descNotBlacklisted = !isOneOf(o.description, appliedBlackList); return titleNotBlacklisted && descNotBlacklisted; } @@ -30,6 +31,7 @@ const config = { }, normalize: normalize, filter: applyBlacklist, + activeTester: checkIfListingIsActive, }; export const init = (sourceConfig, blacklist) => { config.enabled = sourceConfig.enabled; diff --git a/lib/provider/kleinanzeigen.js b/lib/provider/kleinanzeigen.js index f21cd16..e322c5b 100755 --- a/lib/provider/kleinanzeigen.js +++ b/lib/provider/kleinanzeigen.js @@ -1,4 +1,5 @@ -import utils, { buildHash } from '../utils.js'; +import { buildHash, isOneOf } from '../utils.js'; +import checkIfListingIsActive from '../services/listings/listingActiveTester.js'; let appliedBlackList = []; let appliedBlacklistedDistricts = []; @@ -11,10 +12,10 @@ function normalize(o) { } function applyBlacklist(o) { - const titleNotBlacklisted = !utils.isOneOf(o.title, appliedBlackList); - const descNotBlacklisted = !utils.isOneOf(o.description, appliedBlackList); + const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList); + const descNotBlacklisted = !isOneOf(o.description, appliedBlackList); const isBlacklistedDistrict = - appliedBlacklistedDistricts.length === 0 ? false : utils.isOneOf(o.description, appliedBlacklistedDistricts); + appliedBlacklistedDistricts.length === 0 ? false : isOneOf(o.description, appliedBlacklistedDistricts); return o.title != null && !isBlacklistedDistrict && titleNotBlacklisted && descNotBlacklisted; } @@ -36,6 +37,7 @@ const config = { }, normalize: normalize, filter: applyBlacklist, + activeTester: checkIfListingIsActive, }; export const metaInformation = { name: 'Ebay Kleinanzeigen', diff --git a/lib/provider/neubauKompass.js b/lib/provider/neubauKompass.js index bdb2fc0..de4808f 100755 --- a/lib/provider/neubauKompass.js +++ b/lib/provider/neubauKompass.js @@ -1,4 +1,5 @@ -import utils, { buildHash } from '../utils.js'; +import { isOneOf, buildHash } from '../utils.js'; +import checkIfListingIsActive from '../services/listings/listingActiveTester.js'; let appliedBlackList = []; @@ -15,14 +16,14 @@ function normalize(o) { } function applyBlacklist(o) { - return !utils.isOneOf(o.title, appliedBlackList); + return !isOneOf(o.title, appliedBlackList); } const config = { url: null, crawlContainer: '.col-12.mb-4', sortByDateParam: 'Sortierung=Id&Richtung=DESC', - waitForSelector: '.nbk-section', + waitForSelector: 'div[data-live-name-value="SearchList"]', crawlFields: { id: 'a@href', title: 'a@title | removeNewline | trim', @@ -33,6 +34,7 @@ const config = { }, normalize: normalize, filter: applyBlacklist, + activeTester: checkIfListingIsActive, }; export const init = (sourceConfig, blacklist) => { config.enabled = sourceConfig.enabled; diff --git a/lib/provider/wgGesucht.js b/lib/provider/wgGesucht.js index a287097..f0d441a 100755 --- a/lib/provider/wgGesucht.js +++ b/lib/provider/wgGesucht.js @@ -1,4 +1,5 @@ -import utils, { buildHash } from '../utils.js'; +import { isOneOf, buildHash } from '../utils.js'; +import checkIfListingIsActive from '../services/listings/listingActiveTester.js'; let appliedBlackList = []; @@ -10,8 +11,8 @@ function normalize(o) { } function applyBlacklist(o) { - const titleNotBlacklisted = !utils.isOneOf(o.title, appliedBlackList); - const descNotBlacklisted = !utils.isOneOf(o.description, appliedBlackList); + const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList); + const descNotBlacklisted = !isOneOf(o.description, appliedBlackList); return o.id != null && titleNotBlacklisted && descNotBlacklisted; } @@ -31,6 +32,7 @@ const config = { }, normalize: normalize, filter: applyBlacklist, + activeTester: checkIfListingIsActive, }; export const init = (sourceConfig, blacklist) => { config.enabled = sourceConfig.enabled; diff --git a/lib/services/crons/demoCleanup-cron.js b/lib/services/crons/demoCleanup-cron.js new file mode 100644 index 0000000..79b5752 --- /dev/null +++ b/lib/services/crons/demoCleanup-cron.js @@ -0,0 +1,23 @@ +import { removeJobsByUserId } from '../storage/jobStorage.js'; +import { config } from '../../utils.js'; +import { getUsers } from '../storage/userStorage.js'; +import logger from '../logger.js'; +import cron from 'node-cron'; + +/** + * if we are running in demo environment, we have to cleanup the db files (specifically the jobs table) + */ +export function cleanupDemoAtMidnight() { + cron.schedule('0 0 * * *', cleanup); +} + +function cleanup() { + if (config.demoMode) { + const demoUser = getUsers(false).find((user) => user.username === 'demo'); + if (demoUser == null) { + logger.error('Demo user not found, cannot remove Jobs'); + return; + } + removeJobsByUserId(demoUser.id); + } +} diff --git a/lib/services/crons/listing-alive-cron.js b/lib/services/crons/listing-alive-cron.js new file mode 100644 index 0000000..60dd5d9 --- /dev/null +++ b/lib/services/crons/listing-alive-cron.js @@ -0,0 +1,13 @@ +import cron from 'node-cron'; +import runActiveChecker from '../listings/listingActiveService.js'; + +async function runTask() { + await runActiveChecker(); +} + +export async function initActiveCheckerCron() { + //run directly on start + await runTask(); + // then every day at 1 am + cron.schedule('0 1 * * *', runTask); +} diff --git a/lib/services/crons/tracker-cron.js b/lib/services/crons/tracker-cron.js new file mode 100644 index 0000000..7cfe8cc --- /dev/null +++ b/lib/services/crons/tracker-cron.js @@ -0,0 +1,17 @@ +import cron from 'node-cron'; +import { config, inDevMode } from '../../utils.js'; +import { trackMainEvent } from '../tracking/Tracker.js'; + +async function runTask() { + //make sure to only send tracking events if the user gave us the green light and we are not in dev mode + if (config.analyticsEnabled && !inDevMode()) { + await trackMainEvent(); + } +} + +export async function initTrackerCron() { + //run directly on start + await runTask(); + // then every 6 hours + cron.schedule('0 */6 * * *', runTask); +} diff --git a/lib/services/demoCleanup.js b/lib/services/demoCleanup.js deleted file mode 100644 index 4d995a4..0000000 --- a/lib/services/demoCleanup.js +++ /dev/null @@ -1,37 +0,0 @@ -import { setInterval } from 'node:timers'; -import { removeJobsByUserName } from './storage/jobStorage.js'; -import { config } from '../utils.js'; -import { getUsers } from './storage/userStorage.js'; - -/** - * if we are running in demo environment, we have to cleanup the db files (specifically the jobs table) - */ -export function cleanupDemoAtMidnight() { - const now = new Date(); - const millisUntilMidnightUTC = - (24 - now.getUTCHours()) * 60 * 60 * 1000 - - now.getUTCMinutes() * 60 * 1000 - - now.getUTCSeconds() * 1000 - - now.getUTCMilliseconds(); - - cleanup(); - setTimeout(() => { - setInterval( - () => { - cleanup(); - }, - 24 * 60 * 60 * 1000, - ); - }, millisUntilMidnightUTC); -} - -function cleanup() { - if (config.demoMode) { - const demoUser = getUsers(false).find((user) => user.username === 'demo'); - if (demoUser == null) { - console.error('Demo user not found, cannot remove Jobs'); - return; - } - removeJobsByUserName(demoUser.id); - } -} diff --git a/lib/services/events/event-bus.js b/lib/services/events/event-bus.js new file mode 100644 index 0000000..e56852b --- /dev/null +++ b/lib/services/events/event-bus.js @@ -0,0 +1,2 @@ +import { EventEmitter } from 'node:events'; +export const bus = new EventEmitter(); diff --git a/lib/services/extractor/extractor.js b/lib/services/extractor/extractor.js index 8e5cbaf..b881141 100644 --- a/lib/services/extractor/extractor.js +++ b/lib/services/extractor/extractor.js @@ -1,6 +1,7 @@ import { setDebug } from './utils.js'; import puppeteerExtractor from './puppeteerExtractor.js'; import { loadParser, parse } from './parser/parser.js'; +import logger from '../logger.js'; const DEFAULT_OPTIONS = { debug: false, @@ -32,7 +33,7 @@ export default class Extractor { loadParser(this.responseText); } } catch (error) { - console.error('Error trying to load page.', error); + logger.error('Error trying to load page.', error); } return this; }; diff --git a/lib/services/extractor/parser/parser.js b/lib/services/extractor/parser/parser.js index be22838..0666ac6 100644 --- a/lib/services/extractor/parser/parser.js +++ b/lib/services/extractor/parser/parser.js @@ -1,4 +1,5 @@ import * as cheerio from 'cheerio'; +import logger from '../../logger.js'; let $ = null; @@ -8,19 +9,19 @@ export function loadParser(text) { export function parse(crawlContainer, crawlFields, text, url) { if (!text) { - console.warn('No content found for ', url); + logger.debug('No content found for ', url); return null; } if (!crawlContainer || !crawlFields) { - console.warn('Cannot parse, selector was empty for url ', url); + logger.debug('Cannot parse, selector was empty for url ', url); return null; } const result = []; if ($(crawlContainer).length === 0) { - console.warn('No elements in crawl container found for url ', url); + logger.debug('No elements in crawl container found for url ', url); return null; } @@ -58,7 +59,7 @@ export function parse(crawlContainer, crawlFields, text, url) { parsedObject[key] = value || null; } catch (error) { - console.error(`Error parsing field '${key}' with selector '${fieldSelector}':`, error); + logger.error(`Error parsing field '${key}' with selector '${fieldSelector}':`, error); parsedObject[key] = null; } } @@ -66,7 +67,7 @@ export function parse(crawlContainer, crawlFields, text, url) { if (parsedObject.id != null) { result.push(parsedObject); } else { - console.warn('ID not found. Not relaying object.'); + logger.debug('ID not found. Not relaying object.'); } }); @@ -89,7 +90,7 @@ function applyModifiers(value, modifiers) { value = value.replace(/\n/g, ' '); break; default: - console.warn(`Unknown modifier: ${modifier}`); + logger.warn(`Unknown modifier: ${modifier}`); } }); diff --git a/lib/services/extractor/puppeteerExtractor.js b/lib/services/extractor/puppeteerExtractor.js index 7d858de..0818a5d 100644 --- a/lib/services/extractor/puppeteerExtractor.js +++ b/lib/services/extractor/puppeteerExtractor.js @@ -1,30 +1,57 @@ import puppeteer from 'puppeteer-extra'; import StealthPlugin from 'puppeteer-extra-plugin-stealth'; import { debug, DEFAULT_HEADER, botDetected } from './utils.js'; +import logger from '../logger.js'; +import fs from 'fs'; +import os from 'os'; +import path from 'path'; puppeteer.use(StealthPlugin()); export default async function execute(url, waitForSelector, options) { let browser; + let page; + let result = null; + let userDataDir; + let removeUserDataDir = false; try { debug(`Sending request to ${url} using Puppeteer.`); + // Prepare a dedicated temporary userDataDir to avoid leaking /tmp/.org.chromium.* dirs + if (options && options.userDataDir) { + userDataDir = options.userDataDir; + removeUserDataDir = !!options.cleanupUserDataDir; + } else { + const prefix = path.join(os.tmpdir(), 'puppeteer-fredy-'); + userDataDir = fs.mkdtempSync(prefix); + removeUserDataDir = true; + } + browser = await puppeteer.launch({ headless: options.puppeteerHeadless ?? true, - args: ['--no-sandbox', '--disable-gpu', '--disable-setuid-sandbox'], + args: [ + '--no-sandbox', + '--disable-gpu', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-crash-reporter', + ], timeout: options.puppeteerTimeout || 30_000, + userDataDir, }); - let page = await browser.newPage(); + page = await browser.newPage(); await page.setExtraHTTPHeaders(DEFAULT_HEADER); const response = await page.goto(url, { waitUntil: 'domcontentloaded', }); let pageSource; - //if we're extracting data from a spa, we must wait for the selector + // if we're extracting data from a SPA, we must wait for the selector if (waitForSelector != null) { - await page.waitForSelector(waitForSelector); + const selectorTimeout = options?.puppeteerSelectorTimeout ?? options?.puppeteerTimeout ?? 30_000; + await page.waitForSelector(waitForSelector, { timeout: selectorTimeout }); pageSource = await page.evaluate((selector) => { - return document.querySelector(selector).innerHTML; + const el = document.querySelector(selector); + return el ? el.innerHTML : ''; }, waitForSelector); } else { pageSource = await page.content(); @@ -33,17 +60,36 @@ export default async function execute(url, waitForSelector, options) { const statusCode = response.status(); if (botDetected(pageSource, statusCode)) { - console.warn('We have been detected as a bot :-/ Tried url: => ', url); - return null; + logger.warn('We have been detected as a bot :-/ Tried url: => ', url); + result = null; + } else { + result = pageSource || (await page.content()); } - - return await page.content(); } catch (error) { - console.error('Error executing with puppeteer executor', error); - return null; + logger.error('Error executing with puppeteer executor', error); + result = null; } finally { - if (browser != null) { - await browser.close(); + try { + if (page) { + await page.close(); + } + } catch { + // ignore + } + try { + if (browser != null) { + await browser.close(); + } + } catch { + // ignore + } + try { + if (removeUserDataDir && userDataDir) { + await fs.promises.rm(userDataDir, { recursive: true, force: true }); + } + } catch { + // ignore } } + return result; } diff --git a/lib/services/extractor/utils.js b/lib/services/extractor/utils.js index a356ece..af42f8d 100644 --- a/lib/services/extractor/utils.js +++ b/lib/services/extractor/utils.js @@ -1,3 +1,5 @@ +import logger from '../logger.js'; + let debuggingOn = false; export const DEFAULT_HEADER = { @@ -15,9 +17,7 @@ export const setDebug = (options) => { export const debug = (message) => { if (debuggingOn) { - /* eslint-disable no-console */ - console.debug(message); - /* eslint-enable no-console */ + logger.debug(message); } }; diff --git a/lib/services/immoscout/immoscout-web-translator.js b/lib/services/immoscout/immoscout-web-translator.js index 18a21e0..ec28957 100644 --- a/lib/services/immoscout/immoscout-web-translator.js +++ b/lib/services/immoscout/immoscout-web-translator.js @@ -60,6 +60,7 @@ https://api.mobile.immobilienscout24.de/search/map/v3?publishedafter=2025-05-14T https://api.mobile.immobilienscout24.de/search/map/v3?features=disableNHBGrouping,nextGen,fairPrice,listingsInListFirstSummary,xxlListingType,contactDetails&publishedafter=2025-05-14T09:19:43&sorting=standard&pagesize=300&searchType=shape&realEstateType=housebuy&pagenumber=1&shape=%7D%7BjwHy%7Cqh@jCKdCgAvB_BdB%7DBzAaCjAqCfAqC~@uCt@iCh@eCZkCLyC?_EO%7DEa@%7DEa@iE_@%7BD%5DaDe@gDi@gDo@uCu@kBcB_AeDOiE?iDCgCMuBOkDCkG?yFRgD%60@cB%5C%7BA%60@eBx@aB%7C@kAbAy@rAe@bBUxCAhE?dFh@fGlAzGbBbHlBxGdB%60FrAhDz@xBh@nAf@l@RNNXkCkMJR~B%7CEnCpErCnDtClCvC~ApCh@rCJpC? */ import queryString from 'query-string'; +import { nullOrEmpty } from '../../utils.js'; const PARAM_NAME_MAP = { heatingtypes: 'heatingtypes', @@ -193,3 +194,14 @@ export function convertWebToMobile(webUrl) { return `https://api.mobile.immobilienscout24.de/search/list?${mobileQuery}`; } + +export function convertImmoscoutListingToMobileListing(url) { + if (nullOrEmpty(url)) { + return null; + } + + return url.replace( + /^https:\/\/www\.immobilienscout24\.de\/expose\//, + 'https://api.mobile.immobilienscout24.de/expose/', + ); +} diff --git a/lib/services/listings/listingActiveService.js b/lib/services/listings/listingActiveService.js new file mode 100644 index 0000000..23c56bb --- /dev/null +++ b/lib/services/listings/listingActiveService.js @@ -0,0 +1,104 @@ +import { deactivateListings, getActiveOrUnknownListings } from '../storage/listingsStorage.js'; +import { getProviders } from '../../utils.js'; +import logger from '../../services/logger.js'; + +/** + * Runs the active-listing checker: + * 1) Loads all listings with unknown or active status. + * 2) Resolves each listing's provider and calls its `activeTester(link)`. + * 3) Collects listings that are no longer active and deactivates them in one batch. + * + * Concurrency: network-bound checks are executed with a configurable concurrency limit. + * + * @param {object} [opts] + * @param {number} [opts.concurrency=8] Max number of parallel activeTester calls. + * @returns {Promise} + */ +export default async function runActiveChecker(opts = {}) { + const { concurrency = 4 } = opts; + + const listings = getActiveOrUnknownListings(); + if (!Array.isArray(listings) || listings.length === 0) { + logger.debug('No listings to check.'); + return; + } + + const providers = await getProviders(); + if (!Array.isArray(providers) || providers.length === 0) { + logger.warn('No providers available. Skipping active checks.'); + return; + } + + // Build a map for O(1) provider lookup by id + /** @type {Record} */ + const providerById = Object.create(null); + for (const p of providers) { + const id = p?.metaInformation?.id; + if (id) providerById[id] = p; + } + + // Small generic mapLimit to cap concurrency without extra deps + /** + * @template T, R + * @param {T[]} items + * @param {number} limit + * @param {(item: T, index: number) => Promise} worker + * @returns {Promise} + */ + async function mapLimit(items, limit, worker) { + const results = new Array(items.length); + let next = 0; + + async function runOne() { + while (next < items.length) { + const i = next++; + try { + results[i] = await worker(items[i], i); + } catch (err) { + results[i] = /** @type {any} */ (err); + } + } + } + + const runners = Array.from({ length: Math.min(limit, items.length) }, runOne); + await Promise.all(runners); + return results; + } + + /** @type {string[]} */ + const listingsSetToInactive = []; + + await mapLimit(listings, concurrency, async (listing) => { + const { provider: listingProviderId, link, id } = listing || {}; + + const matchedProvider = providerById[listingProviderId]; + if (!matchedProvider) { + logger.warn('Could not find matching provider for', listingProviderId); + return; + } + const tester = matchedProvider?.config?.activeTester; + if (typeof tester !== 'function') { + logger.warn('No activeTester configured for', listingProviderId); + return; + } + + // Contract: activeTester(link) returns 1 if active, 0 if inactive + let result; + try { + result = await tester(link); + } catch { + result = -1; + } + + if (result === 0 && id) { + listingsSetToInactive.push(id); + } + }); + + if (listingsSetToInactive.length > 0) { + logger.info(`Setting ${listingsSetToInactive.length} listings to inactive.`); + deactivateListings(listingsSetToInactive); + } else { + logger.debug('No listings need to be set inactive.'); + } +} diff --git a/lib/services/listings/listingActiveTester.js b/lib/services/listings/listingActiveTester.js new file mode 100644 index 0000000..148c925 --- /dev/null +++ b/lib/services/listings/listingActiveTester.js @@ -0,0 +1,68 @@ +import fetch from 'node-fetch'; +import { randomBetween, sleep } from '../../utils.js'; + +const maxAttempts = 3; + +/** + * Check if a listing is still active with up to 3 attempts and exponential backoff. + * Backoff waits are capped and the last wait is at most 2000 ms. + * + * Rules: + * - HTTP 200 => return 1 + * - HTTP 401/403 => return -1 (most certainly detected as a bot) + * - HTTP 404 => return 0 + * - Other statuses or network errors => retry until attempts are exhausted + * + * @returns {Promise} 1 if active, o if not active and -1 if detected as bot + */ +export default async function checkIfListingIsActive(link) { + await sleep(randomBetween(50, 100)); + + for (let attempt = 1; attempt <= maxAttempts; attempt++) { + try { + const res = await fetch(link, { + headers: { + 'User-Agent': + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36', + 'Accept-Language': 'de-DE,de;q=0.9,en;q=0.8', + }, + }); + + if (res.status === 200) { + return 1; + } + if (res.status === 401) return -1; + if (res.status === 403) return -1; + if (res.status === 404) return 0; + + // For any other status, only retry if attempts remain + if (attempt < maxAttempts) { + await sleep(backoffDelay(attempt)); + continue; + } + + return 0; + } catch { + // Network error: retry if attempts remain + if (attempt < maxAttempts) { + await sleep(backoffDelay(attempt)); + continue; + } + return 0; + } + } + + return 0; +} + +/** + * Exponential backoff delay with cap. + * attempt: 1 -> 500ms, 2 -> 1000ms, 3 -> 2000ms (cap) + * @param {number} attempt 1-based attempt index + * @returns {number} delay in ms + */ +function backoffDelay(attempt) { + const base = 500; + const cap = 2000; + return Math.min(base * 2 ** (attempt - 1), cap); +} diff --git a/lib/services/logger.js b/lib/services/logger.js new file mode 100644 index 0000000..b60aafe --- /dev/null +++ b/lib/services/logger.js @@ -0,0 +1,59 @@ +const COLORS = { + debug: '\x1b[36m', + info: '\x1b[32m', + warn: '\x1b[33m', + error: '\x1b[31m', + reset: '\x1b[0m', +}; + +const env = process.env.NODE_ENV || 'development'; +const useColor = process.stdout.isTTY || process.stderr.isTTY; + +function ts() { + const d = new Date(); + const yyyy = d.getFullYear(); + const mm = String(d.getMonth() + 1).padStart(2, '0'); + const dd = String(d.getDate()).padStart(2, '0'); + const hh = String(d.getHours()).padStart(2, '0'); + const mi = String(d.getMinutes()).padStart(2, '0'); + const ss = String(d.getSeconds()).padStart(2, '0'); + return `${yyyy}-${mm}-${dd} ${hh}:${mi}:${ss}`; +} + +function lvl(level) { + const upper = level.toUpperCase(); + if (!useColor) return upper; + return `${COLORS[level] || ''}${upper}${COLORS.reset}`; +} + +/* eslint-disable no-console */ +function log(level, ...args) { + if (level === 'debug' && env !== 'development') { + return; // Skip debug logs in non-development environments + } + + const prefix = `[${ts()}] ${lvl(level)}:`; + switch (level) { + case 'debug': + console.debug(prefix, ...args); + break; + case 'info': + console.info(prefix, ...args); + break; + case 'warn': + console.warn(prefix, ...args); + break; + case 'error': + console.error(prefix, ...args); + break; + default: + console.log(prefix, ...args); + } +} + +export default { + debug: (...a) => log('debug', ...a), + info: (...a) => log('info', ...a), + warn: (...a) => log('warn', ...a), + error: (...a) => log('error', ...a), +}; diff --git a/lib/services/similarity-check/SimilarityCacheEntry.js b/lib/services/similarity-check/SimilarityCacheEntry.js deleted file mode 100644 index 65b32c7..0000000 --- a/lib/services/similarity-check/SimilarityCacheEntry.js +++ /dev/null @@ -1,26 +0,0 @@ -import stringSimilarity from 'string-similarity'; -//if the score is higher than this, it will be considered a match -const MAX_DICE_INDEX = 0.7; -export default (class SimilarityCacheEntry { - constructor(time) { - this.time = time; - this.values = []; - } - setCacheEntry = (entry) => { - this.values.push(entry); - }; - getTime = () => { - return this.time; - }; - hasSimilarEntries = (value) => { - if (this.values.length > 0) { - for (let i = 0; i < this.values.length; i++) { - const index = stringSimilarity.compareTwoStrings(value, this.values[i]); - if (index >= MAX_DICE_INDEX) { - return true; - } - } - } - return false; - }; -}); diff --git a/lib/services/similarity-check/similarityCache.js b/lib/services/similarity-check/similarityCache.js index 0e94819..f89f8f3 100644 --- a/lib/services/similarity-check/similarityCache.js +++ b/lib/services/similarity-check/similarityCache.js @@ -1,40 +1,116 @@ -import SimilarityCacheEntry from './SimilarityCacheEntry.js'; -import { config } from '../../utils.js'; -//5 minutes -let retention = 5 * 60 * 1000; -const intervalInMs = config.interval * 60 * 1000; -//an interval below 5 mins sounds crazy, but there are ppl out there doing crazy shit. -if (intervalInMs <= retention) { - retention = Math.floor(intervalInMs / 2); -} -//jobid -> SimilarityCacheEntry -const cache = {}; -let intervalId; +import crypto from 'crypto'; + +const retention = 60 * 60 * 1000; /** - * cleanup + * Internal cache storage. + * Maps a SHA-256 hash (string) to its expiry timestamp (number in ms). + * @type {Map} */ -intervalId = setInterval(() => { - const keysToBeRemoved = []; +const entries = new Map(); + +/** + * Reference to the currently scheduled cleanup timer. + * @type {NodeJS.Timeout | null} + */ +let timer = null; + +/** + * Generate a SHA-256 hash from a list of input strings. + * Null or undefined values are ignored. + * + * @param {...(string|null|undefined)} strings - Input values to hash + * @returns {string} Hexadecimal hash + */ +function toHash(...strings) { + return crypto.createHash('sha256').update(strings.filter(Boolean).join('|')).digest('hex'); +} + +/** + * Cleanup expired cache entries and schedule the next cleanup run. + * This function is invoked automatically by scheduled timers. + * + * @private + */ +function runCleanup() { const now = Date.now(); - Object.keys(cache).forEach((key) => { - if (cache[key].getTime() + retention < now) { - keysToBeRemoved.push(key); - } - }); - if (keysToBeRemoved.length > 0) { - keysToBeRemoved.forEach((key) => delete cache[key]); + for (const [hash, expiry] of entries) { + if (expiry <= now) entries.delete(hash); } -}, 10000); -export const addCacheEntry = (jobId, value) => { - cache[jobId] = cache[jobId] || new SimilarityCacheEntry(Date.now()); - cache[jobId].setCacheEntry(value); -}; -export const hasSimilarEntries = (jobId, value) => { - if (cache[jobId] == null) { + scheduleNext(); +} + +/** + * Find the soonest expiry timestamp among all cache entries + * and schedule a one-shot timer that will trigger at that time. + * Cancels any existing timer before scheduling a new one. + * + * @private + */ +function scheduleNext() { + if (timer) { + clearTimeout(timer); + timer = null; + } + let next = Infinity; + const now = Date.now(); + for (const expiry of entries.values()) { + if (expiry > now && expiry < next) next = expiry; + } + if (next !== Infinity) { + timer = setTimeout(runCleanup, Math.max(0, next - now)); + } +} + +/** + * Add or refresh a cache entry for the given title and address. + * The entry will automatically expire after the configured retention window. + * + * @param {string} title - The title used to build the cache key + * @param {string} address - The address used to build the cache key + */ +export function addCacheEntry(title, address) { + const hash = toHash(title, address); + const expiry = Date.now() + retention; + entries.set(hash, expiry); + scheduleNext(); +} + +/** + * Check if a cache entry with the same title and address exists + * and is still valid (not expired). + * + * @param {string} title - The title used to build the cache key + * @param {string} address - The address used to build the cache key + * @returns {boolean} True if a valid cache entry exists, false otherwise + */ +export function hasSimilarEntries(title, address) { + const hash = toHash(title, address); + const expiry = entries.get(hash); + if (expiry == null) return false; + if (expiry <= Date.now()) { + entries.delete(hash); + scheduleNext(); return false; } - return cache[jobId].hasSimilarEntries(value); -}; -export const stopCacheCleanup = () => { - clearInterval(intervalId); -}; + return true; +} + +/** + * Stop any scheduled cleanup timers and prevent further automatic cleanup. + * Entries that are already in the cache will remain until removed manually + * or until cleanup is started again by adding new entries. + */ +export function stopCacheCleanup() { + if (timer) clearTimeout(timer); + timer = null; +} + +/** + * this is only for test purposes + */ +export function invalidateAllForTest() { + for (const key of entries.keys()) { + entries.set(key, 0); + } + runCleanup(); +} diff --git a/lib/services/storage/LowDashAdapter.js b/lib/services/storage/LowDashAdapter.js deleted file mode 100644 index 1db153f..0000000 --- a/lib/services/storage/LowDashAdapter.js +++ /dev/null @@ -1,8 +0,0 @@ -import lodash from 'lodash'; -import { LowSync } from 'lowdb'; -export default class LowdashAdapter extends LowSync { - constructor(adapter, defaultData = {}) { - super(adapter, defaultData); - this.chain = lodash.chain(this).get('data'); - } -} diff --git a/lib/services/storage/SqliteConnection.js b/lib/services/storage/SqliteConnection.js new file mode 100644 index 0000000..eb4305f --- /dev/null +++ b/lib/services/storage/SqliteConnection.js @@ -0,0 +1,140 @@ +import fs from 'fs'; +import path from 'path'; +import Database from 'better-sqlite3'; +import logger from '../../services/logger.js'; +import { config } from '../../utils.js'; + +/** + * SqliteConnection + * A small, high-performance wrapper around better-sqlite3 that provides a + * singleton connection, sensible PRAGMA tuning, and helper methods. This + * module is safe to import and reuse. + * + * Performance notes: + * - journal_mode = WAL: allows concurrent readers with a single writer and + * yields better performance for server apps. + * - synchronous = NORMAL: trades a bit of durability for significant speed + * while still being safe in most environments. + * - cache_size = -64000: ~64MB page cache (negative value sets KB) to improve + * query performance for frequent reads. + * - foreign_keys = ON: ensure referential integrity is enforced. + * - optimize: runs SQLite's auto-analysis and purges internal caches. It is + * cheap; we call it at startup and before process exit. You can also call + * optimize() manually after large schema changes or bulk operations. + */ +class SqliteConnection { + static #db = null; + + /** + * Returns a singleton instance of better-sqlite3 Database. + * Respects env var SQLITE_DB_PATH and defaults to db/listings.db. + */ + static getConnection() { + if (this.#db) return this.#db; + + // Interpret config.sqlitepath as a directory relative to project root when it starts with '/' + const cfg = typeof config === 'object' && config ? config.sqlitepath : undefined; + const rawDir = cfg && cfg.length > 0 ? cfg : '/db'; + const relDir = rawDir.startsWith('/') ? rawDir.slice(1) : rawDir; + const absDir = path.isAbsolute(relDir) ? relDir : path.join(process.cwd(), relDir); + const dbPath = path.join(absDir, 'listings.db'); + + // Ensure directory exists + const dir = path.dirname(dbPath); + if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true }); + + // Open the database synchronously (better-sqlite3 is sync and very fast) + this.#db = new Database(dbPath, { verbose: undefined }); + + // Apply high-performance PRAGMA's + try { + this.#db.pragma('journal_mode = WAL'); + this.#db.pragma('synchronous = NORMAL'); + this.#db.pragma('cache_size = -64000'); + this.#db.pragma('foreign_keys = ON'); + this.#db.pragma('optimize'); + } catch (e) { + logger.warn('Failed to apply one or more PRAGMAs:', e.message); + } + + // Run optimize on exit to persist analysis and cleanup internal caches. + process.once('beforeExit', () => { + try { + this.#db?.pragma('optimize'); + } catch (e) { + logger.debug('PRAGMA optimize on exit failed:', e.message); + } + }); + + return this.#db; + } + + /** + * Execute a write statement (INSERT/UPDATE/DELETE/DDL). Returns better-sqlite3 run info. + */ + static execute(sql, params = {}) { + const db = this.getConnection(); + return db.prepare(sql).run(params); + } + + /** + * Execute a query and returns all rows. + */ + static query(sql, params = {}) { + const db = this.getConnection(); + return db.prepare(sql).all(params); + } + + /** + * Check whether a table exists. + */ + static tableExists(tableName) { + const db = this.getConnection(); + const row = db.prepare("SELECT 1 FROM sqlite_master WHERE type = 'table' AND name = ?").get(tableName); + return !!row; + } + + /** + * Run the given callback inside a transaction. The callback receives the Database instance. + * If the callback throws, the transaction is rolled back and the error re-thrown. + */ + static withTransaction(callback) { + const db = this.getConnection(); + const trx = db.transaction((cb) => cb(db)); + return trx(callback); + } + + /** + * Run SQLite PRAGMA optimize. See https://sqlite.org/pragma.html#pragma_optimize + * + * Explanation: PRAGMA optimize triggers internal housekeeping, such as + * recomputing query planner statistics (similar to ANALYZE) when appropriate + * and purging unused pages from caches. It is inexpensive and can improve + * performance after schema changes or heavy write activity. + */ + static optimize() { + const db = this.getConnection(); + try { + db.pragma('optimize'); + } catch (e) { + logger.warn('PRAGMA optimize failed:', e.message); + } + } + + /** + * Close the database connection. Typically not needed for long-running apps. + */ + static close() { + if (this.#db) { + try { + this.#db.pragma('optimize'); + } catch (e) { + logger.debug('PRAGMA optimize before close failed:', e.message); + } + this.#db.close(); + this.#db = null; + } + } +} + +export default SqliteConnection; diff --git a/lib/services/storage/jobStorage.js b/lib/services/storage/jobStorage.js index 6697b5c..3248638 100644 --- a/lib/services/storage/jobStorage.js +++ b/lib/services/storage/jobStorage.js @@ -1,107 +1,144 @@ -import { JSONFileSync } from 'lowdb/node'; import { nanoid } from 'nanoid'; -import * as listingStorage from './listingsStorage.js'; -import { getDirName } from '../../utils.js'; -import path from 'path'; -import LowdashAdapter from './LowDashAdapter.js'; - -const file = path.join(getDirName(), '../', 'db/jobs.json'); -const adapter = new JSONFileSync(file); -const db = new LowdashAdapter(adapter, { jobs: [] }); - -db.read(); +import SqliteConnection from './SqliteConnection.js'; +import logger from '../logger.js'; +import { toJson, fromJson } from '../../utils.js'; +/** + * Insert or update a job. Preserves original owner (userId) when updating an existing job. + * + * @param {Object} params + * @param {string} [params.jobId] - Existing job id to update; omit to insert a new job. + * @param {string} [params.name] - Job display name. + * @param {Array} [params.blacklist] - Blacklist entries; defaults to empty array. + * @param {boolean} [params.enabled] - Whether the job is enabled; defaults to true. + * @param {Array} params.provider - Provider configuration list. + * @param {Array} params.notificationAdapter - Notification adapter configuration list. + * @param {string} params.userId - Owner user id for inserts; preserved on updates. + * @returns {void} + */ export const upsertJob = ({ jobId, name, blacklist = [], enabled = true, provider, notificationAdapter, userId }) => { - const currentJob = - jobId == null - ? null - : db.chain - .get('jobs') - .find((job) => job.id === jobId) - .value(); - const jobs = db.chain - .get('jobs') - .filter((job) => job.id !== jobId) - .value(); - jobs.push({ - id: jobId || nanoid(), - //make sure to not overwrite the user id in case an admin changes the job - userId: currentJob == null ? userId : currentJob.userId, - enabled, - name, - blacklist, - provider, - notificationAdapter, - }); - db.chain.set('jobs', jobs).value(); - db.write(); -}; -export const getJob = (jobId) => { - const job = db.chain - .get('jobs') - .find((job) => job.id === jobId) - .value(); - if (job == null) { - return null; + const id = jobId || nanoid(); + const existing = SqliteConnection.query(`SELECT id, user_id FROM jobs WHERE id = @id LIMIT 1`, { id })[0]; + const ownerId = existing ? existing.user_id : userId; + if (existing) { + SqliteConnection.execute( + `UPDATE jobs + SET enabled = @enabled, + name = @name, + blacklist = @blacklist, + provider = @provider, + notification_adapter = @notification_adapter + WHERE id = @id`, + { + id, + enabled: enabled ? 1 : 0, + name: name ?? null, + blacklist: toJson(blacklist ?? []), + provider: toJson(provider ?? []), + notification_adapter: toJson(notificationAdapter ?? []), + }, + ); + } else { + SqliteConnection.execute( + `INSERT INTO jobs (id, user_id, enabled, name, blacklist, provider, notification_adapter) + VALUES (@id, @user_id, @enabled, @name, @blacklist, @provider, @notification_adapter)`, + { + id, + user_id: ownerId, + enabled: enabled ? 1 : 0, + name: name ?? null, + blacklist: toJson(blacklist ?? []), + provider: toJson(provider ?? []), + notification_adapter: toJson(notificationAdapter ?? []), + }, + ); } +}; + +/** + * Get a single job by id. + * @param {string} jobId - Job primary key. + * @returns {Job|null} The job or null if not found. + */ +export const getJob = (jobId) => { + const row = SqliteConnection.query( + `SELECT j.id, + j.user_id AS userId, + j.enabled, + j.name, + j.blacklist, + j.provider, + j.notification_adapter AS notificationAdapter, + (SELECT COUNT(1) FROM listings l WHERE l.job_id = j.id) AS numberOfFoundListings + FROM jobs j + WHERE j.id = @id + LIMIT 1`, + { id: jobId }, + )[0]; + if (!row) return null; return { - ...job, - numberOfFoundListings: listingStorage.getNumberOfAllKnownListings(job.id).length, + ...row, + enabled: !!row.enabled, + blacklist: fromJson(row.blacklist, []), + provider: fromJson(row.provider, []), + notificationAdapter: fromJson(row.notificationAdapter, []), }; }; + +/** + * Update job enabled status. + * @param {{jobId: string, status: boolean}} params - Parameters. + * @returns {void} + */ export const setJobStatus = ({ jobId, status }) => { - db.chain - .get('jobs') - .find((job) => job.id === jobId) - .assign({ enabled: status }) - .value(); - db.write(); + SqliteConnection.execute(`UPDATE jobs SET enabled = @enabled WHERE id = @id`, { + id: jobId, + enabled: status ? 1 : 0, + }); }; + +/** + * Remove a job by id. Listings are deleted automatically due to FK ON DELETE CASCADE. + * @param {string} jobId - Job id. + * @returns {void} + */ export const removeJob = (jobId) => { - listingStorage.removeListings(jobId); - db.chain - .get('jobs') - .remove((job) => job.id === jobId) - .value(); - db.write(); + // listings table has FK ON DELETE CASCADE via job_id + SqliteConnection.execute(`DELETE FROM jobs WHERE id = @id`, { id: jobId }); }; + export const removeJobsByUserId = (userId) => { - db.chain - .get('jobs') - .filter((job) => job.userId === userId) - .forEach((job) => listingStorage.removeListings(job.id)); - db.chain - .get('jobs') - .remove((job) => job.userId === userId) - .value(); - db.write(); -}; -export const removeJobsByUserName = (userId) => { - let removedDemoJobs = 0; - db.chain - .get('jobs') - .filter((job) => job.userId === userId) - .forEach((job) => { - removedDemoJobs++; - listingStorage.removeListings(job.id); - }); - db.chain - .get('jobs') - .remove((job) => job.userId === userId) - .value(); - db.write(); - if (removedDemoJobs > 0) { - /* eslint-disable no-console */ - console.log(`Removed ${removedDemoJobs} demo jobs`); - /* eslint-enable no-console */ + // Count jobs to log similar to previous behavior + const count = + SqliteConnection.query(`SELECT COUNT(1) AS c FROM jobs WHERE user_id = @user_id`, { user_id: userId })[0]?.c ?? 0; + SqliteConnection.execute(`DELETE FROM jobs WHERE user_id = @user_id`, { user_id: userId }); + if (count > 0) { + logger.info(`Removed ${count} jobs for user ${userId}`); } }; + +/** + * Get all jobs. + * @returns {Job[]} List of jobs ordered by name (NULLs last). + */ export const getJobs = () => { - return db.chain - .get('jobs') - .map((job) => ({ - ...job, - numberOfFoundListings: listingStorage.getNumberOfAllKnownListings(job.id), - })) - .value(); + const rows = SqliteConnection.query( + `SELECT j.id, + j.user_id AS userId, + j.enabled, + j.name, + j.blacklist, + j.provider, + j.notification_adapter AS notificationAdapter, + (SELECT COUNT(1) FROM listings l WHERE l.job_id = j.id) AS numberOfFoundListings + FROM jobs j + ORDER BY j.name IS NULL, j.name`, + ); + return rows.map((row) => ({ + ...row, + enabled: !!row.enabled, + blacklist: fromJson(row.blacklist, []), + provider: fromJson(row.provider, []), + notificationAdapter: fromJson(row.notificationAdapter, []), + })); }; diff --git a/lib/services/storage/listingsStorage.js b/lib/services/storage/listingsStorage.js index 08c8197..082767b 100755 --- a/lib/services/storage/listingsStorage.js +++ b/lib/services/storage/listingsStorage.js @@ -1,52 +1,168 @@ -import { JSONFileSync } from 'lowdb/node'; -import { getDirName } from '../../utils.js'; -import path from 'path'; -import LowdashAdapter from './LowDashAdapter.js'; +import { nullOrEmpty } from '../../utils.js'; +import SqliteConnection from './SqliteConnection.js'; +import { nanoid } from 'nanoid'; -const file = path.join(getDirName(), '../', 'db/jobListingData.json'); -const adapter = new JSONFileSync(file); -const db = new LowdashAdapter(adapter, {}); - -db.read(); - -const buildKey = (jobKey, providerId, endpoint) => { - let key = `${jobKey}`; - if (jobKey == null && endpoint == null) { - return key; - } - if (providerId != null) { - key += `.${providerId}`; - } - if (endpoint != null) { - key += `.${endpoint}`; - } - return key; -}; -export const getNumberOfAllKnownListings = (jobId) => { - const data = db.chain.get(`${jobId}.providerData`).value() || {}; - return Object.values(data) - .map((values) => Object.keys(values).length) - .reduce((accumulator, currentValue) => accumulator + currentValue, 0); -}; +/** + * Build analytics data for a given job by grouping all listings by provider and + * mapping each listing hash to its creation timestamp. + * + * SQL shape: + * SELECT json_group_object(provider, json_object(hash, created_at)) AS result + * FROM listings WHERE job_id = @jobId; + * + * The resulting object has the shape: + * { + * providerA: { "": , "": , ... }, + * providerB: { ... } + * } + * + * @param {string} jobId - ID of the job whose listings should be aggregated. + * @returns {Record>} Object grouped by provider mapping listing-hash -> created_at epoch ms. + */ export const getListingProviderDataForAnalytics = (jobId) => { - const key = buildKey(jobId, 'providerData'); - return db.chain.get(key).value() || {}; + const row = SqliteConnection.query( + `SELECT COALESCE( + json_group_object(provider, json(provider_map)), + json('{}') + ) AS result + FROM (SELECT provider, + json_group_object(hash, created_at) AS provider_map + FROM listings + WHERE job_id = @jobId + GROUP BY provider);`, + { jobId }, + ); + + return row?.length > 0 ? JSON.parse(row[0].result) : {}; }; -export const getKnownListings = (jobId, providerId) => { - const providerListingsKey = buildKey(jobId, 'providerData', providerId, 'listings'); - return db.chain.get(providerListingsKey).value() || {}; + +/** + * Return a list of known listing hashes for a given job and provider. + * Useful to de-duplicate before inserting new listings. + * + * @param {string} jobId - The job identifier. + * @param {string} providerId - The provider identifier (e.g., 'immoscout'). + * @returns {string[]} Array of listing hashes. + */ +export const getKnownListingHashesForJobAndProvider = (jobId, providerId) => { + return SqliteConnection.query( + `SELECT hash + FROM listings + WHERE job_id = @jobId AND provider = @providerId`, + { jobId, providerId }, + ).map((r) => r.hash); }; -export const setKnownListings = (jobId, providerId, listings) => { - const providerListingsKey = buildKey(jobId, 'providerData', providerId, 'listings'); - db.chain.set(providerListingsKey, listings).value(); - return db.write(); + +/** + * Return a list of listing that either are active or have an unknown status + * to constantly check if they are still online + * + * @returns {string[]} Array of listings + */ +export const getActiveOrUnknownListings = () => { + return SqliteConnection.query( + `SELECT * + FROM listings + WHERE is_active is null OR is_active = 1 ORDER BY provider`, + ); }; -export const setLastJobExecution = (jobId) => { - const key = buildKey(jobId, null, 'lastExecution'); - db.chain.set(key, Date.now()).value(); - return db.write(); + +/** + * Deactivates listings by setting is_active = 0 for all matching IDs. + * + * @param {string[]} ids - Array of listing IDs to deactivate. + * @returns {object[]} Result of the SQLite query execution. + */ +export const deactivateListings = (ids) => { + const placeholders = ids.map(() => '?').join(','); + return SqliteConnection.execute( + `UPDATE listings + SET is_active = 0 + WHERE id IN (${placeholders})`, + ids, + ); }; -export const removeListings = (jobId) => { - db.chain.unset(jobId).value(); - db.write(); + +/** + * Persist a batch of scraped listings for a given job and provider. + * + * - Empty or non-array inputs are ignored. + * - Each listing is inserted with ON CONFLICT(hash) DO NOTHING to avoid duplicates. + * - Performs inserts in a single transaction for performance. + * + * Listing input shape (minimal expected): + * { + * id: string, // unique id + * hash: string // stable hash/id of the listing (used as unique hash) + * price?: string, // e.g., "1.234 €" or "1,234€" + * size?: string, // e.g., "70 m²" + * title?: string, + * image?: string, // image URL + * description?: string, + * address?: string, // free-text address possibly containing parentheses + * link?: string + * } + * + * @param {string} jobId - The job identifier. + * @param {string} providerId - The provider identifier. + * @param {Array} listings - Array of listing objects as described above. + * @returns {void} + */ +export const storeListings = (jobId, providerId, listings) => { + if (!Array.isArray(listings) || listings.length === 0) { + return; + } + + SqliteConnection.withTransaction((db) => { + const stmt = db.prepare( + `INSERT INTO listings (id, hash, provider, job_id, price, size, title, image_url, description, address, + link, created_at, is_active) + VALUES (@id, @hash, @provider, @job_id, @price, @size, @title, @image_url, @description, @address, @link, + @created_at, 1) + ON CONFLICT(job_id, hash) DO NOTHING`, + ); + + for (const item of listings) { + const params = { + id: nanoid(), + hash: item.id, + provider: providerId, + job_id: jobId, + price: extractNumber(item.price), + size: extractNumber(item.size), + title: item.title, + image_url: item.image, + description: item.description, + address: removeParentheses(item.address), + link: item.link, + created_at: Date.now(), + }; + stmt.run(params); + } + }); + + /** + * Extract the first number from a string like "1.234 €" or "70 m²". + * Removes dots/commas before parsing. Returns null on invalid input. + * @param {string|undefined|null} str + * @returns {number|null} + */ + function extractNumber(str) { + if (!str) return null; + const match = str.replace(/[.,]/g, '').match(/\d+/); + return match ? +match[0] : null; + } + + /** + * Remove any parentheses segments (including surrounding whitespace) from a string. + * Returns null for empty input. + * @param {string|undefined|null} str + * @returns {string|null} + */ + function removeParentheses(str) { + if (nullOrEmpty(str)) { + return null; + } + return str.replace(/\s*\([^)]*\)/g, ''); + } }; diff --git a/lib/services/storage/migrations/migrate.js b/lib/services/storage/migrations/migrate.js new file mode 100644 index 0000000..ab7356d --- /dev/null +++ b/lib/services/storage/migrations/migrate.js @@ -0,0 +1,185 @@ +/** + * Migration Runner for better-sqlite3 + * I know there are external libs out there, but + * a) most of them are pretty bloated + * b) I wanted to have something that fit's this limited use-case + * c) I was searching for justifications anyway to build a migration system on my own. Don't judge me ;) + * + * Executes all migration files in lib/services/storage/migrations/sql in natural order. + * Each migration runs in its own transaction. If a migration fails, only that + * migration is rolled back and the process stops with a non-zero exit code. + * Already applied migrations are skipped using the schema_migrations table. + * + * Usage: + * CLI: yarn run migratedb + * Programmatic: + * import { runMigrations } from './lib/services/storage/migrations/migrate.js'; + * await runMigrations(); + * + * Migration file format (example: lib/services/storage/migrations/sql/1.add-users.js): + * export function up(db) { + * db.exec("CREATE TABLE users (id INTEGER PRIMARY KEY, name TEXT NOT NULL)"); + * } + * + */ +import fs from 'fs'; +import path from 'path'; +import { pathToFileURL } from 'url'; +import crypto from 'crypto'; +import SqliteConnection from '../SqliteConnection.js'; +import logger from '../../logger.js'; + +const ROOT = path.resolve('.'); +const MIGRATIONS_DIR = path.join(ROOT, 'lib', 'services', 'storage', 'migrations', 'sql'); + +/** + * Ensures that the given directory exists, creating it recursively if needed. + * @param {string} p - Path to the directory. + */ +function ensureDir(p) { + if (!fs.existsSync(p)) fs.mkdirSync(p, { recursive: true }); +} + +/** + * Lists all migration files in the migrations directory. + * Migration files must follow the format: .