-
-
Notifications
You must be signed in to change notification settings - Fork 138
Feature/spec filter #276
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Feature/spec filter #276
Changes from all commits
bb6b46b
21db416
2739fab
ae975d9
9578192
a134da3
85643fc
067682b
a7c7273
82d08d7
8b368a1
18cbe24
fdc7a7d
fed77be
27310b0
153c63c
47e3bb9
0074053
7f9ce14
2dedcbc
3a246ff
2bf368d
58c91f9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,12 @@ | ||
| { | ||
| "compilerOptions": { | ||
| "module": "NodeNext", | ||
| "moduleResolution": "NodeNext", | ||
| "target": "ESNext", | ||
| "checkJs": true, | ||
| "allowJs": true, | ||
| "noEmit": true, | ||
| "strict": false | ||
| }, | ||
| "exclude": ["node_modules", "ui"] | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,22 +19,14 @@ import { distanceMeters } from './services/listings/distanceCalculator.js'; | |
| import { getUserSettings } from './services/storage/settingsStorage.js'; | ||
| import { updateListingDistance } from './services/storage/listingsStorage.js'; | ||
| import booleanPointInPolygon from '@turf/boolean-point-in-polygon'; | ||
| import { formatListing } from './utils/formatListing.js'; | ||
|
|
||
| /** | ||
| * @typedef {Object} Listing | ||
| * @property {string} id Stable unique identifier (hash) of the listing. | ||
| * @property {string} title Title or headline of the listing. | ||
| * @property {string} [address] Optional address/location text. | ||
| * @property {string} [price] Optional price text/value. | ||
| * @property {string} [url] Link to the listing detail page. | ||
| * @property {any} [meta] Provider-specific additional metadata. | ||
| */ | ||
|
|
||
| /** | ||
| * @typedef {Object} SimilarityCache | ||
| * @property {(title:string, address?:string)=>boolean} hasSimilarEntries Returns true if a similar entry is known. | ||
| * @property {(title:string, address?:string)=>void} addCacheEntry Adds a new entry to the similarity cache. | ||
| */ | ||
| /** @import { ParsedListing } from './types/listing.js' */ | ||
| /** @import { Job } from './types/job.js' */ | ||
| /** @import { ProviderConfig } from './types/providerConfig.js' */ | ||
| /** @import { SpecFilter, SpatialFilter } from './types/filter.js' */ | ||
| /** @import { SimilarityCache } from './types/similarityCache.js' */ | ||
| /** @import { Browser } from './types/browser.js' */ | ||
|
|
||
| /** | ||
| * Runtime orchestrator for fetching, normalizing, filtering, deduplicating, storing, | ||
|
|
@@ -48,42 +40,43 @@ import booleanPointInPolygon from '@turf/boolean-point-in-polygon'; | |
| * 5) Identify new listings (vs. previously stored hashes) | ||
| * 6) Persist new listings | ||
| * 7) Filter out entries similar to already seen ones | ||
| * 8) Dispatch notifications | ||
| * 8) Filter out entries that do not match the job's specFilter | ||
| * 9) Filter out entries that do not match the job's spatialFilter | ||
| * 10) Dispatch notifications | ||
| */ | ||
| class FredyPipelineExecutioner { | ||
| /** | ||
| * Create a new runtime instance for a single provider/job execution. | ||
| * | ||
| * @param {Object} providerConfig Provider configuration. | ||
| * @param {string} providerConfig.url Base URL to crawl. | ||
| * @param {string} [providerConfig.sortByDateParam] Query parameter used to enforce sorting by date (provider-specific). | ||
| * @param {string} [providerConfig.waitForSelector] CSS selector to wait for before parsing content. | ||
| * @param {Object.<string, string>} providerConfig.crawlFields Mapping of field names to selectors/paths to extract. | ||
| * @param {string} providerConfig.crawlContainer CSS selector for the container holding listing items. | ||
| * @param {(raw:any)=>Listing} providerConfig.normalize Function to convert raw scraped data into a Listing shape. | ||
| * @param {(listing:Listing)=>boolean} providerConfig.filter Function to filter out unwanted listings. | ||
| * @param {(url:string, waitForSelector?:string)=>Promise<void>|Promise<Listing[]>} [providerConfig.getListings] Optional override to fetch listings. | ||
| * @param {Object} notificationConfig Notification configuration passed to notification adapters. | ||
| * @param {Object} spatialFilter Optional spatial filter configuration. | ||
| * @param {ProviderConfig} providerConfig Provider configuration. | ||
| * @param {Job} job Job configuration. | ||
| * @param {string} providerId The ID of the provider currently in use. | ||
| * @param {string} jobKey Key of the job that is currently running (from within the config). | ||
| * @param {SimilarityCache} similarityCache Cache instance for checking similar entries. | ||
| * @param browser | ||
| * @param {Browser} browser Puppeteer browser instance. | ||
| */ | ||
| constructor(providerConfig, notificationConfig, spatialFilter, providerId, jobKey, similarityCache, browser) { | ||
| constructor(providerConfig, job, providerId, similarityCache, browser) { | ||
| /** @type {ProviderConfig} */ | ||
| this._providerConfig = providerConfig; | ||
| this._notificationConfig = notificationConfig; | ||
| this._spatialFilter = spatialFilter; | ||
| /** @type {Object} */ | ||
| this._jobNotificationConfig = job.notificationAdapter; | ||
| /** @type {string} */ | ||
| this._jobKey = job.id; | ||
| /** @type {SpecFilter | null} */ | ||
| this._jobSpecFilter = job.specFilter; | ||
| /** @type {SpatialFilter | null} */ | ||
| this._jobSpatialFilter = job.spatialFilter; | ||
| /** @type {string} */ | ||
| this._providerId = providerId; | ||
| this._jobKey = jobKey; | ||
| /** @type {SimilarityCache} */ | ||
| this._similarityCache = similarityCache; | ||
| /** @type {Browser} */ | ||
| this._browser = browser; | ||
| } | ||
|
|
||
| /** | ||
| * Execute the end-to-end pipeline for a single provider run. | ||
| * | ||
| * @returns {Promise<Listing[]|void>} Resolves to the list of new (and similarity-filtered) listings | ||
| * @returns {Promise<ParsedListing[]|void>} Resolves to the list of new (and similarity-filtered) listings | ||
| * after notifications have been sent; resolves to void when there are no new listings. | ||
| */ | ||
| execute() { | ||
|
|
@@ -95,7 +88,8 @@ class FredyPipelineExecutioner { | |
| .then(this._geocode.bind(this)) | ||
| .then(this._save.bind(this)) | ||
| .then(this._calculateDistance.bind(this)) | ||
| .then(this._filterBySimilarListings.bind(this)) | ||
| .then(this._deleteSimilarListings.bind(this)) | ||
| .then(this._filterBySpecs.bind(this)) | ||
| .then(this._filterByArea.bind(this)) | ||
| .then(this._notify.bind(this)) | ||
| .catch(this._handleError.bind(this)); | ||
|
|
@@ -104,8 +98,8 @@ class FredyPipelineExecutioner { | |
| /** | ||
| * Geocode new listings. | ||
| * | ||
| * @param {Listing[]} newListings New listings to geocode. | ||
| * @returns {Promise<Listing[]>} Resolves with the listings (potentially with added coordinates). | ||
| * @param {ParsedListing[]} newListings New listings to geocode. | ||
| * @returns {Promise<ParsedListing[]>} Resolves with the listings (potentially with added coordinates). | ||
| */ | ||
| async _geocode(newListings) { | ||
| for (const listing of newListings) { | ||
|
|
@@ -124,20 +118,19 @@ class FredyPipelineExecutioner { | |
| * Filter listings by area using the provider's area filter if available. | ||
| * Only filters if areaFilter is set on the provider AND the listing has coordinates. | ||
| * | ||
| * @param {Listing[]} newListings New listings to filter by area. | ||
| * @returns {Promise<Listing[]>} Resolves with listings that are within the area (or not filtered if no area is set). | ||
| * @param {ParsedListing[]} newListings New listings to filter by area. | ||
| * @returns {ParsedListing[]} Resolves with listings that are within the area (or not filtered if no area is set). | ||
| */ | ||
| _filterByArea(newListings) { | ||
| const polygonFeatures = this._spatialFilter?.features?.filter((f) => f.geometry?.type === 'Polygon'); | ||
| const polygonFeatures = this._jobSpatialFilter?.features?.filter((f) => f.geometry?.type === 'Polygon'); | ||
|
|
||
| // If no area filter is set, return all listings | ||
| if (!polygonFeatures?.length) { | ||
| return newListings; | ||
| } | ||
|
|
||
| const filteredIds = []; | ||
| // Filter listings by area - keep only those within the polygon | ||
| const keptListings = newListings.filter((listing) => { | ||
| const filteredListings = newListings.filter((listing) => { | ||
| // If listing doesn't have coordinates, keep it (don't filter out) | ||
| if (listing.latitude == null || listing.longitude == null) { | ||
| return true; | ||
|
|
@@ -147,26 +140,42 @@ class FredyPipelineExecutioner { | |
| const point = [listing.longitude, listing.latitude]; // GeoJSON format: [lon, lat] | ||
| const isInPolygon = polygonFeatures.some((feature) => booleanPointInPolygon(point, feature)); | ||
|
|
||
| if (!isInPolygon) { | ||
| filteredIds.push(listing.id); | ||
| } | ||
|
|
||
| return isInPolygon; | ||
| }); | ||
|
|
||
| if (filteredIds.length > 0) { | ||
| deleteListingsById(filteredIds); | ||
| return filteredListings; | ||
| } | ||
|
|
||
| /** | ||
| * Filter listings based on its specifications (minRooms, minSize, maxPrice). | ||
| * | ||
| * @param {ParsedListing[]} newListings New listings to filter. | ||
| * @returns {ParsedListing[]} Resolves with listings that pass the specification filters. | ||
| */ | ||
| _filterBySpecs(newListings) { | ||
| const { minRooms, minSize, maxPrice } = this._jobSpecFilter || {}; | ||
|
|
||
| // If no specs are set, return all listings | ||
| if (!minRooms && !minSize && !maxPrice) { | ||
| return newListings; | ||
| } | ||
|
|
||
| return keptListings; | ||
| const filtered = newListings.filter((listing) => { | ||
| if (minRooms && listing.rooms && listing.rooms < minRooms) return false; | ||
| if (minSize && listing.size && listing.size < minSize) return false; | ||
| if (maxPrice && listing.price && listing.price > maxPrice) return false; | ||
| return true; | ||
| }); | ||
|
|
||
| return filtered; | ||
| } | ||
|
|
||
| /** | ||
| * Fetch listings from the provider, using the default Extractor flow unless | ||
| * a provider-specific getListings override is supplied. | ||
| * | ||
| * @param {string} url The provider URL to fetch from. | ||
| * @returns {Promise<Listing[]>} Resolves with an array of listings (empty when none found). | ||
| * @returns {Promise<ParsedListing[]>} Resolves with an array of listings (empty when none found). | ||
| */ | ||
| _getListings(url) { | ||
| const extractor = new Extractor({ ...this._providerConfig.puppeteerOptions, browser: this._browser }); | ||
|
|
@@ -189,33 +198,42 @@ class FredyPipelineExecutioner { | |
| } | ||
|
|
||
| /** | ||
| * Normalize raw listings into the provider-specific Listing shape. | ||
| * Normalize raw listings into the provider-specific ParsedListing shape. | ||
| * | ||
| * @param {any[]} listings Raw listing entries from the extractor or override. | ||
| * @returns {Listing[]} Normalized listings. | ||
| * @returns {ParsedListing[]} Normalized listings. | ||
| */ | ||
| _normalize(listings) { | ||
| return listings.map(this._providerConfig.normalize); | ||
| return listings.map((listing) => this._providerConfig.normalize(listing)); | ||
| } | ||
|
|
||
| /** | ||
| * Filter out listings that are missing required fields and those rejected by the | ||
| * provider's blacklist/filter function. | ||
| * | ||
| * @param {Listing[]} listings Listings to filter. | ||
| * @returns {Listing[]} Filtered listings that pass validation and provider filter. | ||
| * @param {ParsedListing[]} listings Listings to filter. | ||
| * @returns {ParsedListing[]} Filtered listings that pass validation and provider filter. | ||
| */ | ||
| _filter(listings) { | ||
| const keys = Object.keys(this._providerConfig.crawlFields); | ||
| const filteredListings = listings.filter((item) => keys.every((key) => key in item)); | ||
| return filteredListings.filter(this._providerConfig.filter); | ||
| const requiredKeys = this._providerConfig.fieldNames; | ||
| const requireValues = ['id', 'link', 'title']; | ||
|
|
||
| const filteredListings = listings | ||
| // this should never filter some listings out, because the normalize function should always extract all fields. | ||
| .filter((item) => requiredKeys.every((key) => key in item)) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. like you told me i add it, without knowledge for what. |
||
| // TODO: move blacklist filter to this file, so it will handle for all providers in same way. | ||
| .filter(this._providerConfig.filter) | ||
| // filter out listings that are missing required fields | ||
| .filter((item) => requireValues.every((key) => item[key] != null)); | ||
|
|
||
| return filteredListings; | ||
| } | ||
|
|
||
| /** | ||
| * Determine which listings are new by comparing their IDs against stored hashes. | ||
| * | ||
| * @param {Listing[]} listings Listings to evaluate for novelty. | ||
| * @returns {Listing[]} New listings not seen before. | ||
| * @param {ParsedListing[]} listings Listings to evaluate for novelty. | ||
| * @returns {ParsedListing[]} New listings not seen before. | ||
| * @throws {NoNewListingsWarning} When no new listings are found. | ||
| */ | ||
| _findNew(listings) { | ||
|
|
@@ -232,23 +250,30 @@ class FredyPipelineExecutioner { | |
| /** | ||
| * Send notifications for new listings using the configured notification adapter(s). | ||
| * | ||
| * @param {Listing[]} newListings New listings to notify about. | ||
| * @returns {Promise<Listing[]>} Resolves to the provided listings after notifications complete. | ||
| * @param {ParsedListing[]} newListings New listings to notify about. | ||
| * @returns {Promise<ParsedListing[]>} Resolves to the provided listings after notifications complete. | ||
| * @throws {NoNewListingsWarning} When there are no listings to notify about. | ||
| */ | ||
| _notify(newListings) { | ||
| if (newListings.length === 0) { | ||
| throw new NoNewListingsWarning(); | ||
| } | ||
| const sendNotifications = notify.send(this._providerId, newListings, this._notificationConfig, this._jobKey); | ||
| // TODO: move this to the notification adapter, so it will handle for all providers in same way. | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you resolve this todo before we proceed?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i dont did it because it means changes on all notification files. Also this feels not correct. i'm not shure where to put it. Maybei can put it into the
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok I get it. leave it as it is for now :) |
||
| const formattedListings = newListings.map(formatListing); | ||
| const sendNotifications = notify.send( | ||
| this._providerId, | ||
| formattedListings, | ||
| this._jobNotificationConfig, | ||
| this._jobKey, | ||
| ); | ||
| return Promise.all(sendNotifications).then(() => newListings); | ||
| } | ||
|
|
||
| /** | ||
| * Persist new listings and pass them through. | ||
| * | ||
| * @param {Listing[]} newListings Listings to store. | ||
| * @returns {Listing[]} The same listings, unchanged. | ||
| * @param {ParsedListing[]} newListings Listings to store. | ||
| * @returns {ParsedListing[]} The same listings, unchanged. | ||
| */ | ||
| _save(newListings) { | ||
| logger.debug(`Storing ${newListings.length} new listings (Provider: '${this._providerId}')`); | ||
|
|
@@ -259,8 +284,8 @@ class FredyPipelineExecutioner { | |
| /** | ||
| * Calculate distance for new listings. | ||
| * | ||
| * @param {Listing[]} listings | ||
| * @returns {Listing[]} | ||
| * @param {ParsedListing[]} listings | ||
| * @returns {ParsedListing[]} | ||
| * @private | ||
| */ | ||
| _calculateDistance(listings) { | ||
|
|
@@ -296,10 +321,10 @@ class FredyPipelineExecutioner { | |
| * Remove listings that are similar to already known entries according to the similarity cache. | ||
| * Adds the remaining listings to the cache. | ||
| * | ||
| * @param {Listing[]} listings Listings to filter by similarity. | ||
| * @returns {Listing[]} Listings considered unique enough to keep. | ||
| * @param {ParsedListing[]} listings Listings to filter by similarity. | ||
| * @returns {ParsedListing[]} Listings considered unique enough to keep. | ||
| */ | ||
| _filterBySimilarListings(listings) { | ||
| _deleteSimilarListings(listings) { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will than also rename this back. before your changes all filters looks like filters and this not, so i renamed it. now every filter is working with delete so i will rename it back |
||
| const filteredIds = []; | ||
| const keptListings = listings.filter((listing) => { | ||
| const similar = this._similarityCache.checkAndAddEntry({ | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why did you remove the deleteListings here? That was added on purpose to not scrape things infinite?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
might be happend with merge. i dont realize that you did changes here.
i will fix it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks ❤️