-
-
Notifications
You must be signed in to change notification settings - Fork 65
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Moving to puppeteer | removing scrapingAnt
- Loading branch information
1 parent
58965a6
commit 214e714
Showing
30 changed files
with
1,367 additions
and
1,190 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
{"interval":"60","port":9998,"scrapingAnt":{"apiKey":"","proxy":"datacenter"},"workingHours":{"from":"","to":""},"demoMode":false,"analyticsEnabled":null} | ||
{"interval":"60","port":9998,"workingHours":{"from":"","to":""},"demoMode":false,"analyticsEnabled":null} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,134 +1,118 @@ | ||
import { NoNewListingsWarning } from './errors.js'; | ||
import { setKnownListings, getKnownListings } from './services/storage/listingsStorage.js'; | ||
import {NoNewListingsWarning} from './errors.js'; | ||
import {setKnownListings, getKnownListings} from './services/storage/listingsStorage.js'; | ||
import * as notify from './notification/notify.js'; | ||
import xray from './services/scraper.js'; | ||
import * as scrapingAnt from './services/scrapingAnt.js'; | ||
import Extractor from './services/extractor/extractor.js'; | ||
import urlModifier from './services/queryStringMutator.js'; | ||
|
||
class FredyRuntime { | ||
/** | ||
* | ||
* @param providerConfig the config for the specific provider, we're going to query at the moment | ||
* @param notificationConfig the config for all notifications | ||
* @param providerId the id of the provider currently in use | ||
* @param jobKey key of the job that is currently running (from within the config) | ||
* @param similarityCache cache instance holding values to check for similarity of entries | ||
*/ | ||
constructor(providerConfig, notificationConfig, providerId, jobKey, similarityCache) { | ||
this._providerConfig = providerConfig; | ||
this._notificationConfig = notificationConfig; | ||
this._providerId = providerId; | ||
this._jobKey = jobKey; | ||
this._similarityCache = similarityCache; | ||
} | ||
execute() { | ||
return ( | ||
//modify the url to make sure search order is correctly set | ||
Promise.resolve(urlModifier(this._providerConfig.url, this._providerConfig.sortByDateParam)) | ||
//scraping the site and try finding new listings | ||
.then(this._getListings.bind(this)) | ||
//bring them in a proper form (dictated by the provider) | ||
.then(this._normalize.bind(this)) | ||
//filter listings with stuff tagged by the blacklist of the provider | ||
.then(this._filter.bind(this)) | ||
//check if new listings available. if so proceed | ||
.then(this._findNew.bind(this)) | ||
//store everything in db | ||
.then(this._save.bind(this)) | ||
//check for similar listings. if found, remove them before notifying | ||
.then(this._filterBySimilarListings.bind(this)) | ||
//notify the user using the configured notification adapter | ||
.then(this._notify.bind(this)) | ||
//if an error occurred on the way, handle it here. | ||
.catch(this._handleError.bind(this)) | ||
); | ||
} | ||
_getListings(url) { | ||
return new Promise((resolve, reject) => { | ||
const id = this._providerId; | ||
if (scrapingAnt.needScrapingAnt(id) && !scrapingAnt.isScrapingAntApiKeySet()) { | ||
const error = 'Immoscout or Immonet can only be used with if you have set an apikey for scrapingAnt.'; | ||
/* eslint-disable no-console */ | ||
console.log(error); | ||
/* eslint-enable no-console */ | ||
reject(error); | ||
return; | ||
} | ||
const u = scrapingAnt.needScrapingAnt(id) ? scrapingAnt.transformUrlForScrapingAnt(url, id) : url; | ||
try { | ||
if (this._providerConfig.paginate != null) { | ||
xray(u, this._providerConfig.crawlContainer, [this._providerConfig.crawlFields]) | ||
//the first 2 pages should be enough here | ||
.limit(2) | ||
.paginate(this._providerConfig.paginate) | ||
.then((listings) => { | ||
resolve(listings == null ? [] : listings); | ||
}) | ||
.catch((err) => { | ||
reject(err); | ||
console.error(err); | ||
}); | ||
} else { | ||
xray(u, this._providerConfig.crawlContainer, [this._providerConfig.crawlFields]) | ||
.then((listings) => { | ||
resolve(listings == null ? [] : listings); | ||
}) | ||
.catch((err) => { | ||
reject(err); | ||
console.error(err); | ||
/** | ||
* | ||
* @param providerConfig the config for the specific provider, we're going to query at the moment | ||
* @param notificationConfig the config for all notifications | ||
* @param providerId the id of the provider currently in use | ||
* @param jobKey key of the job that is currently running (from within the config) | ||
* @param similarityCache cache instance holding values to check for similarity of entries | ||
*/ | ||
constructor(providerConfig, notificationConfig, providerId, jobKey, similarityCache) { | ||
this._providerConfig = providerConfig; | ||
this._notificationConfig = notificationConfig; | ||
this._providerId = providerId; | ||
this._jobKey = jobKey; | ||
this._similarityCache = similarityCache; | ||
} | ||
|
||
execute() { | ||
return ( | ||
//modify the url to make sure search order is correctly set | ||
Promise.resolve(urlModifier(this._providerConfig.url, this._providerConfig.sortByDateParam)) | ||
//scraping the site and try finding new listings | ||
.then(this._getListings.bind(this)) | ||
//bring them in a proper form (dictated by the provider) | ||
.then(this._normalize.bind(this)) | ||
//filter listings with stuff tagged by the blacklist of the provider | ||
.then(this._filter.bind(this)) | ||
//check if new listings available. if so proceed | ||
.then(this._findNew.bind(this)) | ||
//store everything in db | ||
.then(this._save.bind(this)) | ||
//check for similar listings. if found, remove them before notifying | ||
.then(this._filterBySimilarListings.bind(this)) | ||
//notify the user using the configured notification adapter | ||
.then(this._notify.bind(this)) | ||
//if an error occurred on the way, handle it here. | ||
.catch(this._handleError.bind(this)) | ||
); | ||
} | ||
|
||
_getListings(url) { | ||
const extractor = new Extractor(); | ||
return new Promise((resolve, reject) => { | ||
extractor.execute(url,this._providerConfig.waitForSelector) | ||
.then(() => { | ||
const listings = extractor.parseResponseText(this._providerConfig.crawlContainer, this._providerConfig.crawlFields); | ||
resolve(listings == null ? [] : listings); | ||
}).catch(err => { | ||
reject(err); | ||
/* eslint-disable no-console */ | ||
console.error(err); | ||
/* eslint-enable no-console */ | ||
}); | ||
}); | ||
} | ||
|
||
_normalize(listings) { | ||
return listings.map(this._providerConfig.normalize); | ||
} | ||
|
||
_filter(listings) { | ||
//only return those where all the fields have been found | ||
const keys = Object.keys(this._providerConfig.crawlFields); | ||
const filteredListings = listings.filter((item) => keys.every((key) => key in item)); | ||
return filteredListings.filter(this._providerConfig.filter); | ||
} | ||
|
||
_findNew(listings) { | ||
const newListings = listings.filter((o) => getKnownListings(this._jobKey, this._providerId)[o.id] == null); | ||
if (newListings.length === 0) { | ||
throw new NoNewListingsWarning(); | ||
} | ||
return newListings; | ||
} | ||
|
||
_notify(newListings) { | ||
if (newListings.length === 0) { | ||
throw new NoNewListingsWarning(); | ||
} | ||
} catch (error) { | ||
reject(error); | ||
console.error(error); | ||
} | ||
}); | ||
} | ||
_normalize(listings) { | ||
return listings.map(this._providerConfig.normalize); | ||
} | ||
_filter(listings) { | ||
//only return those where all the fields have been found | ||
const keys = Object.keys(this._providerConfig.crawlFields); | ||
const filteredListings = listings.filter((item) => keys.every((key) => key in item)); | ||
return filteredListings.filter(this._providerConfig.filter); | ||
} | ||
_findNew(listings) { | ||
const newListings = listings.filter((o) => getKnownListings(this._jobKey, this._providerId)[o.id] == null); | ||
if (newListings.length === 0) { | ||
throw new NoNewListingsWarning(); | ||
const sendNotifications = notify.send(this._providerId, newListings, this._notificationConfig, this._jobKey); | ||
return Promise.all(sendNotifications).then(() => newListings); | ||
} | ||
|
||
_save(newListings) { | ||
const currentListings = getKnownListings(this._jobKey, this._providerId) || {}; | ||
newListings.forEach((listing) => { | ||
currentListings[listing.id] = Date.now(); | ||
}); | ||
setKnownListings(this._jobKey, this._providerId, currentListings); | ||
return newListings; | ||
} | ||
|
||
_filterBySimilarListings(listings) { | ||
const filteredList = listings.filter((listing) => { | ||
const similar = this._similarityCache.hasSimilarEntries(this._jobKey, listing.title); | ||
if (similar) { | ||
/* eslint-disable no-console */ | ||
console.debug(`Filtering similar entry for job with id ${this._jobKey} with title: `, listing.title); | ||
/* eslint-enable no-console */ | ||
} | ||
return !similar; | ||
}); | ||
filteredList.forEach((filter) => this._similarityCache.addCacheEntry(this._jobKey, filter.title)); | ||
return filteredList; | ||
} | ||
return newListings; | ||
} | ||
_notify(newListings) { | ||
if (newListings.length === 0) { | ||
throw new NoNewListingsWarning(); | ||
|
||
_handleError(err) { | ||
if (err.name !== 'NoNewListingsWarning') console.error(err); | ||
} | ||
const sendNotifications = notify.send(this._providerId, newListings, this._notificationConfig, this._jobKey); | ||
return Promise.all(sendNotifications).then(() => newListings); | ||
} | ||
_save(newListings) { | ||
const currentListings = getKnownListings(this._jobKey, this._providerId) || {}; | ||
newListings.forEach((listing) => { | ||
currentListings[listing.id] = Date.now(); | ||
}); | ||
setKnownListings(this._jobKey, this._providerId, currentListings); | ||
return newListings; | ||
} | ||
_filterBySimilarListings(listings) { | ||
const filteredList = listings.filter((listing) => { | ||
const similar = this._similarityCache.hasSimilarEntries(this._jobKey, listing.title); | ||
if (similar) { | ||
/* eslint-disable no-console */ | ||
console.debug(`Filtering similar entry for job with id ${this._jobKey} with title: `, listing.title); | ||
/* eslint-enable no-console */ | ||
} | ||
return !similar; | ||
}); | ||
filteredList.forEach((filter) => this._similarityCache.addCacheEntry(this._jobKey, filter.title)); | ||
return filteredList; | ||
} | ||
_handleError(err) { | ||
if (err.name !== 'NoNewListingsWarning') console.error(err); | ||
} | ||
} | ||
|
||
export default FredyRuntime; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.