Skip to content

Commit

Permalink
Puppeteer rewrite (#119)
Browse files Browse the repository at this point in the history
* Moving to puppeteer | removing scrapingAnt
  • Loading branch information
orangecoding authored Dec 17, 2024
1 parent 58965a6 commit 214e714
Show file tree
Hide file tree
Showing 30 changed files with 1,367 additions and 1,190 deletions.
9 changes: 2 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,8 @@ yarn run test
# Architecture
![Architecture](/doc/architecture.jpg "Architecture")

### Immoscout / Immonet / NeubauKompass
I have added **experimental** support for Immoscout, Immonet and NeubauKompass. They all are somewhat special, because they have decided to secure their service from bots using Re-Capture. Finding a way around this is barely possible. For _Fredy_ to be able to bypass this check, I'm using a service called [ScrapingAnt](https://scrapingant.com/). The trick is to use a headless browser, rotating proxies and (once successfully validated) to re-send the cookies each time.

To be able to use Immoscout / Immonet, you need to create an account at ScrapingAnt. Configure the API key in the "General Settings" tab (visible when logged in as administrator).
The rest will be handled by _Fredy_. Keep in mind, the support is experimental. There might be bugs and you might not always pass the re-capture check, but most of the time it works rather well :)

If you need more than the 1000 API calls allowed per month, I'd suggest opting for a paid account... ScrapingAnt loves OpenSource, therefore they have decided to give all _Fredy_ users a 10% discount by using the code **FREDY10** (Disclaimer: I do not earn any money for recommending their service).
### Immoscout
Immoscout has implemented advanced bot detection. I’m actively working on bypassing these measures, but until then, selecting Immoscout as a provider will not return any results. I apologize for the inconvenience. 😉

# Analytics
Fredy is completely free (and will always remain free). However, it would be a huge help if you’d allow me to collect some analytical data.
Expand Down
2 changes: 1 addition & 1 deletion conf/config.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"interval":"60","port":9998,"scrapingAnt":{"apiKey":"","proxy":"datacenter"},"workingHours":{"from":"","to":""},"demoMode":false,"analyticsEnabled":null}
{"interval":"60","port":9998,"workingHours":{"from":"","to":""},"demoMode":false,"analyticsEnabled":null}
234 changes: 109 additions & 125 deletions lib/FredyRuntime.js
Original file line number Diff line number Diff line change
@@ -1,134 +1,118 @@
import { NoNewListingsWarning } from './errors.js';
import { setKnownListings, getKnownListings } from './services/storage/listingsStorage.js';
import {NoNewListingsWarning} from './errors.js';
import {setKnownListings, getKnownListings} from './services/storage/listingsStorage.js';
import * as notify from './notification/notify.js';
import xray from './services/scraper.js';
import * as scrapingAnt from './services/scrapingAnt.js';
import Extractor from './services/extractor/extractor.js';
import urlModifier from './services/queryStringMutator.js';

class FredyRuntime {
/**
*
* @param providerConfig the config for the specific provider, we're going to query at the moment
* @param notificationConfig the config for all notifications
* @param providerId the id of the provider currently in use
* @param jobKey key of the job that is currently running (from within the config)
* @param similarityCache cache instance holding values to check for similarity of entries
*/
constructor(providerConfig, notificationConfig, providerId, jobKey, similarityCache) {
this._providerConfig = providerConfig;
this._notificationConfig = notificationConfig;
this._providerId = providerId;
this._jobKey = jobKey;
this._similarityCache = similarityCache;
}
execute() {
return (
//modify the url to make sure search order is correctly set
Promise.resolve(urlModifier(this._providerConfig.url, this._providerConfig.sortByDateParam))
//scraping the site and try finding new listings
.then(this._getListings.bind(this))
//bring them in a proper form (dictated by the provider)
.then(this._normalize.bind(this))
//filter listings with stuff tagged by the blacklist of the provider
.then(this._filter.bind(this))
//check if new listings available. if so proceed
.then(this._findNew.bind(this))
//store everything in db
.then(this._save.bind(this))
//check for similar listings. if found, remove them before notifying
.then(this._filterBySimilarListings.bind(this))
//notify the user using the configured notification adapter
.then(this._notify.bind(this))
//if an error occurred on the way, handle it here.
.catch(this._handleError.bind(this))
);
}
_getListings(url) {
return new Promise((resolve, reject) => {
const id = this._providerId;
if (scrapingAnt.needScrapingAnt(id) && !scrapingAnt.isScrapingAntApiKeySet()) {
const error = 'Immoscout or Immonet can only be used with if you have set an apikey for scrapingAnt.';
/* eslint-disable no-console */
console.log(error);
/* eslint-enable no-console */
reject(error);
return;
}
const u = scrapingAnt.needScrapingAnt(id) ? scrapingAnt.transformUrlForScrapingAnt(url, id) : url;
try {
if (this._providerConfig.paginate != null) {
xray(u, this._providerConfig.crawlContainer, [this._providerConfig.crawlFields])
//the first 2 pages should be enough here
.limit(2)
.paginate(this._providerConfig.paginate)
.then((listings) => {
resolve(listings == null ? [] : listings);
})
.catch((err) => {
reject(err);
console.error(err);
});
} else {
xray(u, this._providerConfig.crawlContainer, [this._providerConfig.crawlFields])
.then((listings) => {
resolve(listings == null ? [] : listings);
})
.catch((err) => {
reject(err);
console.error(err);
/**
*
* @param providerConfig the config for the specific provider, we're going to query at the moment
* @param notificationConfig the config for all notifications
* @param providerId the id of the provider currently in use
* @param jobKey key of the job that is currently running (from within the config)
* @param similarityCache cache instance holding values to check for similarity of entries
*/
constructor(providerConfig, notificationConfig, providerId, jobKey, similarityCache) {
this._providerConfig = providerConfig;
this._notificationConfig = notificationConfig;
this._providerId = providerId;
this._jobKey = jobKey;
this._similarityCache = similarityCache;
}

execute() {
return (
//modify the url to make sure search order is correctly set
Promise.resolve(urlModifier(this._providerConfig.url, this._providerConfig.sortByDateParam))
//scraping the site and try finding new listings
.then(this._getListings.bind(this))
//bring them in a proper form (dictated by the provider)
.then(this._normalize.bind(this))
//filter listings with stuff tagged by the blacklist of the provider
.then(this._filter.bind(this))
//check if new listings available. if so proceed
.then(this._findNew.bind(this))
//store everything in db
.then(this._save.bind(this))
//check for similar listings. if found, remove them before notifying
.then(this._filterBySimilarListings.bind(this))
//notify the user using the configured notification adapter
.then(this._notify.bind(this))
//if an error occurred on the way, handle it here.
.catch(this._handleError.bind(this))
);
}

_getListings(url) {
const extractor = new Extractor();
return new Promise((resolve, reject) => {
extractor.execute(url,this._providerConfig.waitForSelector)
.then(() => {
const listings = extractor.parseResponseText(this._providerConfig.crawlContainer, this._providerConfig.crawlFields);
resolve(listings == null ? [] : listings);
}).catch(err => {
reject(err);
/* eslint-disable no-console */
console.error(err);
/* eslint-enable no-console */
});
});
}

_normalize(listings) {
return listings.map(this._providerConfig.normalize);
}

_filter(listings) {
//only return those where all the fields have been found
const keys = Object.keys(this._providerConfig.crawlFields);
const filteredListings = listings.filter((item) => keys.every((key) => key in item));
return filteredListings.filter(this._providerConfig.filter);
}

_findNew(listings) {
const newListings = listings.filter((o) => getKnownListings(this._jobKey, this._providerId)[o.id] == null);
if (newListings.length === 0) {
throw new NoNewListingsWarning();
}
return newListings;
}

_notify(newListings) {
if (newListings.length === 0) {
throw new NoNewListingsWarning();
}
} catch (error) {
reject(error);
console.error(error);
}
});
}
_normalize(listings) {
return listings.map(this._providerConfig.normalize);
}
_filter(listings) {
//only return those where all the fields have been found
const keys = Object.keys(this._providerConfig.crawlFields);
const filteredListings = listings.filter((item) => keys.every((key) => key in item));
return filteredListings.filter(this._providerConfig.filter);
}
_findNew(listings) {
const newListings = listings.filter((o) => getKnownListings(this._jobKey, this._providerId)[o.id] == null);
if (newListings.length === 0) {
throw new NoNewListingsWarning();
const sendNotifications = notify.send(this._providerId, newListings, this._notificationConfig, this._jobKey);
return Promise.all(sendNotifications).then(() => newListings);
}

_save(newListings) {
const currentListings = getKnownListings(this._jobKey, this._providerId) || {};
newListings.forEach((listing) => {
currentListings[listing.id] = Date.now();
});
setKnownListings(this._jobKey, this._providerId, currentListings);
return newListings;
}

_filterBySimilarListings(listings) {
const filteredList = listings.filter((listing) => {
const similar = this._similarityCache.hasSimilarEntries(this._jobKey, listing.title);
if (similar) {
/* eslint-disable no-console */
console.debug(`Filtering similar entry for job with id ${this._jobKey} with title: `, listing.title);
/* eslint-enable no-console */
}
return !similar;
});
filteredList.forEach((filter) => this._similarityCache.addCacheEntry(this._jobKey, filter.title));
return filteredList;
}
return newListings;
}
_notify(newListings) {
if (newListings.length === 0) {
throw new NoNewListingsWarning();

_handleError(err) {
if (err.name !== 'NoNewListingsWarning') console.error(err);
}
const sendNotifications = notify.send(this._providerId, newListings, this._notificationConfig, this._jobKey);
return Promise.all(sendNotifications).then(() => newListings);
}
_save(newListings) {
const currentListings = getKnownListings(this._jobKey, this._providerId) || {};
newListings.forEach((listing) => {
currentListings[listing.id] = Date.now();
});
setKnownListings(this._jobKey, this._providerId, currentListings);
return newListings;
}
_filterBySimilarListings(listings) {
const filteredList = listings.filter((listing) => {
const similar = this._similarityCache.hasSimilarEntries(this._jobKey, listing.title);
if (similar) {
/* eslint-disable no-console */
console.debug(`Filtering similar entry for job with id ${this._jobKey} with title: `, listing.title);
/* eslint-enable no-console */
}
return !similar;
});
filteredList.forEach((filter) => this._similarityCache.addCacheEntry(this._jobKey, filter.title));
return filteredList;
}
_handleError(err) {
if (err.name !== 'NoNewListingsWarning') console.error(err);
}
}

export default FredyRuntime;
23 changes: 1 addition & 22 deletions lib/api/routes/jobRouter.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import * as userStorage from '../../services/storage/userStorage.js';
import * as immoscoutProvider from '../../provider/immoscout.js';
import { config } from '../../utils.js';
import { isAdmin } from '../security.js';
import {isScrapingAntApiKeySet} from '../../services/scrapingAnt.js';
import {trackDemoJobCreated} from '../../services/tracking/Tracker.js';
const service = restana();
const jobRouter = service.newRouter();
Expand All @@ -27,34 +26,14 @@ jobRouter.get('/', async (req, res) => {
res.send();
});
jobRouter.get('/processingTimes', async (req, res) => {
let scrapingAntData = {};
if (isScrapingAntApiKeySet()) {
try {
const response = await fetch(`https://api.scrapingant.com/v2/usage?x-api-key=${config.scrapingAnt.apiKey}`);
scrapingAntData = await response.json();
} catch (Exception) {
console.error('Could not query plan data from scraping ant.', Exception);
}
}
res.body = {
interval: config.interval,
lastRun: config.lastRun || null,
scrapingAntData,
error: scrapingAntData?.detail == null ? null : scrapingAntData?.detail
lastRun: config.lastRun || null
};
res.send();
});
jobRouter.post('/', async (req, res) => {
const { provider, notificationAdapter, name, blacklist = [], jobId, enabled } = req.body;
if (
provider.find((p) => p.id === immoscoutProvider.metaInformation.id) != null &&
(config.scrapingAnt.apiKey == null || config.scrapingAnt.apiKey.length === 0)
) {
res.send(
new Error('To use Immoscout as provider, you need to configure ScrapingAnt first. Please check the readme.')
);
return;
}
try {
jobStorage.upsertJob({
userId: req.session.currentUser,
Expand Down
1 change: 0 additions & 1 deletion lib/defaultConfig.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
export const DEFAULT_CONFIG = {
'interval': '60',
'port': 9998,
'scrapingAnt': {'apiKey': '', 'proxy': 'datacenter'},
'workingHours': {'from': '', 'to': ''},
'demoMode': false,
'analyticsEnabled': null
Expand Down
14 changes: 5 additions & 9 deletions lib/provider/einsAImmobilien.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,10 @@ import utils, { buildHash } from '../utils.js';
let appliedBlackList = [];

function normalize(o) {
let size = `${o.size.replace(' Wohnfläche ', '').trim()}`;
if (o.rooms != null) {
size += ` / / ${o.rooms.trim()}`;
}
const link = `https://www.1a-immobilienmarkt.de/expose/${o.id}.html`;
const price = normalizePrice(o.price);
const id = buildHash(o.id, price);
return Object.assign(o, { id, price, size, link });
return Object.assign(o, { id, price, link });
}

/**
Expand Down Expand Up @@ -39,12 +35,12 @@ const config = {
url: null,
crawlContainer: '.tabelle',
sortByDateParam: 'sort_type=newest',
waitForSelector: 'body',
crawlFields: {
id: '.inner_object_data input[name="marker_objekt_id"]@value | int',
price: '.tabelle .inner_object_data .single_data_price | removeNewline | trim',
size: '.tabelle .inner_object_data .data_boxes div:nth-child(1)',
rooms: '.tabelle .inner_object_data .data_boxes div:nth-child(2)',
title: '.tabelle .inner_object_data .tabelle_inhalt_titel_black | removeNewline | trim',
price: '.inner_object_data .single_data_price | removeNewline | trim',
size: '.tabelle .tabelle_inhalt_infos .single_data_box | removeNewline | trim',
title: '.inner_object_data .tabelle_inhalt_titel_black | removeNewline | trim',
},
normalize: normalize,
filter: applyBlacklist,
Expand Down
10 changes: 6 additions & 4 deletions lib/provider/immobilienDe.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@ function normalize(o) {
const price = o.price || 'N/A €';
const title = o.title || 'No title available';
const address = o.address || 'No address available';
const link = shortenLink(o.link);
const id = buildHash(parseId(shortenLink(o.link)), o.price);
const shortLink = shortenLink(o.link);
const link = `https://www.immobilien.de/${shortLink}`;
const id = buildHash(parseId(shortLink), o.price);
return Object.assign(o, { id, price, size, title, address, link });
}
function applyBlacklist(o) {
Expand All @@ -22,17 +23,18 @@ function applyBlacklist(o) {
}
const config = {
url: null,
crawlContainer: '.estates_list .list_immo a._ref',
crawlContainer: '._ref',
sortByDateParam: 'sort_col=*created_ts&sort_dir=desc',
waitForSelector: 'body',
crawlFields: {
id: '@href', //will be transformed later
price: '.list_entry .immo_preis .label_info',
size: '.list_entry .flaeche .label_info | removeNewline | trim',
title: '.list_entry .part_text h3 span',
description: '.list_entry .description | trim',
link: '@href',
address: '.list_entry .place',
},
paginate: '.list_immo .blocknav .blocknav_list li.next a@href',
normalize: normalize,
filter: applyBlacklist,
};
Expand Down
Loading

0 comments on commit 214e714

Please sign in to comment.