From cf52865ccf46095ecb5afa4691d63fe3fca5cb4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20Ambro=C5=BEi=C4=8D?= Date: Wed, 15 Mar 2017 20:24:39 +0100 Subject: [PATCH] - Added option to configure title filtering in config.json - Exposed scraping function - Added package.json for easier installation - Added .gitignore file --- .gitignore | 4 +++ package.json | 18 ++++++++++++ scraper.js | 83 +++++++++++++++++++++++++++++++++------------------- 3 files changed, 75 insertions(+), 30 deletions(-) create mode 100644 .gitignore create mode 100644 package.json diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..929ee2b --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +/node_modules +config.json +.idea +NepremicnineProd.json \ No newline at end of file diff --git a/package.json b/package.json new file mode 100644 index 0000000..ffc8c8d --- /dev/null +++ b/package.json @@ -0,0 +1,18 @@ +{ + + "name": "real-estate-scraper", + "version": "0.0.1", + "description": "Realestate scraper", + "repository": { + "type": "git" + }, + + "dependencies": { + "nodemailer": "2.4.2", + "tinyreq": "3.1.1", + "node-json-db": "0.7.2", + "cheerio": "0.20.0", + "daemon": "1.1.0" + } + +} \ No newline at end of file diff --git a/scraper.js b/scraper.js index fd52647..dc7fe84 100644 --- a/scraper.js +++ b/scraper.js @@ -69,13 +69,16 @@ sites.push({ conn: "www.bolha.com" + bAds[i].children[3].children[1].children[0].attribs.href, price: bAds[i].children[7].children[1].children[0].children[0].data }; - try { - db.getData("/B"+ad.id); - } catch (err){ - bolhaDelta += 1; - db.push("/B"+ad.id,ad); - delta.push(ad); - } + + if (processFilters(ad)) { + try { + db.getData("/N"+ad.id); + } catch (err){ + bolhaDelta += 1; + db.push("/N"+ad.id,ad); + delta.push(ad); + } + } } } console.log((new Date()).toString() + " BOLHA.COM Delta: " + bolhaDelta); @@ -100,12 +103,15 @@ sites.push({ conn: "https://www.nepremicnine.net" + nAds[i].children[5].children[5].children[0].attribs.href,//nAds[i].children[1].children[0].attribs.href, price: nAds[i].children[5].children[11].children[13].children[4].children[0].data//nAds[i].children[5].children[13].children[4].children[0].data }; - try { - db.getData("/N"+ad.id); - } catch (err){ - nepremDelta += 1; - db.push("/N"+ad.id,ad); - delta.push(ad); + + if (processFilters(ad)) { + try { + db.getData("/N"+ad.id); + } catch (err){ + nepremDelta += 1; + db.push("/N"+ad.id,ad); + delta.push(ad); + } } } } @@ -115,24 +121,41 @@ sites.push({ } }); +function processFilters(ad) { + var filters = config.filters; + var titleContains = filters.title.contains; + if (titleContains.length > 0) { + for(var i in titleContains) { + if (ad.title.indexOf(titleContains[i]) !== -1) { + return true; + } + } + } else { + return true; + } -setInterval( - function(){ - console.log((new Date()).toString() + " STARTED"); - if ((new Date()).getHours() < 22 && (new Date()).getHours() >= 7) { - for (var i = 0; i < sites.length; i++){ - request({ - url: sites[i].url, - method: "GET", - headers: { - "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36" - } - }, - sites[i].callback - ); - } - } - }, + return false; +} + +function runScraping() { + console.log((new Date()).toString() + " STARTED"); + if ((new Date()).getHours() < 22 && (new Date()).getHours() >= 7) { + for (var i = 0; i < sites.length; i++){ + request({ + url: sites[i].url, + method: "GET", + headers: { + "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36" + } + }, + sites[i].callback + ); + } + } +} + +setInterval( + runScraping, interval * 60 * 1000 ); \ No newline at end of file