Skip to content
This repository was archived by the owner on Apr 19, 2022. It is now read-only.

Commit b0b510c

Browse files
committedJun 7, 2019
app review changes: trailing slashes removed, error fixed in worker catch block, comments updated
1 parent 392e734 commit b0b510c

File tree

5 files changed

+40
-21
lines changed

5 files changed

+40
-21
lines changed
 

‎README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@ Crawl medium.com urls recursively.
44
Local setup
55
* make sure node version is > 8.x. To install node version nvm `nvm install 8.x`
66
* install mongodb | follow this link https://www.digitalocean.com/community/tutorials/how-to-install-mongodb-on-ubuntu-18-04
7+
* make sure `mongod` service is running `sudo service mongod status` (for ubunutu)
78
* `npm start` to start app in dev env
89

910

1011
Dev Setup
1112
* make sure `docker` and `docker-compose` is installed. If now, follow link https://docs.docker.com/install/
12-
* make sure `mongod` service is running `sudo service mongod status` (for ubunutu)
1313
* `npm run app:test` to run
1414

1515
Prod setup

‎components/controllers/params.controller.js

+2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
const config = require('../../config');
55
const Params = require('../models/params');
66
module.exports = {
7+
// To save query params data in DB
78
save: async function(data) {
89
let urlParams = new Params(data);
910
let err, result;
@@ -14,6 +15,7 @@ module.exports = {
1415
}
1516
return [err, result];
1617
},
18+
// To get record from DB based on queryParams, e.g. find record by md5
1719
getOneby: async function(queryParams) {
1820
let err, result;
1921
try {

‎components/controllers/url.controller.js

+2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
const config = require('../../config');
55
const Urls = require('../models/urls');
66
module.exports = {
7+
// To save url data in DB
78
save: async function(data) {
89
let url = new Urls(data);
910
let err, result;
@@ -14,6 +15,7 @@ module.exports = {
1415
}
1516
return [err, result];
1617
},
18+
// To get record from DB based on queryParams, e.g. find by md5
1719
getOneby: async function(params) {
1820
let err, result;
1921
try {

‎lib/index.js

+12-1
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,20 @@ function fetch(url) {
2828
return response;
2929
}
3030

31+
function stripEndSlash(href) {
32+
let urlInfo = href.length ? href.split('?') : ['', ''];
33+
let url = urlInfo[0];
34+
if (url[url.length - 1] == '/') {
35+
return url.substr(0, url.length - 1) + '?' + urlInfo[1];
36+
}
37+
return href;
38+
}
39+
3140
function filterDuplicates(arr) {
3241
var uniqueLinks = {};
3342
var results = [];
34-
arr.forEach(element => {
43+
arr.forEach(item => {
44+
let element = stripEndSlash(item);
3545
if (!uniqueLinks[element]) {
3646
uniqueLinks[element] = true;
3747
results.push(element);
@@ -46,5 +56,6 @@ module.exports = {
4656
generateMD5: generateMD5,
4757
filterDuplicates: filterDuplicates,
4858
fetch: fetch,
59+
stripEndSlash: stripEndSlash,
4960
To: awaitHandler
5061
}

‎worker/index.js

+23-19
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/**
22
* Worker to intiate crawling of medium.com urls
3+
*
34
*/
45
const cheerio = require('cheerio'),
56
config = require('../config'),
@@ -11,45 +12,55 @@ let runningProcess = 0;
1112
let urlCounter = 0;
1213
let mediumUrl = 'https://medium.com/';
1314

15+
// Max number of process allowed
1416
const MAX_PROCESS = config.MAX_PROCESS || 5;
1517
// Experimental config
1618
const MAX_URLS = config.MAX_URLS || 8;
1719
const DELAY = config.DELAY || 1000;
1820

1921

2022
queue.push(mediumUrl);
21-
23+
/**
24+
* This function crawls medium urls recursively in batches of 5.
25+
* If encountered url is new after filtering unique url then
26+
* that url is saved in DB,similar process happens for query params
27+
*/
2228
function crawlProcess() {
2329
let href = queue.shift();
2430
console.log("Fethching from " + href + "...");
25-
// Number of URLs to track
26-
// if (href && urlCounter <= (MAX_URLS - MAX_PROCESS)) {
2731
runningProcess++;
32+
// Fetching page with url
2833
lib.fetch(href).then(async(res) => {
2934
let htmlRes = res.body;
3035
const $ = cheerio.load(htmlRes);
3136
let querySelectorString = `a[href^="${mediumUrl}"]`;
3237
let anchorUrls = [];
38+
// Running query selector to get all anchore nodes
39+
// which have href starting with ${mediumUrl}
3340
$(querySelectorString).filter((key, node) => {
3441
anchorUrls.push(node.attribs.href);
3542
});
43+
//Stripping end slashes and filtering duplicate Urls
3644
anchorUrls = lib.filterDuplicates(anchorUrls);
3745
queue = queue.concat(anchorUrls);
46+
3847
let urlInfo = href ? href.split("?") : "";
3948
let url = urlInfo[0];
49+
// Building params for db to store
4050
let data = {
4151
url: url,
4252
md5: lib.generateMD5(url)
4353
};
44-
//console.log("RUNNING PROCESS: ", runningProcess);
54+
// Parsing url query params
4555
let urlParams = urlInfo[1] ? lib.parseQueryParams(urlInfo[1]) : null;
4656
let urlRes, queryRes;
57+
4758
try {
48-
// See if url data already exist or not;
59+
// See if url data already exist in DB or not;
4960
urlRes = await controllers.urlCtrl.getOneby({ md5: data.md5 });
5061
if (!urlRes[1] || !urlRes[1]._id) {
62+
// Saving new url data in DB
5163
urlRes = await controllers.urlCtrl.save(data);
52-
// update urlCounter on saving new url
5364
urlCounter++;
5465
console.log("URL COUNTER: ", urlCounter);
5566
}
@@ -67,39 +78,32 @@ function crawlProcess() {
6778
},
6879
md5: lib.generateMD5(href)
6980
};
70-
// See if query data already exist or not;
81+
// See if query data already exist in DB or not;
7182
queryRes = await controllers.paramsCtrl.getOneby({ md5: queryData.md5 });
72-
//console.log("queryRes", queryRes);
7383
if (!queryRes[1] || !queryRes[1]._id) {
84+
// Saving new query params data in DB
7485
queryRes = await controllers.paramsCtrl.save(queryData);
7586
}
7687
}
77-
//console.log("response ", urlRes, queryRes);
7888
} catch (ex) {
7989
console.debug(ex);
8090
}
8191
if (runningProcess < MAX_PROCESS && queue.length) {
82-
//console.log("sync run");
83-
// linear multiplier can be added to provide processing delay e.g. runningProcess*100
8492
crawlProcess();
8593
}
8694
runningProcess--;
8795
}, (reject) => {
88-
console.log("Fetching error ", reject);
89-
queue.push(url);
96+
console.debug("Fetching error ", reject);
97+
queue.push(href);
9098
setTimeout(() => {
9199
crawlProcess();
92100
}, (runningProcess + 1) * 1000);
93101
});
102+
// Setting up MAX_PROCESS (5) asynchronous crawcallslProcess
103+
// for the first after fetching landing page, otherwise it will always process 1-1;
94104
if (runningProcess < MAX_PROCESS && queue.length) {
95-
//console.log("async run");
96-
// linear multiplier can be added to provide processing delay e.g. runningProcess*100
97105
crawlProcess();
98106
}
99-
//}
100-
// else {
101-
// console.log("I AM DONE !");
102-
// }
103107
}
104108

105109
module.exports = crawlProcess;

0 commit comments

Comments
 (0)
This repository has been archived.