1
1
/**
2
2
* Worker to intiate crawling of medium.com urls
3
+ *
3
4
*/
4
5
const cheerio = require ( 'cheerio' ) ,
5
6
config = require ( '../config' ) ,
@@ -11,45 +12,55 @@ let runningProcess = 0;
11
12
let urlCounter = 0 ;
12
13
let mediumUrl = 'https://medium.com/' ;
13
14
15
+ // Max number of process allowed
14
16
const MAX_PROCESS = config . MAX_PROCESS || 5 ;
15
17
// Experimental config
16
18
const MAX_URLS = config . MAX_URLS || 8 ;
17
19
const DELAY = config . DELAY || 1000 ;
18
20
19
21
20
22
queue . push ( mediumUrl ) ;
21
-
23
+ /**
24
+ * This function crawls medium urls recursively in batches of 5.
25
+ * If encountered url is new after filtering unique url then
26
+ * that url is saved in DB,similar process happens for query params
27
+ */
22
28
function crawlProcess ( ) {
23
29
let href = queue . shift ( ) ;
24
30
console . log ( "Fethching from " + href + "..." ) ;
25
- // Number of URLs to track
26
- // if (href && urlCounter <= (MAX_URLS - MAX_PROCESS)) {
27
31
runningProcess ++ ;
32
+ // Fetching page with url
28
33
lib . fetch ( href ) . then ( async ( res ) => {
29
34
let htmlRes = res . body ;
30
35
const $ = cheerio . load ( htmlRes ) ;
31
36
let querySelectorString = `a[href^="${ mediumUrl } "]` ;
32
37
let anchorUrls = [ ] ;
38
+ // Running query selector to get all anchore nodes
39
+ // which have href starting with ${mediumUrl}
33
40
$ ( querySelectorString ) . filter ( ( key , node ) => {
34
41
anchorUrls . push ( node . attribs . href ) ;
35
42
} ) ;
43
+ //Stripping end slashes and filtering duplicate Urls
36
44
anchorUrls = lib . filterDuplicates ( anchorUrls ) ;
37
45
queue = queue . concat ( anchorUrls ) ;
46
+
38
47
let urlInfo = href ? href . split ( "?" ) : "" ;
39
48
let url = urlInfo [ 0 ] ;
49
+ // Building params for db to store
40
50
let data = {
41
51
url : url ,
42
52
md5 : lib . generateMD5 ( url )
43
53
} ;
44
- //console.log("RUNNING PROCESS: ", runningProcess);
54
+ // Parsing url query params
45
55
let urlParams = urlInfo [ 1 ] ? lib . parseQueryParams ( urlInfo [ 1 ] ) : null ;
46
56
let urlRes , queryRes ;
57
+
47
58
try {
48
- // See if url data already exist or not;
59
+ // See if url data already exist in DB or not;
49
60
urlRes = await controllers . urlCtrl . getOneby ( { md5 : data . md5 } ) ;
50
61
if ( ! urlRes [ 1 ] || ! urlRes [ 1 ] . _id ) {
62
+ // Saving new url data in DB
51
63
urlRes = await controllers . urlCtrl . save ( data ) ;
52
- // update urlCounter on saving new url
53
64
urlCounter ++ ;
54
65
console . log ( "URL COUNTER: " , urlCounter ) ;
55
66
}
@@ -67,39 +78,32 @@ function crawlProcess() {
67
78
} ,
68
79
md5 : lib . generateMD5 ( href )
69
80
} ;
70
- // See if query data already exist or not;
81
+ // See if query data already exist in DB or not;
71
82
queryRes = await controllers . paramsCtrl . getOneby ( { md5 : queryData . md5 } ) ;
72
- //console.log("queryRes", queryRes);
73
83
if ( ! queryRes [ 1 ] || ! queryRes [ 1 ] . _id ) {
84
+ // Saving new query params data in DB
74
85
queryRes = await controllers . paramsCtrl . save ( queryData ) ;
75
86
}
76
87
}
77
- //console.log("response ", urlRes, queryRes);
78
88
} catch ( ex ) {
79
89
console . debug ( ex ) ;
80
90
}
81
91
if ( runningProcess < MAX_PROCESS && queue . length ) {
82
- //console.log("sync run");
83
- // linear multiplier can be added to provide processing delay e.g. runningProcess*100
84
92
crawlProcess ( ) ;
85
93
}
86
94
runningProcess -- ;
87
95
} , ( reject ) => {
88
- console . log ( "Fetching error " , reject ) ;
89
- queue . push ( url ) ;
96
+ console . debug ( "Fetching error " , reject ) ;
97
+ queue . push ( href ) ;
90
98
setTimeout ( ( ) => {
91
99
crawlProcess ( ) ;
92
100
} , ( runningProcess + 1 ) * 1000 ) ;
93
101
} ) ;
102
+ // Setting up MAX_PROCESS (5) asynchronous crawcallslProcess
103
+ // for the first after fetching landing page, otherwise it will always process 1-1;
94
104
if ( runningProcess < MAX_PROCESS && queue . length ) {
95
- //console.log("async run");
96
- // linear multiplier can be added to provide processing delay e.g. runningProcess*100
97
105
crawlProcess ( ) ;
98
106
}
99
- //}
100
- // else {
101
- // console.log("I AM DONE !");
102
- // }
103
107
}
104
108
105
109
module . exports = crawlProcess ;
0 commit comments