-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.js
116 lines (107 loc) · 3.35 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
const HCCrawler = require('headless-chrome-crawler');
const CSVExporter = require('headless-chrome-crawler/exporter/csv');
const fs = require('fs'); // v 5.0.0
var argv = require('yargs')
.usage('Usage: $0 [options]')
.example('$0 -in url_list -out url_price_out', 'extracts price from www.flipkart.com and www.amazon.in urls')
.alias('i', 'infile')
.alias('o', 'outfile')
.alias('d', 'delay')
.alias('c', 'maxConcurrency')
.nargs('i', 1)
.describe('i', 'input file containing list of urls to crawl')
.nargs('o', 1)
.describe('o', 'outout file containing list of urls and corresponding prices')
.demandOption([])
.help('h')
.alias('h', 'help')
.epilog('copyright 2021')
.argv;
var infile = process.stdin
var outfile = argv.outfile
console.log(argv, argv.infile)
if (argv.infile) {
infile = fs.createReadStream(argv.infile);
}
argv.delay = argv.delay || 5 * 100
argv.maxConcurrency = argv.maxConcurrency || 5
function extract() {
var domain = window.location.host
var item_json = {}
console.log("extracting...", domain)
var extractAmazon = function() {
console.log("extracting amazon...")
var item_json = {
"type": "Amazon",
"title": document.getElementById('title').innerText,
"offer_price": document.getElementById("priceblock_ourprice").innerText,
"raw_price": document.getElementById('price').innerText
}
return item_json
}
var extractFlipkart = function() {
console.log("extracting flipkart...")
var item_json = {
"type": "Flipkart",
"title": document.getElementsByTagName('h1')[0].innerText,
"offer_price": document.getElementsByClassName('_16Jk6d')[0].innerText,
"raw_price": document.getElementsByClassName('_16Jk6d')[0].innerText
}
return item_json
}
if (domain == 'www.amazon.in') {
item_json = extractAmazon()
}
else if (domain == 'www.flipkart.com') {
item_json = extractFlipkart()
}
return item_json
}
const exporter = new CSVExporter({
file: outfile,
fields: ['result.type', 'response.url', 'response.status', 'result.title', 'result.offer_price', 'result.raw_price'],
separator: "\t"
});
(async () => {
const crawler = await HCCrawler.launch({
delay: argv.delay,
maxConcurrency: argv.maxConcurrency,
executablePath: '/usr/bin/google-chrome',
headless: false,
evaluatePage: extract,
waitUntil: 'networkidle2',
retryCount: 0,
jQuery: false,
customCrawl: async (page, crawl) => {
// You can access the page object before requests
await page.setRequestInterception(true);
page.on('request', request => {
if (['image', 'stylesheet', 'font','other'].includes(request.resourceType())) {
request.abort();
} else {
request.continue();
}
});
// The result contains options, links, cookies and etc.
const result = await crawl();
// You need to extend and return the crawled result
return result;
},
onSuccess: result => {
console.log(`Crawled ${result.options.url}.`);
},
requestfailed: (err) => {
console.err("Request failed! ", err)
},
exporter
});
infile.on('data', async function (buf) {
buf.toString().split(/\n/).forEach(async function (url) {
if (url.trim().length) {
await crawler.queue(url);
}
})
});
await crawler.onIdle();
await crawler.close();
})();