diff --git a/.eslintrc.js b/.eslintrc.js index 38e3895..ced0800 100644 --- a/.eslintrc.js +++ b/.eslintrc.js @@ -1,7 +1,7 @@ module.exports = { extends: 'eslint:recommended', parserOptions: { - ecmaVersion: 6, + ecmaVersion: 8, sourceType: 'module', ecmaFeatures: {}, }, diff --git a/README.md b/README.md index 159a91c..e56fdcb 100644 --- a/README.md +++ b/README.md @@ -34,19 +34,22 @@ sitemap.fetch('https://wp.seantburke.com/sitemap.xml').then(function(sites) { ```javascript import Sitemapper from 'sitemapper'; -const Google = new Sitemapper({ - url: 'https://www.google.com/work/sitemap.xml', - timeout: 15000, // 15 seconds -}); - -Google.fetch() - .then(data => console.log(data.sites)) - .catch(error => console.log(error)); +(async () => { + const Google = new Sitemapper({ + url: 'https://www.google.com/work/sitemap.xml', + timeout: 15000, // 15 seconds + }); + try { + const { sites } = await Google.fetch(); + console.log(sites); + catch (error) { + console.log(error); + } +})(); // or - const sitemapper = new Sitemapper(); sitemapper.timeout = 5000; diff --git a/example.es6.js b/example.es6.js index 25b6f5b..5fd9e24 100644 --- a/example.es6.js +++ b/example.es6.js @@ -1,27 +1,41 @@ import Sitemapper from 'sitemapper'; -const sitemapper = new Sitemapper(); +(async () => { + const sitemapper = new Sitemapper(); -const Google = new Sitemapper({ - url: 'https://www.google.com/work/sitemap.xml', - debug: false, - timeout: 15000, // 15 seconds -}); + const Google = new Sitemapper({ + url: 'https://www.google.com/work/sitemap.xml', + debug: false, + timeout: 15000, // 15 seconds + }); -Google.fetch() - .then(data => console.log(data.sites)) - .catch(error => console.log(error)); + try { + const data = await Google.fetch(); + console.log(data.sites); + } catch(error) { + console.log(error); + } -sitemapper.timeout = 5000; + sitemapper.timeout = 5000; -sitemapper.fetch('https://wp.seantburke.com/sitemap.xml') - .then(({ url, sites }) => console.log(`url:${url}`, 'sites:', sites)) - .catch(error => console.log(error)); + try { + const { url, sites } = await sitemapper.fetch('https://wp.seantburke.com/sitemap.xml'); + console.log(`url:${url}`, 'sites:', sites); + } catch(error) { + console.log(error) + } -sitemapper.fetch('http://www.cnn.com/sitemaps/sitemap-index.xml') - .then(data => console.log(data)) - .catch(error => console.log(error)); + try { + const { url, sites } = await sitemapper.fetch('http://www.cnn.com/sitemaps/sitemap-index.xml'); + console.log(`url:${url}`, 'sites:', sites); + } catch(error) { + console.log(error) + } -sitemapper.fetch('http://www.stubhub.com/new-sitemap/us/sitemap-US-en-index.xml') - .then((data) => console.log(data)) - .catch(error => console.log(error)); + try { + const { url, sites } = await sitemapper.fetch('http://www.stubhub.com/new-sitemap/us/sitemap-US-en-index.xml'); + console.log(`url:${url}`, 'sites:', sites); + } catch(error) { + console.log(error) + } +})(); diff --git a/lib/assets/sitemapper.js b/lib/assets/sitemapper.js index bb6c1b1..2864765 100644 --- a/lib/assets/sitemapper.js +++ b/lib/assets/sitemapper.js @@ -1,2 +1,2 @@ -"use strict";Object.defineProperty(exports,"__esModule",{value:!0}),exports.default=void 0;var _xml2js=require("xml2js"),_got=_interopRequireDefault(require("got"));function _interopRequireDefault(a){return a&&a.__esModule?a:{default:a}}class Sitemapper{constructor(a){var b=a||{requestHeaders:{}};this.url=b.url,this.timeout=b.timeout||15e3,this.timeoutTable={},this.requestHeaders=b.requestHeaders,this.debug=b.debug}fetch(){var a=0this.crawl(a).then(c=>b({url:a,sites:c})))}static get timeout(){return this.timeout}static set timeout(a){this.timeout=a}static set url(a){this.url=a}static get url(){return this.url}static set debug(a){this.debug=a}static get debug(){return this.debug}parse(){var a=0{var d=(0,_got.default)(a,b);d.then(b=>b&&200===b.statusCode?(0,_xml2js.parseStringPromise)(b.body):(clearTimeout(this.timeoutTable[a]),c({error:b.error,data:b}))).then(a=>c({error:null,data:a})).catch(a=>c({error:a.error,data:a})),this.initializeTimeout(a,d,c)})}initializeTimeout(a,b,c){this.timeoutTable[a]=setTimeout(()=>{b.cancel(),this.debug&&console.debug("crawl timed out"),c({error:"request timed out after ".concat(this.timeout," milliseconds for url: '").concat(a,"'"),data:{}})},this.timeout)}crawl(a){return new Promise(b=>{this.parse(a).then((c)=>{var{error:d,data:e}=c;if(clearTimeout(this.timeoutTable[a]),d)return this.debug&&console.error("Error occurred during \"crawl('".concat(a,"')\":\n\r Error: ").concat(d)),b([]);if(e&&e.urlset&&e.urlset.url){this.debug&&console.debug("Urlset found during \"crawl('".concat(a,"')\""));var h=e.urlset.url.map(a=>a.loc&&a.loc[0]);return b([].concat(h))}if(e&&e.sitemapindex){this.debug&&console.debug("Additional sitemap found during \"crawl('".concat(a,"')\""));var f=e.sitemapindex.sitemap.map(a=>a.loc&&a.loc[0]),g=f.map(a=>this.crawl(a));return Promise.all(g).then(a=>{var c=a.filter(a=>!a.error).reduce((a,b)=>a.concat(b),[]);return b(c)})}return this.debug&&console.error("Unknown state during \"crawl(".concat(a,")\":"),d,e),b([])})})}getSites(){var a=0{d=a.sites}).catch(a=>{c=a}),b(c,d)}}exports.default=Sitemapper,module.exports=exports.default,module.exports.default=exports.default; +"use strict";Object.defineProperty(exports,"__esModule",{value:!0}),exports.default=void 0;var _xml2js=require("xml2js"),_got=_interopRequireDefault(require("got"));function _interopRequireDefault(a){return a&&a.__esModule?a:{default:a}}function asyncGeneratorStep(a,b,c,d,e,f,g){try{var h=a[f](g),i=h.value}catch(a){return void c(a)}h.done?b(i):Promise.resolve(i).then(d,e)}function _asyncToGenerator(a){return function(){var b=this,c=arguments;return new Promise(function(d,e){function f(a){asyncGeneratorStep(h,d,e,f,g,"next",a)}function g(a){asyncGeneratorStep(h,d,e,f,g,"throw",a)}var h=a.apply(b,c);f(void 0)})}}class Sitemapper{constructor(a){var b=a||{requestHeaders:{}};this.url=b.url,this.timeout=b.timeout||15e3,this.timeoutTable={},this.requestHeaders=b.requestHeaders,this.debug=b.debug}fetch(){var a=arguments,b=this;return _asyncToGenerator(function*(){var c=0b.cancel(),this.timeout)}crawl(a){var b=this;return _asyncToGenerator(function*(){try{var{error:g,data:h}=yield b.parse(a);if(clearTimeout(b.timeoutTable[a]),g)return b.debug&&console.error("Error occurred during \"crawl('".concat(a,"')\":\n\r Error: ").concat(g)),[];if(h&&h.urlset&&h.urlset.url){b.debug&&console.debug("Urlset found during \"crawl('".concat(a,"')\""));var i=h.urlset.url.map(a=>a.loc&&a.loc[0]);return[].concat(i)}if(h&&h.sitemapindex){b.debug&&console.debug("Additional sitemap found during \"crawl('".concat(a,"')\""));var c=h.sitemapindex.sitemap.map(a=>a.loc&&a.loc[0]),d=c.map(a=>b.crawl(a)),e=yield Promise.all(d),f=e.filter(a=>!a.error).reduce((a,b)=>a.concat(b),[]);return f}return b.debug&&console.error("Unknown state during \"crawl('".concat(a,")'\":"),g,h),[]}catch(a){b.debug&&b.debug&&console.error(a)}})()}getSites(){var a=arguments,b=this;return _asyncToGenerator(function*(){var c=0 console.log(sites)); */ - fetch(url = this.url) { - return new Promise(resolve => this.crawl(url).then(sites => resolve({ url, sites }))); + async fetch(url = this.url) { + let sites = []; + try { + // crawl the URL + sites = await this.crawl(url); + } catch (e) { + if (this.debug) { + console.error(e); + } + } + + // If we run into an error, don't throw, but instead return an empty array + return { + url, + sites, + } } /** @@ -111,7 +125,8 @@ export default class Sitemapper { * @param {string} [url] - the Sitemaps url (e.g https://wp.seantburke.com/sitemap.xml) * @returns {Promise} */ - parse(url = this.url) { + async parse(url = this.url) { + // setup the response options for the got request const requestOptions = { method: 'GET', resolveWithFullResponse: true, @@ -119,20 +134,42 @@ export default class Sitemapper { headers: this.requestHeaders, }; - return new Promise((resolve) => { + try { + // create a request Promise with the url and request options const requester = got(url, requestOptions); - requester.then((response) => { - if (!response || response.statusCode !== 200) { - clearTimeout(this.timeoutTable[url]); - return resolve({ error: response.error, data: response }); - } - return parseStringPromise(response.body); - }) - .then(data => resolve({ error: null, data })) - .catch(response => resolve({ error: response.error, data: response })); - - this.initializeTimeout(url, requester, resolve); - }); + + // initialize the timeout method based on the URL, and pass the request object. + this.initializeTimeout(url, requester); + + // + const response = await requester; + + // if the response does not have a successful status code then clear the timeout for this url. + if (!response || response.statusCode !== 200) { + clearTimeout(this.timeoutTable[url]); + return { error: response.error, data: response }; + } + + // otherwise parse the XML that was returned. + const data = await parseStringPromise(response.body); + + // return the results + return { error: null, data } + } catch (error) { + // If the request was canceled notify the user of the timeout + if (error.name === 'CancelError') { + return { + error: `Request timed out after ${this.timeout} milliseconds for url: '${url}'`, + data: error + } + } + + // Otherwise notify of another error + return { + error: error.error, + data: error + } + } } /** @@ -142,22 +179,10 @@ export default class Sitemapper { * @private * @param {string} url - url to use as a hash in the timeoutTable * @param {Promise} requester - the promise that creates the web request to the url - * @param {Function} callback - the resolve method is used here to resolve the parent promise */ - initializeTimeout(url, requester, callback) { - // this resolves instead of rejects in order to allow other requests to continue - this.timeoutTable[url] = setTimeout(() => { - requester.cancel(); - - if (this.debug) { - console.debug('crawl timed out'); - } - - callback({ - error: `request timed out after ${this.timeout} milliseconds for url: '${url}'`, - data: {}, - }); - }, this.timeout); + initializeTimeout(url, requester) { + // this will throw a CancelError which will be handled in the parent that calls this method. + this.timeoutTable[url] = setTimeout(() => requester.cancel(), this.timeout); } /** @@ -168,47 +193,52 @@ export default class Sitemapper { * @param {string} url - the Sitemaps url (e.g https://wp.seantburke.com/sitemap.xml) * @returns {Promise | Promise} */ - crawl(url) { - return new Promise((resolve) => { - this.parse(url).then(({ error, data }) => { - // The promise resolved, remove the timeout - clearTimeout(this.timeoutTable[url]); + async crawl(url) { + try { + const { error, data } = await this.parse(url); + // The promise resolved, remove the timeout + clearTimeout(this.timeoutTable[url]); - if (error) { - if (this.debug) { - console.error(`Error occurred during "crawl('${url}')":\n\r Error: ${error}`); - } - // Fail silently - return resolve([]); - } else if (data && data.urlset && data.urlset.url) { - if (this.debug) { - console.debug(`Urlset found during "crawl('${url}')"`); - } - const sites = data.urlset.url.map(site => site.loc && site.loc[0]); - return resolve([].concat(sites)); - } else if (data && data.sitemapindex) { - if (this.debug) { - console.debug(`Additional sitemap found during "crawl('${url}')"`); - } - // Map each child url into a promise to create an array of promises - const sitemap = data.sitemapindex.sitemap.map(map => map.loc && map.loc[0]); - const promiseArray = sitemap.map(site => this.crawl(site)); - - // Make sure all the promises resolve then filter and reduce the array - return Promise.all(promiseArray).then(results => { - const sites = results.filter(result => !result.error) - .reduce((prev, curr) => prev.concat(curr), []); - - return resolve(sites); - }); - } + if (error) { if (this.debug) { - console.error(`Unknown state during "crawl(${url})":`, error, data); - } + console.error(`Error occurred during "crawl('${url}')":\n\r Error: ${error}`); + } // Fail silently - return resolve([]); - }); - }); + return []; + } else if (data && data.urlset && data.urlset.url) { + if (this.debug) { + console.debug(`Urlset found during "crawl('${url}')"`); + } + const sites = data.urlset.url.map(site => site.loc && site.loc[0]); + return [].concat(sites); + } else if (data && data.sitemapindex) { + if (this.debug) { + console.debug(`Additional sitemap found during "crawl('${url}')"`); + } + // Map each child url into a promise to create an array of promises + const sitemap = data.sitemapindex.sitemap.map(map => map.loc && map.loc[0]); + const promiseArray = sitemap.map(site => this.crawl(site)); + + // Make sure all the promises resolve then filter and reduce the array + const results = await Promise.all(promiseArray); + const sites = results + .filter(result => !result.error) + .reduce((prev, curr) => prev.concat(curr), []); + + return sites; + } + + if (this.debug) { + console.error(`Unknown state during "crawl('${url})'":`, error, data); + } + + // Fail silently + return []; + } catch (e) { + if (this.debug) { + this.debug &&console.error(e); + } + } } @@ -220,18 +250,19 @@ export default class Sitemapper { * @param {getSitesCallback} callback - callback for sites and error * @callback */ - getSites(url = this.url, callback) { + async getSites(url = this.url, callback) { console.warn( // eslint-disable-line no-console '\r\nWarning:', 'function .getSites() is deprecated, please use the function .fetch()\r\n' ); let err = {}; let sites = []; - this.fetch(url).then(response => { + try { + const response = await this.fetch(url); sites = response.sites; - }).catch(error => { - err = error; - }); + } catch (e) { + err = e; + } return callback(err, sites); } } diff --git a/src/examples/index.js b/src/examples/index.js index b7ef657..c868d54 100644 --- a/src/examples/index.js +++ b/src/examples/index.js @@ -7,7 +7,7 @@ const exampleURL = 'https://www.walmart.com/sitemap_topic.xml'; const sitemapper = new Sitemapper({ url: exampleURL, // url to crawl debug: true, // don't show debug logs - timeout: 10000, // 10 seconds + timeout: 1, // 10 seconds }); /**