diff --git a/README.md b/README.md index d647992..1e2da7e 100644 --- a/README.md +++ b/README.md @@ -62,8 +62,12 @@ sitemapper.fetch('https://wp.seantburke.com/sitemap.xml') You can add options on the initial Sitemapper object when instantiating it. -+ `requestHeaders`: (Object) - Additional Request Headers -+ `timeout`: (Number) - Maximum timeout for a single URL ++ `requestHeaders`: (Object) - Additional Request Headers (e.g. `User-Agent`) ++ `timeout`: (Number) - Maximum timeout in ms for a single URL. Default: 15000 (15 seconds) ++ `url`: (String) - Sitemap URL to crawl ++ `debug`: (Boolean) - Enables/Disables debug console logging. Default: False ++ `concurrency`: (Number) - Sets the maximum number of concurrent sitemap crawling threads. Default: 10 ++ `retries`: (Number) - Sets the maximum number of retries to attempt in case of an error response (e.g. 404 or Timeout). Default: 0 ```javascript @@ -77,6 +81,23 @@ const sitemapper = new Sitemapper({ ``` +An example using all available options: + +```javascript + +const sitemapper = new Sitemapper({ + url: 'https://art-works.community/sitemap.xml', + timeout: 15000, + requestHeaders: { + 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:81.0) Gecko/20100101 Firefox/81.0' + }, + debug: true, + concurrency: 2, + retries: 1, +}); + +``` + ### Examples in ES5 ```javascript var Sitemapper = require('sitemapper'); diff --git a/lib/assets/sitemapper.js b/lib/assets/sitemapper.js index 1b35266..27405d4 100644 --- a/lib/assets/sitemapper.js +++ b/lib/assets/sitemapper.js @@ -1,2 +1,2 @@ -"use strict";Object.defineProperty(exports,"__esModule",{value:!0}),exports.default=void 0;var _xml2js=require("xml2js"),_got=_interopRequireDefault(require("got")),_zlib=_interopRequireDefault(require("zlib")),_url=_interopRequireDefault(require("url")),_path=_interopRequireDefault(require("path"));function _interopRequireDefault(a){return a&&a.__esModule?a:{default:a}}function asyncGeneratorStep(a,b,c,d,e,f,g){try{var h=a[f](g),i=h.value}catch(a){return void c(a)}h.done?b(i):Promise.resolve(i).then(d,e)}function _asyncToGenerator(a){return function(){var b=this,c=arguments;return new Promise(function(d,e){function f(a){asyncGeneratorStep(h,d,e,f,g,"next",a)}function g(a){asyncGeneratorStep(h,d,e,f,g,"throw",a)}var h=a.apply(b,c);f(void 0)})}}class Sitemapper{constructor(a){var b=a||{requestHeaders:{}};this.url=b.url,this.timeout=b.timeout||15e3,this.timeoutTable={},this.requestHeaders=b.requestHeaders,this.debug=b.debug}fetch(){var a=arguments,b=this;return _asyncToGenerator(function*(){var c=0b.cancel(),this.timeout)}crawl(a){var b=this;return _asyncToGenerator(function*(){try{var{error:g,data:h}=yield b.parse(a);if(clearTimeout(b.timeoutTable[a]),g)return b.debug&&console.error("Error occurred during \"crawl('".concat(a,"')\":\n\r Error: ").concat(g)),[];if(h&&h.urlset&&h.urlset.url){b.debug&&console.debug("Urlset found during \"crawl('".concat(a,"')\""));var i=h.urlset.url.map(a=>a.loc&&a.loc[0]);return[].concat(i)}if(h&&h.sitemapindex){b.debug&&console.debug("Additional sitemap found during \"crawl('".concat(a,"')\""));var c=h.sitemapindex.sitemap.map(a=>a.loc&&a.loc[0]),d=c.map(a=>b.crawl(a)),e=yield Promise.all(d),f=e.filter(a=>!a.error).reduce((a,b)=>a.concat(b),[]);return f}return b.debug&&console.error("Unknown state during \"crawl('".concat(a,")'\":"),g,h),[]}catch(a){b.debug&&b.debug&&console.error(a)}})()}getSites(){var a=arguments,b=this;return _asyncToGenerator(function*(){var c=0{var d=Buffer.from(a);_zlib.default.gunzip(d,(a,d)=>{a?c(a):b(d)})})}}exports.default=Sitemapper,module.exports=exports.default,module.exports.default=exports.default; +"use strict";Object.defineProperty(exports,"__esModule",{value:!0}),exports.default=void 0;var _xml2js=require("xml2js"),_got=_interopRequireDefault(require("got")),_zlib=_interopRequireDefault(require("zlib")),_url=_interopRequireDefault(require("url")),_path=_interopRequireDefault(require("path")),_pLimit=_interopRequireDefault(require("p-limit"));function _interopRequireDefault(a){return a&&a.__esModule?a:{default:a}}function asyncGeneratorStep(a,b,c,d,e,f,g){try{var h=a[f](g),i=h.value}catch(a){return void c(a)}h.done?b(i):Promise.resolve(i).then(d,e)}function _asyncToGenerator(a){return function(){var b=this,c=arguments;return new Promise(function(d,e){function f(a){asyncGeneratorStep(h,d,e,f,g,"next",a)}function g(a){asyncGeneratorStep(h,d,e,f,g,"throw",a)}var h=a.apply(b,c);f(void 0)})}}class Sitemapper{constructor(a){var b=a||{requestHeaders:{}};this.url=b.url,this.timeout=b.timeout||15e3,this.timeoutTable={},this.requestHeaders=b.requestHeaders,this.debug=b.debug,this.concurrency=b.concurrency||10,this.retries=b.retries||0}fetch(){var a=arguments,b=this;return _asyncToGenerator(function*(){var c=0b.cancel(),this.timeout)}crawl(a){var b=arguments,c=this;return _asyncToGenerator(function*(){var d=1a.loc&&a.loc[0]);return{sites:m,errors:[]}}if(l&&l.sitemapindex){c.debug&&console.debug("Additional sitemap found during \"crawl('".concat(a,"')\""));var e=l.sitemapindex.sitemap.map(a=>a.loc&&a.loc[0]),f=(0,_pLimit.default)(c.concurrency),g=e.map(a=>f(()=>c.crawl(a))),h=yield Promise.all(g),i=h.filter(a=>0===a.errors.length).reduce((a,b)=>{var{sites:c}=b;return[...a,...c]},[]),j=h.filter(a=>0!==a.errors.length).reduce((a,b)=>{var{errors:c}=b;return[...a,...c]},[]);return{sites:i,errors:j}}return d{var d=Buffer.from(a);_zlib.default.gunzip(d,(a,d)=>{a?c(a):b(d)})})}}exports.default=Sitemapper,module.exports=exports.default,module.exports.default=exports.default; //# sourceMappingURL=sitemapper.js.map \ No newline at end of file diff --git a/lib/examples/index.js b/lib/examples/index.js index 4d45219..64b6198 100644 --- a/lib/examples/index.js +++ b/lib/examples/index.js @@ -1,2 +1,2 @@ -"use strict";var _sitemapper=_interopRequireDefault(require("../assets/sitemapper"));function _interopRequireDefault(a){return a&&a.__esModule?a:{default:a}}function asyncGeneratorStep(a,b,c,d,e,f,g){try{var h=a[f](g),i=h.value}catch(a){return void c(a)}h.done?b(i):Promise.resolve(i).then(d,e)}function _asyncToGenerator(a){return function(){var b=this,c=arguments;return new Promise(function(d,e){function f(a){asyncGeneratorStep(h,d,e,f,g,"next",a)}function g(a){asyncGeneratorStep(h,d,e,f,g,"throw",a)}var h=a.apply(b,c);f(void 0)})}}var exampleURL="https://www.walmart.com/sitemap_topic.xml",sitemapper=new _sitemapper.default({url:"https://www.walmart.com/sitemap_topic.xml",debug:!0,timeout:1});_asyncToGenerator(function*(){try{var a=yield sitemapper.fetch();console.log(a)}catch(a){console.error(a)}})(); +"use strict";var _sitemapper=_interopRequireDefault(require("../assets/sitemapper"));function _interopRequireDefault(a){return a&&a.__esModule?a:{default:a}}function asyncGeneratorStep(a,b,c,d,e,f,g){try{var h=a[f](g),i=h.value}catch(a){return void c(a)}h.done?b(i):Promise.resolve(i).then(d,e)}function _asyncToGenerator(a){return function(){var b=this,c=arguments;return new Promise(function(d,e){function f(a){asyncGeneratorStep(h,d,e,f,g,"next",a)}function g(a){asyncGeneratorStep(h,d,e,f,g,"throw",a)}var h=a.apply(b,c);f(void 0)})}}var exampleURL="https://www.walmart.com/sitemap_topic.xml",sitemapper=new _sitemapper.default({url:"https://www.walmart.com/sitemap_topic.xml",debug:!1,timeout:1e4,concurrency:10,retries:0});_asyncToGenerator(function*(){try{var a=yield sitemapper.fetch();console.log(a)}catch(a){console.error(a)}})(); //# sourceMappingURL=index.js.map \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index 47cde49..cbbb50f 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,6 +1,6 @@ { "name": "sitemapper", - "version": "3.1.16", + "version": "3.2.0", "lockfileVersion": 1, "requires": true, "dependencies": { @@ -3482,6 +3482,17 @@ "dev": true, "requires": { "p-limit": "^2.0.0" + }, + "dependencies": { + "p-limit": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.3.0.tgz", + "integrity": "sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w==", + "dev": true, + "requires": { + "p-try": "^2.0.0" + } + } } }, "path-exists": { @@ -6342,12 +6353,11 @@ "integrity": "sha512-wvPXDmbMmu2ksjkB4Z3nZWTSkJEb9lqVdMaCKpZUGJG9TMiNp9XcbG3fn9fPKjem04fJMJnXoyFPk2FmgiaiNg==" }, "p-limit": { - "version": "2.3.0", - "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.3.0.tgz", - "integrity": "sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w==", - "dev": true, + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz", + "integrity": "sha512-TYOanM3wGwNGsZN2cVTYPArw454xnXj5qmWF1bEoAc4+cU/ol7GVh7odevjp1FNHduHc3KZMcFduxU5Xc6uJRQ==", "requires": { - "p-try": "^2.0.0" + "yocto-queue": "^0.1.0" } }, "p-locate": { @@ -6357,6 +6367,17 @@ "dev": true, "requires": { "p-limit": "^2.2.0" + }, + "dependencies": { + "p-limit": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.3.0.tgz", + "integrity": "sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w==", + "dev": true, + "requires": { + "p-try": "^2.0.0" + } + } } }, "p-try": { @@ -6775,6 +6796,17 @@ "dev": true, "requires": { "p-limit": "^2.0.0" + }, + "dependencies": { + "p-limit": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.3.0.tgz", + "integrity": "sha512-//88mFWSJx8lxCzwdAABTJL2MyWB12+eIY7MDL2SqLmAkeKU9qxRvWuSyTjm3FUmpBEMuFfckAIqEaVGUDxb6w==", + "dev": true, + "requires": { + "p-try": "^2.0.0" + } + } } }, "path-exists": { @@ -8710,8 +8742,7 @@ "yocto-queue": { "version": "0.1.0", "resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-0.1.0.tgz", - "integrity": "sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==", - "dev": true + "integrity": "sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==" } } } diff --git a/package.json b/package.json index 4695ac8..dade74d 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "sitemapper", - "version": "3.1.16", + "version": "3.2.0", "description": "Parser for XML Sitemaps to be used with Robots.txt and web crawlers", "keywords": [ "parse", @@ -78,6 +78,7 @@ }, "dependencies": { "got": "^11.8.0", + "p-limit": "^3.1.0", "xml2js": "^0.4.23" } } diff --git a/sitemapper.d.ts b/sitemapper.d.ts index df7b328..b0be714 100644 --- a/sitemapper.d.ts +++ b/sitemapper.d.ts @@ -1,26 +1,36 @@ export interface SitemapperResponse { - url: string; - sites: string[]; + url: string; + sites: string[]; + errors: SitemapperErrorData[]; +} + +export interface SitemapperErrorData { + type: string; + url: string; + retries: number; } export interface SitemapperOptions { - url?: string; - timeout?: number; - requestHeaders?: {[name: string]: string}; + url?: string; + timeout?: number; + requestHeaders?: {[name: string]: string}; + debug?: boolean; + concurrency?: number; + retries?: number; } declare class Sitemapper { - timeout: number; + timeout: number; - constructor(options: SitemapperOptions) + constructor(options: SitemapperOptions) - /** - * Gets the sites from a sitemap.xml with a given URL - * - * @param url URL to the sitemap.xml file - */ - fetch(url?: string): Promise; + /** + * Gets the sites from a sitemap.xml with a given URL + * + * @param url URL to the sitemap.xml file + */ + fetch(url?: string): Promise; } export default Sitemapper; diff --git a/src/assets/sitemapper.js b/src/assets/sitemapper.js index b149854..caa7b6a 100644 --- a/src/assets/sitemapper.js +++ b/src/assets/sitemapper.js @@ -11,6 +11,7 @@ import got from 'got'; import zlib from 'zlib'; import Url from 'url'; import path from 'path'; +import pLimit from 'p-limit'; /** * @typedef {Object} Sitemapper @@ -22,6 +23,9 @@ export default class Sitemapper { * @params {Object} options to set * @params {string} [options.url] - the Sitemap url (e.g https://wp.seantburke.com/sitemap.xml) * @params {Timeout} [options.timeout] - @see {timeout} + * @params {boolean} [options.debug] - Enables/Disables additional logging + * @params {integer} [options.concurrency] - The number of concurrent sitemaps to crawl (e.g. 2 will crawl no more than 2 sitemaps at the same time) + * @params {integer} [options.retries] - The maximum number of retries to attempt when crawling fails (e.g. 1 for 1 retry, 2 attempts in total) * * @example let sitemap = new Sitemapper({ * url: 'https://wp.seantburke.com/sitemap.xml', @@ -35,6 +39,8 @@ export default class Sitemapper { this.timeoutTable = {}; this.requestHeaders = settings.requestHeaders; this.debug = settings.debug; + this.concurrency = settings.concurrency || 10; + this.retries = settings.retries || 0; } /** @@ -47,23 +53,31 @@ export default class Sitemapper { * .then((sites) => console.log(sites)); */ async fetch(url = this.url) { - let sites = []; + // initialize empty variables + let results = { + url: '', + sites: [], + errors: [], + }; + + // attempt to set the variables with the crawl try { // crawl the URL - sites = await this.crawl(url); + results = await this.crawl(url); } catch (e) { + // show errors that may occur if (this.debug) { console.error(e); } } - // If we run into an error, don't throw, but instead return an empty array return { url, - sites, + sites: results.sites || [], + errors: results.errors || [], }; - } + } /** * Get the timeout * @@ -178,7 +192,7 @@ export default class Sitemapper { // Otherwise notify of another error return { - error: error.error, + error: `Error occurred: ${error.name}`, data: error }; } @@ -203,49 +217,98 @@ export default class Sitemapper { * @private * @recursive * @param {string} url - the Sitemaps url (e.g https://wp.seantburke.com/sitemap.xml) - * @returns {Promise | Promise} + * @param {integer} retryIndex - Number of retry attempts fro this URL (e.g. 0 for 1st attempt, 1 for second attempty etc.) + * @returns {Promise} */ - async crawl(url) { + async crawl(url, retryIndex = 0) { try { const { error, data } = await this.parse(url); // The promise resolved, remove the timeout clearTimeout(this.timeoutTable[url]); if (error) { + // Handle errors during sitemap parsing / request + // Retry on error until you reach the retry limit set in the settings + if (retryIndex < this.retries) { + if (this.debug) { + console.log (`(Retry attempt: ${retryIndex + 1} / ${this.retries}) ${url} due to ${data.name} on previous request`); + } + return this.crawl(url, retryIndex + 1); + } + if (this.debug) { console.error(`Error occurred during "crawl('${url}')":\n\r Error: ${error}`); } - // Fail silently - return []; + + // Fail and log error + return { + sites: [], + errors: [{ + type: data.name, + url, + retries: retryIndex, + }] + }; + } else if (data && data.urlset && data.urlset.url) { + // Handle URLs found inside the sitemap if (this.debug) { console.debug(`Urlset found during "crawl('${url}')"`); } const sites = data.urlset.url.map(site => site.loc && site.loc[0]); - return [].concat(sites); + return { + sites, + errors: [] + } + } else if (data && data.sitemapindex) { + // Handle child sitemaps found inside the active sitemap if (this.debug) { console.debug(`Additional sitemap found during "crawl('${url}')"`); } // Map each child url into a promise to create an array of promises const sitemap = data.sitemapindex.sitemap.map(map => map.loc && map.loc[0]); - const promiseArray = sitemap.map(site => this.crawl(site)); + + // Parse all child urls within the concurrency limit in the settings + const limit = pLimit(this.concurrency); + const promiseArray = sitemap.map(site => limit(() => this.crawl(site))); // Make sure all the promises resolve then filter and reduce the array const results = await Promise.all(promiseArray); const sites = results - .filter(result => !result.error) - .reduce((prev, curr) => prev.concat(curr), []); + .filter(result => (result.errors.length === 0)) + .reduce((prev, { sites }) => [...prev, ...sites], []); + const errors = results + .filter(result => (result.errors.length !== 0)) + .reduce((prev, { errors }) => [...prev, ...errors], []); - return sites; + return { + sites, + errors, + }; } + // Retry on error until you reach the retry limit set in the settings + if (retryIndex < this.retries) { + if (this.debug) { + console.log (`(Retry attempt: ${retryIndex + 1} / ${this.retries}) ${url} due to ${data.name} on previous request`); + } + return this.crawl(url, retryIndex + 1); + } if (this.debug) { console.error(`Unknown state during "crawl('${url})'":`, error, data); } - // Fail silently - return []; + // Fail and log error + return { + sites: [], + errors: [{ + url, + type: data.name || 'UnknownStateError', + retries: retryIndex + }] + }; + } catch (e) { if (this.debug) { this.debug && console.error(e); @@ -340,7 +403,7 @@ export default class Sitemapper { * @property {Object} data.sitemapindex - index of sitemap * @property {string} data.sitemapindex.sitemap - Sitemap * @example { - * error: "There was an error!" + * error: 'There was an error!' * data: { * url: 'https://linkedin.com', * urlset: [{ @@ -359,11 +422,24 @@ export default class Sitemapper { * * @property {string} url - the original url used to query the data * @property {SitesArray} sites + * @property {ErrorDataArray} errors * @example { * url: 'https://linkedin.com/sitemap.xml', * sites: [ * 'https://linkedin.com/project1', * 'https://linkedin.com/project2' + * ], + * errors: [ + * { + * type: 'CancelError', + * url: 'https://www.walmart.com/sitemap_tp1.xml', + * retries: 0 + * }, + * { + * type: 'HTTPError', + * url: 'https://www.walmart.com/sitemap_tp2.xml', + * retries: 0 + * }, * ] * } */ @@ -377,3 +453,36 @@ export default class Sitemapper { * 'https://www.linkedin.com' * ] */ + +/** + * An array of Error data objects + * + * @typedef {ErrorData[]} ErrorDataArray + * @example [ + * { + * type: 'CancelError', + * url: 'https://www.walmart.com/sitemap_tp1.xml', + * retries: 0 + * }, + * { + * type: 'HTTPError', + * url: 'https://www.walmart.com/sitemap_tp2.xml', + * retries: 0 + * }, + * ] + */ + +/** + * An object containing details about the errors which occurred during the crawl + * + * @typedef {Object} ErrorData + * + * @property {string} type - The error type which was returned + * @property {string} url - The sitemap URL which returned the error + * @property {Number} errors - The total number of retries attempted after receiving the first error + * @example { + * type: 'CancelError', + * url: 'https://www.walmart.com/sitemap_tp1.xml', + * retries: 0 + * } + */ diff --git a/src/examples/index.js b/src/examples/index.js index c868d54..3cabd7b 100644 --- a/src/examples/index.js +++ b/src/examples/index.js @@ -6,8 +6,10 @@ const exampleURL = 'https://www.walmart.com/sitemap_topic.xml'; // Instantiate an instance const sitemapper = new Sitemapper({ url: exampleURL, // url to crawl - debug: true, // don't show debug logs - timeout: 1, // 10 seconds + debug: false, // don't show debug logs + timeout: 10000, // 10 seconds + concurrency: 10, // Number of maximum concurrent sitemap crawl threads + retries: 0, // Number of retry attempts in case of error response (e.g. 404 or timeout) }); /** diff --git a/src/tests/test.es5.js b/src/tests/test.es5.js index 8180954..e5e535b 100644 --- a/src/tests/test.es5.js +++ b/src/tests/test.es5.js @@ -79,6 +79,7 @@ describe('Sitemapper', function () { sitemapper.fetch(url) .then(data => { data.sites.should.be.Array; + data.errors.should.be.Array; done(); }) .catch(error => { @@ -138,6 +139,33 @@ describe('Sitemapper', function () { }); }); + describe('gzipped sitemaps', function () { + beforeEach(() => { + sitemapper = new Sitemapper({ + requestHeaders: { + 'Accept-Encoding': 'gzip,deflate,sdch', + } + }); + }); + + it('https://www.banggood.com/sitemap/category.xml.gz gzip should be a non-empty array', function (done) { + this.timeout(30000); + const url = 'https://www.banggood.com/sitemap/category.xml.gz'; + sitemapper.timeout = 10000; + sitemapper.fetch(url) + .then(data => { + data.sites.should.be.Array; + data.errors.should.be.Array; + data.sites.length.should.be.greaterThan(0); + done(); + }) + .catch(error => { + console.error('Test failed'); + done(error); + }); + }); + }); + describe('getSites method', function () { it('getSites should be backwards compatible', function (done) { this.timeout(30000); diff --git a/src/tests/test.js b/src/tests/test.js index d48943f..81d4706 100644 --- a/src/tests/test.js +++ b/src/tests/test.js @@ -77,9 +77,14 @@ describe('Sitemapper', function () { it('gibberish.gibberish should fail silently with an empty array', function (done) { this.timeout(30000); const url = 'http://gibberish.gibberish'; + sitemapper.debug = true; sitemapper.fetch(url) .then(data => { data.sites.should.be.Array; + data.errors.should.be.Array; + data.errors.length.should.be.greaterThan(0); + data.errors.length.should.be.greaterThan(0); + console.log(data); done(); }) .catch(error => { @@ -130,6 +135,8 @@ describe('Sitemapper', function () { sitemapper.fetch(url) .then(data => { data.sites.should.be.Array; + data.errors.should.be.Array; + console.log(data); done(); }) .catch(error => { @@ -138,9 +145,25 @@ describe('Sitemapper', function () { }); }); - it('https://m.banggood.com/sitemap/category.xml.gz gzip should be a non-empty array', function (done) { + it('https://www.golinks.com/blog/sitemap.xml sitemaps should return an empty array when timing out', function (done) { this.timeout(30000); - const url = 'https://m.banggood.com/sitemap/category.xml.gz'; + const url = 'https://www.golinks.com/blog/sitemap.xml'; + sitemapper.timeout = 10000; + sitemapper.fetch(url) + .then(data => { + data.sites.should.be.Array; + data.errors.should.be.Array; + done(); + }) + .catch(error => { + console.error('Test failed'); + done(error); + }); + }); + + it('https://www.banggood.com/sitemap/category.xml.gz gzip should be a non-empty array', function (done) { + this.timeout(30000); + const url = 'https://www.banggood.com/sitemap/category.xml.gz'; sitemapper.timeout = 10000; sitemapper.fetch(url) .then(data => { @@ -164,13 +187,14 @@ describe('Sitemapper', function () { }); }); - it('https://m.banggood.com/sitemap/category.xml.gz gzip should be a non-empty array', function (done) { + it('https://www.banggood.com/sitemap/category.xml.gz gzip should be a non-empty array', function (done) { this.timeout(30000); - const url = 'https://m.banggood.com/sitemap/category.xml.gz'; + const url = 'https://www.banggood.com/sitemap/category.xml.gz'; sitemapper.timeout = 10000; sitemapper.fetch(url) .then(data => { data.sites.should.be.Array; + data.errors.should.be.Array; data.sites.length.should.be.greaterThan(0); done(); }) diff --git a/src/tests/test.ts.ts b/src/tests/test.ts.ts index 488a430..ee0576c 100644 --- a/src/tests/test.ts.ts +++ b/src/tests/test.ts.ts @@ -81,6 +81,7 @@ describe('Sitemapper', function () { sitemapper.fetch(url) .then(data => { data.sites.should.be.Array; + data.errors.should.be.Array; done(); }) .catch(error => { @@ -141,6 +142,33 @@ describe('Sitemapper', function () { }); }); + describe('gzipped sitemaps', function () { + beforeEach(() => { + sitemapper = new Sitemapper({ + requestHeaders: { + 'Accept-Encoding': 'gzip,deflate,sdch', + } + }); + }); + + it('https://www.banggood.com/sitemap/category.xml.gz gzip should be a non-empty array', function (done) { + this.timeout(30000); + const url = 'https://www.banggood.com/sitemap/category.xml.gz'; + sitemapper.timeout = 10000; + sitemapper.fetch(url) + .then(data => { + data.sites.should.be.Array; + data.errors.should.be.Array; + data.sites.length.should.be.greaterThan(0); + done(); + }) + .catch(error => { + console.error('Test failed'); + done(error); + }); + }); + }); + describe('getSites method', function () { it('getSites should be backwards compatible', function (done) { this.timeout(30000);