From 9f7017b3f1fa4214e6613f6ffe1c5aa63126677e Mon Sep 17 00:00:00 2001 From: chantorak <22381437+chantorak@users.noreply.github.com> Date: Mon, 23 Sep 2024 21:40:07 +0100 Subject: [PATCH] hpagent option (#131) * hpagent option * updated docs, readme * Update sitemapper.js * updating package-lock.json --------- Co-authored-by: Mohammad Co-authored-by: Sean Thomas Burke --- README.md | 1 + docs.md | 408 ++++++++++++--------------------------- src/assets/sitemapper.js | 3 + src/tests/test.js | 2 +- 4 files changed, 128 insertions(+), 286 deletions(-) diff --git a/README.md b/README.md index 3c45ca9..512fe6a 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,7 @@ You can add options on the initial Sitemapper object when instantiating it. + `rejectUnauthorized`: (Boolean) - If true, it will throw on invalid certificates, such as expired or self-signed ones. Default: True + `lastmod`: (Number) - Timestamp of the minimum lastmod value allowed for returned urls + `field` : (Object) - An object of fields to be returned from the sitemap. For Example: `{ loc: true, lastmod: true, changefreq: true, priority: true }`. Leaving a field out has the same effect as `field: false`. If not specified sitemapper defaults to returning the 'classic' array of urls. ++ `proxyAgent`: (HttpProxyAgent|HttpsProxyAgent) - instance of npm "hpagent" HttpProxyAgent or HttpsProxyAgent to be passed to npm "got" ```javascript diff --git a/docs.md b/docs.md index 242bf95..6fbf2aa 100644 --- a/docs.md +++ b/docs.md @@ -1,106 +1,81 @@ - - -### Table of Contents - -- [xmlParse][1] -- [SitesArray][2] - - [Examples][3] -- [SitesData][4] - - [Properties][5] - - [Examples][6] -- [ParseData][7] - - [Properties][8] - - [Examples][9] -- [Timeout][10] -- [getSitesCallback][11] - - [Parameters][12] -- [Sitemapper][13] - - [fetch][14] - - [Parameters][15] - - [Examples][16] - - [getSites][17] - - [Parameters][18] - - [timeout][19] - - [Examples][20] - - [timeout][21] - - [Parameters][22] - - [Examples][23] - - [url][24] - - [Parameters][25] - - [Examples][26] - - [url][27] - - [Examples][28] - -## xmlParse - -Sitemap Parser - -Copyright (c) 2020 Sean Thomas Burke -Licensed under the MIT license. - -**Meta** - -- **author**: Sean Burke <@seantomburke> - -## SitesArray - -An array of urls - -Type: [Array][29]<[String][30]> - -### Examples - -```javascript -[ - 'www.google.com', - 'https://www.linkedin.com' -] -``` - -## SitesData - -Resolve handler type for the promise in this.parse() - -Type: [Object][31] - -### Properties - -- `url` **[string][30]** the original url used to query the data -- `sites` **[SitesArray][32]** - -### Examples - -```javascript -{ - url: 'https://linkedin.com/sitemap.xml', - sites: [ - 'https://linkedin.com/project1', - 'https://linkedin.com/project2' - ] -} -``` - -## ParseData - -Resolve handler type for the promise in this.parse() - -Type: [Object][31] - -### Properties +### Table of Contents* [parseStringPromise][1] +* [Sitemapper][2] + * [fetch][3] + * [Parameters][4] + * [Examples][5] + * [getSites][6] + * [Parameters][7] + * [decompressResponseBody][8] + * [Parameters][9] + * [timeout][10] + * [Examples][11] + * [timeout][12] + * [Parameters][13] + * [Examples][14] + * [lastmod][15] + * [Examples][16] + * [lastmod][17] + * [Parameters][18] + * [Examples][19] + * [url][20] + * [Parameters][21] + * [Examples][22] + * [url][23] + * [Examples][24] + * [debug][25] + * [Parameters][26] + * [Examples][27] + * [debug][28] + * [Examples][29] +* [getSitesCallback][30] + * [Parameters][31] +* [Timeout][32] +* [ParseData][33] + * [Properties][34] + * [Examples][35] +* [SitesData][36] + * [Properties][37] + * [Examples][38] +* [SitesArray][39] + * [Examples][40] +* [ErrorDataArray][41] + * [Examples][42] +* [ErrorData][43] + * [Properties][44] + * [Examples][45]## parseStringPromiseSitemap ParserCopyright (c) 2020 Sean Thomas Burke +Licensed under the MIT license.**Meta*** **author**: Sean Burke <@seantomburke>## SitemapperType: [Object][46]### fetchGets the sites from a sitemap.xml with a given URL#### Parameters* `url` **[string][47]?** the Sitemaps url (e.g [https://wp.seantburke.com/sitemap.xml][48]) (optional, default `this.url`)#### Examples```javascript +sitemapper.fetch('example.xml') + .then((sites) => console.log(sites)); +```Returns **[Promise][49]<[SitesData][36]>** ### getSitesGets the sites from a sitemap.xml with a given URLType: [Function][50]#### Parameters* `url` **[string][47]** url to query (optional, default `this.url`) +* `callback` **[getSitesCallback][30]** callback for sites and error**Meta*** **deprecated**: This is deprecated.### decompressResponseBodyDecompress the gzipped response body using zlib.gunzip#### Parameters* `body` **[Buffer][51]** body of the gzipped fileReturns **[Boolean][52]** ### timeoutGet the timeout#### Examples```javascript +console.log(sitemapper.timeout); +```Returns **[Timeout][10]** ### timeoutSet the timeout#### Parameters* `duration` **[Timeout][10]** #### Examples```javascript +sitemapper.timeout = 15000; // 15 seconds +```### lastmodGet the lastmod minimum value#### Examples```javascript +console.log(sitemapper.lastmod); +```Returns **[Number][53]** ### lastmodSet the lastmod minimum value#### Parameters* `timestamp` **[Number][53]** #### Examples```javascript +sitemapper.lastmod = 1630694181; // Unix timestamp +```### url#### Parameters* `url` **[string][47]** url for making requests. Should be a link to a sitemaps.xml#### Examples```javascript +sitemapper.url = 'https://wp.seantburke.com/sitemap.xml' +```### urlGet the url to parse#### Examples```javascript +console.log(sitemapper.url) +```Returns **[string][47]** ### debugSetter for the debug state#### Parameters* `option` **[Boolean][52]** set whether to show debug logs in output.#### Examples```javascript +sitemapper.debug = true; +```### debugGetter for the debug state#### Examples```javascript +console.log(sitemapper.debug) +```Returns **[Boolean][52]** ## getSitesCallbackCallback for the getSites methodType: [Function][50]### Parameters* `error` **[Object][46]** error from callback +* `sites` **[Array][54]** an Array of sitemaps## TimeoutTimeout in millisecondsType: [Number][53]## ParseDataResolve handler type for the promise in this.parse()Type: [Object][46]### Properties* `error` **[Error][55]** that either comes from `parseStringPromise` or `got` or custom error +* `data` **[Object][46]** -- `error` **[Error][33]** that either comes from `xmlParse` or `request` or custom error -- `data` **[Object][31]** - - `data.url` **[string][30]** URL of sitemap - - `data.urlset` **[Array][29]** Array of returned URLs - - `data.urlset.url` **[string][30]** single Url - - `data.sitemapindex` **[Object][31]** index of sitemap - - `data.sitemapindex.sitemap` **[string][30]** Sitemap + * `data.url` **[string][47]** URL of sitemap + * `data.urlset` **[Array][54]** Array of returned URLs -### Examples + * `data.urlset.url` **[string][47]** single Url + * `data.sitemapindex` **[Object][46]** index of sitemap -```javascript + * `data.sitemapindex.sitemap` **[string][47]** Sitemap### Examples```javascript { - error: "There was an error!" + error: 'There was an error!' data: { url: 'https://linkedin.com', urlset: [{ @@ -110,189 +85,52 @@ Type: [Object][31] }] } } -``` - -## Timeout - -Timeout in milliseconds - -Type: [Number][34] - -## getSitesCallback - -Callback for the getSites method - -Type: [Function][35] - -### Parameters - -- `error` **[Object][31]** error from callback -- `sites` **[Array][29]** an Array of sitemaps - -## Sitemapper - -Type: [Object][31] - -### fetch - -Gets the sites from a sitemap.xml with a given URL - -#### Parameters - -- `url` **[string][30]?** the Sitemaps url (e.g [https://wp.seantburke.com/sitemap.xml][36]) (optional, default `this.url`) - -#### Examples - -```javascript -sitemapper.fetch('example.xml') - .then((sites) => console.log(sites)); -``` - -Returns **[Promise][37]<[SitesData][38]>** - -### getSites - -/\*\* -Gets the sites from a sitemap.xml with a given URL - -Type: [Function][35] - -#### Parameters - -- `url` **[string][30]** url to query (optional, default `this.url`) -- `callback` **[getSitesCallback][39]** callback for sites and error - -**Meta** - -- **deprecated**: This is deprecated. - - -### timeout - -Get the timeout - -#### Examples - -```javascript -console.log(sitemapper.timeout); -``` - -Returns **[Timeout][40]** - -### timeout - -Set the timeout - -#### Parameters - -- `duration` **[Timeout][40]** - -#### Examples - -```javascript -sitemapper.timeout = 15000; // 15 seconds -``` - -### url - -#### Parameters - -- `url` **[string][30]** url for making requests. Should be a link to a sitemaps.xml - -#### Examples - -```javascript -sitemapper.url = 'https://wp.seantburke.com/sitemap.xml' -``` - -### url - -Get the url to parse - -#### Examples - -```javascript -console.log(sitemapper.url) -``` - -Returns **[string][30]** - -[1]: #xmlparse - -[2]: #sitesarray - -[3]: #examples - -[4]: #sitesdata - -[5]: #properties - -[6]: #examples-1 - -[7]: #parsedata - -[8]: #properties-1 - -[9]: #examples-2 - -[10]: #timeout - -[11]: #getsitescallback - -[12]: #parameters - -[13]: #sitemapper - -[14]: #fetch - -[15]: #parameters-1 - -[16]: #examples-3 - -[17]: #getsites - -[18]: #parameters-2 - -[19]: #timeout-1 - -[20]: #examples-4 - -[21]: #timeout-2 - -[22]: #parameters-3 - -[23]: #examples-5 - -[24]: #url - -[25]: #parameters-4 - -[26]: #examples-6 - -[27]: #url-1 - -[28]: #examples-7 - -[29]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Array - -[30]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/String - -[31]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Object - -[32]: #sitesarray - -[33]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Error - -[34]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Number - -[35]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Statements/function - -[36]: https://wp.seantburke.com/sitemap.xml - -[37]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Promise - -[38]: #sitesdata - -[39]: #getsitescallback - -[40]: #timeout +```## SitesDataResolve handler type for the promise in this.parse()Type: [Object][46]### Properties* `url` **[string][47]** the original url used to query the data +* `sites` **[SitesArray][39]** +* `errors` **[ErrorDataArray][41]** ### Examples```javascript +{ + url: 'https://linkedin.com/sitemap.xml', + sites: [ + 'https://linkedin.com/project1', + 'https://linkedin.com/project2' + ], + errors: [ + { + type: 'CancelError', + url: 'https://www.walmart.com/sitemap_tp1.xml', + retries: 0 + }, + { + type: 'HTTPError', + url: 'https://www.walmart.com/sitemap_tp2.xml', + retries: 0 + }, + ] +} +```## SitesArrayAn array of urlsType: [Array][54]<[String][47]>### Examples```javascript +[ + 'https://www.google.com', + 'https://www.linkedin.com' +] +```## ErrorDataArrayAn array of Error data objectsType: [Array][54]<[ErrorData][43]>### Examples```javascript +[ + { + type: 'CancelError', + url: 'https://www.walmart.com/sitemap_tp1.xml', + retries: 0 + }, + { + type: 'HTTPError', + url: 'https://www.walmart.com/sitemap_tp2.xml', + retries: 0 + }, +] +```## ErrorDataAn object containing details about the errors which occurred during the crawlType: [Object][46]### Properties* `type` **[string][47]** The error type which was returned +* `url` **[string][47]** The sitemap URL which returned the error +* `errors` **[Number][53]** The total number of retries attempted after receiving the first error### Examples```javascript +{ + type: 'CancelError', + url: 'https://www.walmart.com/sitemap_tp1.xml', + retries: 0 +} +```[1]: #parsestringpromise[2]: #sitemapper[3]: #fetch[4]: #parameters[5]: #examples[6]: #getsites[7]: #parameters-1[8]: #decompressresponsebody[9]: #parameters-2[10]: #timeout[11]: #examples-1[12]: #timeout-1[13]: #parameters-3[14]: #examples-2[15]: #lastmod[16]: #examples-3[17]: #lastmod-1[18]: #parameters-4[19]: #examples-4[20]: #url[21]: #parameters-5[22]: #examples-5[23]: #url-1[24]: #examples-6[25]: #debug[26]: #parameters-6[27]: #examples-7[28]: #debug-1[29]: #examples-8[30]: #getsitescallback[31]: #parameters-7[32]: #timeout-2[33]: #parsedata[34]: #properties[35]: #examples-9[36]: #sitesdata[37]: #properties-1[38]: #examples-10[39]: #sitesarray[40]: #examples-11[41]: #errordataarray[42]: #examples-12[43]: #errordata[44]: #properties-2[45]: #examples-13[46]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Object[47]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/String[48]: https://wp.seantburke.com/sitemap.xml[49]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Promise[50]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Statements/function[51]: https://nodejs.org/api/buffer.html[52]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Boolean[53]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Number[54]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Array[55]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Error diff --git a/src/assets/sitemapper.js b/src/assets/sitemapper.js index fa74217..811f443 100644 --- a/src/assets/sitemapper.js +++ b/src/assets/sitemapper.js @@ -27,6 +27,7 @@ export default class Sitemapper { * @params {integer} [options.retries] - The maximum number of retries to attempt when crawling fails (e.g. 1 for 1 retry, 2 attempts in total) * @params {boolean} [options.rejectUnauthorized] - If true (default), it will throw on invalid certificates, such as expired or self-signed ones. * @params {lastmod} [options.lastmod] - the minimum lastmod value for urls + * @params {hpagent.HttpProxyAgent|hpagent.HttpsProxyAgent} [options.proxyAgent] - instance of npm "hpagent" HttpProxyAgent or HttpsProxyAgent to be passed to npm "got" * * @example let sitemap = new Sitemapper({ * url: 'https://wp.seantburke.com/sitemap.xml', @@ -47,6 +48,7 @@ export default class Sitemapper { this.rejectUnauthorized = settings.rejectUnauthorized === false ? false : true; this.fields = settings.fields || false; + this.proxyAgent = settings.proxyAgent || {}; } /** @@ -186,6 +188,7 @@ export default class Sitemapper { https: { rejectUnauthorized: this.rejectUnauthorized, }, + agent: this.proxyAgent, }; try { diff --git a/src/tests/test.js b/src/tests/test.js index f7cffd3..7c4d0ca 100644 --- a/src/tests/test.js +++ b/src/tests/test.js @@ -199,7 +199,7 @@ describe('Sitemapper', function () { done(); }) .catch(error => { - console.error('Test failed'); + console.error('Test failed');status done(error); }); });