From 14f44f0deef6129cf9d970c383bf3b5f14586ef3 Mon Sep 17 00:00:00 2001 From: Sean Thomas Burke <965298+seantomburke@users.noreply.github.com> Date: Fri, 8 Nov 2024 01:00:38 -0800 Subject: [PATCH] Converting to using fast-xml-parser (#157) * Converting to using fast-xml-parser * Removing xml2js * Don't expose the parser unecessarily * Adding parser options * Adding parser options --------- Co-authored-by: Sean Thomas Burke --- package-lock.json | 87 +++++++++++++++++++--------------------- package.json | 4 +- src/assets/sitemapper.js | 37 ++++++++++------- src/examples/index.js | 4 +- 4 files changed, 68 insertions(+), 64 deletions(-) diff --git a/package-lock.json b/package-lock.json index 6962ed1..e57dd0e 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,10 +9,10 @@ "version": "3.2.18", "license": "MIT", "dependencies": { + "fast-xml-parser": "^4.5.0", "got": "^11.8.0", "is-gzip": "2.0.0", - "p-limit": "^3.1.0", - "xml2js": "^0.5.0" + "p-limit": "^3.1.0" }, "devDependencies": { "@babel/cli": "^7.12.8", @@ -3232,6 +3232,27 @@ "integrity": "sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==", "dev": true }, + "node_modules/fast-xml-parser": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/fast-xml-parser/-/fast-xml-parser-4.5.0.tgz", + "integrity": "sha512-/PlTQCI96+fZMAOLMZK4CWG1ItCbfZ/0jx7UIJFChPNrx7tcEgerUgWbeieCM9MfHInUDyK8DWYZ+YrywDJuTg==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/NaturalIntelligence" + }, + { + "type": "paypal", + "url": "https://paypal.me/naturalintelligence" + } + ], + "dependencies": { + "strnum": "^1.0.5" + }, + "bin": { + "fxparser": "src/cli/cli.js" + } + }, "node_modules/fastq": { "version": "1.17.1", "resolved": "https://registry.npmjs.org/fastq/-/fastq-1.17.1.tgz", @@ -4599,11 +4620,6 @@ } ] }, - "node_modules/sax": { - "version": "1.4.1", - "resolved": "https://registry.npmjs.org/sax/-/sax-1.4.1.tgz", - "integrity": "sha512-+aWOz7yVScEGoKNd4PA10LZ8sk0A/z5+nXQG5giUO5rprX9jgYsTdov9qCchZiPIZezbZH+jRut8nPodFAX4Jg==" - }, "node_modules/semver": { "version": "6.3.1", "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", @@ -4751,6 +4767,11 @@ "node": ">=8" } }, + "node_modules/strnum": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/strnum/-/strnum-1.0.5.tgz", + "integrity": "sha512-J8bbNyKKXl5qYcR36TIO8W3mVGVHrmmxsd5PAItGkmyzwJvybiw2IVq5nqd0i4LSNSkB/sx9VHllbfFdr9k1JA==" + }, "node_modules/supports-color": { "version": "5.5.0", "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.5.0.tgz", @@ -5053,26 +5074,6 @@ "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==" }, - "node_modules/xml2js": { - "version": "0.5.0", - "resolved": "https://registry.npmjs.org/xml2js/-/xml2js-0.5.0.tgz", - "integrity": "sha512-drPFnkQJik/O+uPKpqSgr22mpuFHqKdbS835iAQrUC73L2F5WkboIRd63ai/2Yg6I1jzifPFKH2NTK+cfglkIA==", - "dependencies": { - "sax": ">=0.6.0", - "xmlbuilder": "~11.0.0" - }, - "engines": { - "node": ">=4.0.0" - } - }, - "node_modules/xmlbuilder": { - "version": "11.0.1", - "resolved": "https://registry.npmjs.org/xmlbuilder/-/xmlbuilder-11.0.1.tgz", - "integrity": "sha512-fDlsI/kFEx7gLvbecc0/ohLG50fugQp8ryHzMTuW9vSa1GJ0XYWKnhsUx7oie3G98+r56aTQIUB4kht42R3JvA==", - "engines": { - "node": ">=4.0" - } - }, "node_modules/y18n": { "version": "5.0.8", "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz", @@ -7453,6 +7454,14 @@ "integrity": "sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==", "dev": true }, + "fast-xml-parser": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/fast-xml-parser/-/fast-xml-parser-4.5.0.tgz", + "integrity": "sha512-/PlTQCI96+fZMAOLMZK4CWG1ItCbfZ/0jx7UIJFChPNrx7tcEgerUgWbeieCM9MfHInUDyK8DWYZ+YrywDJuTg==", + "requires": { + "strnum": "^1.0.5" + } + }, "fastq": { "version": "1.17.1", "resolved": "https://registry.npmjs.org/fastq/-/fastq-1.17.1.tgz", @@ -8433,11 +8442,6 @@ "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==", "dev": true }, - "sax": { - "version": "1.4.1", - "resolved": "https://registry.npmjs.org/sax/-/sax-1.4.1.tgz", - "integrity": "sha512-+aWOz7yVScEGoKNd4PA10LZ8sk0A/z5+nXQG5giUO5rprX9jgYsTdov9qCchZiPIZezbZH+jRut8nPodFAX4Jg==" - }, "semver": { "version": "6.3.1", "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", @@ -8564,6 +8568,11 @@ "ansi-regex": "^5.0.1" } }, + "strnum": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/strnum/-/strnum-1.0.5.tgz", + "integrity": "sha512-J8bbNyKKXl5qYcR36TIO8W3mVGVHrmmxsd5PAItGkmyzwJvybiw2IVq5nqd0i4LSNSkB/sx9VHllbfFdr9k1JA==" + }, "supports-color": { "version": "5.5.0", "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.5.0.tgz", @@ -8768,20 +8777,6 @@ "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==" }, - "xml2js": { - "version": "0.5.0", - "resolved": "https://registry.npmjs.org/xml2js/-/xml2js-0.5.0.tgz", - "integrity": "sha512-drPFnkQJik/O+uPKpqSgr22mpuFHqKdbS835iAQrUC73L2F5WkboIRd63ai/2Yg6I1jzifPFKH2NTK+cfglkIA==", - "requires": { - "sax": ">=0.6.0", - "xmlbuilder": "~11.0.0" - } - }, - "xmlbuilder": { - "version": "11.0.1", - "resolved": "https://registry.npmjs.org/xmlbuilder/-/xmlbuilder-11.0.1.tgz", - "integrity": "sha512-fDlsI/kFEx7gLvbecc0/ohLG50fugQp8ryHzMTuW9vSa1GJ0XYWKnhsUx7oie3G98+r56aTQIUB4kht42R3JvA==" - }, "y18n": { "version": "5.0.8", "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz", diff --git a/package.json b/package.json index 027af48..408e72f 100644 --- a/package.json +++ b/package.json @@ -86,9 +86,9 @@ "typescript": "^4.1.2" }, "dependencies": { + "fast-xml-parser": "^4.5.0", "got": "^11.8.0", "is-gzip": "2.0.0", - "p-limit": "^3.1.0", - "xml2js": "^0.5.0" + "p-limit": "^3.1.0" } } diff --git a/src/assets/sitemapper.js b/src/assets/sitemapper.js index a4513a4..1b1abc6 100644 --- a/src/assets/sitemapper.js +++ b/src/assets/sitemapper.js @@ -6,7 +6,7 @@ * @author Sean Burke <@seantomburke> */ -import { parseStringPromise } from 'xml2js'; +import { XMLParser } from 'fast-xml-parser'; import got from 'got'; import zlib from 'zlib'; import pLimit from 'p-limit'; @@ -95,6 +95,7 @@ export default class Sitemapper { errors: results.errors || [], }; } + /** * Get the timeout * @@ -174,7 +175,7 @@ export default class Sitemapper { } /** - * Requests the URL and uses parseStringPromise to parse through and find the data + * Requests the URL and uses fast-xml-parser to parse through and find the data * * @private * @param {string} [url] - the Sitemaps url (e.g https://wp.seantburke.com/sitemap.xml) @@ -218,8 +219,10 @@ export default class Sitemapper { responseBody = response.body; } - // otherwise parse the XML that was returned. - const data = await parseStringPromise(responseBody); + // Parse XML using fast-xml-parser + const parser = new XMLParser(); + + const data = parser.parse(responseBody.toString()); // return the results return { error: null, data }; @@ -312,26 +315,32 @@ export default class Sitemapper { if (this.debug) { console.debug(`Urlset found during "crawl('${url}')"`); } - // filter out any urls that are older than the lastmod - const sites = data.urlset.url + + // Convert single object to array if needed + const urlArray = Array.isArray(data.urlset.url) + ? data.urlset.url + : [data.urlset.url]; + + // Begin filtering the urls + const sites = urlArray .filter((site) => { if (this.lastmod === 0) return true; if (site.lastmod === undefined) return false; - const modified = new Date(site.lastmod[0]).getTime(); + const modified = new Date(site.lastmod).getTime(); return modified >= this.lastmod; }) .filter((site) => { - return !this.isExcluded(site.loc[0]); + return !this.isExcluded(site.loc); }) .map((site) => { if (!this.fields) { - return site.loc && site.loc[0]; + return site.loc; } else { let fields = {}; for (const [field, active] of Object.entries(this.fields)) { if (active && site[field]) { - fields[field] = site[field][0]; + fields[field] = site[field]; } } return fields; @@ -349,7 +358,7 @@ export default class Sitemapper { } // Map each child url into a promise to create an array of promises const sitemap = data.sitemapindex.sitemap - .map((map) => map.loc && map.loc[0]) + .map((map) => map.loc) .filter((url) => { return !this.isExcluded(url); }); @@ -441,8 +450,8 @@ export default class Sitemapper { * @param {Buffer} body - body of the gzipped file * @returns {boolean} */ - decompressResponseBody(body) { - return new Promise((resolve, reject) => { + async decompressResponseBody(body) { + return await new Promise((resolve, reject) => { const buffer = Buffer.from(body); zlib.gunzip(buffer, (err, result) => { if (err) { @@ -488,7 +497,7 @@ export default class Sitemapper { * * @typedef {Object} ParseData * - * @property {Error} error that either comes from `parseStringPromise` or `got` or custom error + * @property {Error} error that either comes from fast-xml-parser or `got` or custom error * @property {Object} data * @property {string} data.url - URL of sitemap * @property {Array} data.urlset - Array of returned URLs diff --git a/src/examples/index.js b/src/examples/index.js index 3cabd7b..b20f106 100644 --- a/src/examples/index.js +++ b/src/examples/index.js @@ -1,12 +1,12 @@ import Sitemapper from '../assets/sitemapper'; // URL to be crawled -const exampleURL = 'https://www.walmart.com/sitemap_topic.xml'; +const exampleURL = 'https://wp.seantburke.com/sitemap.xml'; // Instantiate an instance const sitemapper = new Sitemapper({ url: exampleURL, // url to crawl - debug: false, // don't show debug logs + debug: true, // don't show debug logs timeout: 10000, // 10 seconds concurrency: 10, // Number of maximum concurrent sitemap crawl threads retries: 0, // Number of retry attempts in case of error response (e.g. 404 or timeout)