From 61054545609a1acd4795539c58e015ca9d3d7964 Mon Sep 17 00:00:00 2001 From: Sean Thomas Burke Date: Fri, 8 Nov 2024 00:31:02 -0800 Subject: [PATCH] Converting to using fast-xml-parser --- package-lock.json | 40 ++++++++++++++++++++++++++++++++++++++++ package.json | 1 + src/assets/sitemapper.js | 32 ++++++++++++++++++++------------ src/examples/index.js | 4 ++-- 4 files changed, 63 insertions(+), 14 deletions(-) diff --git a/package-lock.json b/package-lock.json index 6962ed1..a7351d4 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,6 +9,7 @@ "version": "3.2.18", "license": "MIT", "dependencies": { + "fast-xml-parser": "^4.5.0", "got": "^11.8.0", "is-gzip": "2.0.0", "p-limit": "^3.1.0", @@ -3232,6 +3233,27 @@ "integrity": "sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==", "dev": true }, + "node_modules/fast-xml-parser": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/fast-xml-parser/-/fast-xml-parser-4.5.0.tgz", + "integrity": "sha512-/PlTQCI96+fZMAOLMZK4CWG1ItCbfZ/0jx7UIJFChPNrx7tcEgerUgWbeieCM9MfHInUDyK8DWYZ+YrywDJuTg==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/NaturalIntelligence" + }, + { + "type": "paypal", + "url": "https://paypal.me/naturalintelligence" + } + ], + "dependencies": { + "strnum": "^1.0.5" + }, + "bin": { + "fxparser": "src/cli/cli.js" + } + }, "node_modules/fastq": { "version": "1.17.1", "resolved": "https://registry.npmjs.org/fastq/-/fastq-1.17.1.tgz", @@ -4751,6 +4773,11 @@ "node": ">=8" } }, + "node_modules/strnum": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/strnum/-/strnum-1.0.5.tgz", + "integrity": "sha512-J8bbNyKKXl5qYcR36TIO8W3mVGVHrmmxsd5PAItGkmyzwJvybiw2IVq5nqd0i4LSNSkB/sx9VHllbfFdr9k1JA==" + }, "node_modules/supports-color": { "version": "5.5.0", "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.5.0.tgz", @@ -7453,6 +7480,14 @@ "integrity": "sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==", "dev": true }, + "fast-xml-parser": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/fast-xml-parser/-/fast-xml-parser-4.5.0.tgz", + "integrity": "sha512-/PlTQCI96+fZMAOLMZK4CWG1ItCbfZ/0jx7UIJFChPNrx7tcEgerUgWbeieCM9MfHInUDyK8DWYZ+YrywDJuTg==", + "requires": { + "strnum": "^1.0.5" + } + }, "fastq": { "version": "1.17.1", "resolved": "https://registry.npmjs.org/fastq/-/fastq-1.17.1.tgz", @@ -8564,6 +8599,11 @@ "ansi-regex": "^5.0.1" } }, + "strnum": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/strnum/-/strnum-1.0.5.tgz", + "integrity": "sha512-J8bbNyKKXl5qYcR36TIO8W3mVGVHrmmxsd5PAItGkmyzwJvybiw2IVq5nqd0i4LSNSkB/sx9VHllbfFdr9k1JA==" + }, "supports-color": { "version": "5.5.0", "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.5.0.tgz", diff --git a/package.json b/package.json index 027af48..b8b2a19 100644 --- a/package.json +++ b/package.json @@ -86,6 +86,7 @@ "typescript": "^4.1.2" }, "dependencies": { + "fast-xml-parser": "^4.5.0", "got": "^11.8.0", "is-gzip": "2.0.0", "p-limit": "^3.1.0", diff --git a/src/assets/sitemapper.js b/src/assets/sitemapper.js index a4513a4..0ed962c 100644 --- a/src/assets/sitemapper.js +++ b/src/assets/sitemapper.js @@ -6,7 +6,7 @@ * @author Sean Burke <@seantomburke> */ -import { parseStringPromise } from 'xml2js'; +import { XMLParser } from 'fast-xml-parser'; import got from 'got'; import zlib from 'zlib'; import pLimit from 'p-limit'; @@ -52,6 +52,7 @@ export default class Sitemapper { this.fields = settings.fields || false; this.proxyAgent = settings.proxyAgent || {}; this.exclusions = settings.exclusions || []; + this.parser = new XMLParser(); } /** @@ -95,6 +96,7 @@ export default class Sitemapper { errors: results.errors || [], }; } + /** * Get the timeout * @@ -174,7 +176,7 @@ export default class Sitemapper { } /** - * Requests the URL and uses parseStringPromise to parse through and find the data + * Requests the URL and uses fast-xml-parser to parse through and find the data * * @private * @param {string} [url] - the Sitemaps url (e.g https://wp.seantburke.com/sitemap.xml) @@ -218,8 +220,8 @@ export default class Sitemapper { responseBody = response.body; } - // otherwise parse the XML that was returned. - const data = await parseStringPromise(responseBody); + // Parse XML using fast-xml-parser + const data = this.parser.parse(responseBody.toString()); // return the results return { error: null, data }; @@ -312,26 +314,32 @@ export default class Sitemapper { if (this.debug) { console.debug(`Urlset found during "crawl('${url}')"`); } - // filter out any urls that are older than the lastmod - const sites = data.urlset.url + + // Convert single object to array if needed + const urlArray = Array.isArray(data.urlset.url) + ? data.urlset.url + : [data.urlset.url]; + + // Begin filtering the urls + const sites = urlArray .filter((site) => { if (this.lastmod === 0) return true; if (site.lastmod === undefined) return false; - const modified = new Date(site.lastmod[0]).getTime(); + const modified = new Date(site.lastmod).getTime(); return modified >= this.lastmod; }) .filter((site) => { - return !this.isExcluded(site.loc[0]); + return !this.isExcluded(site.loc); }) .map((site) => { if (!this.fields) { - return site.loc && site.loc[0]; + return site.loc; } else { let fields = {}; for (const [field, active] of Object.entries(this.fields)) { if (active && site[field]) { - fields[field] = site[field][0]; + fields[field] = site[field]; } } return fields; @@ -349,7 +357,7 @@ export default class Sitemapper { } // Map each child url into a promise to create an array of promises const sitemap = data.sitemapindex.sitemap - .map((map) => map.loc && map.loc[0]) + .map((map) => map.loc) .filter((url) => { return !this.isExcluded(url); }); @@ -488,7 +496,7 @@ export default class Sitemapper { * * @typedef {Object} ParseData * - * @property {Error} error that either comes from `parseStringPromise` or `got` or custom error + * @property {Error} error that either comes from fast-xml-parser or `got` or custom error * @property {Object} data * @property {string} data.url - URL of sitemap * @property {Array} data.urlset - Array of returned URLs diff --git a/src/examples/index.js b/src/examples/index.js index 3cabd7b..b20f106 100644 --- a/src/examples/index.js +++ b/src/examples/index.js @@ -1,12 +1,12 @@ import Sitemapper from '../assets/sitemapper'; // URL to be crawled -const exampleURL = 'https://www.walmart.com/sitemap_topic.xml'; +const exampleURL = 'https://wp.seantburke.com/sitemap.xml'; // Instantiate an instance const sitemapper = new Sitemapper({ url: exampleURL, // url to crawl - debug: false, // don't show debug logs + debug: true, // don't show debug logs timeout: 10000, // 10 seconds concurrency: 10, // Number of maximum concurrent sitemap crawl threads retries: 0, // Number of retry attempts in case of error response (e.g. 404 or timeout)