Converting to using fast-xml-parser

seantomburke · Nov 8, 2024 · 6105454 · 6105454
1 parent 867cd5a
commit 6105454
Show file tree

Hide file tree

Showing 4 changed files with 63 additions and 14 deletions.
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -86,6 +86,7 @@
     "typescript": "^4.1.2"
   },
   "dependencies": {
+    "fast-xml-parser": "^4.5.0",
     "got": "^11.8.0",
     "is-gzip": "2.0.0",
     "p-limit": "^3.1.0",

diff --git a/src/assets/sitemapper.js b/src/assets/sitemapper.js
@@ -6,7 +6,7 @@
  * @author Sean Burke <@seantomburke>
  */
 
-import { parseStringPromise } from 'xml2js';
+import { XMLParser } from 'fast-xml-parser';
 import got from 'got';
 import zlib from 'zlib';
 import pLimit from 'p-limit';
@@ -52,6 +52,7 @@ export default class Sitemapper {
     this.fields = settings.fields || false;
     this.proxyAgent = settings.proxyAgent || {};
     this.exclusions = settings.exclusions || [];
+    this.parser = new XMLParser();
   }
 
   /**
@@ -95,6 +96,7 @@ export default class Sitemapper {
       errors: results.errors || [],
     };
   }
+
   /**
    * Get the timeout
    *
@@ -174,7 +176,7 @@ export default class Sitemapper {
   }
 
   /**
-   * Requests the URL and uses parseStringPromise to parse through and find the data
+   * Requests the URL and uses fast-xml-parser to parse through and find the data
    *
    * @private
    * @param {string} [url] - the Sitemaps url (e.g https://wp.seantburke.com/sitemap.xml)
@@ -218,8 +220,8 @@ export default class Sitemapper {
         responseBody = response.body;
       }
 
-      // otherwise parse the XML that was returned.
-      const data = await parseStringPromise(responseBody);
+      // Parse XML using fast-xml-parser
+      const data = this.parser.parse(responseBody.toString());
 
       // return the results
       return { error: null, data };
@@ -312,26 +314,32 @@ export default class Sitemapper {
         if (this.debug) {
           console.debug(`Urlset found during "crawl('${url}')"`);
         }
-        // filter out any urls that are older than the lastmod
-        const sites = data.urlset.url
+
+        // Convert single object to array if needed
+        const urlArray = Array.isArray(data.urlset.url)
+          ? data.urlset.url
+          : [data.urlset.url];
+
+        // Begin filtering the urls
+        const sites = urlArray
           .filter((site) => {
             if (this.lastmod === 0) return true;
             if (site.lastmod === undefined) return false;
-            const modified = new Date(site.lastmod[0]).getTime();
+            const modified = new Date(site.lastmod).getTime();
 
             return modified >= this.lastmod;
           })
           .filter((site) => {
-            return !this.isExcluded(site.loc[0]);
+            return !this.isExcluded(site.loc);
           })
           .map((site) => {
             if (!this.fields) {
-              return site.loc && site.loc[0];
+              return site.loc;
             } else {
               let fields = {};
               for (const [field, active] of Object.entries(this.fields)) {
                 if (active && site[field]) {
-                  fields[field] = site[field][0];
+                  fields[field] = site[field];
                 }
               }
               return fields;
@@ -349,7 +357,7 @@ export default class Sitemapper {
         }
         // Map each child url into a promise to create an array of promises
         const sitemap = data.sitemapindex.sitemap
-          .map((map) => map.loc && map.loc[0])
+          .map((map) => map.loc)
           .filter((url) => {
             return !this.isExcluded(url);
           });
@@ -488,7 +496,7 @@ export default class Sitemapper {
  *
  * @typedef {Object} ParseData
  *
- * @property {Error} error that either comes from `parseStringPromise` or `got` or custom error
+ * @property {Error} error that either comes from fast-xml-parser or `got` or custom error
  * @property {Object} data
  * @property {string} data.url - URL of sitemap
  * @property {Array} data.urlset - Array of returned URLs

diff --git a/src/examples/index.js b/src/examples/index.js
@@ -1,12 +1,12 @@
 import Sitemapper from '../assets/sitemapper';
 
 // URL to be crawled
-const exampleURL = 'https://www.walmart.com/sitemap_topic.xml';
+const exampleURL = 'https://wp.seantburke.com/sitemap.xml';
 
 // Instantiate an instance
 const sitemapper = new Sitemapper({
   url: exampleURL, // url to crawl
-  debug: false, // don't show debug logs
+  debug: true, // don't show debug logs
   timeout: 10000, // 10 seconds
   concurrency: 10, // Number of maximum concurrent sitemap crawl threads
   retries: 0, // Number of retry attempts in case of error response (e.g. 404 or timeout)