Skip to content

Commit

Permalink
Converting to using fast-xml-parser
Browse files Browse the repository at this point in the history
  • Loading branch information
seantomburke committed Nov 8, 2024
1 parent 867cd5a commit 6105454
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 14 deletions.
40 changes: 40 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@
"typescript": "^4.1.2"
},
"dependencies": {
"fast-xml-parser": "^4.5.0",
"got": "^11.8.0",
"is-gzip": "2.0.0",
"p-limit": "^3.1.0",
Expand Down
32 changes: 20 additions & 12 deletions src/assets/sitemapper.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
* @author Sean Burke <@seantomburke>
*/

import { parseStringPromise } from 'xml2js';
import { XMLParser } from 'fast-xml-parser';
import got from 'got';
import zlib from 'zlib';
import pLimit from 'p-limit';
Expand Down Expand Up @@ -52,6 +52,7 @@ export default class Sitemapper {
this.fields = settings.fields || false;
this.proxyAgent = settings.proxyAgent || {};
this.exclusions = settings.exclusions || [];
this.parser = new XMLParser();
}

/**
Expand Down Expand Up @@ -95,6 +96,7 @@ export default class Sitemapper {
errors: results.errors || [],
};
}

/**
* Get the timeout
*
Expand Down Expand Up @@ -174,7 +176,7 @@ export default class Sitemapper {
}

/**
* Requests the URL and uses parseStringPromise to parse through and find the data
* Requests the URL and uses fast-xml-parser to parse through and find the data
*
* @private
* @param {string} [url] - the Sitemaps url (e.g https://wp.seantburke.com/sitemap.xml)
Expand Down Expand Up @@ -218,8 +220,8 @@ export default class Sitemapper {
responseBody = response.body;
}

// otherwise parse the XML that was returned.
const data = await parseStringPromise(responseBody);
// Parse XML using fast-xml-parser
const data = this.parser.parse(responseBody.toString());

// return the results
return { error: null, data };
Expand Down Expand Up @@ -312,26 +314,32 @@ export default class Sitemapper {
if (this.debug) {
console.debug(`Urlset found during "crawl('${url}')"`);
}
// filter out any urls that are older than the lastmod
const sites = data.urlset.url

// Convert single object to array if needed
const urlArray = Array.isArray(data.urlset.url)
? data.urlset.url
: [data.urlset.url];

// Begin filtering the urls
const sites = urlArray
.filter((site) => {
if (this.lastmod === 0) return true;
if (site.lastmod === undefined) return false;
const modified = new Date(site.lastmod[0]).getTime();
const modified = new Date(site.lastmod).getTime();

return modified >= this.lastmod;
})
.filter((site) => {
return !this.isExcluded(site.loc[0]);
return !this.isExcluded(site.loc);
})
.map((site) => {
if (!this.fields) {
return site.loc && site.loc[0];
return site.loc;
} else {
let fields = {};
for (const [field, active] of Object.entries(this.fields)) {
if (active && site[field]) {
fields[field] = site[field][0];
fields[field] = site[field];
}
}
return fields;
Expand All @@ -349,7 +357,7 @@ export default class Sitemapper {
}
// Map each child url into a promise to create an array of promises
const sitemap = data.sitemapindex.sitemap
.map((map) => map.loc && map.loc[0])
.map((map) => map.loc)
.filter((url) => {
return !this.isExcluded(url);
});
Expand Down Expand Up @@ -488,7 +496,7 @@ export default class Sitemapper {
*
* @typedef {Object} ParseData
*
* @property {Error} error that either comes from `parseStringPromise` or `got` or custom error
* @property {Error} error that either comes from fast-xml-parser or `got` or custom error
* @property {Object} data
* @property {string} data.url - URL of sitemap
* @property {Array} data.urlset - Array of returned URLs
Expand Down
4 changes: 2 additions & 2 deletions src/examples/index.js
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import Sitemapper from '../assets/sitemapper';

// URL to be crawled
const exampleURL = 'https://www.walmart.com/sitemap_topic.xml';
const exampleURL = 'https://wp.seantburke.com/sitemap.xml';

// Instantiate an instance
const sitemapper = new Sitemapper({
url: exampleURL, // url to crawl
debug: false, // don't show debug logs
debug: true, // don't show debug logs
timeout: 10000, // 10 seconds
concurrency: 10, // Number of maximum concurrent sitemap crawl threads
retries: 0, // Number of retry attempts in case of error response (e.g. 404 or timeout)
Expand Down

0 comments on commit 6105454

Please sign in to comment.