Skip to content

Commit

Permalink
Converting to using fast-xml-parser (#157)
Browse files Browse the repository at this point in the history
* Converting to using fast-xml-parser

* Removing xml2js

* Don't expose the parser unecessarily

* Adding parser options

* Adding parser options

---------

Co-authored-by: Sean Thomas Burke <[email protected]>
  • Loading branch information
seantomburke and seantomburke authored Nov 8, 2024
1 parent 867cd5a commit 14f44f0
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 64 deletions.
87 changes: 41 additions & 46 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,9 @@
"typescript": "^4.1.2"
},
"dependencies": {
"fast-xml-parser": "^4.5.0",
"got": "^11.8.0",
"is-gzip": "2.0.0",
"p-limit": "^3.1.0",
"xml2js": "^0.5.0"
"p-limit": "^3.1.0"
}
}
37 changes: 23 additions & 14 deletions src/assets/sitemapper.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
* @author Sean Burke <@seantomburke>
*/

import { parseStringPromise } from 'xml2js';
import { XMLParser } from 'fast-xml-parser';
import got from 'got';
import zlib from 'zlib';
import pLimit from 'p-limit';
Expand Down Expand Up @@ -95,6 +95,7 @@ export default class Sitemapper {
errors: results.errors || [],
};
}

/**
* Get the timeout
*
Expand Down Expand Up @@ -174,7 +175,7 @@ export default class Sitemapper {
}

/**
* Requests the URL and uses parseStringPromise to parse through and find the data
* Requests the URL and uses fast-xml-parser to parse through and find the data
*
* @private
* @param {string} [url] - the Sitemaps url (e.g https://wp.seantburke.com/sitemap.xml)
Expand Down Expand Up @@ -218,8 +219,10 @@ export default class Sitemapper {
responseBody = response.body;
}

// otherwise parse the XML that was returned.
const data = await parseStringPromise(responseBody);
// Parse XML using fast-xml-parser
const parser = new XMLParser();

const data = parser.parse(responseBody.toString());

// return the results
return { error: null, data };
Expand Down Expand Up @@ -312,26 +315,32 @@ export default class Sitemapper {
if (this.debug) {
console.debug(`Urlset found during "crawl('${url}')"`);
}
// filter out any urls that are older than the lastmod
const sites = data.urlset.url

// Convert single object to array if needed
const urlArray = Array.isArray(data.urlset.url)
? data.urlset.url
: [data.urlset.url];

// Begin filtering the urls
const sites = urlArray
.filter((site) => {
if (this.lastmod === 0) return true;
if (site.lastmod === undefined) return false;
const modified = new Date(site.lastmod[0]).getTime();
const modified = new Date(site.lastmod).getTime();

return modified >= this.lastmod;
})
.filter((site) => {
return !this.isExcluded(site.loc[0]);
return !this.isExcluded(site.loc);
})
.map((site) => {
if (!this.fields) {
return site.loc && site.loc[0];
return site.loc;
} else {
let fields = {};
for (const [field, active] of Object.entries(this.fields)) {
if (active && site[field]) {
fields[field] = site[field][0];
fields[field] = site[field];
}
}
return fields;
Expand All @@ -349,7 +358,7 @@ export default class Sitemapper {
}
// Map each child url into a promise to create an array of promises
const sitemap = data.sitemapindex.sitemap
.map((map) => map.loc && map.loc[0])
.map((map) => map.loc)
.filter((url) => {
return !this.isExcluded(url);
});
Expand Down Expand Up @@ -441,8 +450,8 @@ export default class Sitemapper {
* @param {Buffer} body - body of the gzipped file
* @returns {boolean}
*/
decompressResponseBody(body) {
return new Promise((resolve, reject) => {
async decompressResponseBody(body) {
return await new Promise((resolve, reject) => {
const buffer = Buffer.from(body);
zlib.gunzip(buffer, (err, result) => {
if (err) {
Expand Down Expand Up @@ -488,7 +497,7 @@ export default class Sitemapper {
*
* @typedef {Object} ParseData
*
* @property {Error} error that either comes from `parseStringPromise` or `got` or custom error
* @property {Error} error that either comes from fast-xml-parser or `got` or custom error
* @property {Object} data
* @property {string} data.url - URL of sitemap
* @property {Array} data.urlset - Array of returned URLs
Expand Down
4 changes: 2 additions & 2 deletions src/examples/index.js
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import Sitemapper from '../assets/sitemapper';

// URL to be crawled
const exampleURL = 'https://www.walmart.com/sitemap_topic.xml';
const exampleURL = 'https://wp.seantburke.com/sitemap.xml';

// Instantiate an instance
const sitemapper = new Sitemapper({
url: exampleURL, // url to crawl
debug: false, // don't show debug logs
debug: true, // don't show debug logs
timeout: 10000, // 10 seconds
concurrency: 10, // Number of maximum concurrent sitemap crawl threads
retries: 0, // Number of retry attempts in case of error response (e.g. 404 or timeout)
Expand Down

0 comments on commit 14f44f0

Please sign in to comment.