From 2b0894af97f762a8a0f8e23c580861ecf8433664 Mon Sep 17 00:00:00 2001 From: Adam Chapman Date: Thu, 24 Oct 2024 17:02:55 +1100 Subject: [PATCH] Exclusions option to filter urls (#148) --- sitemapper.d.ts | 1 + src/assets/sitemapper.js | 27 +++++++++++--- src/tests/test.js | 78 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 102 insertions(+), 4 deletions(-) diff --git a/sitemapper.d.ts b/sitemapper.d.ts index 67dc261..6e6105b 100644 --- a/sitemapper.d.ts +++ b/sitemapper.d.ts @@ -20,6 +20,7 @@ export interface SitemapperOptions { timeout?: number; url?: string; fields?: {[name: string]: boolean}; + exclusions?: RegExp[]; } declare class Sitemapper { diff --git a/src/assets/sitemapper.js b/src/assets/sitemapper.js index 811f443..9b32939 100644 --- a/src/assets/sitemapper.js +++ b/src/assets/sitemapper.js @@ -28,11 +28,13 @@ export default class Sitemapper { * @params {boolean} [options.rejectUnauthorized] - If true (default), it will throw on invalid certificates, such as expired or self-signed ones. * @params {lastmod} [options.lastmod] - the minimum lastmod value for urls * @params {hpagent.HttpProxyAgent|hpagent.HttpsProxyAgent} [options.proxyAgent] - instance of npm "hpagent" HttpProxyAgent or HttpsProxyAgent to be passed to npm "got" + * @params {Array} [options.exclusions] - Array of regex patterns to exclude URLs * * @example let sitemap = new Sitemapper({ * url: 'https://wp.seantburke.com/sitemap.xml', * timeout: 15000, - * lastmod: 1630693759 + * lastmod: 1630693759, + * exclusions: [/foo.com/, /bar.xml/] // Filters out URLs matching these patterns * }); */ constructor(options) { @@ -49,6 +51,7 @@ export default class Sitemapper { settings.rejectUnauthorized === false ? false : true; this.fields = settings.fields || false; this.proxyAgent = settings.proxyAgent || {}; + this.exclusions = settings.exclusions || []; } /** @@ -319,6 +322,9 @@ export default class Sitemapper { return modified >= this.lastmod; }) + .filter((site) => { + return !this.isExcluded(site.loc[0]) + }) .map((site) => { if( !this.fields) { return site.loc && site.loc[0]; @@ -343,9 +349,11 @@ export default class Sitemapper { console.debug(`Additional sitemap found during "crawl('${url}')"`); } // Map each child url into a promise to create an array of promises - const sitemap = data.sitemapindex.sitemap.map( - (map) => map.loc && map.loc[0] - ); + const sitemap = data.sitemapindex.sitemap + .map((map) => map.loc && map.loc[0]) + .filter((url) => { + return !this.isExcluded(url) + }); // Parse all child urls within the concurrency limit in the settings const limit = pLimit(this.concurrency); @@ -446,6 +454,17 @@ export default class Sitemapper { }); }); } + + /** + * Checks if a urls is excluded based on the exclusion patterns. + * + * @param {string} url - The URL to check. + * @returns {boolean} Returns true if the urls is excluded, false otherwise. + */ + isExcluded(url) { + if (this.exclusions.length === 0) return false; + return this.exclusions.some((pattern) => pattern.test(url)); + } } /** diff --git a/src/tests/test.js b/src/tests/test.js index 7c4d0ca..77f65c7 100644 --- a/src/tests/test.js +++ b/src/tests/test.js @@ -263,4 +263,82 @@ describe('Sitemapper', function () { }); }); }); + + describe('exclusions option', function () { + // check for the url that should be excluded in a later test + it('should prevent false positive', function (done) { + this.timeout(30000); + const url = 'https://wp.seantburke.com/sitemap.xml'; + // exclude video and image sitemap index urls + sitemapper.exclusions = [/video/,/image/] + sitemapper.fetch(url) + .then(data => { + data.sites.should.be.Array; + data.sites.includes('https://wp.seantburke.com/?page_id=2').should.be.true + done(); + }) + .catch(error => { + console.error('Test failed'); + done(error); + }); + }); + + it('should filter out page_id urls', function (done) { + this.timeout(30000); + const url = 'https://wp.seantburke.com/sitemap.xml'; + // exclude page_id=2 + sitemapper.exclusions = [/page_id/] + sitemapper.fetch(url) + .then(data => { + data.sites.should.be.Array; + data.sites.includes('https://wp.seantburke.com/?page_id=2').should.be.false; + done(); + }) + .catch(error => { + console.error('Test failed'); + done(error); + }); + }); + }); + + describe('isExcluded method', function () { + it('should return false when no exclusions are set', function () { + const result = sitemapper.isExcluded('https://foo.com/page1'); + result.should.be.false(); + }); + + it('should return false when url does not match any exclusion patterns', function () { + sitemapper.exclusions = [/\.pdf$/, /private/]; + const result = sitemapper.isExcluded('https://foo.com/page1'); + result.should.be.false(); + }); + + it('should return false when url matches an exclusion pattern', function () { + sitemapper.exclusions = [/\.pdf$/, /private/]; + const result = sitemapper.isExcluded('https://foo.com/document.pdf'); + result.should.be.true(); + }); + + it('should return true when url matches any of multiple exclusion patterns', function () { + sitemapper.exclusions = [/\.pdf$/, /private/, /temp/]; + const result = sitemapper.isExcluded('https://foo.com/private/temp.html'); + result.should.be.true(); + }); + + it('should handle complex regex patterns correctly', function () { + sitemapper.exclusions = [/^https:\/\/foo\.com\/([a-z]{2})\/private/] + const result1 = sitemapper.isExcluded('https://foo.com/en/private/page'); + const result2 = sitemapper.isExcluded('https://foo.com/en/public/page'); + result1.should.be.true(); + result2.should.be.false(); + }); + + it('should handle case sensitivity correctly', function () { + sitemapper.exclusions = [/private/i]; + const result1 = sitemapper.isExcluded('https://foo.com/PRIVATE/page'); + const result2 = sitemapper.isExcluded('https://foo.com/Private/page'); + result1.should.be.true(); + result2.should.be.true(); + }); + }); });