Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Exclusions option to filter urls #148

Merged
merged 8 commits into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions sitemapper.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ export interface SitemapperOptions {
timeout?: number;
url?: string;
fields?: {[name: string]: boolean};
exclusions?: RegExp[];
}

declare class Sitemapper {
Expand Down
27 changes: 23 additions & 4 deletions src/assets/sitemapper.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,13 @@ export default class Sitemapper {
* @params {boolean} [options.rejectUnauthorized] - If true (default), it will throw on invalid certificates, such as expired or self-signed ones.
* @params {lastmod} [options.lastmod] - the minimum lastmod value for urls
* @params {hpagent.HttpProxyAgent|hpagent.HttpsProxyAgent} [options.proxyAgent] - instance of npm "hpagent" HttpProxyAgent or HttpsProxyAgent to be passed to npm "got"
* @params {Array<RegExp>} [options.exclusions] - Array of regex patterns to exclude URLs
*
* @example let sitemap = new Sitemapper({
* url: 'https://wp.seantburke.com/sitemap.xml',
* timeout: 15000,
* lastmod: 1630693759
* lastmod: 1630693759,
* exclusions: [/foo.com/, /bar.xml/] // Filters out URLs matching these patterns
* });
*/
constructor(options) {
Expand All @@ -49,6 +51,7 @@ export default class Sitemapper {
settings.rejectUnauthorized === false ? false : true;
this.fields = settings.fields || false;
this.proxyAgent = settings.proxyAgent || {};
this.exclusions = settings.exclusions || [];
}

/**
Expand Down Expand Up @@ -319,6 +322,9 @@ export default class Sitemapper {

return modified >= this.lastmod;
})
.filter((site) => {
return !this.isExcluded(site.loc[0])
})
.map((site) => {
if( !this.fields) {
return site.loc && site.loc[0];
Expand All @@ -343,9 +349,11 @@ export default class Sitemapper {
console.debug(`Additional sitemap found during "crawl('${url}')"`);
}
// Map each child url into a promise to create an array of promises
const sitemap = data.sitemapindex.sitemap.map(
(map) => map.loc && map.loc[0]
);
const sitemap = data.sitemapindex.sitemap
.map((map) => map.loc && map.loc[0])
.filter((url) => {
return !this.isExcluded(url)
});

// Parse all child urls within the concurrency limit in the settings
const limit = pLimit(this.concurrency);
Expand Down Expand Up @@ -446,6 +454,17 @@ export default class Sitemapper {
});
});
}

/**
* Checks if a urls is excluded based on the exclusion patterns.
*
* @param {string} url - The URL to check.
* @returns {boolean} Returns true if the urls is excluded, false otherwise.
*/
isExcluded(url) {
if (this.exclusions.length === 0) return false;
return this.exclusions.some((pattern) => pattern.test(url));
}
}

/**
Expand Down
78 changes: 78 additions & 0 deletions src/tests/test.js
Original file line number Diff line number Diff line change
Expand Up @@ -263,4 +263,82 @@ describe('Sitemapper', function () {
});
});
});

describe('exclusions option', function () {
// check for the url that should be excluded in a later test
it('should prevent false positive', function (done) {
this.timeout(30000);
const url = 'https://wp.seantburke.com/sitemap.xml';
// exclude video and image sitemap index urls
sitemapper.exclusions = [/video/,/image/]
sitemapper.fetch(url)
.then(data => {
data.sites.should.be.Array;
data.sites.includes('https://wp.seantburke.com/?page_id=2').should.be.true
done();
})
.catch(error => {
console.error('Test failed');
done(error);
});
});

it('should filter out page_id urls', function (done) {
this.timeout(30000);
const url = 'https://wp.seantburke.com/sitemap.xml';
// exclude page_id=2
sitemapper.exclusions = [/page_id/]
sitemapper.fetch(url)
.then(data => {
data.sites.should.be.Array;
data.sites.includes('https://wp.seantburke.com/?page_id=2').should.be.false;
done();
})
.catch(error => {
console.error('Test failed');
done(error);
});
});
});

describe('isExcluded method', function () {
it('should return false when no exclusions are set', function () {
const result = sitemapper.isExcluded('https://foo.com/page1');
result.should.be.false();
});

it('should return false when url does not match any exclusion patterns', function () {
sitemapper.exclusions = [/\.pdf$/, /private/];
const result = sitemapper.isExcluded('https://foo.com/page1');
result.should.be.false();
});

it('should return false when url matches an exclusion pattern', function () {
sitemapper.exclusions = [/\.pdf$/, /private/];
const result = sitemapper.isExcluded('https://foo.com/document.pdf');
result.should.be.true();
});

it('should return true when url matches any of multiple exclusion patterns', function () {
sitemapper.exclusions = [/\.pdf$/, /private/, /temp/];
const result = sitemapper.isExcluded('https://foo.com/private/temp.html');
result.should.be.true();
});

it('should handle complex regex patterns correctly', function () {
sitemapper.exclusions = [/^https:\/\/foo\.com\/([a-z]{2})\/private/]
const result1 = sitemapper.isExcluded('https://foo.com/en/private/page');
const result2 = sitemapper.isExcluded('https://foo.com/en/public/page');
result1.should.be.true();
result2.should.be.false();
});

it('should handle case sensitivity correctly', function () {
sitemapper.exclusions = [/private/i];
const result1 = sitemapper.isExcluded('https://foo.com/PRIVATE/page');
const result2 = sitemapper.isExcluded('https://foo.com/Private/page');
result1.should.be.true();
result2.should.be.true();
});
});
});