Skip to content

Commit 3ecbd9e

Browse files
authored
Merge pull request #149 from remusao/next
* Performance improvements in all methods of public API (up to x2 faster) * `extractHostname`: will now avoid lower-casing the result in some cases * `extractHostname`: handles single or triple '/' after protocol * `extractHostname`: has fast-path for validation of common protocols (e.g. https) * `isProbablyIpv4`: performs first quick check on length of hostname * `isProbablyIpv6`: performs first quick check on length of hostname * `isValidHostname`: make use of `charCodeAt` instead of `codePointAt` * `lookupInTrie`: makes use of Trie with more stable structure (faster) * `lookupInTrie`: lazily allocate memory for result * `suffixLookup`: uses fast-path for most common suffixes (massive speed-up) * `suffixLookup`: does not allocate memory for result anymore * `setDefaults`: fast-path in case no argument was provided * `getSubdomain`: fast-path if subdomain is empty * Add more options to fine-tune behavior and performance * `detectIp` allows to disable IP check * `mixedInput` allows to specify if we expect a mix of URLs and hostnames as input. If only hostnames are expected then `extractHostname` can be set to `false` to speed-up parsing. If only URLs are expected then `mixedInputs` can be set to `false`. The `mixedInputs` is only a hint and will not change the behavior of the library. * `validateHostname` can be set to `false` to disable validation and speed-up processing further. * Check that input is string before parsing * Fix support for reserved keywords in hostnames * Add tests and bring back coverage to 100% * Minified bundle is now also tested with the same suite * Migrate utils scripts from `bin/` folder to TypeScript * Add small `tldts` cli which can be used to parse URLs * Update README with more accurate information
2 parents dc10e28 + cd26b20 commit 3ecbd9e

31 files changed

+1432
-1750
lines changed

.editorconfig

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,6 @@ insert_final_newline = true
77
charset = utf-8
88
trim_trailing_whitespace = true
99

10-
[*.{es,js,json,jsm,jsx}]
10+
[*.{es,js,json,jsm,jsx,ts}]
1111
indent_style = space
1212
indent_size = 2

.eslintrc

-15
This file was deleted.

.travis.yml

+4-15
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,18 @@
1-
sudo: false
2-
dist: trusty
1+
dist: xenial
32
language: node_js
43

54
node_js:
6-
- 6
7-
- 8
8-
- 9
5+
- lts/*
96
- node
107

11-
cache:
12-
directories:
13-
- node_modules
8+
cache: npm
149

1510
notifications:
1611
email:
1712
on_failure: change
1813

1914
before_install:
20-
- npm install -g npm@latest
21-
22-
install:
23-
- npm ci
24-
25-
script:
26-
- npm run test
15+
- npm install --global npm@latest
2716

2817
after_script:
2918
- cat ./coverage/lcov.info | ./node_modules/coveralls/bin/coveralls.js && rm -rf ./coverage

CHANGELOG.md

+35
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,41 @@
22

33
### Not Released
44

5+
### 5.0.0
6+
7+
*2019-05-23*
8+
9+
- Improvements in various areas [#149](https://github.com/remusao/tldts/pull/149)
10+
* Performance improvements in all methods of public API (up to x2 faster)
11+
* `extractHostname`: will now avoid lower-casing the result in some cases
12+
* `extractHostname`: handles single or triple '/' after protocol
13+
* `extractHostname`: has fast-path for validation of common protocols (e.g. https)
14+
* `isProbablyIpv4`: performs first quick check on length of hostname
15+
* `isProbablyIpv6`: performs first quick check on length of hostname
16+
* `isValidHostname`: make use of `charCodeAt` instead of `codePointAt`
17+
* `lookupInTrie`: makes use of Trie with more stable structure (faster)
18+
* `lookupInTrie`: lazily allocate memory for result
19+
* `suffixLookup`: uses fast-path for most common suffixes (massive speed-up)
20+
* `suffixLookup`: does not allocate memory for result anymore
21+
* `setDefaults`: fast-path in case no argument was provided
22+
* `getSubdomain`: fast-path if subdomain is empty
23+
* Add more options to fine-tune behavior and performance
24+
* `detectIp` allows to disable IP check
25+
* `mixedInput` allows to specify if we expect a mix of URLs and hostnames as
26+
input. If only hostnames are expected then `extractHostname` can be set to
27+
`false` to speed-up parsing. If only URLs are expected then `mixedInputs`
28+
can be set to `false`. The `mixedInputs` is only a hint and will not
29+
change the behavior of the library.
30+
* `validateHostname` can be set to `false` to disable validation and
31+
speed-up processing further.
32+
* Check that input is string before parsing
33+
* Fix support for reserved keywords in hostnames
34+
* Add tests and bring back coverage to 100%
35+
* Minified bundle is now also tested with the same suite
36+
* Migrate utils scripts from `bin/` folder to TypeScript
37+
* Add small `tldts` cli which can be used to parse URLs
38+
* Update README with more accurate information
39+
540
### 4.0.6
641

742
*2019-04-15*

README.md

+32-13
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,23 @@
1-
# tldts - Hostname and Domain Parsing using Public Suffix Lists
1+
# tldts - Blazing Fast URL Parsing
22

33
[![NPM](https://nodei.co/npm/tldts.png?downloads=true&downloadRank=true)](https://nodei.co/npm/tldts/)
44

55
[![Build Status][badge-ci]](http://travis-ci.org/remusao/tldts) ![][badge-downloads]
66
![Coverage Status](https://coveralls.io/repos/github/remusao/tldts/badge.svg?branch=master)
77
[![Known Vulnerabilities](https://snyk.io/test/github/remusao/tldts/badge.svg?targetFile=package.json)](https://snyk.io/test/github/remusao/tldts?targetFile=package.json)
88

9-
10-
> `tldts` is a Typescript library to parse hostnames, domains, public suffixes, top-level domains and subdomains from URLs.
11-
9+
`tldts` is a JavaScript library to extract hostnames, domains, public suffixes, top-level domains and subdomains from URLs.
1210

1311
**Features**:
14-
1. **Fastest library** around (up to 2M operations per second, that's 3 orders of
15-
magnitude faster than the most popular library out there)
16-
2. Written in **TypeScript**, ships with `umd`, `esm`, `cjs` bundles and *type definitions*
12+
1. Tuned for **performance** (order of 0.1 to 1 μs per input)
13+
2. Handles both URLs and hostnames
1714
3. Full Unicode/IDNA support
18-
4. Support both ICANN and Private suffixes
19-
5. Ships with continuously updated version of the list: it works *out of the box*!
20-
6. Support parsing full URLs or hostnames
21-
7. Small bundles and small memory footprint
15+
4. Support parsing email addresses
16+
5. Detect IPv4 and IPv6 addresses
17+
6. Continuously updated version of the public suffix list
18+
7. **TypeScript**, ships with `umd`, `esm`, `cjs` bundles and *type definitions*
19+
8. Small bundles and small memory footprint
20+
9. Battle tested: full test coverage and production use
2221

2322
# Install
2423

@@ -29,7 +28,7 @@ npm install --save tldts
2928
# Usage
3029

3130
```js
32-
const tldts = require('tldts');
31+
const { parse } = require('tldts');
3332

3433
// Retrieving hostname related informations of a given URL
3534
parse('http://www.writethedocs.org/conf/eu/2017/');
@@ -42,6 +41,12 @@ parse('http://www.writethedocs.org/conf/eu/2017/');
4241
// subdomain: 'www' }
4342
```
4443

44+
Modern *ES6 modules import* is also supported:
45+
46+
```js
47+
import { parse } from 'tldts';
48+
```
49+
4550
# API
4651

4752
* `tldts.parse(url | hostname, options)`
@@ -51,7 +56,9 @@ parse('http://www.writethedocs.org/conf/eu/2017/');
5156
* `tldts.getSubdomain(url, | hostname, options)`
5257

5358
The behavior of `tldts` can be customized using an `options` argument for all
54-
the functions exposed as part of the public API.
59+
the functions exposed as part of the public API. This is useful to both change
60+
the behavior of the library as well as fine-tune the performance depending on
61+
your inputs.
5562

5663
```js
5764
{
@@ -60,7 +67,19 @@ the functions exposed as part of the public API.
6067
// Use suffixes from Private section (default: false)
6168
allowPrivateDomains: boolean;
6269
// Extract and validate hostname (default: true)
70+
// When set to `false`, inputs will be considered valid hostnames.
6371
extractHostname: boolean;
72+
// Validate hostnames after parsing (default: true)
73+
// If a hostname is not valid, not further processing is performed. When set
74+
// to `false`, inputs to the library will be considered valid and parsing will
75+
// proceed regardless.
76+
validateHostname: boolean;
77+
// Perform IP address detection (default: true).
78+
detectIp: boolean;
79+
// Assume that both URLs and hostnames can be given as input (default: true)
80+
// If set to `false` we assume only URLs will be given as input, which
81+
// speed-ups processing.
82+
mixedInputs: boolean;
6483
// Specifies extra valid suffixes (default: null)
6584
validHosts: string[] | null;
6685
}

bench/benchmark.js

100755100644
+64-39
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,13 @@
1-
#!/usr/bin/env node
2-
1+
const Benchmark = require('benchmark');
2+
const chalk = require('chalk');
33
const { URL } = require('url');
44
const fs = require('fs');
55
const path = require('path');
6-
7-
const tldtsExperimental = require(path.resolve(
8-
__dirname,
9-
'../dist/tldts-experimental.umd.min.js',
10-
));
11-
const tldtsDefault = require(path.resolve(
12-
__dirname,
13-
'../dist/tldts.umd.min.js',
14-
));
15-
16-
function bench(title, tldts, inputs) {
17-
console.log(`* Start: ${title}`);
18-
const t0 = Date.now();
19-
for (let i = 0; i < inputs.length; i += 1) {
20-
tldts.parse(inputs[i]);
21-
tldts.parse(inputs[i]);
22-
tldts.parse(inputs[i]);
23-
tldts.parse(inputs[i]);
24-
tldts.parse(inputs[i]);
25-
}
26-
const total = Date.now() - t0;
27-
console.log(` - ${total / 5} ms`);
28-
console.log(` - ${total / inputs.length / 5} ms per input`);
29-
console.log(
30-
` - ${Math.floor(1000 / (total / inputs.length / 5))} calls per second`,
31-
);
32-
}
6+
const tldts = require('..');
337

348
function main() {
35-
const urls = [
36-
...new Set(
9+
const urls = Array.from(
10+
new Set(
3711
fs
3812
.readFileSync(path.resolve(__dirname, './requests.json'), {
3913
encoding: 'utf-8',
@@ -42,16 +16,67 @@ function main() {
4216
.map(JSON.parse)
4317
.map(({ url }) => url),
4418
),
45-
];
46-
console.log('urls', urls.length);
19+
);
20+
const hostnames = Array.from(new Set(urls.map(url => new URL(url).hostname)));
21+
22+
function bench(name, args, fn) {
23+
const suite = new Benchmark.Suite();
24+
suite
25+
.add(name, () => fn(args))
26+
.on('cycle', event => {
27+
console.log(
28+
` + ${name} ${Math.floor(event.target.hz * args.length)} ops/second`,
29+
);
30+
})
31+
.run({ async: false });
32+
}
4733

48-
const hostnames = [...new Set(urls.map(url => new URL(url).hostname))];
49-
console.log('Hosts', hostnames.length);
34+
for (const method of [
35+
'parse',
36+
'getHostname',
37+
'getPublicSuffix',
38+
'getDomain',
39+
'getSubdomain',
40+
]) {
41+
console.log(`= ${chalk.bold(method)}`);
42+
const fn = tldts[method];
5043

51-
bench('tldts URLs', tldtsDefault, urls);
52-
bench('tldts-experimental URLs', tldtsExperimental, urls);
53-
bench('tldts hostnames', tldtsDefault, hostnames);
54-
bench('tldts-experimental hostnames', tldtsExperimental, hostnames);
44+
for (const options of [
45+
undefined, // defaults
46+
{ validateHostname: false },
47+
{ validateHostname: false, detectIp: false, mixedInputs: false },
48+
]) {
49+
bench(
50+
`#${chalk.bold(method)}(url, ${chalk.underline(
51+
JSON.stringify(options),
52+
)})`,
53+
urls,
54+
urls => {
55+
for (let i = 0; i < urls.length; i += 1) {
56+
fn(urls[i], options);
57+
}
58+
},
59+
);
60+
}
61+
62+
for (const options of [
63+
undefined, // defaults
64+
{ validateHostname: false },
65+
{ validateHostname: false, detectIp: false, extractHostname: false },
66+
]) {
67+
bench(
68+
`#${chalk.bold(method)}(hostname, ${chalk.underline(
69+
JSON.stringify(options),
70+
)})`,
71+
hostnames,
72+
hostnames => {
73+
for (let i = 0; i < hostnames.length; i += 1) {
74+
fn(hostnames[i], options);
75+
}
76+
},
77+
);
78+
}
79+
}
5580
}
5681

5782
main();

bin/builders/hashes.js bin/builders/hashes.ts

+19-15
Original file line numberDiff line numberDiff line change
@@ -84,32 +84,37 @@
8484
* 4. Hash can be computed on-the-fly from end to start without any string copy
8585
*/
8686

87-
const parse = require('../parser');
87+
import parse from '../parser';
8888

8989
/**
9090
* Compute 32 bits hash of `str` backward.
9191
*/
92-
function fastHash(str) {
93-
let hash = 5381;
92+
function fastHash(str: string): number {
93+
let hash: number = 5381;
9494
for (let j = str.length - 1; j >= 0; j -= 1) {
9595
hash = (hash * 33) ^ str.charCodeAt(j);
9696
}
9797
return hash >>> 0;
9898
}
9999

100+
interface IRules {
101+
icann: any[];
102+
priv: any[];
103+
}
104+
100105
/**
101106
* Build packed typed array given the raw public list as a string.
102107
*/
103-
module.exports = (body) => {
104-
const rules = {
108+
export default (body: string) => {
109+
const rules: IRules = {
105110
icann: [],
106111
priv: [],
107112
};
108-
const wildcards = {
113+
const wildcards: IRules = {
109114
icann: [],
110115
priv: [],
111116
};
112-
const exceptions = {
117+
const exceptions: IRules = {
113118
icann: [],
114119
priv: [],
115120
};
@@ -121,13 +126,7 @@ module.exports = (body) => {
121126
let maximumNumberOfLabels = 0;
122127

123128
// Iterate on public suffix rules
124-
parse(body, ({
125-
rule,
126-
isIcann,
127-
isException,
128-
isWildcard,
129-
isNormal,
130-
}) => {
129+
parse(body, ({ rule, isIcann, isException, isWildcard, isNormal }) => {
131130
// Select correct section to insert the rule
132131
let hashesPerLabels = null;
133132
if (isException) {
@@ -140,6 +139,10 @@ module.exports = (body) => {
140139
hashesPerLabels = isIcann ? rules.icann : rules.priv;
141140
}
142141

142+
if (hashesPerLabels === null) {
143+
return;
144+
}
145+
143146
// Count number of labels in this suffix
144147
const numberOfLabels = rule.split('.').length;
145148

@@ -164,7 +167,7 @@ module.exports = (body) => {
164167
});
165168

166169
// Pack everything together
167-
const chunks = [];
170+
const chunks: number[][] = [];
168171
const pushHashes = (hashes = []) => {
169172
chunks.push([
170173
hashes.length,
@@ -189,5 +192,6 @@ module.exports = (body) => {
189192
pushHashes(rules.priv[label]);
190193
}
191194

195+
// @ts-ignore
192196
return new Uint32Array([maximumNumberOfLabels, ...[].concat(...chunks)]);
193197
};

0 commit comments

Comments
 (0)