Skip to content

Commit

Permalink
perf: rework decoder interface (#22)
Browse files Browse the repository at this point in the history
The updated interface decodes codepoints directly from a reader rather
than being implemented as a state machine. This turns out to be
considerably more efficient than the previous implementation, with
around 25% improvement on the `token_reader` and `reader` benchmarks:

```
Benchmark 1 (27 runs): zig-out/bin-old/token_reader Gtk-4.0.gir
  measurement          mean ± σ            min … max           outliers         delta
  wall_time           188ms ± 14.5ms     168ms …  205ms          0 ( 0%)        0%
  peak_rss           7.31MB ± 58.5KB    7.21MB … 7.34MB          0 ( 0%)        0%
  cpu_cycles          688M  ± 4.20M      684M  …  706M           1 ( 4%)        0%
  instructions       1.19G  ± 29.4      1.19G  … 1.19G           0 ( 0%)        0%
  cache_references    412K  ±  763K      239K  … 4.21M           2 ( 7%)        0%
  cache_misses       10.0K  ± 7.40K     7.90K  … 46.8K           2 ( 7%)        0%
  branch_misses       814K  ± 1.37K      813K  …  821K           1 ( 4%)        0%
Benchmark 2 (37 runs): zig-out/bin/token_reader Gtk-4.0.gir
  measurement          mean ± σ            min … max           outliers         delta
  wall_time           136ms ± 13.8ms     115ms …  147ms          0 ( 0%)        ⚡- 27.7% ±  3.8%
  peak_rss           7.31MB ± 54.7KB    7.21MB … 7.34MB          8 (22%)          +  0.1% ±  0.4%
  cpu_cycles          462M  ± 1.87M      459M  …  466M           0 ( 0%)        ⚡- 32.8% ±  0.2%
  instructions       1.14G  ± 26.6      1.14G  … 1.14G           0 ( 0%)        ⚡-  4.1% ±  0.0%
  cache_references    236K  ± 4.86K      227K  …  244K           0 ( 0%)          - 42.7% ± 60.7%
  cache_misses       9.40K  ± 1.25K     7.88K  … 11.5K           0 ( 0%)          -  6.5% ± 24.6%
  branch_misses       815K  ± 1.01K      813K  …  817K           0 ( 0%)          +  0.1% ±  0.1%
```

```
Benchmark 1 (23 runs): zig-out/bin-old/reader Gtk-4.0.gir
  measurement          mean ± σ            min … max           outliers         delta
  wall_time           225ms ± 14.2ms     199ms …  249ms          0 ( 0%)        0%
  peak_rss           7.25MB ±  100KB    7.08MB … 7.34MB          0 ( 0%)        0%
  cpu_cycles          823M  ± 12.2M      813M  …  847M           0 ( 0%)        0%
  instructions       1.43G  ± 23.0      1.43G  … 1.43G           0 ( 0%)        0%
  cache_references    757K  ±  129K      635K  … 1.07M           1 ( 4%)        0%
  cache_misses       13.7K  ± 1.18K     12.5K  … 17.2K           2 ( 9%)        0%
  branch_misses      1.43M  ± 3.35K     1.42M  … 1.43M           0 ( 0%)        0%
Benchmark 2 (31 runs): zig-out/bin/reader Gtk-4.0.gir
  measurement          mean ± σ            min … max           outliers         delta
  wall_time           166ms ± 13.9ms     144ms …  175ms          0 ( 0%)        ⚡- 26.5% ±  3.4%
  peak_rss           7.27MB ± 81.8KB    7.08MB … 7.34MB          0 ( 0%)          +  0.3% ±  0.7%
  cpu_cycles          581M  ± 1.54M      579M  …  584M           0 ( 0%)        ⚡- 29.4% ±  0.5%
  instructions       1.38G  ± 16.0      1.38G  … 1.38G           9 (29%)        ⚡-  3.8% ±  0.0%
  cache_references    715K  ±  219K      563K  … 1.71M           3 (10%)          -  5.5% ± 13.6%
  cache_misses       13.5K  ± 1.31K     11.4K  … 16.5K           2 ( 6%)          -  1.2% ±  5.1%
  branch_misses      1.07M  ± 20.3K     1.05M  … 1.11M           5 (16%)        ⚡- 25.3% ±  0.6%
```
  • Loading branch information
ianprime0509 authored Oct 15, 2023
1 parent ea0fcb1 commit 9c6389d
Show file tree
Hide file tree
Showing 6 changed files with 311 additions and 312 deletions.
84 changes: 70 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,23 +123,79 @@ parsing through an entire XML file without doing any additional processing. The
XML file is loaded completely into memory first, then the parser is executed on
it until it completes.

Below are some benchmarking results as of commit
`e9809855f7ee3403efa1fdc5f9010182f47361d0`, as performed on my laptop. The
results were obtained by executing [poop](https://github.com/andrewrk/poop) on
the benchmark implementations.
Below are some benchmarking results as of August 14, 2023, using Zig
`0.12.0-dev.906+2d7d037c4`, as performed on my laptop. The results were obtained
by executing [poop](https://github.com/andrewrk/poop) on the benchmark
implementations.

### GTK 4 GIR

This is a 7.6MiB XML file containing GObject introspection metadata for GTK 4.

| Implementation | Execution time | Memory usage |
| -------------------------- | --------------- | --------------- |
| zig-xml (`Reader`) | 242ms ± 5.50ms | 9.12MB ± 66.5KB |
| zig-xml (`TokenReader`) | 169ms ± 13.4ms | 9.07MB ± 97.9KB |
| zig-xml (`Scanner`) | 40.2ms ± 2.25ms | 9.09MB ± 97.0KB |
| libxml2 (`xmlreader.h`) | 74.0ms ± 3.16ms | 10.4MB ± 104KB |
| mxml (`mxmlSAXLoadString`) | 97.1ms ± 1.63ms | 9.12MB ± 64.9KB |
| yxml | 36.2ms ± 999us | 9.09MB ± 92.3KB |
This is a 5.7MB XML file containing GObject introspection metadata for GTK 4. In
the output below, libxml2 is used as the baseline. The three benchmarks
`reader`, `token_reader`, and `scanner` test the three APIs provided by this
library, and the mxml and yxml libraries are also included for comparison.

```
Benchmark 1 (78 runs): zig-out/bin/libxml2 Gtk-4.0.gir
measurement mean ± σ min … max outliers delta
wall_time 64.2ms ± 1.87ms 55.5ms … 70.1ms 4 ( 5%) 0%
peak_rss 14.6MB ± 76.4KB 14.4MB … 14.7MB 0 ( 0%) 0%
cpu_cycles 196M ± 1.03M 194M … 200M 3 ( 4%) 0%
instructions 409M ± 43.1 409M … 409M 0 ( 0%) 0%
cache_references 5.44M ± 325K 5.08M … 6.97M 5 ( 6%) 0%
cache_misses 66.0K ± 5.36K 55.0K … 91.0K 3 ( 4%) 0%
branch_misses 874K ± 3.80K 868K … 890K 1 ( 1%) 0%
Benchmark 2 (30 runs): zig-out/bin/reader Gtk-4.0.gir
measurement mean ± σ min … max outliers delta
wall_time 170ms ± 1.59ms 167ms … 173ms 0 ( 0%) 💩+164.2% ± 1.2%
peak_rss 7.29MB ± 73.8KB 7.08MB … 7.34MB 0 ( 0%) ⚡- 50.0% ± 0.2%
cpu_cycles 583M ± 2.88M 579M … 590M 0 ( 0%) 💩+196.9% ± 0.4%
instructions 1.38G ± 32.2 1.38G … 1.38G 0 ( 0%) 💩+237.2% ± 0.0%
cache_references 751K ± 135K 580K … 1.12M 0 ( 0%) ⚡- 86.2% ± 2.2%
cache_misses 17.5K ± 5.41K 12.9K … 34.5K 3 (10%) ⚡- 73.5% ± 3.5%
branch_misses 1.06M ± 10.9K 1.05M … 1.11M 2 ( 7%) 💩+ 21.5% ± 0.3%
Benchmark 3 (38 runs): zig-out/bin/token_reader Gtk-4.0.gir
measurement mean ± σ min … max outliers delta
wall_time 135ms ± 1.59ms 132ms … 138ms 0 ( 0%) 💩+110.4% ± 1.1%
peak_rss 7.31MB ± 54.2KB 7.21MB … 7.34MB 8 (21%) ⚡- 49.8% ± 0.2%
cpu_cycles 462M ± 2.20M 459M … 467M 0 ( 0%) 💩+135.5% ± 0.3%
instructions 1.14G ± 21.0 1.14G … 1.14G 0 ( 0%) 💩+179.9% ± 0.0%
cache_references 237K ± 7.40K 225K … 255K 0 ( 0%) ⚡- 95.6% ± 1.9%
cache_misses 10.1K ± 1.29K 8.16K … 13.2K 0 ( 0%) ⚡- 84.8% ± 2.7%
branch_misses 815K ± 919 813K … 816K 3 ( 8%) ⚡- 6.8% ± 0.1%
Benchmark 4 (103 runs): zig-out/bin/scanner Gtk-4.0.gir
measurement mean ± σ min … max outliers delta
wall_time 48.6ms ± 1.82ms 45.8ms … 55.2ms 4 ( 4%) ⚡- 24.3% ± 0.8%
peak_rss 7.27MB ± 87.8KB 7.08MB … 7.34MB 0 ( 0%) ⚡- 50.1% ± 0.2%
cpu_cycles 152M ± 3.48M 151M … 177M 5 ( 5%) ⚡- 22.4% ± 0.4%
instructions 472M ± 19.9 472M … 472M 0 ( 0%) 💩+ 15.6% ± 0.0%
cache_references 209K ± 1.80K 207K … 222K 4 ( 4%) ⚡- 96.2% ± 1.2%
cache_misses 7.95K ± 179 7.59K … 8.50K 0 ( 0%) ⚡- 88.0% ± 1.6%
branch_misses 511K ± 874 510K … 518K 13 (13%) ⚡- 41.6% ± 0.1%
Benchmark 5 (63 runs): zig-out/bin/mxml Gtk-4.0.gir
measurement mean ± σ min … max outliers delta
wall_time 80.2ms ± 2.44ms 76.0ms … 87.9ms 3 ( 5%) 💩+ 24.9% ± 1.1%
peak_rss 7.44MB ± 56.3KB 7.34MB … 7.47MB 15 (24%) ⚡- 48.9% ± 0.2%
cpu_cycles 262M ± 2.95M 258M … 281M 1 ( 2%) 💩+ 33.4% ± 0.4%
instructions 762M ± 56.7K 762M … 762M 3 ( 5%) 💩+ 86.4% ± 0.0%
cache_references 401K ± 473K 272K … 3.08M 10 (16%) ⚡- 92.6% ± 2.4%
cache_misses 14.2K ± 2.62K 12.0K … 31.1K 2 ( 3%) ⚡- 78.5% ± 2.2%
branch_misses 1.02M ± 99.5K 998K … 1.79M 4 ( 6%) 💩+ 16.3% ± 2.5%
Benchmark 6 (196 runs): zig-out/bin/yxml Gtk-4.0.gir
measurement mean ± σ min … max outliers delta
wall_time 25.4ms ± 1.03ms 23.9ms … 34.3ms 3 ( 2%) ⚡- 60.4% ± 0.5%
peak_rss 7.29MB ± 77.0KB 7.08MB … 7.34MB 0 ( 0%) ⚡- 50.0% ± 0.1%
cpu_cycles 71.0M ± 1.03M 70.5M … 84.2M 5 ( 3%) ⚡- 63.8% ± 0.1%
instructions 236M ± 20.1 236M … 236M 0 ( 0%) ⚡- 42.2% ± 0.0%
cache_references 202K ± 805 201K … 210K 7 ( 4%) ⚡- 96.3% ± 0.8%
cache_misses 8.00K ± 215 7.64K … 9.57K 4 ( 2%) ⚡- 87.9% ± 1.1%
branch_misses 239K ± 787 238K … 248K 21 (11%) ⚡- 72.7% ± 0.1%
```

## License

Expand Down
1 change: 1 addition & 0 deletions bench/build.zig
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ pub fn build(b: *Build) !void {
bench_reader.linkLibC();

const libxml2 = b.dependency("libxml2", .{
.optimize = .ReleaseFast,
.iconv = false,
.lzma = false,
.zlib = false,
Expand Down
11 changes: 7 additions & 4 deletions bench/src/scanner.zig
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
const std = @import("std");
const xml = @import("xml");

pub const main = @import("common.zig").main;

pub fn runBench(data: []const u8) !void {
var scanner = xml.Scanner{};
var data_stream = std.io.fixedBufferStream(data);
var decoder = xml.encoding.Utf8Decoder{};
for (data) |b| {
if (try decoder.next(b)) |c| {
_ = try scanner.next(c, 1);
}
var buf: [4]u8 = undefined;
while (true) {
const c = try decoder.readCodepoint(data_stream.reader(), &buf);
if (!c.present) break;
_ = try scanner.next(c.codepoint, c.byte_length);
}
}
20 changes: 6 additions & 14 deletions examples/scan.zig
Original file line number Diff line number Diff line change
Expand Up @@ -27,27 +27,19 @@ pub fn main() !void {

var line: usize = 1;
var column: usize = 1;
read: while (true) {
var codepoint_bytes: usize = 0;
const c = while (true) {
const b = input_reader.readByte() catch |e| switch (e) {
error.EndOfStream => break :read,
else => |other| return other,
};
codepoint_bytes += 1;
if (try decoder.next(b)) |codepoint| {
break codepoint;
}
};
const token = scanner.next(c, codepoint_bytes) catch |e| {
while (true) {
var buf: [4]u8 = undefined;
const c = try decoder.readCodepoint(input_reader, &buf);
if (!c.present) break;
const token = scanner.next(c.codepoint, c.byte_length) catch |e| {
try stdout_buffered_writer.flush();
try stderr.print("error: {} ({}:{}): {}\n", .{ scanner.pos, line, column, e });
return;
};
if (token != .ok) {
try stdout.print("{} ({}:{}): {}\n", .{ scanner.pos, line, column, token });
}
if (c == '\n') {
if (c.codepoint == '\n') {
line += 1;
column = 1;
} else {
Expand Down
Loading

0 comments on commit 9c6389d

Please sign in to comment.