diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000..98d0681
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,11 @@
+root = true
+
+[*]
+charset = utf-8
+end_of_line = lf
+indent_style = space
+insert_final_newline = true
+trim_trailing_whitespace = true
+
+[*.xml]
+indent_size = 2
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a3ee6e5..cd6e3f8 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -14,7 +14,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
- zig-version: [0.12.0, 0.13.0, master]
+ zig-version: [0.13.0, master]
steps:
- name: Checkout
uses: actions/checkout@v3
diff --git a/.gitignore b/.gitignore
index f33e3bd..d8c8979 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,2 @@
-bench/*.xml
-callgrind.out.*
-core*
-fuzz/outputs
-test/xmlconf
.zig-cache
-zig-cache
zig-out
diff --git a/README.md b/README.md
index 3abe6ee..88a5af4 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
# zig-xml
-zig-xml is an XML library for Zig, currently supporting Zig 0.12.0, 0.13.0, and
-the latest master at the time of writing.
+zig-xml is an XML library for Zig, currently supporting Zig 0.13.0 and the
+latest master at the time of writing.
See the documentation in the code for more information about the available APIs
(start in `xml.zig`). Autodocs are also published to GitHub Pages:
@@ -12,185 +12,26 @@ The library aims to confirm with the following standards:
- [XML 1.0 Fifth Edition](https://www.w3.org/TR/2008/REC-xml-20081126/)
- [XML Namespaces 1.0 Third Edition](https://www.w3.org/TR/2009/REC-xml-names-20091208/)
-Other standards (such as XML 1.1 or XML 1.0 prior to the fifth edition) are only
-supported insofar as they are compatible with the above standards. In practice,
-this should not make much difference, since XML 1.1 is rarely used, and the
-differences between XML 1.0 editions are minor (the XML 1.0 fifth edition
-standard allows many more characters in names than previous editions, subsuming
-the
-[only non-harmful feature of XML 1.1](http://www.ibiblio.org/xml/books/effectivexml/chapters/03.html)).
-
-## Feature overview
-
-Key for the list:
+Currently, DTDs (DOCTYPE) are not supported, nor is any non-UTF-8 encoding.
-- ✅ Supported
-- 🚧 Partially supported
-- ❌ Unsupported, but planned
-- ❓️ Unsupported, maybe planned (long-term)
-- 👎️ Unsupported, not planned
-
-Features:
-
-- ✅ Streaming parser (three options are available, `Reader` is the most
- general-purpose but also the slowest)
- - ✅ Core XML 1.0 language minus `DOCTYPE`
- - ✅ Well-formedness checks not involving DTD (varying degrees of lesser
- support in `TokenReader` and `Scanner`)
- - ✅ End-of-line and attribute value normalization (in `Reader` and
- `TokenReader` only, optional)
- - ✅ Namespace support (in `Reader` only, optional)
- - 🚧 Detailed errors
- - 🚧 Source location tracking
- - ❌ `DOCTYPE` (just parsing, not doing anything with it)
- (https://github.com/ianprime0509/zig-xml/issues/9)
- - ❓️ Non-validating `DOCTYPE` handling (entity expansion, further attribute
- value normalization for non-`CDATA` types) (no external DTD content)
- - ❓️ Hooks for loading external DTD content
- - ❓️ XML 1.1
- - 👎️ Validation
-- 🚧 DOM parser (current `Node` abstraction is limited and read-only)
-- ✅ Unicode
- - ✅ UTF-8
- - ✅ UTF-16
- - ✅ UTF-8 vs UTF-16 auto-detection (`DefaultDecoder`)
- - ❌ US-ASCII (this is for support of US-ASCII as its own encoding; note that
- all ASCII can be treated as UTF-8)
- - ❌ ISO 8859-1
- - ❓️ Other encodings besides these
- - ✅ User-definable additional encodings (meaning even though this library
- doesn't provide other encodings out of the box, you can write them yourself)
-- 🚧 XML writer (https://github.com/ianprime0509/zig-xml/issues/10)
-- 👎️ XPath, XML Schema, other XML-related stuff
+Other standards (such as XML 1.1 or XML 1.0 prior to the fifth edition) are only
+supported insofar as they are compatible with the above standards.
## Examples
-See the `examples` directory (these examples are not very good right now but
-they do show how to use most of the library).
-
-Another ("real-world") example can be found in the zig-gobject project:
-https://github.com/ianprime0509/zig-gobject/blob/main/src/gir.zig
+A basic example of usage can be found in the `examples` directory, and can be
+built using `zig build install-examples`.
## Tests
-There are several tests in the project itself using the standard Zig test
-system. These tests can be run using `zig build test`.
-
-There is also a runner for the
-[W3C XML Conformance Test Suite](https://www.w3.org/XML/Test/) under
-`test/xmlconf.zig`. To build this runner as a standalone executable, run
-`zig build install-xmlconf`. If you download the 20130923 version of the test
-suite and place the `xmlconf` directory under `test`, you can also use
-`zig build run-xmlconf` to run all the test suites the runner can currently
-understand. The test suite files are not contained directly in this repository
-due to unclear licensing and file size (16MB uncompressed).
-
-At the time of writing, the library passes all the conformance tests it is able
-to run (353 of them); the other tests are skipped because they involve doctype
-in one way or another or are for XML standards which aren't supported (XML 1.1,
-editions of XML 1.0 besides the fifth edition).
-
-## Fuzzing
-
-This library has some basic support for fuzz testing, taking its basic method
-from the article
-[Fuzzing Zig Code Using AFL++](https://www.ryanliptak.com/blog/fuzzing-zig-code/).
-To start fuzzing, you will need
-[AFL++](https://github.com/AFLplusplus/AFLplusplus), specifically
-`afl-clang-lto` and `afl-fuzz`, in your path. Then, you can run
-`zig build fuzz`. To resume a prior fuzzing session, pass `-Dresume=true`.
-
-You can also run `zig build install-fuzz` to just build the fuzz executable and
-then run it with `afl-fuzz` separately.
-
-Finally, if any crashes are identified during fuzzing, they can be replayed by
-feeding the crash input back to `zig build fuzz-reproduce`, which will yield an
-error trace for further debugging.
-
-## Benchmarking and performance
-
-**TL;DR:** `Reader` and `TokenReader` are relatively slow compared to other
-popular libraries. `Scanner` is faster (on a similar level as yxml), but
-comparatively doesn't do very much.
-
-There is a benchmarking setup in the `bench` directory. The benchmark is for
-parsing through an entire XML file without doing any additional processing. The
-XML file is loaded completely into memory first, then the parser is executed on
-it until it completes.
-
-Below are some benchmarking results as of August 14, 2023, using Zig
-`0.12.0-dev.906+2d7d037c4`, as performed on my laptop. The results were obtained
-by executing [poop](https://github.com/andrewrk/poop) on the benchmark
-implementations.
-
-### GTK 4 GIR
-
-This is a 5.7MB XML file containing GObject introspection metadata for GTK 4. In
-the output below, libxml2 is used as the baseline. The three benchmarks
-`reader`, `token_reader`, and `scanner` test the three APIs provided by this
-library, and the mxml and yxml libraries are also included for comparison.
-
-```
-Benchmark 1 (78 runs): zig-out/bin/libxml2 Gtk-4.0.gir
- measurement mean ± σ min … max outliers delta
- wall_time 64.2ms ± 1.87ms 55.5ms … 70.1ms 4 ( 5%) 0%
- peak_rss 14.6MB ± 76.4KB 14.4MB … 14.7MB 0 ( 0%) 0%
- cpu_cycles 196M ± 1.03M 194M … 200M 3 ( 4%) 0%
- instructions 409M ± 43.1 409M … 409M 0 ( 0%) 0%
- cache_references 5.44M ± 325K 5.08M … 6.97M 5 ( 6%) 0%
- cache_misses 66.0K ± 5.36K 55.0K … 91.0K 3 ( 4%) 0%
- branch_misses 874K ± 3.80K 868K … 890K 1 ( 1%) 0%
-
-Benchmark 2 (30 runs): zig-out/bin/reader Gtk-4.0.gir
- measurement mean ± σ min … max outliers delta
- wall_time 170ms ± 1.59ms 167ms … 173ms 0 ( 0%) 💩+164.2% ± 1.2%
- peak_rss 7.29MB ± 73.8KB 7.08MB … 7.34MB 0 ( 0%) ⚡- 50.0% ± 0.2%
- cpu_cycles 583M ± 2.88M 579M … 590M 0 ( 0%) 💩+196.9% ± 0.4%
- instructions 1.38G ± 32.2 1.38G … 1.38G 0 ( 0%) 💩+237.2% ± 0.0%
- cache_references 751K ± 135K 580K … 1.12M 0 ( 0%) ⚡- 86.2% ± 2.2%
- cache_misses 17.5K ± 5.41K 12.9K … 34.5K 3 (10%) ⚡- 73.5% ± 3.5%
- branch_misses 1.06M ± 10.9K 1.05M … 1.11M 2 ( 7%) 💩+ 21.5% ± 0.3%
-
-Benchmark 3 (38 runs): zig-out/bin/token_reader Gtk-4.0.gir
- measurement mean ± σ min … max outliers delta
- wall_time 135ms ± 1.59ms 132ms … 138ms 0 ( 0%) 💩+110.4% ± 1.1%
- peak_rss 7.31MB ± 54.2KB 7.21MB … 7.34MB 8 (21%) ⚡- 49.8% ± 0.2%
- cpu_cycles 462M ± 2.20M 459M … 467M 0 ( 0%) 💩+135.5% ± 0.3%
- instructions 1.14G ± 21.0 1.14G … 1.14G 0 ( 0%) 💩+179.9% ± 0.0%
- cache_references 237K ± 7.40K 225K … 255K 0 ( 0%) ⚡- 95.6% ± 1.9%
- cache_misses 10.1K ± 1.29K 8.16K … 13.2K 0 ( 0%) ⚡- 84.8% ± 2.7%
- branch_misses 815K ± 919 813K … 816K 3 ( 8%) ⚡- 6.8% ± 0.1%
-
-Benchmark 4 (103 runs): zig-out/bin/scanner Gtk-4.0.gir
- measurement mean ± σ min … max outliers delta
- wall_time 48.6ms ± 1.82ms 45.8ms … 55.2ms 4 ( 4%) ⚡- 24.3% ± 0.8%
- peak_rss 7.27MB ± 87.8KB 7.08MB … 7.34MB 0 ( 0%) ⚡- 50.1% ± 0.2%
- cpu_cycles 152M ± 3.48M 151M … 177M 5 ( 5%) ⚡- 22.4% ± 0.4%
- instructions 472M ± 19.9 472M … 472M 0 ( 0%) 💩+ 15.6% ± 0.0%
- cache_references 209K ± 1.80K 207K … 222K 4 ( 4%) ⚡- 96.2% ± 1.2%
- cache_misses 7.95K ± 179 7.59K … 8.50K 0 ( 0%) ⚡- 88.0% ± 1.6%
- branch_misses 511K ± 874 510K … 518K 13 (13%) ⚡- 41.6% ± 0.1%
-
-Benchmark 5 (63 runs): zig-out/bin/mxml Gtk-4.0.gir
- measurement mean ± σ min … max outliers delta
- wall_time 80.2ms ± 2.44ms 76.0ms … 87.9ms 3 ( 5%) 💩+ 24.9% ± 1.1%
- peak_rss 7.44MB ± 56.3KB 7.34MB … 7.47MB 15 (24%) ⚡- 48.9% ± 0.2%
- cpu_cycles 262M ± 2.95M 258M … 281M 1 ( 2%) 💩+ 33.4% ± 0.4%
- instructions 762M ± 56.7K 762M … 762M 3 ( 5%) 💩+ 86.4% ± 0.0%
- cache_references 401K ± 473K 272K … 3.08M 10 (16%) ⚡- 92.6% ± 2.4%
- cache_misses 14.2K ± 2.62K 12.0K … 31.1K 2 ( 3%) ⚡- 78.5% ± 2.2%
- branch_misses 1.02M ± 99.5K 998K … 1.79M 4 ( 6%) 💩+ 16.3% ± 2.5%
+The library has several tests of its own, which can be run using `zig build test`.
-Benchmark 6 (196 runs): zig-out/bin/yxml Gtk-4.0.gir
- measurement mean ± σ min … max outliers delta
- wall_time 25.4ms ± 1.03ms 23.9ms … 34.3ms 3 ( 2%) ⚡- 60.4% ± 0.5%
- peak_rss 7.29MB ± 77.0KB 7.08MB … 7.34MB 0 ( 0%) ⚡- 50.0% ± 0.1%
- cpu_cycles 71.0M ± 1.03M 70.5M … 84.2M 5 ( 3%) ⚡- 63.8% ± 0.1%
- instructions 236M ± 20.1 236M … 236M 0 ( 0%) ⚡- 42.2% ± 0.0%
- cache_references 202K ± 805 201K … 210K 7 ( 4%) ⚡- 96.3% ± 0.8%
- cache_misses 8.00K ± 215 7.64K … 9.57K 4 ( 2%) ⚡- 87.9% ± 1.1%
- branch_misses 239K ± 787 238K … 248K 21 (11%) ⚡- 72.7% ± 0.1%
-```
+The `xmlconf` directory additionally contains a runner for the [W3C XML
+Conformance Test Suite](https://www.w3.org/XML/Test/). Running `zig build test`
+in that directory will fetch the test suite distribution tarball and run the
+tests within. Due to features missing in the current parser implementation (DTD
+support), many tests are currently skipped. At the time of writing, 250 tests
+pass, and 924 are skipped due to unsupported features.
## License
diff --git a/bench/build.zig b/bench/build.zig
index 2be01c6..dd8058d 100644
--- a/bench/build.zig
+++ b/bench/build.zig
@@ -5,14 +5,6 @@ const Step = Build.Step;
pub fn build(b: *Build) !void {
const xml = b.dependency("xml", .{}).module("xml");
- const bench_scanner = addBench(b, "scanner");
- bench_scanner.root_module.addImport("xml", xml);
- bench_scanner.linkLibC();
-
- const bench_token_reader = addBench(b, "token_reader");
- bench_token_reader.root_module.addImport("xml", xml);
- bench_token_reader.linkLibC();
-
const bench_reader = addBench(b, "reader");
bench_reader.root_module.addImport("xml", xml);
bench_reader.linkLibC();
diff --git a/bench/build.zig.zon b/bench/build.zig.zon
index 50bfb4f..bfdc142 100644
--- a/bench/build.zig.zon
+++ b/bench/build.zig.zon
@@ -12,8 +12,8 @@
.path = "..",
},
.libxml2 = .{
- .url = "git+https://github.com/ianprime0509/zig-libxml2#9a88110c7ea7a541cb6ead6a648c69a8fc929141",
- .hash = "1220b556b7c193580caa53db7e95ad31c0ac589af8adcd894594b93dd1f7875b9405",
+ .url = "git+https://github.com/ianprime0509/zig-libxml2?ref=main#6cebb963e0ad5789825eb2333a4d21fab8f35a92",
+ .hash = "12200f672ceb8df0c715a7018e5c53ad434db17f900c620e6238f178cc9a9d80b88e",
},
.mxml = .{
.url = "git+https://github.com/michaelrsweet/mxml.git#809204a3051607f54b57e2950f3a5520d79ae383",
diff --git a/bench/src/reader.zig b/bench/src/reader.zig
index 8a82fbb..dedbae3 100644
--- a/bench/src/reader.zig
+++ b/bench/src/reader.zig
@@ -4,10 +4,8 @@ const xml = @import("xml");
pub const main = @import("common.zig").main;
pub fn runBench(data: []const u8) !void {
- var data_stream = std.io.fixedBufferStream(data);
- var reader = xml.reader(std.heap.c_allocator, data_stream.reader(), .{
- .DecoderType = xml.encoding.Utf8Decoder,
- });
+ var doc = xml.StaticDocument.init(data);
+ var reader = doc.reader(std.heap.c_allocator, .{});
defer reader.deinit();
- while (try reader.next()) |_| {}
+ while (try reader.read() != .eof) {}
}
diff --git a/bench/src/scanner.zig b/bench/src/scanner.zig
deleted file mode 100644
index 933f168..0000000
--- a/bench/src/scanner.zig
+++ /dev/null
@@ -1,16 +0,0 @@
-const std = @import("std");
-const xml = @import("xml");
-
-pub const main = @import("common.zig").main;
-
-pub fn runBench(data: []const u8) !void {
- var scanner = xml.Scanner{};
- var data_stream = std.io.fixedBufferStream(data);
- var decoder = xml.encoding.Utf8Decoder{};
- var buf: [4]u8 = undefined;
- while (true) {
- const c = try decoder.readCodepoint(data_stream.reader(), &buf);
- if (!c.present) break;
- _ = try scanner.next(c.codepoint, c.byte_length);
- }
-}
diff --git a/bench/src/token_reader.zig b/bench/src/token_reader.zig
deleted file mode 100644
index 8858949..0000000
--- a/bench/src/token_reader.zig
+++ /dev/null
@@ -1,15 +0,0 @@
-const std = @import("std");
-const xml = @import("xml");
-
-pub const main = @import("common.zig").main;
-
-pub fn runBench(data: []const u8) !void {
- var data_stream = std.io.fixedBufferStream(data);
- var token_reader = xml.tokenReader(data_stream.reader(), .{
- .DecoderType = xml.encoding.Utf8Decoder,
- });
- while (true) {
- const token = try token_reader.next();
- if (token == .eof) break;
- }
-}
diff --git a/build.zig b/build.zig
index bfe58f4..55fdf6b 100644
--- a/build.zig
+++ b/build.zig
@@ -8,173 +8,40 @@ pub fn build(b: *Build) void {
const xml = b.addModule("xml", .{
.root_source_file = b.path("src/xml.zig"),
- });
-
- addTests(b, target, optimize, xml);
- addDocs(b, target);
- addExamples(b, target, optimize, xml);
- addFuzz(b, target, xml);
-}
-
-fn addTests(b: *Build, target: Build.ResolvedTarget, optimize: Mode, xml: *Build.Module) void {
- const main_tests = b.addTest(.{
- .root_source_file = b.path("src/xml.zig"),
.target = target,
.optimize = optimize,
});
- const run_main_tests = b.addRunArtifact(main_tests);
-
- const test_step = b.step("test", "Run library tests");
- test_step.dependOn(&run_main_tests.step);
-
- const xmlconf_exe = b.addExecutable(.{
- .name = "xmlconf",
- .root_source_file = b.path("test/xmlconf.zig"),
+ const test_step = b.step("test", "Run the tests");
+ const xml_test = b.addTest(.{
+ .root_source_file = b.path("src/xml.zig"),
.target = target,
- .optimize = optimize,
- });
- xmlconf_exe.root_module.addImport("xml", xml);
-
- const install_xmlconf_step = b.step("install-xmlconf", "Install xmlconf test runner");
- install_xmlconf_step.dependOn(&b.addInstallArtifact(xmlconf_exe, .{}).step);
-
- const run_xmlconf_exe = b.addRunArtifact(xmlconf_exe);
- if (b.args) |args| {
- run_xmlconf_exe.addArgs(args);
- }
- // Since we can't yet handle doctypes, the test files need to be specified
- // individually
- run_xmlconf_exe.addArgs(&.{
- "test/xmlconf/eduni/errata-2e/errata2e.xml",
- "test/xmlconf/eduni/errata-3e/errata3e.xml",
- "test/xmlconf/eduni/errata-4e/errata4e.xml",
- "test/xmlconf/eduni/misc/ht-bh.xml",
- "test/xmlconf/eduni/namespaces/1.0/rmt-ns10.xml",
- "test/xmlconf/eduni/namespaces/1.1/rmt-ns11.xml",
- "test/xmlconf/eduni/namespaces/errata-1e/errata1e.xml",
- "test/xmlconf/eduni/xml-1.1/xml11.xml",
- "test/xmlconf/ibm/ibm_oasis_invalid.xml",
- "test/xmlconf/ibm/ibm_oasis_not-wf.xml",
- "test/xmlconf/ibm/ibm_oasis_valid.xml",
- "test/xmlconf/japanese/japanese.xml",
- "test/xmlconf/oasis/oasis.xml",
- // The test case files in the sun directory do not have an enclosing
- // TESTCASES element, and only work when directly substituted as entity
- // content, so they cannot be used at this time.
- "test/xmlconf/xmltest/xmltest.xml",
});
+ const xml_test_run = b.addRunArtifact(xml_test);
+ test_step.dependOn(&xml_test_run.step);
- const run_xmlconf_step = b.step("run-xmlconf", "Run xmlconf test cases");
- run_xmlconf_step.dependOn(&run_xmlconf_exe.step);
-}
-
-fn addDocs(b: *Build, target: Build.ResolvedTarget) void {
- const obj = b.addObject(.{
+ const docs_step = b.step("docs", "Build the documentation");
+ const xml_docs = b.addObject(.{
.name = "xml",
.root_source_file = b.path("src/xml.zig"),
.target = target,
.optimize = .Debug,
});
- const docs_path = obj.getEmittedDocs();
-
- const install_docs = b.addInstallDirectory(.{
- .source_dir = docs_path,
+ const xml_docs_copy = b.addInstallDirectory(.{
+ .source_dir = xml_docs.getEmittedDocs(),
.install_dir = .prefix,
.install_subdir = "docs",
});
+ docs_step.dependOn(&xml_docs_copy.step);
- const docs_step = b.step("docs", "Generate documentation");
- docs_step.dependOn(&install_docs.step);
-}
-
-fn addExamples(b: *Build, target: Build.ResolvedTarget, optimize: Mode, xml: *Build.Module) void {
- const install_examples_step = b.step("install-examples", "Install examples");
-
- const scan_exe = b.addExecutable(.{
- .name = "scan",
- .root_source_file = b.path("examples/scan.zig"),
- .target = target,
- .optimize = optimize,
- });
- scan_exe.root_module.addImport("xml", xml);
- install_examples_step.dependOn(&b.addInstallArtifact(scan_exe, .{}).step);
-
- const run_scan_exe = b.addRunArtifact(scan_exe);
- if (b.args) |args| {
- run_scan_exe.addArgs(args);
- }
-
- const run_scan_step = b.step("run-example-scan", "Run scan example");
- run_scan_step.dependOn(&run_scan_exe.step);
-
- const read_exe = b.addExecutable(.{
- .name = "read",
- .root_source_file = b.path("examples/read.zig"),
+ const install_examples_step = b.step("install-examples", "Build and install the example programs");
+ const example_reader_exe = b.addExecutable(.{
+ .name = "example-reader",
+ .root_source_file = b.path("examples/reader.zig"),
.target = target,
.optimize = optimize,
});
- read_exe.root_module.addImport("xml", xml);
- install_examples_step.dependOn(&b.addInstallArtifact(read_exe, .{}).step);
-
- const run_read_exe = b.addRunArtifact(read_exe);
- if (b.args) |args| {
- run_read_exe.addArgs(args);
- }
-
- const run_read_step = b.step("run-example-read", "Run read example");
- run_read_step.dependOn(&run_read_exe.step);
-}
-
-fn addFuzz(b: *Build, target: Build.ResolvedTarget, xml: *Build.Module) void {
- // Thanks to https://www.ryanliptak.com/blog/fuzzing-zig-code/ for the basis of this!
- const fuzz_lib = b.addStaticLibrary(.{
- .name = "fuzz",
- .root_source_file = b.path("fuzz/main.zig"),
- .target = target,
- .optimize = .Debug,
- });
- fuzz_lib.want_lto = true;
- fuzz_lib.bundle_compiler_rt = true;
- fuzz_lib.root_module.addImport("xml", xml);
-
- const fuzz_compile = b.addSystemCommand(&.{ "afl-clang-lto", "-o" });
- const fuzz_exe = fuzz_compile.addOutputFileArg("fuzz");
- fuzz_compile.addArtifactArg(fuzz_lib);
- const fuzz_install = b.addInstallBinFile(fuzz_exe, "fuzz");
-
- const run_fuzz_compile_step = b.step("install-fuzz", "Build executable for fuzz testing using afl-clang-lto");
- run_fuzz_compile_step.dependOn(&fuzz_install.step);
-
- const run_fuzz = b.addSystemCommand(&.{"afl-fuzz"});
- run_fuzz.addArg("-i");
- if (b.option(bool, "resume", "Resume fuzzing rather than starting a new run") orelse false) {
- run_fuzz.addArg("-");
- } else {
- run_fuzz.addArg(b.pathJoin(&.{ "fuzz", "inputs" }));
- }
- run_fuzz.addArgs(&.{ "-o", b.pathJoin(&.{ "fuzz", "outputs" }) });
- const dictionaries = &[_][]const u8{ "xml.dict", "xml_UTF_16.dict", "xml_UTF_16BE.dict", "xml_UTF_16LE.dict" };
- for (dictionaries) |dictionary| {
- run_fuzz.addArgs(&.{ "-x", b.pathJoin(&.{ "fuzz", "dictionaries", dictionary }) });
- }
- run_fuzz.addFileArg(fuzz_exe);
- const run_fuzz_step = b.step("fuzz", "Execute afl-fuzz with the fuzz testing executable");
- run_fuzz_step.dependOn(&run_fuzz.step);
-
- const fuzz_reproduce_exe = b.addExecutable(.{
- .name = "fuzz-reproduce",
- .root_source_file = b.path("fuzz/main.zig"),
- .target = target,
- .optimize = .Debug,
- });
- fuzz_reproduce_exe.root_module.addImport("xml", xml);
-
- const run_fuzz_reproduce_exe = b.addRunArtifact(fuzz_reproduce_exe);
- if (b.args) |args| {
- run_fuzz_reproduce_exe.addArgs(args);
- }
-
- const run_fuzz_reproduce_step = b.step("fuzz-reproduce", "Reproduce crash found by fuzzing");
- run_fuzz_reproduce_step.dependOn(&run_fuzz_reproduce_exe.step);
+ example_reader_exe.root_module.addImport("xml", xml);
+ const example_reader_install = b.addInstallArtifact(example_reader_exe, .{});
+ install_examples_step.dependOn(&example_reader_install.step);
}
diff --git a/build.zig.zon b/build.zig.zon
index 8d81938..6eb80a1 100644
--- a/build.zig.zon
+++ b/build.zig.zon
@@ -1,5 +1,5 @@
.{
- .name = "zig-xml",
+ .name = "xml",
.version = "0.1.0",
.paths = .{
"src",
diff --git a/examples/read.zig b/examples/read.zig
deleted file mode 100644
index 3fb77f3..0000000
--- a/examples/read.zig
+++ /dev/null
@@ -1,46 +0,0 @@
-const std = @import("std");
-const xml = @import("xml");
-
-pub fn main() !void {
- var gpa = std.heap.GeneralPurposeAllocator(.{}){};
- defer _ = gpa.deinit();
- const allocator = gpa.allocator();
-
- const args = try std.process.argsAlloc(allocator);
- defer std.process.argsFree(allocator, args);
- if (args.len != 2) {
- return error.InvalidArguments;
- }
- const input_path = args[1];
-
- const stdout_raw = std.io.getStdOut().writer();
- var stdout_buffered_writer = std.io.bufferedWriter(stdout_raw);
- const stdout = stdout_buffered_writer.writer();
-
- const input_file = try std.fs.cwd().openFile(input_path, .{});
- defer input_file.close();
- var input_buffered_reader = std.io.bufferedReader(input_file.reader());
- var reader = xml.reader(allocator, input_buffered_reader.reader(), .{});
- defer reader.deinit();
-
- while (try reader.next()) |event| {
- try printEvent(stdout, event);
- }
- try stdout_buffered_writer.flush();
-}
-
-fn printEvent(out: anytype, event: xml.Event) !void {
- switch (event) {
- .xml_declaration => |xml_declaration| try out.print(" |element_start| {
- try out.print("<{?s}({?s}):{s}\n", .{ element_start.name.prefix, element_start.name.ns, element_start.name.local });
- for (element_start.attributes) |attr| {
- try out.print(" @{?s}({?s}):{s}={s}\n", .{ attr.name.prefix, attr.name.ns, attr.name.local, attr.value });
- }
- },
- .element_content => |element_content| try out.print(" {s}\n", .{element_content.content}),
- .element_end => |element_end| try out.print("/{?s}({?s}):{s}\n", .{ element_end.name.prefix, element_end.name.ns, element_end.name.local }),
- .comment => |comment| try out.print("
diff --git a/fuzz/inputs/valid-utf16be.xml b/fuzz/inputs/valid-utf16be.xml
deleted file mode 100644
index 027c1f4..0000000
Binary files a/fuzz/inputs/valid-utf16be.xml and /dev/null differ
diff --git a/fuzz/inputs/valid-utf16le.xml b/fuzz/inputs/valid-utf16le.xml
deleted file mode 100644
index 958ccc7..0000000
Binary files a/fuzz/inputs/valid-utf16le.xml and /dev/null differ
diff --git a/fuzz/inputs/valid.xml b/fuzz/inputs/valid.xml
deleted file mode 100644
index a243053..0000000
--- a/fuzz/inputs/valid.xml
+++ /dev/null
@@ -1,7 +0,0 @@
-
-
-
- Hello, world!
-
- Hello, world!
-
diff --git a/fuzz/main.zig b/fuzz/main.zig
deleted file mode 100644
index 903422e..0000000
--- a/fuzz/main.zig
+++ /dev/null
@@ -1,30 +0,0 @@
-const std = @import("std");
-const xml = @import("xml");
-
-fn cMain() callconv(.C) void {
- main();
-}
-
-comptime {
- @export(cMain, .{ .name = "main" });
-}
-
-pub fn main() void {
- var gpa = std.heap.GeneralPurposeAllocator(.{}){};
- defer std.debug.assert(gpa.deinit() == .ok);
- const allocator = gpa.allocator();
-
- var stdin_buf = std.io.bufferedReader(std.io.getStdIn().reader());
- var reader = xml.reader(allocator, stdin_buf.reader(), .{});
- defer reader.deinit();
-
- var stdout_buf = std.io.bufferedWriter(std.io.getStdOut().writer());
- const stdout = stdout_buf.writer();
- const stderr = std.io.getStdErr().writer();
- while (reader.next() catch |e| {
- stderr.print("Error at {}: {}\n", .{ reader.token_reader.scanner.pos, e }) catch {};
- return;
- }) |event| {
- stdout.print("{} {}\n", .{ reader.token_reader.scanner.pos, event }) catch {};
- }
-}
diff --git a/fuzz/src/fuzz.zig b/fuzz/src/fuzz.zig
new file mode 100644
index 0000000..86f553b
--- /dev/null
+++ b/fuzz/src/fuzz.zig
@@ -0,0 +1,26 @@
+const std = @import("std");
+const Allocator = std.mem.Allocator;
+const assert = std.debug.assert;
+const xml = @import("xml");
+
+export fn zig_fuzz_init() void {}
+
+export fn zig_fuzz_test(buf: [*]u8, len: isize) void {
+ var gpa_state: std.heap.GeneralPurposeAllocator(.{}) = .{};
+ defer assert(gpa_state.deinit() == .ok);
+ const gpa = gpa_state.allocator();
+ fuzz(gpa, buf[0..@intCast(len)]) catch @panic("OOM");
+}
+
+fn fuzz(gpa: Allocator, input: []const u8) !void {
+ var doc = xml.StaticDocument.init(input);
+ var reader = doc.reader(gpa, .{});
+ defer reader.deinit();
+ while (true) {
+ const node = reader.read() catch |err| switch (err) {
+ error.MalformedXml => break,
+ error.OutOfMemory => return error.OutOfMemory,
+ };
+ if (node == .eof) break;
+ }
+}
diff --git a/src/Reader.zig b/src/Reader.zig
new file mode 100644
index 0000000..bce392f
--- /dev/null
+++ b/src/Reader.zig
@@ -0,0 +1,2216 @@
+//! A streaming XML parser, aiming to conform to the [XML 1.0 (Fifth
+//! Edition)](https://www.w3.org/TR/2008/REC-xml-20081126) and [Namespaces in
+//! XML 1.0 (Third Edition)](https://www.w3.org/TR/2009/REC-xml-names-20091208/)
+//! specifications.
+//!
+//! This is the core, type-erased reader implementation. Generally, users will
+//! not use this directly, but will use `xml.GenericReader`, which is a thin
+//! wrapper around this type providing type safety for returned errors.
+//!
+//! A reader gets its raw data from a `Source`, which acts as a forward-only
+//! window of an XML document. In a simple case (`xml.StaticDocument`), this
+//! may just be slices of a document loaded completely in memory, but the same
+//! interface works just as well for a document streamed from a byte reader
+//! (`xml.StreamingDocument`).
+//!
+//! Calling `read` returns the next `Node` in the document, and other reader
+//! functions specific to each node type can be used to obtain more information
+//! about the current node. The convention is that functions associated with a
+//! specific node type have names starting with the node type (and `attribute`
+//! functions can only be called on an `element_start` node).
+//!
+//! Some reader functions end in `Ns`, providing namespace-aware functionality.
+//! These functions must only be called on a reader configured to be
+//! namespace-aware (namespace awareness is on by default in `Options`).
+
+const std = @import("std");
+const Allocator = std.mem.Allocator;
+const assert = std.debug.assert;
+const expectError = std.testing.expectError;
+const expectEqual = std.testing.expectEqual;
+const expectEqualDeep = std.testing.expectEqualDeep;
+const expectEqualStrings = std.testing.expectEqualStrings;
+
+const Location = @import("xml.zig").Location;
+const StaticDocument = @import("xml.zig").StaticDocument;
+const QName = @import("xml.zig").QName;
+const PrefixedQName = @import("xml.zig").PrefixedQName;
+const predefined_entities = @import("xml.zig").predefined_entities;
+const predefined_namespace_uris = @import("xml.zig").predefined_namespace_uris;
+const ns_xml = @import("xml.zig").ns_xml;
+const ns_xmlns = @import("xml.zig").ns_xmlns;
+
+options: Options,
+
+state: State,
+/// An array of buffer spans relevant to the current node.
+/// The layout of the spans depends on the node type:
+/// - `eof` - none
+/// - `xml_declaration` - "xml" (NAME VALUE)...
+/// - `element_start` - NAME (NAME VALUE)...
+/// - `element_end` - NAME
+/// - `comment` - COMMENT
+/// - `pi` - TARGET DATA
+/// - `text` - none
+/// - `cdata` - CDATA
+/// - `character_reference` - REF
+/// - `entity_reference` - REF
+spans: std.ArrayListUnmanaged(BufSpan),
+/// A map of attribute names to indexes.
+/// The keys are slices into `buf`.
+attributes: std.StringArrayHashMapUnmanaged(usize),
+/// A map of attribute qnames to indexes.
+/// The key `ns` and `local` values are slices into `buf`.
+q_attributes: std.ArrayHashMapUnmanaged(QName, usize, QNameContext, true),
+/// String data for the current element nesting context.
+/// Each element start node appends the name of the element to this buffer, and
+/// the element name is followed by any namespace prefixes and URIs declared on
+/// the element so they can be referenced by `ns_prefixes`.
+strings: std.ArrayListUnmanaged(u8),
+/// The start indexes of the element names in `strings`.
+element_names: std.ArrayListUnmanaged(StringIndex),
+/// The namespace prefixes declared by the current nesting context of elements.
+ns_prefixes: std.ArrayListUnmanaged(std.AutoArrayHashMapUnmanaged(StringIndex, StringIndex)),
+/// The Unicode code point associated with the current character reference.
+character: u21,
+
+source: Source,
+/// The source location of the beginning of `buf`.
+loc: Location,
+/// Buffered data read from `source`.
+buf: []const u8,
+/// The current position of the reader in `buf`.
+pos: usize,
+
+/// The last node returned by `read` (that is, the current node).
+node: ?Node,
+/// The current error code (only valid if `read` returned `error.MalformedXml`).
+error_code: ErrorCode,
+/// The position of the current error in `buf`.
+error_pos: usize,
+
+scratch: std.ArrayListUnmanaged(u8),
+
+gpa: Allocator,
+
+const Reader = @This();
+
+pub const Options = struct {
+ /// Whether the reader should handle namespaces in element and attribute
+ /// names. The `Ns`-suffixed functions of `Reader` may only be used when
+ /// this is enabled.
+ namespace_aware: bool = true,
+ /// Whether the reader should track the source location (line and column)
+ /// of nodes in the document. The `location` functions of `Reader` may only
+ /// be used when this is enabled.
+ location_aware: bool = true,
+ /// Whether the reader may assume that its input data is valid UTF-8.
+ assume_valid_utf8: bool = false,
+};
+
+pub const Node = enum {
+ eof,
+ xml_declaration,
+ element_start,
+ element_end,
+ comment,
+ pi,
+ text,
+ cdata,
+ character_reference,
+ entity_reference,
+};
+
+pub const ErrorCode = enum {
+ xml_declaration_attribute_unsupported,
+ xml_declaration_version_missing,
+ xml_declaration_version_unsupported,
+ xml_declaration_encoding_unsupported,
+ xml_declaration_standalone_malformed,
+ doctype_unsupported,
+ directive_unknown,
+ attribute_missing_space,
+ attribute_duplicate,
+ attribute_prefix_undeclared,
+ attribute_illegal_character,
+ element_end_mismatched,
+ element_end_unclosed,
+ comment_malformed,
+ comment_unclosed,
+ pi_unclosed,
+ pi_target_disallowed,
+ pi_missing_space,
+ text_cdata_end_disallowed,
+ cdata_unclosed,
+ entity_reference_unclosed,
+ entity_reference_undefined,
+ character_reference_unclosed,
+ character_reference_malformed,
+ name_malformed,
+ namespace_prefix_unbound,
+ namespace_binding_illegal,
+ namespace_prefix_illegal,
+ unexpected_character,
+ unexpected_eof,
+ expected_equals,
+ expected_quote,
+ missing_end_quote,
+ invalid_utf8,
+ illegal_character,
+};
+
+pub const Source = struct {
+ context: *const anyopaque,
+ moveFn: *const fn (context: *const anyopaque, advance: usize, len: usize) anyerror![]const u8,
+
+ pub fn move(source: Source, advance: usize, len: usize) anyerror![]const u8 {
+ return source.moveFn(source.context, advance, len);
+ }
+};
+
+const State = enum {
+ invalid,
+ start,
+ after_xml_declaration,
+ after_doctype,
+ in_root,
+ empty_element,
+ empty_root,
+ after_root,
+ eof,
+};
+
+pub fn init(gpa: Allocator, source: Source, options: Options) Reader {
+ return .{
+ .options = options,
+
+ .state = .start,
+ .spans = .{},
+ .attributes = .{},
+ .q_attributes = .{},
+ .strings = .{},
+ .element_names = .{},
+ .ns_prefixes = .{},
+ .character = undefined,
+
+ .source = source,
+ .loc = if (options.location_aware) Location.start else undefined,
+ .buf = &.{},
+ .pos = 0,
+
+ .node = null,
+ .error_code = undefined,
+ .error_pos = undefined,
+
+ .scratch = .{},
+
+ .gpa = gpa,
+ };
+}
+
+pub fn deinit(reader: *Reader) void {
+ reader.spans.deinit(reader.gpa);
+ reader.attributes.deinit(reader.gpa);
+ reader.q_attributes.deinit(reader.gpa);
+ reader.strings.deinit(reader.gpa);
+ reader.element_names.deinit(reader.gpa);
+ for (reader.ns_prefixes.items) |*map| map.deinit(reader.gpa);
+ reader.ns_prefixes.deinit(reader.gpa);
+ reader.scratch.deinit(reader.gpa);
+ reader.* = undefined;
+}
+
+/// Returns the location of the node.
+/// Asserts that the reader is location-aware and there is a current node (`read` was called and did not return an error).
+pub fn location(reader: Reader) Location {
+ assert(reader.options.location_aware and reader.node != null);
+ return reader.loc;
+}
+
+test location {
+ var doc = StaticDocument.init(
+ \\
+ \\ Hello, world!
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+
+ try expectEqual(.element_start, try reader.read());
+ try expectEqualDeep(Location{ .line = 1, .column = 1 }, reader.location());
+
+ try expectEqual(.text, try reader.read());
+ try expectEqualDeep(Location{ .line = 1, .column = 7 }, reader.location());
+
+ try expectEqual(.element_start, try reader.read());
+ try expectEqualDeep(Location{ .line = 2, .column = 3 }, reader.location());
+
+ try expectEqual(.text, try reader.read());
+ try expectEqualDeep(Location{ .line = 2, .column = 8 }, reader.location());
+
+ try expectEqual(.element_end, try reader.read());
+ try expectEqualDeep(Location{ .line = 2, .column = 21 }, reader.location());
+
+ try expectEqual(.text, try reader.read());
+ try expectEqualDeep(Location{ .line = 2, .column = 27 }, reader.location());
+
+ try expectEqual(.element_end, try reader.read());
+ try expectEqualDeep(Location{ .line = 3, .column = 1 }, reader.location());
+}
+
+/// Returns the error code associated with the error.
+/// Asserts that `error.MalformedXml` was returned by the last call to `read`.
+pub fn errorCode(reader: Reader) ErrorCode {
+ assert(reader.state == .invalid);
+ return reader.error_code;
+}
+
+test errorCode {
+ var doc = StaticDocument.init(
+ \\
+ \\ <123>Hello, world!123>
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+
+ try expectEqual(.element_start, try reader.read());
+ try expectEqual(.text, try reader.read());
+ try expectError(error.MalformedXml, reader.read());
+ try expectEqual(.name_malformed, reader.errorCode());
+}
+
+/// Returns the location where the error occurred.
+/// Asserts that the reader is location-aware and `error.MalformedXml` was returned by the last call to `read`.
+pub fn errorLocation(reader: Reader) Location {
+ assert(reader.state == .invalid);
+ var loc = reader.loc;
+ loc.update(reader.buf[0..reader.error_pos]);
+ return loc;
+}
+
+test errorLocation {
+ var doc = StaticDocument.init(
+ \\
+ \\ <123>Hello, world!123>
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+
+ try expectEqual(.element_start, try reader.read());
+ try expectEqual(.text, try reader.read());
+ try expectError(error.MalformedXml, reader.read());
+ try expectEqualDeep(Location{ .line = 2, .column = 4 }, reader.errorLocation());
+}
+
+/// Returns the version declared in the XML declaration.
+/// Asserts that the current node is `Node.xml_version`.
+pub fn xmlDeclarationVersion(reader: Reader) []const u8 {
+ assert(reader.node == .xml_declaration);
+ return reader.attributeValueUnchecked(0);
+}
+
+test xmlDeclarationVersion {
+ var doc = StaticDocument.init(
+ \\
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.xml_declaration, try reader.read());
+ try expectEqualStrings("1.0", reader.xmlDeclarationVersion());
+}
+
+/// Returns the encoding declared in the XML declaration.
+/// Asserts that the current node is `Node.xml_version`.
+pub fn xmlDeclarationEncoding(reader: Reader) ?[]const u8 {
+ assert(reader.node == .xml_declaration);
+ const n = reader.attributes.get("encoding") orelse return null;
+ return reader.attributeValueUnchecked(n);
+}
+
+test xmlDeclarationEncoding {
+ var doc = StaticDocument.init(
+ \\
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.xml_declaration, try reader.read());
+ try expectEqualStrings("UTF-8", reader.xmlDeclarationEncoding().?);
+}
+
+/// Returns whether the XML declaration declares the document to be standalone.
+/// Asserts that the current node is `Node.xml_version`.
+pub fn xmlDeclarationStandalone(reader: Reader) ?bool {
+ assert(reader.node == .xml_declaration);
+ const n = reader.attributes.get("standalone") orelse return null;
+ return std.mem.eql(u8, reader.attributeValueUnchecked(n), "yes");
+}
+
+test xmlDeclarationStandalone {
+ var doc = StaticDocument.init(
+ \\
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.xml_declaration, try reader.read());
+ try expectEqual(true, reader.xmlDeclarationStandalone());
+}
+
+/// Returns the name of the element.
+/// Asserts that the current node is `Node.element_start` or `Node.element_end`.
+pub fn elementName(reader: Reader) []const u8 {
+ assert(reader.node == .element_start or reader.node == .element_end);
+ return reader.elementNameUnchecked();
+}
+
+test elementName {
+ var doc = StaticDocument.init(
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.element_start, try reader.read());
+ try expectEqualStrings("root", reader.elementName());
+ try expectEqual(.element_end, try reader.read());
+ try expectEqualStrings("root", reader.elementName());
+}
+
+/// Returns the name of the element as a `PrefixedQName`.
+/// Asserts that the current node is `Node.element_start` or `Node.element_end` and that `reader` is namespace-aware.
+pub fn elementNameNs(reader: Reader) PrefixedQName {
+ assert(reader.options.namespace_aware);
+ return reader.parseQName(reader.elementName());
+}
+
+test elementNameNs {
+ var doc = StaticDocument.init(
+ \\
+ \\
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+
+ try expectEqual(.element_start, try reader.read());
+ try expectEqualStrings("", reader.elementNameNs().prefix);
+ try expectEqualStrings("https://example.com/ns", reader.elementNameNs().ns);
+ try expectEqualStrings("root", reader.elementNameNs().local);
+
+ try expectEqual(.text, try reader.read());
+
+ try expectEqual(.element_start, try reader.read());
+ try expectEqualStrings("a", reader.elementNameNs().prefix);
+ try expectEqualStrings("https://example.com/ns2", reader.elementNameNs().ns);
+ try expectEqualStrings("a", reader.elementNameNs().local);
+
+ try expectEqual(.element_end, try reader.read());
+ try expectEqualStrings("a", reader.elementNameNs().prefix);
+ try expectEqualStrings("https://example.com/ns2", reader.elementNameNs().ns);
+ try expectEqualStrings("a", reader.elementNameNs().local);
+
+ try expectEqual(.text, try reader.read());
+
+ try expectEqual(.element_end, try reader.read());
+ try expectEqualStrings("", reader.elementNameNs().prefix);
+ try expectEqualStrings("https://example.com/ns", reader.elementNameNs().ns);
+ try expectEqualStrings("root", reader.elementNameNs().local);
+}
+
+fn elementNameUnchecked(reader: Reader) []const u8 {
+ return reader.bufSlice(reader.spans.items[0]);
+}
+
+fn elementNamePos(reader: Reader) usize {
+ return reader.spans.items[0].start;
+}
+
+/// Returns the number of attributes of the element.
+/// Asserts that the current node is `Node.element_start`.
+pub fn attributeCount(reader: Reader) usize {
+ assert(reader.node == .element_start);
+ return reader.attributeCountUnchecked();
+}
+
+test attributeCount {
+ var doc = StaticDocument.init(
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.element_start, try reader.read());
+ try expectEqual(3, reader.attributeCount());
+}
+
+fn attributeCountUnchecked(reader: Reader) usize {
+ return @divExact(reader.spans.items.len - 1, 2);
+}
+
+/// Returns the name of the `n`th attribute of the element.
+/// Asserts that the current node is `Node.element_start` and `n` is less than `reader.nAttributes()`.
+pub fn attributeName(reader: Reader, n: usize) []const u8 {
+ assert(reader.node == .element_start and n < reader.attributeCount());
+ return reader.attributeNameUnchecked(n);
+}
+
+test attributeName {
+ var doc = StaticDocument.init(
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.element_start, try reader.read());
+ try expectEqualStrings("a", reader.attributeName(0));
+ try expectEqualStrings("b", reader.attributeName(1));
+ try expectEqualStrings("c", reader.attributeName(2));
+}
+
+/// Returns the name of the `n`th attribute of the element as a `PrefixedQName`.
+/// If the reader is not namespace-aware, only the `local` part will be non-empty.
+/// Asserts that the current node is `Node.element_start` and `n` is less than `reader.nAttributes()`.
+pub fn attributeNameNs(reader: Reader, n: usize) PrefixedQName {
+ const name = reader.attributeName(n);
+ return if (reader.options.namespace_aware) reader.parseQName(name) else .{
+ .prefix = "",
+ .ns = "",
+ .local = name,
+ };
+}
+
+test attributeNameNs {
+ var doc = StaticDocument.init(
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.element_start, try reader.read());
+
+ try expectEqualStrings("xmlns", reader.attributeNameNs(0).prefix);
+ try expectEqualStrings("http://www.w3.org/2000/xmlns/", reader.attributeNameNs(0).ns);
+ try expectEqualStrings("pre", reader.attributeNameNs(0).local);
+
+ try expectEqualStrings("", reader.attributeNameNs(1).prefix);
+ try expectEqualStrings("", reader.attributeNameNs(1).ns);
+ try expectEqualStrings("a", reader.attributeNameNs(1).local);
+
+ try expectEqualStrings("pre", reader.attributeNameNs(2).prefix);
+ try expectEqualStrings("https://example.com/ns", reader.attributeNameNs(2).ns);
+ try expectEqualStrings("b", reader.attributeNameNs(2).local);
+}
+
+fn attributeNameUnchecked(reader: Reader, n: usize) []const u8 {
+ return reader.bufSlice(reader.spans.items[n * 2 + 1]);
+}
+
+fn attributeNamePos(reader: Reader, n: usize) usize {
+ return reader.spans.items[n * 2 + 1].start;
+}
+
+/// Returns the value of the `n`th attribute of the element.
+/// This function may incur allocations if the attribute value contains entity or character
+/// references, or CR, LF, or TAB characters which must be normalized according to the spec.
+/// The returned value is owned by `reader` and is only valid until the next call to another
+/// function on `reader`.
+/// Asserts that the current node is `Node.element_start` and `n` is less than `reader.nAttributes()`.
+pub fn attributeValue(reader: *Reader, n: usize) Allocator.Error![]const u8 {
+ const raw = reader.attributeValueRaw(n);
+ if (std.mem.indexOfAny(u8, raw, "&\t\r\n") == null) return raw;
+ reader.scratch.clearRetainingCapacity();
+ const writer = reader.scratch.writer(reader.gpa);
+ reader.attributeValueWrite(n, writer.any()) catch |err| switch (err) {
+ error.OutOfMemory => return error.OutOfMemory,
+ else => unreachable,
+ };
+ return reader.scratch.items;
+}
+
+test attributeValue {
+ var doc = StaticDocument.init(
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.element_start, try reader.read());
+ try expectEqualStrings("1", try reader.attributeValue(0));
+ try expectEqualStrings("2", try reader.attributeValue(1));
+ try expectEqualStrings("1 & 2", try reader.attributeValue(2));
+}
+
+/// Returns the value of the `n`th attribute of the element.
+/// Asserts that the current node is `Node.element_start` and `n` is less than `reader.nAttributes()`.
+pub fn attributeValueAlloc(reader: Reader, gpa: Allocator, n: usize) Allocator.Error![]u8 {
+ var buf = std.ArrayList(u8).init(gpa);
+ defer buf.deinit();
+ const buf_writer = buf.writer();
+ reader.attributeValueWrite(n, buf_writer.any()) catch |err| switch (err) {
+ error.OutOfMemory => return error.OutOfMemory,
+ else => unreachable,
+ };
+ return buf.toOwnedSlice();
+}
+
+test attributeValueAlloc {
+ var doc = StaticDocument.init(
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.element_start, try reader.read());
+
+ const attr0 = try reader.attributeValueAlloc(std.testing.allocator, 0);
+ defer std.testing.allocator.free(attr0);
+ try expectEqualStrings("1", attr0);
+ const attr1 = try reader.attributeValueAlloc(std.testing.allocator, 1);
+ defer std.testing.allocator.free(attr1);
+ try expectEqualStrings("2", attr1);
+ const attr2 = try reader.attributeValueAlloc(std.testing.allocator, 2);
+ defer std.testing.allocator.free(attr2);
+ try expectEqualStrings("1 & 2", attr2);
+}
+
+/// Writes the value of the `n`th attribute of the element to `writer`.
+/// Asserts that the current node is `Node.element_start` and `n` is less than `reader.nAttributes()`.
+pub fn attributeValueWrite(reader: Reader, n: usize, writer: std.io.AnyWriter) anyerror!void {
+ const raw = reader.attributeValueRaw(n);
+ var pos: usize = 0;
+ while (std.mem.indexOfAnyPos(u8, raw, pos, "&\t\r\n")) |split_pos| {
+ try writer.writeAll(raw[pos..split_pos]);
+ pos = split_pos;
+ switch (raw[pos]) {
+ '&' => {
+ const entity_end = std.mem.indexOfScalarPos(u8, raw, pos, ';') orelse unreachable;
+ if (raw[pos + "&".len] == '#') {
+ const c = if (raw[pos + "".len] == 'x')
+ std.fmt.parseInt(u21, raw[pos + "".len .. entity_end], 16) catch unreachable
+ else
+ std.fmt.parseInt(u21, raw[pos + "".len .. entity_end], 10) catch unreachable;
+ var buf: [4]u8 = undefined;
+ const len = std.unicode.utf8Encode(c, &buf) catch unreachable;
+ try writer.writeAll(buf[0..len]);
+ } else {
+ try writer.writeAll(predefined_entities.get(raw[pos + "&".len .. entity_end]) orelse unreachable);
+ }
+ pos = entity_end + 1;
+ },
+ '\t', '\n' => {
+ try writer.writeByte(' ');
+ pos += 1;
+ },
+ '\r' => {
+ try writer.writeByte(' ');
+ if (pos + 1 < raw.len and raw[pos + 1] == '\n') {
+ pos += 2;
+ } else {
+ pos += 1;
+ }
+ },
+ else => unreachable,
+ }
+ }
+ try writer.writeAll(raw[pos..]);
+}
+
+test attributeValueWrite {
+ var doc = StaticDocument.init(
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.element_start, try reader.read());
+
+ var buf = std.ArrayList(u8).init(std.testing.allocator);
+ defer buf.deinit();
+
+ try reader.attributeValueWrite(0, buf.writer());
+ try expectEqualStrings("1", buf.items);
+
+ buf.clearRetainingCapacity();
+ try reader.attributeValueWrite(1, buf.writer());
+ try expectEqualStrings("2", buf.items);
+
+ buf.clearRetainingCapacity();
+ try reader.attributeValueWrite(2, buf.writer());
+ try expectEqualStrings("1 & 2", buf.items);
+}
+
+/// Returns the raw value of the `n`th attribute of the element, as it appears in the source.
+/// Asserts that the current node is `Node.element_start` and `n` is less than `reader.nAttributes()`.
+pub fn attributeValueRaw(reader: Reader, n: usize) []const u8 {
+ assert(reader.node == .element_start and n < reader.attributeCount());
+ return reader.attributeValueUnchecked(n);
+}
+
+test attributeValueRaw {
+ var doc = StaticDocument.init(
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.element_start, try reader.read());
+ try expectEqualStrings("1", reader.attributeValueRaw(0));
+ try expectEqualStrings("2", reader.attributeValueRaw(1));
+ try expectEqualStrings("1 & 2", reader.attributeValueRaw(2));
+}
+
+fn attributeValueUnchecked(reader: Reader, n: usize) []const u8 {
+ return reader.bufSlice(reader.spans.items[n * 2 + 2]);
+}
+
+fn attributeValuePos(reader: Reader, n: usize) usize {
+ return reader.spans.items[n * 2 + 2].start;
+}
+
+fn attributeValueEndPos(reader: Reader, n: usize) usize {
+ return reader.spans.items[n * 2 + 2].end;
+}
+
+/// Returns the location of the `n`th attribute of the element.
+/// Asserts that the reader is location-aware, the current node is `Node.element_start`, and `n` is less than `reader.nAttributes()`.
+pub fn attributeLocation(reader: Reader, n: usize) Location {
+ assert(reader.options.location_aware and reader.node == .element_start and n < reader.attributeCount());
+ var loc = reader.loc;
+ loc.update(reader.buf[0..reader.attributeNamePos(n)]);
+ return loc;
+}
+
+/// Returns the index of the attribute named `name`.
+/// Asserts that the current node is `Node.element_start`.
+pub fn attributeIndex(reader: Reader, name: []const u8) ?usize {
+ assert(reader.node == .element_start);
+ return reader.attributes.get(name);
+}
+
+test attributeIndex {
+ var doc = StaticDocument.init(
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.element_start, try reader.read());
+ try expectEqual(0, reader.attributeIndex("one"));
+ try expectEqual(1, reader.attributeIndex("two"));
+ try expectEqual(2, reader.attributeIndex("three"));
+ try expectEqual(null, reader.attributeIndex("four"));
+}
+
+/// Returns the index of the attribute with namespace `ns` and local name `local`.
+/// Asserts that the current node is `Node.element_start` and `reader` is namespace-aware.
+pub fn attributeIndexNs(reader: Reader, ns: []const u8, local: []const u8) ?usize {
+ assert(reader.node == .element_start and reader.options.namespace_aware);
+ return reader.q_attributes.get(.{ .ns = ns, .local = local });
+}
+
+test attributeIndexNs {
+ var doc = StaticDocument.init(
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.element_start, try reader.read());
+ try expectEqual(0, reader.attributeIndexNs("", "xmlns"));
+ try expectEqual(1, reader.attributeIndexNs("http://www.w3.org/2000/xmlns/", "foo"));
+ try expectEqual(2, reader.attributeIndexNs("", "one"));
+ try expectEqual(3, reader.attributeIndexNs("http://example.com/foo", "two"));
+ try expectEqual(null, reader.attributeIndexNs("http://example.com", "one"));
+ try expectEqual(null, reader.attributeIndexNs("", "three"));
+}
+
+/// Returns the text of the comment.
+/// This function may incur allocations if the comment text contains CR
+/// characters which must be normalized according to the spec.
+/// The returned value is owned by `reader` and is only valid until the next call to another
+/// function on `reader`.
+/// Asserts that the current node is `Node.comment`.
+pub fn comment(reader: *Reader) Allocator.Error![]const u8 {
+ return reader.newlineNormalizedScratch(reader.commentRaw());
+}
+
+test comment {
+ var doc = StaticDocument.init(
+ \\
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.comment, try reader.read());
+ try expectEqualStrings(" Hello, world! ", try reader.comment());
+}
+
+/// Writes the text of the comment to `writer`.
+/// Asserts that the current node is `Node.comment`.
+pub fn commentWrite(reader: Reader, writer: std.io.AnyWriter) anyerror!void {
+ try writeNewlineNormalized(reader.commentRaw(), writer);
+}
+
+test commentWrite {
+ var doc = StaticDocument.init(
+ \\
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.comment, try reader.read());
+
+ var buf = std.ArrayList(u8).init(std.testing.allocator);
+ defer buf.deinit();
+ try reader.commentWrite(buf.writer());
+ try expectEqualStrings(" Hello, world! ", buf.items);
+}
+
+/// Returns the raw text of the comment, as it appears in the source.
+/// Asserts that the current node is `Node.comment`.
+pub fn commentRaw(reader: Reader) []const u8 {
+ assert(reader.node == .comment);
+ return reader.commentUnchecked();
+}
+
+test commentRaw {
+ var doc = StaticDocument.init(
+ \\
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.comment, try reader.read());
+ try expectEqualStrings(" Hello, world! ", reader.commentRaw());
+}
+
+fn commentUnchecked(reader: Reader) []const u8 {
+ return reader.bufSlice(reader.spans.items[0]);
+}
+
+fn commentPos(reader: Reader) usize {
+ return reader.spans.items[0].start;
+}
+
+/// Returns the target of the PI.
+/// Asserts that the current node is `Node.pi`.
+pub fn piTarget(reader: Reader) []const u8 {
+ assert(reader.node == .pi);
+ return reader.piTargetUnchecked();
+}
+
+test piTarget {
+ var doc = StaticDocument.init(
+ \\
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.pi, try reader.read());
+ try expectEqualStrings("pi-target", reader.piTarget());
+}
+
+fn piTargetUnchecked(reader: Reader) []const u8 {
+ return reader.bufSlice(reader.spans.items[0]);
+}
+
+fn piTargetPos(reader: Reader) usize {
+ return reader.spans.items[0].start;
+}
+
+fn piTargetEndPos(reader: Reader) usize {
+ return reader.spans.items[0].end;
+}
+
+/// Returns the data of the PI.
+/// This function may incur allocations if the PI data contains CR
+/// characters which must be normalized according to the spec.
+/// The returned value is owned by `reader` and is only valid until the next call to another
+/// function on `reader`.
+/// Asserts that the current node is `Node.pi`.
+pub fn piData(reader: *Reader) Allocator.Error![]const u8 {
+ return reader.newlineNormalizedScratch(reader.piDataRaw());
+}
+
+test piData {
+ var doc = StaticDocument.init(
+ \\
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.pi, try reader.read());
+ try expectEqualStrings("pi-data", try reader.piData());
+}
+
+/// Writes the data of the PI to `writer`.
+/// Asserts that the current node is `Node.pi`.
+pub fn piDataWrite(reader: Reader, writer: std.io.AnyWriter) anyerror!void {
+ try writeNewlineNormalized(reader.piDataRaw(), writer);
+}
+
+test piDataWrite {
+ var doc = StaticDocument.init(
+ \\
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.pi, try reader.read());
+
+ var buf = std.ArrayList(u8).init(std.testing.allocator);
+ defer buf.deinit();
+ try reader.piDataWrite(buf.writer());
+ try expectEqualStrings("pi-data", buf.items);
+}
+
+/// Returns the raw data of the PI, as it appears in the source.
+/// Asserts that the current node is `Node.pi`.
+pub fn piDataRaw(reader: Reader) []const u8 {
+ assert(reader.node == .pi);
+ return reader.piDataUnchecked();
+}
+
+test piDataRaw {
+ var doc = StaticDocument.init(
+ \\
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.pi, try reader.read());
+ try expectEqualStrings("pi-data", reader.piDataRaw());
+}
+
+fn piDataUnchecked(reader: Reader) []const u8 {
+ return reader.bufSlice(reader.spans.items[1]);
+}
+
+fn piDataPos(reader: Reader) usize {
+ return reader.spans.items[1].start;
+}
+
+fn piDataEndPos(reader: Reader) usize {
+ return reader.spans.items[1].end;
+}
+
+/// Returns the text.
+/// This function may incur allocations if the text contains CR
+/// characters which must be normalized according to the spec.
+/// The returned value is owned by `reader` and is only valid until the next call to another
+/// function on `reader`.
+/// Asserts that the current node is `Node.text`.
+pub fn text(reader: *Reader) Allocator.Error![]const u8 {
+ return reader.newlineNormalizedScratch(reader.textRaw());
+}
+
+test text {
+ var doc = StaticDocument.init(
+ \\Hello, world!
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.element_start, try reader.read());
+ try expectEqual(.text, try reader.read());
+ try expectEqualStrings("Hello, world!", try reader.text());
+}
+
+/// Writes the text to `writer`.
+/// Asserts that the current node is `Node.text`.
+pub fn textWrite(reader: Reader, writer: std.io.AnyWriter) anyerror!void {
+ try writeNewlineNormalized(reader.textRaw(), writer);
+}
+
+test textWrite {
+ var doc = StaticDocument.init(
+ \\Hello, world!
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.element_start, try reader.read());
+ try expectEqual(.text, try reader.read());
+
+ var buf = std.ArrayList(u8).init(std.testing.allocator);
+ defer buf.deinit();
+ try reader.textWrite(buf.writer());
+ try expectEqualStrings("Hello, world!", buf.items);
+}
+
+/// Returns the raw text, as it appears in the source.
+/// Asserts that the current node is `Node.text`.
+pub fn textRaw(reader: Reader) []const u8 {
+ assert(reader.node == .text);
+ return reader.textUnchecked();
+}
+
+test textRaw {
+ var doc = StaticDocument.init(
+ \\Hello, world!
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.element_start, try reader.read());
+ try expectEqual(.text, try reader.read());
+ try expectEqualStrings("Hello, world!", reader.textRaw());
+}
+
+fn textUnchecked(reader: Reader) []const u8 {
+ return reader.buf[0..reader.pos];
+}
+
+fn textPos(reader: Reader) usize {
+ _ = reader;
+ return 0;
+}
+
+/// Returns the text of the CDATA section.
+/// This function may incur allocations if the text contains CR
+/// characters which must be normalized according to the spec.
+/// The returned value is owned by `reader` and is only valid until the next call to another
+/// function on `reader`.
+/// Asserts that the current node is `Node.cdata`.
+pub fn cdata(reader: *Reader) Allocator.Error![]const u8 {
+ return reader.newlineNormalizedScratch(reader.cdataRaw());
+}
+
+test cdata {
+ var doc = StaticDocument.init(
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.element_start, try reader.read());
+ try expectEqual(.cdata, try reader.read());
+ try expectEqualStrings("Hello, world!", try reader.cdata());
+}
+
+/// Writes the text of the CDATA section to `writer`.
+/// Asserts that the current node is `Node.cdata`.
+pub fn cdataWrite(reader: Reader, writer: std.io.AnyWriter) anyerror!void {
+ try writeNewlineNormalized(reader.cdataRaw(), writer);
+}
+
+test cdataWrite {
+ var doc = StaticDocument.init(
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.element_start, try reader.read());
+ try expectEqual(.cdata, try reader.read());
+
+ var buf = std.ArrayList(u8).init(std.testing.allocator);
+ defer buf.deinit();
+ try reader.cdataWrite(buf.writer());
+ try expectEqualStrings("Hello, world!", buf.items);
+}
+
+/// Returns the raw text of the CDATA section, as it appears in the source.
+/// Asserts that the current node is `Node.cdata`.
+pub fn cdataRaw(reader: Reader) []const u8 {
+ assert(reader.node == .cdata);
+ return reader.cdataUnchecked();
+}
+
+test cdataRaw {
+ var doc = StaticDocument.init(
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.element_start, try reader.read());
+ try expectEqual(.cdata, try reader.read());
+ try expectEqualStrings("Hello, world!", reader.cdataRaw());
+}
+
+fn cdataUnchecked(reader: Reader) []const u8 {
+ return reader.bufSlice(reader.spans.items[0]);
+}
+
+fn cdataPos(reader: Reader) usize {
+ return reader.spans.items[0].start;
+}
+
+/// Returns the name of the referenced entity.
+/// Asserts that the current node is `Node.entity_reference`.
+pub fn entityReferenceName(reader: Reader) []const u8 {
+ assert(reader.node == .entity_reference);
+ return reader.entityReferenceNameUnchecked();
+}
+
+test entityReferenceName {
+ var doc = StaticDocument.init(
+ \\&
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.element_start, try reader.read());
+ try expectEqual(.entity_reference, try reader.read());
+ try expectEqualStrings("amp", reader.entityReferenceName());
+}
+
+fn entityReferenceNameUnchecked(reader: Reader) []const u8 {
+ return reader.bufSlice(reader.spans.items[0]);
+}
+
+fn entityReferenceNamePos(reader: Reader) usize {
+ return reader.spans.items[0].start;
+}
+
+/// Returns the referenced character (Unicode codepoint).
+/// Asserts that the current node is `Node.character_reference`.
+pub fn characterReferenceChar(reader: Reader) u21 {
+ assert(reader.node == .character_reference);
+ return reader.character;
+}
+
+test characterReferenceChar {
+ var doc = StaticDocument.init(
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.element_start, try reader.read());
+ try expectEqual(.character_reference, try reader.read());
+ try expectEqual(0x20, reader.characterReferenceChar());
+}
+
+/// Returns the "name" of the referenced character, as it appears in the source.
+/// Asserts that the current node is `Node.character_reference`.
+pub fn characterReferenceName(reader: Reader) []const u8 {
+ assert(reader.node == .character_reference);
+ return reader.characterReferenceNameUnchecked();
+}
+
+test characterReferenceName {
+ var doc = StaticDocument.init(
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+ try expectEqual(.element_start, try reader.read());
+ try expectEqual(.character_reference, try reader.read());
+ try expectEqualStrings("x20", reader.characterReferenceName());
+}
+
+fn characterReferenceNameUnchecked(reader: Reader) []const u8 {
+ return reader.bufSlice(reader.spans.items[0]);
+}
+
+fn characterReferenceNamePos(reader: Reader) usize {
+ return reader.spans.items[0].start;
+}
+
+fn newlineNormalizedScratch(reader: *Reader, raw: []const u8) Allocator.Error![]const u8 {
+ if (std.mem.indexOfScalar(u8, raw, '\r') == null) return raw;
+ reader.scratch.clearRetainingCapacity();
+ const writer = reader.scratch.writer(reader.gpa);
+ writeNewlineNormalized(raw, writer.any()) catch |err| switch (err) {
+ error.OutOfMemory => return error.OutOfMemory,
+ else => unreachable,
+ };
+ return reader.scratch.items;
+}
+
+fn writeNewlineNormalized(raw: []const u8, writer: std.io.AnyWriter) anyerror!void {
+ var pos: usize = 0;
+ while (std.mem.indexOfScalarPos(u8, raw, pos, '\r')) |cr_pos| {
+ try writer.writeAll(raw[pos..cr_pos]);
+ try writer.writeByte('\n');
+ if (cr_pos + 1 < raw.len and raw[cr_pos + 1] == '\n') {
+ pos = cr_pos + "\r\n".len;
+ } else {
+ pos = cr_pos + "\r".len;
+ }
+ }
+ try writer.writeAll(raw[pos..]);
+}
+
+/// Returns the namespace URI bound to `prefix`, or an empty string if none.
+/// If the reader is not namespace-aware, always returns an empty string.
+pub fn namespaceUri(reader: Reader, prefix: []const u8) []const u8 {
+ if (!reader.options.namespace_aware) return "";
+ if (predefined_namespace_uris.get(prefix)) |uri| return uri;
+ var i = reader.ns_prefixes.items.len;
+ const index = while (i > 0) {
+ i -= 1;
+ if (reader.ns_prefixes.items[i].getAdapted(prefix, StringIndexAdapter{
+ .strings = reader.strings.items,
+ })) |uri| break uri;
+ } else return "";
+ return reader.string(index);
+}
+
+test namespaceUri {
+ var doc = StaticDocument.init(
+ \\
+ \\
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+
+ try expectEqual(.element_start, try reader.read());
+ try expectEqualStrings("https://example.com/default", reader.namespaceUri(""));
+ try expectEqualStrings("https://example.com/other", reader.namespaceUri("other"));
+ try expectEqualStrings("", reader.namespaceUri("child"));
+
+ try expectEqual(.text, try reader.read());
+ try expectEqualStrings("https://example.com/default", reader.namespaceUri(""));
+ try expectEqualStrings("https://example.com/other", reader.namespaceUri("other"));
+ try expectEqualStrings("", reader.namespaceUri("child"));
+
+ try expectEqual(.element_start, try reader.read());
+ try expectEqualStrings("https://example.com/default", reader.namespaceUri(""));
+ try expectEqualStrings("https://example.com/other", reader.namespaceUri("other"));
+ try expectEqualStrings("https://example.com/child", reader.namespaceUri("child"));
+
+ try expectEqual(.element_end, try reader.read());
+ try expectEqualStrings("https://example.com/default", reader.namespaceUri(""));
+ try expectEqualStrings("https://example.com/other", reader.namespaceUri("other"));
+ try expectEqualStrings("https://example.com/child", reader.namespaceUri("child"));
+
+ try expectEqual(.text, try reader.read());
+ try expectEqualStrings("https://example.com/default", reader.namespaceUri(""));
+ try expectEqualStrings("https://example.com/other", reader.namespaceUri("other"));
+ try expectEqualStrings("", reader.namespaceUri("child"));
+
+ try expectEqual(.element_end, try reader.read());
+ try expectEqualStrings("https://example.com/default", reader.namespaceUri(""));
+ try expectEqualStrings("https://example.com/other", reader.namespaceUri("other"));
+ try expectEqualStrings("", reader.namespaceUri("child"));
+}
+
+fn parseQName(reader: Reader, name: []const u8) PrefixedQName {
+ const prefix, const local = if (std.mem.indexOfScalar(u8, name, ':')) |colon_pos|
+ .{ name[0..colon_pos], name[colon_pos + 1 ..] }
+ else
+ .{ "", name };
+ return .{
+ .prefix = prefix,
+ .ns = reader.namespaceUri(prefix),
+ .local = local,
+ };
+}
+
+pub const ReadError = error{MalformedXml} || Allocator.Error;
+
+/// Reads and returns the next node in the document.
+pub fn read(reader: *Reader) anyerror!Node {
+ errdefer reader.node = null;
+ const node: Node = while (true) {
+ switch (reader.state) {
+ .invalid => return error.MalformedXml,
+ .start => {
+ try reader.shift();
+ try reader.skipBom();
+ if (try reader.readMatch("")) {
+ try reader.readName();
+ if (std.mem.eql(u8, reader.piTargetUnchecked(), "xml")) {
+ try reader.readXmlDeclarationContent();
+ reader.state = .after_xml_declaration;
+ try reader.checkXmlDeclaration();
+ break .xml_declaration;
+ } else {
+ try reader.readPiContent();
+ reader.state = .after_xml_declaration;
+ try reader.checkPi();
+ break .pi;
+ }
+ }
+ reader.state = .after_xml_declaration;
+ continue;
+ },
+ .after_xml_declaration => {
+ try reader.skipSpace();
+ if (try reader.readMatch("")) {
+ try reader.readName();
+ try reader.readPiContent();
+ try reader.checkPi();
+ break .pi;
+ } else if (try reader.readMatch("
+ \\
+ \\
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+
+ try reader.skipProlog();
+ try expectEqualStrings("root", reader.elementName());
+ try expectEqual(.element_end, try reader.read());
+ try expectEqualStrings("root", reader.elementName());
+ try expectEqual(.eof, try reader.read());
+}
+
+/// Reads and discards all document content until the end of the containing
+/// element, which is the current node after this function returns successfully.
+/// Asserts that the reader is currently inside an element (not before or after
+/// the root element).
+pub fn skipElement(reader: *Reader) anyerror!void {
+ assert(reader.state == .in_root or reader.state == .empty_element or reader.state == .empty_root);
+ const depth = reader.element_names.items.len;
+ while (true) {
+ if (try reader.read() == .element_end and reader.element_names.items.len == depth) return;
+ }
+}
+
+test skipElement {
+ var doc = StaticDocument.init(
+ \\
+ \\ Hello, world!
+ \\
+ \\
+ \\
+ );
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+
+ try expectEqual(.element_start, try reader.read());
+ try expectEqualStrings("root", reader.elementName());
+ try reader.skipElement();
+ try expectEqualStrings("root", reader.elementName());
+ try expectEqual(.eof, try reader.read());
+}
+
+fn readXmlDeclarationContent(reader: *Reader) !void {
+ while (true) {
+ try reader.readSpace();
+ if (try reader.readMatch("?>")) return;
+ try reader.readPair();
+ }
+}
+
+fn checkXmlDeclaration(reader: *Reader) !void {
+ try reader.checkAttributes();
+ var state: enum {
+ start,
+ after_version,
+ after_encoding,
+ end,
+ } = .start;
+ for (0..reader.attributeCountUnchecked()) |i| {
+ const name = reader.attributeNameUnchecked(i);
+ const value = reader.attributeValueUnchecked(i);
+ switch (state) {
+ .start => if (std.mem.eql(u8, name, "version")) {
+ try reader.checkXmlVersion(value, i);
+ state = .after_version;
+ } else {
+ return reader.fatal(.xml_declaration_version_missing, 0);
+ },
+ .after_version => if (std.mem.eql(u8, name, "encoding")) {
+ try reader.checkXmlEncoding(value, i);
+ state = .after_encoding;
+ } else if (std.mem.eql(u8, name, "standalone")) {
+ try reader.checkXmlStandalone(value, i);
+ state = .end;
+ } else {
+ return reader.fatal(.xml_declaration_attribute_unsupported, reader.attributeNamePos(i));
+ },
+ .after_encoding => if (std.mem.eql(u8, name, "standalone")) {
+ try reader.checkXmlStandalone(value, i);
+ state = .end;
+ } else {
+ return reader.fatal(.xml_declaration_attribute_unsupported, reader.attributeNamePos(i));
+ },
+ .end => return reader.fatal(.xml_declaration_attribute_unsupported, reader.attributeNamePos(i)),
+ }
+ }
+ if (state == .start) {
+ return reader.fatal(.xml_declaration_version_missing, 0);
+ }
+}
+
+fn checkXmlVersion(reader: *Reader, version: []const u8, n_attr: usize) !void {
+ if (!std.mem.startsWith(u8, version, "1.")) {
+ return reader.fatal(.xml_declaration_version_unsupported, reader.attributeValuePos(n_attr));
+ }
+ for (version["1.".len..]) |c| {
+ switch (c) {
+ '0'...'9' => {},
+ else => return reader.fatal(.xml_declaration_version_unsupported, reader.attributeValuePos(n_attr)),
+ }
+ }
+}
+
+fn checkXmlEncoding(reader: *Reader, encoding: []const u8, n_attr: usize) !void {
+ if (!std.ascii.eqlIgnoreCase(encoding, "utf-8")) {
+ return reader.fatal(.xml_declaration_encoding_unsupported, reader.attributeValuePos(n_attr));
+ }
+}
+
+fn checkXmlStandalone(reader: *Reader, standalone: []const u8, n_attr: usize) !void {
+ if (!std.mem.eql(u8, standalone, "yes") and !std.mem.eql(u8, standalone, "no")) {
+ return reader.fatal(.xml_declaration_standalone_malformed, reader.attributeValuePos(n_attr));
+ }
+}
+
+fn readElementStartContent(reader: *Reader) !bool {
+ while (true) {
+ try reader.readSpace();
+ if (try reader.readMatch("/>")) {
+ return true;
+ } else if (try reader.readMatch(">")) {
+ return false;
+ } else {
+ try reader.readPair();
+ }
+ }
+}
+
+fn checkElementStart(reader: *Reader) !void {
+ const element_name = reader.elementNameUnchecked();
+ const element_name_pos = reader.elementNamePos();
+ try reader.checkName(element_name, element_name_pos);
+ try reader.checkAttributes();
+
+ const element_name_index = try reader.addString(element_name);
+ try reader.element_names.append(reader.gpa, element_name_index);
+
+ if (reader.options.namespace_aware) {
+ try reader.ns_prefixes.append(reader.gpa, .{});
+ try reader.checkAttributesNs();
+ if (std.mem.indexOfScalar(u8, element_name, ':')) |colon_pos| {
+ const prefix = element_name[0..colon_pos];
+ if (std.mem.eql(u8, prefix, "xmlns")) return reader.fatal(.namespace_prefix_illegal, element_name_pos);
+ try reader.checkNcName(prefix, element_name_pos);
+ const local = element_name[colon_pos + 1 ..];
+ try reader.checkNcName(local, element_name_pos);
+ if (reader.namespaceUri(prefix).len == 0) return reader.fatal(.namespace_prefix_unbound, element_name_pos);
+ }
+ }
+}
+
+fn checkAttributes(reader: *Reader) !void {
+ const n_attributes = reader.attributeCountUnchecked();
+ try reader.attributes.ensureUnusedCapacity(reader.gpa, n_attributes);
+ for (0..n_attributes) |i| {
+ const name_pos = reader.attributeNamePos(i);
+ if (i > 0 and name_pos == reader.attributeValueEndPos(i - 1) + 1) {
+ return reader.fatal(.attribute_missing_space, name_pos);
+ }
+
+ const name = reader.attributeNameUnchecked(i);
+ try reader.checkName(name, name_pos);
+
+ const gop = reader.attributes.getOrPutAssumeCapacity(name);
+ if (gop.found_existing) return reader.fatal(.attribute_duplicate, name_pos);
+ gop.value_ptr.* = i;
+
+ try reader.checkAttributeValue(i);
+ }
+}
+
+fn checkAttributeValue(reader: *Reader, n: usize) !void {
+ const s = reader.attributeValueUnchecked(n);
+ const pos = reader.attributeValuePos(n);
+ try reader.validateUtf8(s, pos);
+ var i: usize = 0;
+ while (i < s.len) : (i += 1) {
+ switch (s[i]) {
+ '\t',
+ '\n',
+ '\r',
+ 0x20...('&' - 1),
+ ('&' + 1)...('<' - 1),
+ ('<' + 1)...0xEE,
+ 0xF0...0xFF,
+ => {},
+ 0xEF => {
+ // We already validated for correct UTF-8, so we know 2 bytes follow.
+ // The Unicode codepoints U+FFFE and U+FFFF are not allowed as characters:
+ // U+FFFE: EF BF BE
+ // U+FFFF: EF BF BF
+ if (s[i + 1] == 0xBF and (s[i + 2] == 0xBE or s[i + 2] == 0xBF)) {
+ return reader.fatal(.illegal_character, pos + i);
+ }
+ },
+ '<' => return reader.fatal(.attribute_illegal_character, pos + i),
+ '&' => {
+ if (std.mem.startsWith(u8, s[i + "&".len ..], "#")) {
+ const end = std.mem.indexOfScalarPos(u8, s, i, ';') orelse return reader.fatal(.character_reference_unclosed, pos + i);
+ const ref = s[i + "".len .. end];
+ const c = if (std.mem.startsWith(u8, ref, "x"))
+ std.fmt.parseInt(u21, ref["x".len..], 16) catch return reader.fatal(.character_reference_malformed, pos + i)
+ else
+ std.fmt.parseInt(u21, ref, 10) catch return reader.fatal(.character_reference_malformed, pos + i);
+ if (!isChar(c)) return reader.fatal(.character_reference_malformed, pos + i);
+ } else {
+ const end = std.mem.indexOfScalarPos(u8, s, i, ';') orelse return reader.fatal(.entity_reference_unclosed, pos + i);
+ const ref = s[i + "&".len .. end];
+ if (!predefined_entities.has(ref)) return reader.fatal(.entity_reference_undefined, pos + i);
+ i = end;
+ }
+ },
+ else => return reader.fatal(.illegal_character, pos + i),
+ }
+ }
+}
+
+fn checkAttributesNs(reader: *Reader) !void {
+ const n_attributes = reader.attributeCountUnchecked();
+ try reader.q_attributes.ensureUnusedCapacity(reader.gpa, n_attributes);
+ const prefix_bindings = &reader.ns_prefixes.items[reader.ns_prefixes.items.len - 1];
+
+ for (0..n_attributes) |i| {
+ const name = reader.attributeNameUnchecked(i);
+ const pos = reader.attributeNamePos(i);
+ if (std.mem.eql(u8, name, "xmlns")) {
+ const value = reader.attributeValueUnchecked(i);
+ const uri_index = try reader.addAttributeValueString(value);
+ const uri = reader.string(uri_index);
+ if (std.mem.eql(u8, uri, ns_xml) or std.mem.eql(u8, uri, ns_xmlns)) {
+ return reader.fatal(.namespace_binding_illegal, pos);
+ }
+ try prefix_bindings.putNoClobber(reader.gpa, .empty, uri_index);
+ } else if (std.mem.startsWith(u8, name, "xmlns:")) {
+ const prefix = name["xmlns:".len..];
+ if (std.mem.eql(u8, prefix, "xmlns")) return reader.fatal(.namespace_binding_illegal, pos);
+ try reader.checkNcName(prefix, pos);
+ const prefix_index = try reader.addString(prefix);
+ const value = reader.attributeValueUnchecked(i);
+ if (value.len == 0) return reader.fatal(.attribute_prefix_undeclared, pos);
+ const uri_index = try reader.addAttributeValueString(value);
+ const uri = reader.string(uri_index);
+ if (std.mem.eql(u8, uri, "xml") != std.mem.eql(u8, uri, ns_xml)) return reader.fatal(.namespace_binding_illegal, pos);
+ if (std.mem.eql(u8, uri, ns_xmlns)) return reader.fatal(.namespace_binding_illegal, pos);
+ try prefix_bindings.putNoClobber(reader.gpa, prefix_index, uri_index);
+ }
+ }
+
+ for (0..n_attributes) |i| {
+ const name = reader.attributeNameUnchecked(i);
+ const pos = reader.attributeNamePos(i);
+ const colon_pos = std.mem.indexOfScalar(u8, name, ':') orelse {
+ reader.q_attributes.putAssumeCapacityNoClobber(.{ .ns = "", .local = name }, i);
+ continue;
+ };
+ const prefix = name[0..colon_pos];
+ try reader.checkNcName(prefix, pos);
+ const local = name[colon_pos + 1 ..];
+ try reader.checkNcName(local, pos);
+ const uri = reader.namespaceUri(prefix);
+ if (uri.len == 0) return reader.fatal(.namespace_prefix_unbound, pos);
+ const gop = reader.q_attributes.getOrPutAssumeCapacity(.{ .ns = uri, .local = local });
+ if (gop.found_existing) return reader.fatal(.attribute_duplicate, pos);
+ gop.value_ptr.* = i;
+ }
+}
+
+fn addAttributeValueString(reader: *Reader, raw_value: []const u8) !StringIndex {
+ try reader.strings.append(reader.gpa, 0);
+ const start = reader.strings.items.len;
+ var i: usize = 0;
+ while (i < raw_value.len) : (i += 1) {
+ switch (raw_value[i]) {
+ '\t', '\n' => try reader.strings.append(reader.gpa, ' '),
+ '\r' => {
+ try reader.strings.append(reader.gpa, ' ');
+ if (i + 1 < raw_value.len and raw_value[i + 1] == '\n') i += 1;
+ },
+ '&' => {
+ const entity_end = std.mem.indexOfScalarPos(u8, raw_value, i, ';') orelse unreachable;
+ if (raw_value[i + "&".len] == '#') {
+ const c = if (raw_value[i + "".len] == 'x')
+ std.fmt.parseInt(u21, raw_value[i + "".len .. entity_end], 16) catch unreachable
+ else
+ std.fmt.parseInt(u21, raw_value[i + "".len .. entity_end], 10) catch unreachable;
+ try reader.strings.ensureUnusedCapacity(reader.gpa, 4);
+ reader.strings.items.len += std.unicode.utf8Encode(c, reader.strings.items) catch unreachable;
+ } else {
+ const expansion = predefined_entities.get(raw_value[i + "&".len .. entity_end]) orelse unreachable;
+ try reader.strings.appendSlice(reader.gpa, expansion);
+ }
+ i = entity_end;
+ },
+ else => |b| try reader.strings.append(reader.gpa, b),
+ }
+ }
+ return @enumFromInt(start);
+}
+
+fn checkElementEnd(reader: *Reader) !void {
+ const element_name = reader.string(reader.element_names.getLast());
+ if (!std.mem.eql(u8, reader.elementNameUnchecked(), element_name)) {
+ return reader.fatal(.element_end_mismatched, reader.elementNamePos());
+ }
+}
+
+fn readCommentContent(reader: *Reader) !void {
+ const start = reader.pos;
+ while (true) {
+ reader.pos = std.mem.indexOfPos(u8, reader.buf, reader.pos, "--") orelse reader.buf.len;
+ if (reader.pos < reader.buf.len) {
+ if (!std.mem.startsWith(u8, reader.buf[reader.pos + "--".len ..], ">")) {
+ return reader.fatal(.comment_malformed, reader.pos);
+ }
+ try reader.spans.append(reader.gpa, .{ .start = start, .end = reader.pos });
+ reader.pos += "-->".len;
+ return;
+ }
+ try reader.more();
+ if (reader.pos == reader.buf.len) return reader.fatal(.comment_unclosed, reader.pos);
+ }
+}
+
+fn checkComment(reader: *Reader) !void {
+ try reader.checkChars(reader.commentUnchecked(), reader.commentPos());
+}
+
+fn readPiContent(reader: *Reader) !void {
+ try reader.readSpace();
+ const start = reader.pos;
+ while (true) {
+ reader.pos = std.mem.indexOfPos(u8, reader.buf, reader.pos, "?>") orelse reader.buf.len;
+ if (reader.pos < reader.buf.len) {
+ try reader.spans.append(reader.gpa, .{ .start = start, .end = reader.pos });
+ reader.pos += "?>".len;
+ return;
+ }
+ try reader.more();
+ if (reader.pos == reader.buf.len) return reader.fatal(.pi_unclosed, reader.pos);
+ }
+}
+
+fn checkPi(reader: *Reader) !void {
+ const target = reader.piTargetUnchecked();
+ if (std.ascii.eqlIgnoreCase(target, "xml")) {
+ return reader.fatal(.pi_target_disallowed, reader.piTargetPos());
+ }
+ try reader.checkName(target, reader.piTargetPos());
+ if (reader.options.namespace_aware and std.mem.indexOfScalar(u8, target, ':') != null) {
+ return reader.fatal(.name_malformed, reader.piTargetPos());
+ }
+ if (reader.piTargetEndPos() == reader.piDataPos() and reader.piDataEndPos() > reader.piDataPos()) {
+ return reader.fatal(.pi_missing_space, reader.piDataPos());
+ }
+ try reader.checkChars(reader.piDataUnchecked(), reader.piDataPos());
+}
+
+fn readText(reader: *Reader) !void {
+ while (reader.pos < reader.buf.len) {
+ const b = reader.buf[reader.pos];
+ if (b == '&' or b == '<') return;
+ // We don't care about validating UTF-8 strictly here.
+ // We just don't want to end in the possible middle of a codepoint.
+ const nb: usize = if (b < 0x80) {
+ reader.pos += 1;
+ continue;
+ } else if (b < 0xE0)
+ 2
+ else if (b < 0xF0)
+ 3
+ else
+ 4;
+ if (reader.pos + nb > reader.buf.len) try reader.more();
+ reader.pos = @min(reader.pos + nb, reader.buf.len);
+ }
+ // We don't want to end on a CR right before an LF, or CRLF normalization will not be possible.
+ if (reader.pos > 0 and reader.buf[reader.pos - 1] == '\r') {
+ try reader.more();
+ if (reader.pos < reader.buf.len and reader.buf[reader.pos] == '\n') {
+ reader.pos += 1;
+ }
+ return;
+ }
+ // We also don't want to end in the middle of ']]>' which checkText needs to reject.
+ if (reader.pos > 0 and reader.buf[reader.pos - 1] == ']') {
+ try reader.more();
+ if (std.mem.startsWith(u8, reader.buf[reader.pos..], "]>")) {
+ reader.pos += "]>".len;
+ }
+ return;
+ }
+}
+
+fn checkText(reader: *Reader) !void {
+ const s = reader.textUnchecked();
+ const pos = reader.textPos();
+ try reader.validateUtf8(s, pos);
+ for (s, 0..) |c, i| {
+ switch (c) {
+ '\t',
+ '\n',
+ '\r',
+ 0x20...(']' - 1),
+ (']' + 1)...0xEE,
+ 0xF0...0xFF,
+ => {},
+ ']' => {
+ if (std.mem.startsWith(u8, s[i + 1 ..], "]>")) {
+ return reader.fatal(.text_cdata_end_disallowed, pos + i);
+ }
+ },
+ 0xEF => {
+ // We already validated for correct UTF-8, so we know 2 bytes follow.
+ // The Unicode codepoints U+FFFE and U+FFFF are not allowed as characters:
+ // U+FFFE: EF BF BE
+ // U+FFFF: EF BF BF
+ if (s[i + 1] == 0xBF and (s[i + 2] == 0xBE or s[i + 2] == 0xBF)) {
+ return reader.fatal(.illegal_character, pos + i);
+ }
+ },
+ else => return reader.fatal(.illegal_character, pos + i),
+ }
+ }
+}
+
+fn readCdata(reader: *Reader) !void {
+ const start = reader.pos;
+ while (true) {
+ reader.pos = std.mem.indexOfPos(u8, reader.buf, reader.pos, "]]>") orelse reader.buf.len;
+ if (reader.pos < reader.buf.len) {
+ try reader.spans.append(reader.gpa, .{ .start = start, .end = reader.pos });
+ reader.pos += "]]>".len;
+ return;
+ }
+ try reader.more();
+ if (reader.pos == reader.buf.len) return reader.fatal(.cdata_unclosed, reader.pos);
+ }
+}
+
+fn checkCdata(reader: *Reader) !void {
+ try reader.checkChars(reader.cdataUnchecked(), reader.cdataPos());
+}
+
+fn checkEntityReference(reader: *Reader) !void {
+ if (!predefined_entities.has(reader.entityReferenceNameUnchecked())) {
+ return reader.fatal(.entity_reference_undefined, reader.entityReferenceNamePos());
+ }
+}
+
+fn readCharacterReference(reader: *Reader) !void {
+ const start = reader.pos;
+ while (true) {
+ while (reader.pos < reader.buf.len) {
+ switch (reader.buf[reader.pos]) {
+ '0'...'9', 'A'...'Z', 'a'...'z' => reader.pos += 1,
+ else => {
+ try reader.spans.append(reader.gpa, .{ .start = start, .end = reader.pos });
+ return;
+ },
+ }
+ }
+ try reader.more();
+ if (reader.pos == reader.buf.len) {
+ try reader.spans.append(reader.gpa, .{ .start = start, .end = reader.pos });
+ return;
+ }
+ }
+}
+
+fn checkCharacterReference(reader: *Reader) !void {
+ const ref = reader.characterReferenceNameUnchecked();
+ const pos = reader.characterReferenceNamePos();
+ const c = if (std.mem.startsWith(u8, ref, "x"))
+ std.fmt.parseInt(u21, ref["x".len..], 16) catch return reader.fatal(.character_reference_malformed, pos)
+ else
+ std.fmt.parseInt(u21, ref, 10) catch return reader.fatal(.character_reference_malformed, pos);
+ if (!isChar(c)) return reader.fatal(.character_reference_malformed, pos);
+ reader.character = c;
+}
+
+fn readName(reader: *Reader) !void {
+ const start = reader.pos;
+ while (true) {
+ while (reader.pos < reader.buf.len) {
+ switch (reader.buf[reader.pos]) {
+ 'A'...'Z', 'a'...'z', '0'...'9', ':', '_', '-', '.', 0x80...0xFF => reader.pos += 1,
+ else => {
+ try reader.spans.append(reader.gpa, .{ .start = start, .end = reader.pos });
+ return;
+ },
+ }
+ }
+ try reader.more();
+ if (reader.pos == reader.buf.len) {
+ try reader.spans.append(reader.gpa, .{ .start = start, .end = reader.pos });
+ return;
+ }
+ }
+}
+
+fn readPair(reader: *Reader) !void {
+ try reader.readName();
+ try reader.readSpace();
+ if (!try reader.readMatch("=")) return reader.fatal(.expected_equals, reader.pos);
+ try reader.readSpace();
+ try reader.readQuotedValue();
+}
+
+fn readQuotedValue(reader: *Reader) !void {
+ const quote = quote: {
+ if (reader.pos == reader.buf.len) {
+ try reader.more();
+ if (reader.pos == reader.buf.len) return reader.fatal(.expected_quote, reader.pos);
+ }
+ break :quote switch (reader.buf[reader.pos]) {
+ '"', '\'' => |c| c,
+ else => return reader.fatal(.expected_quote, reader.pos),
+ };
+ };
+ reader.pos += 1;
+ const start = reader.pos;
+ while (true) {
+ reader.pos = std.mem.indexOfScalarPos(u8, reader.buf, reader.pos, quote) orelse reader.buf.len;
+ if (reader.pos < reader.buf.len) {
+ try reader.spans.append(reader.gpa, .{ .start = start, .end = reader.pos });
+ reader.pos += 1;
+ return;
+ }
+ try reader.more();
+ if (reader.pos == reader.buf.len) return reader.fatal(.missing_end_quote, reader.pos);
+ }
+}
+
+fn readMatch(reader: *Reader, needle: []const u8) !bool {
+ if (reader.pos + needle.len > reader.buf.len) {
+ try reader.more();
+ if (reader.pos + needle.len > reader.buf.len) return false;
+ }
+ if (std.mem.eql(u8, reader.buf[reader.pos..][0..needle.len], needle)) {
+ reader.pos += needle.len;
+ return true;
+ }
+ return false;
+}
+
+fn readSpace(reader: *Reader) !void {
+ while (true) {
+ while (reader.pos < reader.buf.len) {
+ switch (reader.buf[reader.pos]) {
+ ' ', '\t', '\r', '\n' => reader.pos += 1,
+ else => return,
+ }
+ }
+ try reader.more();
+ if (reader.pos == reader.buf.len) return;
+ }
+}
+
+fn checkName(reader: *Reader, s: []const u8, pos: usize) !void {
+ const view = try reader.viewUtf8(s, pos);
+ var iter = view.iterator();
+ if (!isNameStartChar(iter.nextCodepoint() orelse return reader.fatal(.name_malformed, pos))) {
+ return reader.fatal(.name_malformed, pos);
+ }
+ while (iter.nextCodepoint()) |c| {
+ if (!isNameChar(c)) return reader.fatal(.name_malformed, pos);
+ }
+}
+
+fn checkNcName(reader: *Reader, s: []const u8, pos: usize) !void {
+ if (s.len == 0 or !isNameStartChar(s[0]) or std.mem.indexOfScalar(u8, s, ':') != null) {
+ return reader.fatal(.name_malformed, pos);
+ }
+}
+
+fn isNameStartChar(c: u21) bool {
+ return switch (c) {
+ ':',
+ 'A'...'Z',
+ '_',
+ 'a'...'z',
+ 0xC0...0xD6,
+ 0xD8...0xF6,
+ 0xF8...0x2FF,
+ 0x370...0x37D,
+ 0x37F...0x1FFF,
+ 0x200C...0x200D,
+ 0x2070...0x218F,
+ 0x2C00...0x2FEF,
+ 0x3001...0xD7FF,
+ 0xF900...0xFDCF,
+ 0xFDF0...0xFFFD,
+ 0x10000...0xEFFFF,
+ => true,
+ else => false,
+ };
+}
+
+fn isNameChar(c: u21) bool {
+ return isNameStartChar(c) or switch (c) {
+ '-',
+ '.',
+ '0'...'9',
+ 0xB7,
+ 0x0300...0x036F,
+ 0x203F...0x2040,
+ => true,
+ else => false,
+ };
+}
+
+fn checkChars(reader: *Reader, s: []const u8, pos: usize) !void {
+ try reader.validateUtf8(s, pos);
+ for (s, 0..) |c, i| {
+ switch (c) {
+ '\t', '\n', '\r', 0x20...0xEE, 0xF0...0xFF => {},
+ 0xEF => {
+ // We already validated for correct UTF-8, so we know 2 bytes follow.
+ // The Unicode codepoints U+FFFE and U+FFFF are not allowed as characters:
+ // U+FFFE: EF BF BE
+ // U+FFFF: EF BF BF
+ if (s[i + 1] == 0xBF and (s[i + 2] == 0xBE or s[i + 2] == 0xBF)) {
+ return reader.fatal(.illegal_character, pos + i);
+ }
+ },
+ else => return reader.fatal(.illegal_character, pos + i),
+ }
+ }
+}
+
+fn isChar(c: u21) bool {
+ return switch (c) {
+ 0x9,
+ 0xA,
+ 0xD,
+ 0x20...0xD7FF,
+ 0xE000...0xFFFD,
+ 0x10000...0x10FFFF,
+ => true,
+ else => false,
+ };
+}
+
+fn skipBom(reader: *Reader) !void {
+ const bom = "\u{FEFF}";
+ if (std.mem.startsWith(u8, reader.buf[reader.pos..], bom)) {
+ reader.pos += bom.len;
+ try reader.shift();
+ }
+}
+
+fn skipSpace(reader: *Reader) !void {
+ while (true) {
+ while (reader.pos < reader.buf.len) {
+ switch (reader.buf[reader.pos]) {
+ ' ', '\t', '\r', '\n' => reader.pos += 1,
+ else => {
+ try reader.shift();
+ return;
+ },
+ }
+ }
+ try reader.shift();
+ if (reader.pos == reader.buf.len) return;
+ }
+}
+
+fn validateUtf8(reader: *Reader, s: []const u8, pos: usize) !void {
+ if (reader.options.assume_valid_utf8) return;
+ if (!std.unicode.utf8ValidateSlice(s)) return reader.fatalInvalidUtf8(s, pos);
+}
+
+fn viewUtf8(reader: *Reader, s: []const u8, pos: usize) !std.unicode.Utf8View {
+ if (reader.options.assume_valid_utf8) return std.unicode.Utf8View.initUnchecked(s);
+ return std.unicode.Utf8View.init(s) catch reader.fatalInvalidUtf8(s, pos);
+}
+
+fn fatalInvalidUtf8(reader: *Reader, s: []const u8, pos: usize) error{MalformedXml} {
+ // We need to backtrack and redo the UTF-8 validation to set the correct
+ // error location; the standard "validate UTF-8" function doesn't provide
+ // an index for the invalid data.
+ var invalid_pos: usize = 0;
+ while (true) {
+ const cp_len = std.unicode.utf8ByteSequenceLength(s[invalid_pos]) catch break;
+ if (invalid_pos + cp_len > s.len) break;
+ if (!std.unicode.utf8ValidateSlice(s[invalid_pos..][0..cp_len])) break;
+ invalid_pos += cp_len;
+ }
+ return reader.fatal(.invalid_utf8, pos + invalid_pos);
+}
+
+const base_read_size = 4096;
+
+fn shift(reader: *Reader) !void {
+ if (reader.options.location_aware) {
+ reader.loc.update(reader.buf[0..reader.pos]);
+ }
+
+ reader.buf = try reader.source.move(reader.pos, base_read_size);
+ reader.pos = 0;
+ reader.spans.clearRetainingCapacity();
+ reader.attributes.clearRetainingCapacity();
+ reader.q_attributes.clearRetainingCapacity();
+
+ if (reader.node == .element_end) {
+ if (reader.options.namespace_aware) {
+ var prefix_bindings = reader.ns_prefixes.pop();
+ prefix_bindings.deinit(reader.gpa);
+ }
+ const element_name_start = reader.element_names.pop();
+ reader.strings.shrinkRetainingCapacity(@intFromEnum(element_name_start));
+ }
+}
+
+fn more(reader: *Reader) !void {
+ reader.buf = try reader.source.move(0, reader.buf.len * 2);
+}
+
+fn fatal(reader: *Reader, error_code: ErrorCode, error_pos: usize) error{MalformedXml} {
+ reader.state = .invalid;
+ reader.error_code = error_code;
+ reader.error_pos = error_pos;
+ return error.MalformedXml;
+}
+
+const QNameContext = struct {
+ pub fn hash(ctx: @This(), qname: QName) u32 {
+ _ = ctx;
+ var w = std.hash.Wyhash.init(0);
+ w.update(qname.ns);
+ w.update(qname.local);
+ return @truncate(w.final());
+ }
+
+ pub fn eql(ctx: @This(), a: QName, b: QName, b_index: usize) bool {
+ _ = ctx;
+ _ = b_index;
+ return std.mem.eql(u8, a.ns, b.ns) and std.mem.eql(u8, a.local, b.local);
+ }
+};
+
+const BufSpan = struct {
+ start: usize,
+ end: usize,
+};
+
+fn bufSlice(reader: Reader, span: BufSpan) []const u8 {
+ return reader.buf[span.start..span.end];
+}
+
+const StringIndex = enum(usize) { empty = 0, _ };
+
+const StringIndexAdapter = struct {
+ strings: []const u8,
+
+ pub fn hash(ctx: @This(), key: []const u8) u32 {
+ _ = ctx;
+ return @truncate(std.hash.Wyhash.hash(0, key));
+ }
+
+ pub fn eql(ctx: @This(), a: []const u8, b: StringIndex, b_index: usize) bool {
+ _ = b_index;
+ const b_val = std.mem.sliceTo(ctx.strings[@intFromEnum(b)..], 0);
+ return std.mem.eql(u8, a, b_val);
+ }
+};
+
+fn addString(reader: *Reader, s: []const u8) !StringIndex {
+ try reader.strings.ensureUnusedCapacity(reader.gpa, s.len + 1);
+ reader.strings.appendAssumeCapacity(0);
+ const start = reader.strings.items.len;
+ reader.strings.appendSliceAssumeCapacity(s);
+ return @enumFromInt(start);
+}
+
+fn string(reader: Reader, index: StringIndex) []const u8 {
+ return std.mem.sliceTo(reader.strings.items[@intFromEnum(index)..], 0);
+}
diff --git a/src/Scanner.zig b/src/Scanner.zig
deleted file mode 100644
index 3939ea9..0000000
--- a/src/Scanner.zig
+++ /dev/null
@@ -1,2045 +0,0 @@
-//! A simple, low-level streaming XML parser.
-//!
-//! The design of the parser is strongly inspired by
-//! [Yxml](https://dev.yorhel.nl/yxml). Codepoints are fed to the parser one by one
-//! using the `next` function, then the `endInput` function should be used to
-//! check that the parser is in a valid state for the end of input (e.g. not in
-//! the middle of parsing an element). The tokens returned by the parser
-//! reference the input data using `pos` ranges (the meaning of `pos` depends
-//! on the meaning of the `len` passed to `next`).
-//!
-//! A higher-level parser which wants to do anything useful with the returned
-//! tokens will need to store the input text fed to the `next` function in some
-//! sort of buffer. If the document is stored entirely in memory, this buffer
-//! could be the document content itself. If the document is being read in a
-//! streaming manner, however, then an auxiliary buffer will be needed. To
-//! avoid requiring such higher-level APIs to maintain an unbounded input
-//! buffer, the `resetPos` function exists to reset `pos` to 0, if possible.
-//! The approach taken by `TokenReader` is to call `resetPos` after every
-//! token, and after reaching a state where space for a further codepoint is
-//! not guaranteed. With this approach, the length of the buffer bounds the
-//! maximum size of "unsplittable" content, such as element and attribute
-//! names, but not "splittable" content such as element text content and
-//! attribute values.
-//!
-//! Intentional (permanent) limitations (which can be addressed by
-//! higher-level APIs, such as `Reader`):
-//!
-//! - Does not validate that corresponding open and close tags match.
-//! - Does not validate that attribute names are not duplicated.
-//! - Does not do any special handling of namespaces.
-//! - Does not perform any sort of processing on text content or attribute
-//! values (including normalization, expansion of entities, etc.).
-//! - However, note that entity and character references in text content and
-//! attribute values _are_ validated for correct syntax, although their
-//! content is not (they may reference non-existent entities).
-//! - Does not process DTDs in any way besides parsing them (TODO: see below).
-//!
-//! Unintentional (temporary) limitations (which will be removed over time):
-//!
-//! - Does not support `DOCTYPE` at all (using one will result in an error).
-//! - Not extensively tested/fuzzed.
-
-/// The data for the most recently returned token.
-token_data: Token.Data = undefined,
-/// The current state of the scanner.
-state: State = .start,
-/// Data associated with the current state of the scanner.
-state_data: State.Data = undefined,
-/// The current position in the input.
-///
-/// The meaning of this position is determined by the meaning of the `len`
-/// value passed to `next`, which is determined by the user. For example, a
-/// user with a byte slice or reader would probably want to pass `len` as the
-/// number of bytes making up the codepoint, which would make `pos` a byte
-/// offset.
-pos: usize = 0,
-/// The current element nesting depth.
-depth: usize = 0,
-/// Whether the root element has been seen already.
-seen_root_element: bool = false,
-
-const std = @import("std");
-const testing = std.testing;
-const unicode = std.unicode;
-const syntax = @import("syntax.zig");
-
-const Scanner = @This();
-
-/// A range of byte positions in the input.
-pub const Range = struct {
- /// The start of the range (inclusive).
- start: usize,
- /// The end of the range (exclusive).
- end: usize,
-
- pub fn isEmpty(self: Range) bool {
- return self.start == self.end;
- }
-
- pub fn format(self: Range, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void {
- try writer.print("{}..{}", .{ self.start, self.end });
- }
-};
-
-/// A single XML token.
-///
-/// The choice of tokens is designed to allow the buffer position to be reset as
-/// often as reasonably possible ("forgetting" any range information before the
-/// reset), supported by the following design decisions:
-///
-/// - Tokens contain only the immediately necessary context: for example, the
-/// `attribute_content` token does not store any information about the
-/// attribute name, since it may have been processed many resets ago (if the
-/// attribute content is very long).
-/// - Multiple `content` tokens may be returned for a single enclosing context
-/// (e.g. element or attribute) if the buffer is reset in the middle of
-/// content or there are other necessary intervening factors, such as CDATA
-/// in the middle of normal (non-CDATA) element content.
-///
-/// For efficiency (avoiding copying when passing around tokens), this is
-/// merely an enum specifying the token type. The actual token data is available
-/// in `Token.Data`, in the scanner's `token_data` field. The `fullToken`
-/// function can be used to get a `Token.Full`, which is a tagged union type and
-/// may be easier to consume in certain circumstances.
-pub const Token = enum {
- /// Continue processing: no new token to report yet.
- ok,
- /// XML declaration.
- xml_declaration,
- /// Element start tag.
- element_start,
- /// Element content.
- element_content,
- /// Element end tag.
- element_end,
- /// End of an empty element.
- element_end_empty,
- /// Attribute start.
- attribute_start,
- /// Attribute value content.
- attribute_content,
- /// Comment start.
- comment_start,
- /// Comment content.
- comment_content,
- /// Processing instruction (PI) start.
- pi_start,
- /// PI content.
- pi_content,
-
- /// The data associated with a token.
- ///
- /// Even token types which have no associated data are represented here, to
- /// provide some additional safety in safe build modes (where it can be
- /// checked whether the caller is referencing the correct data field).
- pub const Data = union {
- ok: void,
- xml_declaration: XmlDeclaration,
- element_start: ElementStart,
- element_content: ElementContent,
- element_end: ElementEnd,
- element_end_empty: void,
- attribute_start: AttributeStart,
- attribute_content: AttributeContent,
- comment_start: void,
- comment_content: CommentContent,
- pi_start: PiStart,
- pi_content: PiContent,
- };
-
- /// A token type plus data represented as a tagged union.
- pub const Full = union(Token) {
- ok,
- xml_declaration: XmlDeclaration,
- element_start: ElementStart,
- element_content: ElementContent,
- element_end: ElementEnd,
- element_end_empty,
- attribute_start: AttributeStart,
- attribute_content: AttributeContent,
- comment_start,
- comment_content: CommentContent,
- pi_start: PiStart,
- pi_content: PiContent,
- };
-
- pub const XmlDeclaration = struct {
- version: Range,
- encoding: ?Range = null,
- standalone: ?bool = null,
- };
-
- pub const ElementStart = struct {
- name: Range,
- };
-
- pub const ElementContent = struct {
- content: Content,
- };
-
- pub const ElementEnd = struct {
- name: Range,
- };
-
- pub const AttributeStart = struct {
- name: Range,
- };
-
- pub const AttributeContent = struct {
- content: Content,
- final: bool = false,
- };
-
- pub const CommentContent = struct {
- content: Range,
- final: bool = false,
- };
-
- pub const PiStart = struct {
- target: Range,
- };
-
- pub const PiContent = struct {
- content: Range,
- final: bool = false,
- };
-
- /// A bit of content of an element or attribute.
- pub const Content = union(enum) {
- /// Raw text content (does not contain any entities).
- text: Range,
- /// A Unicode codepoint.
- codepoint: u21,
- /// An entity reference, such as `&`. The range covers the name (`amp`).
- entity: Range,
- };
-};
-
-/// Returns the full token (including data) from the most recent call to `next`
-/// or `resetPos`. `token` must be the token returned from the last call to one
-/// of those functions.
-///
-/// ---
-///
-/// API note: the use of `self: *const Scanner` rather than `self: Scanner` is
-/// important to elimiate a potential footgun with the following code:
-///
-/// ```
-/// const full_token = scanner.fullToken(try scanner.next(c, len));
-/// ```
-///
-/// If `self: Scanner` is used, then Zig will evaluate `scanner` in its current
-/// state (for the expression `scanner.fullToken`) before calling
-/// `scanner.next`. This leads to the result being incorrect, since the `scanner`
-/// used for the `fullToken` call will have the old token data.
-pub fn fullToken(self: *const Scanner, token: Token) Token.Full {
- return switch (token) {
- inline else => |tag| @unionInit(Token.Full, @tagName(tag), @field(self.token_data, @tagName(tag))),
- };
-}
-
-/// The possible states of the parser.
-///
-/// The parser is designed as a state machine. A state may need to hold
-/// associated data to allow the necessary information to be included in a
-/// future token. One shortcut used to avoid creating many unnecessary
-/// additional states is to store a `left` byte slice tracking expected bytes
-/// remaining in a state (the slice is always pointing to static strings, so
-/// there are no lifetime considerations): for example, the word "version" in
-/// an XML declaration is parsed in the xml_decl_version_name state, and
-/// successive bytes are validated using the `left` slice (e.g. after parsing
-/// "v", left is "ersion", so that when we handle the next character, we can
-/// fail parsing if it is not "e", and then set `left` to "rsion", and so on).
-pub const State = enum {
- /// Start of document.
- start,
- /// Start of document after BOM.
- start_after_bom,
-
- /// Same as unknown_start, but also allows the XML declaration.
- start_unknown_start,
- /// Start of a PI or XML declaration after ''.
- ///
- /// Some part of 'xml' may have been matched. If this is not matched, the
- /// state will transition to a normal `pi_start`.
- ///
- /// Uses `start`, `left`.
- pi_or_xml_decl_start,
- /// Start of a PI or XML declaration after ' if (c == 0xFEFF) {
- self.state = .start_after_bom;
- return .ok;
- } else if (c == '<') {
- self.state = .start_unknown_start;
- return .ok;
- } else if (syntax.isSpace(c)) {
- self.state = .start_after_xml_decl;
- return .ok;
- },
-
- .start_after_bom => if (c == '<') {
- self.state = .start_unknown_start;
- return .ok;
- } else if (syntax.isSpace(c)) {
- self.state = .start_after_xml_decl;
- return .ok;
- },
-
- .start_unknown_start => if (syntax.isNameStartChar(c)) {
- self.state = .element_start_name;
- self.state_data.start = self.pos;
- return .ok;
- } else if (c == '?') {
- self.state = .pi_or_xml_decl_start;
- self.state_data.start = self.pos + len;
- self.state_data.left = "xml";
- return .ok;
- } else if (c == '!') {
- self.state = .unknown_start_bang;
- return .ok;
- },
-
- .pi_or_xml_decl_start => if (c == self.state_data.left[0]) {
- if (self.state_data.left.len == 1) {
- self.state = .pi_or_xml_decl_start_after_xml;
- // self.state_data.start = self.state_data.start;
- } else {
- self.state_data.left = self.state_data.left[1..];
- }
- return .ok;
- } else if (syntax.isNameStartChar(c) or (syntax.isNameChar(c) and self.pos > self.state_data.start)) {
- self.state = .pi_target;
- // self.state_data.start = self.state_data.start;
- return .ok;
- } else if (syntax.isSpace(c) and self.pos > self.state_data.start) {
- const target = Range{ .start = self.state_data.start, .end = self.pos };
- self.state = .pi_after_target;
- self.token_data = .{ .pi_start = .{ .target = target } };
- return .pi_start;
- } else if (c == '?' and self.pos > self.state_data.start) {
- const target = Range{ .start = self.state_data.start, .end = self.pos };
- self.state = .pi_maybe_end;
- self.state_data.start = self.pos;
- self.state_data.end = self.pos;
- self.token_data = .{ .pi_start = .{ .target = target } };
- return .pi_start;
- },
-
- .pi_or_xml_decl_start_after_xml => if (syntax.isSpace(c)) {
- self.state = .xml_decl;
- return .ok;
- } else if (syntax.isNameChar(c)) {
- self.state = .pi_target;
- // self.state_data.start = self.state_data.start;
- return .ok;
- },
-
- .xml_decl => if (syntax.isSpace(c)) {
- return .ok;
- } else if (c == 'v') {
- self.state = .xml_decl_version_name;
- self.state_data.left = "ersion";
- return .ok;
- },
-
- .xml_decl_version_name => if (c == self.state_data.left[0]) {
- if (self.state_data.left.len == 1) {
- self.state = .xml_decl_after_version_name;
- } else {
- self.state_data.left = self.state_data.left[1..];
- }
- return .ok;
- },
-
- .xml_decl_after_version_name => if (syntax.isSpace(c)) {
- return .ok;
- } else if (c == '=') {
- self.state = .xml_decl_after_version_equals;
- return .ok;
- },
-
- .xml_decl_after_version_equals => if (syntax.isSpace(c)) {
- return .ok;
- } else if (c == '"' or c == '\'') {
- self.state = .xml_decl_version_value_start;
- self.state_data.start = self.pos + len;
- self.state_data.quote = @intCast(c);
- self.state_data.left = "1.";
- return .ok;
- },
-
- .xml_decl_version_value_start => if (c == self.state_data.left[0]) {
- if (self.state_data.left.len == 1) {
- self.state = .xml_decl_version_value;
- // self.state_data.start = self.state_data.start;
- // self.state_data.quote = self.state_data.quote;
- } else {
- self.state_data.left = self.state_data.left[1..];
- }
- return .ok;
- },
-
- .xml_decl_version_value => if (c == self.state_data.quote and self.pos > self.state_data.start + "1.".len) {
- self.state = .xml_decl_after_version_value;
- self.state_data.version = .{ .start = self.state_data.start, .end = self.pos };
- return .ok;
- } else if (syntax.isDigit(c)) {
- return .ok;
- },
-
- .xml_decl_after_version_value => if (syntax.isSpace(c)) {
- self.state = .xml_decl_after_version;
- // self.state_data.version = self.state_data.version;
- return .ok;
- } else if (c == '?') {
- const version = self.state_data.version;
- self.state = .xml_decl_end;
- self.token_data = .{ .xml_declaration = .{ .version = version, .encoding = null, .standalone = null } };
- return .xml_declaration;
- },
-
- .xml_decl_after_version => if (syntax.isSpace(c)) {
- return .ok;
- } else if (c == 'e') {
- self.state = .xml_decl_encoding_name;
- // self.state_data.version = self.state_data.version;
- self.state_data.left = "ncoding";
- return .ok;
- } else if (c == 's') {
- self.state = .xml_decl_standalone_name;
- // self.state_data.version = self.state_data.version;
- self.state_data.encoding = null;
- self.state_data.left = "tandalone";
- return .ok;
- } else if (c == '?') {
- const version = self.state_data.version;
- self.state = .xml_decl_end;
- self.token_data = .{ .xml_declaration = .{ .version = version, .encoding = null, .standalone = null } };
- return .xml_declaration;
- },
-
- .xml_decl_encoding_name => if (c == self.state_data.left[0]) {
- if (self.state_data.left.len == 1) {
- self.state = .xml_decl_after_encoding_name;
- // self.state_data.version = self.state_data.version;
- } else {
- self.state_data.left = self.state_data.left[1..];
- }
- return .ok;
- },
-
- .xml_decl_after_encoding_name => if (syntax.isSpace(c)) {
- return .ok;
- } else if (c == '=') {
- self.state = .xml_decl_after_encoding_equals;
- // self.state_data.version = self.state_data.version;
- return .ok;
- },
-
- .xml_decl_after_encoding_equals => if (syntax.isSpace(c)) {
- return .ok;
- } else if (c == '"' or c == '\'') {
- self.state = .xml_decl_encoding_value_start;
- // self.state_data.version = self.state_data.version;
- self.state_data.start = self.pos + len;
- self.state_data.quote = @as(u8, @intCast(c));
- return .ok;
- },
-
- .xml_decl_encoding_value_start => if (syntax.isEncodingStartChar(c)) {
- self.state = .xml_decl_encoding_value;
- // self.state_data.version = self.state_data.version;
- // self.state_data.start = self.state_data.start;
- // self.state_data.quote = self.state_data.quote;
- return .ok;
- },
-
- .xml_decl_encoding_value => if (c == self.state_data.quote) {
- self.state = .xml_decl_after_encoding_value;
- // self.state_data.version = self.state_data.version;
- self.state_data.encoding = .{ .start = self.state_data.start, .end = self.pos };
- return .ok;
- } else if (syntax.isEncodingChar(c)) {
- return .ok;
- },
-
- .xml_decl_after_encoding_value => if (syntax.isSpace(c)) {
- self.state = .xml_decl_after_encoding;
- // self.state_data.version = self.state_data.version;
- // self.state_data.encoding = self.state_data.encoding;
- return .ok;
- } else if (c == '?') {
- const version = self.state_data.version;
- const encoding = self.state_data.encoding;
- self.state = .xml_decl_end;
- self.token_data = .{ .xml_declaration = .{ .version = version, .encoding = encoding, .standalone = null } };
- return .xml_declaration;
- },
-
- .xml_decl_after_encoding => if (syntax.isSpace(c)) {
- return .ok;
- } else if (c == 's') {
- self.state = .xml_decl_standalone_name;
- // self.state_data.version = self.state_data.version;
- // self.state_data.encoding = self.state_data.encoding;
- self.state_data.left = "tandalone";
- return .ok;
- } else if (c == '?') {
- const version = self.state_data.version;
- const encoding = self.state_data.encoding;
- self.state = .xml_decl_end;
- self.token_data = .{ .xml_declaration = .{ .version = version, .encoding = encoding, .standalone = null } };
- return .xml_declaration;
- },
-
- .xml_decl_standalone_name => if (c == self.state_data.left[0]) {
- if (self.state_data.left.len == 1) {
- self.state = .xml_decl_after_standalone_name;
- // self.state_data.version = self.state_data.version;
- // self.state_data.encoding = self.state_data.encoding;
- } else {
- self.state_data.left = self.state_data.left[1..];
- }
- return .ok;
- },
-
- .xml_decl_after_standalone_name => if (syntax.isSpace(c)) {
- return .ok;
- } else if (c == '=') {
- self.state = .xml_decl_after_standalone_equals;
- // self.state_data.version = self.state_data.version;
- // self.state_data.encoding = self.state_data.encoding;
- return .ok;
- },
-
- .xml_decl_after_standalone_equals => if (syntax.isSpace(c)) {
- return .ok;
- } else if (c == '"' or c == '\'') {
- self.state = .xml_decl_standalone_value_start;
- // self.state_data.version = self.state_data.version;
- // self.state_data.encoding = self.state_data.encoding;
- self.state_data.quote = @as(u8, @intCast(c));
- return .ok;
- },
-
- .xml_decl_standalone_value_start => if (c == 'y') {
- const version = self.state_data.version;
- const encoding = self.state_data.encoding;
- self.state = .xml_decl_standalone_value;
- // self.state_data.quote = self.state_data.quote;
- self.state_data.left = "es";
- self.token_data = .{ .xml_declaration = .{ .version = version, .encoding = encoding, .standalone = true } };
- return .xml_declaration;
- } else if (c == 'n') {
- const version = self.state_data.version;
- const encoding = self.state_data.encoding;
- self.state = .xml_decl_standalone_value;
- // self.state_data.quote = self.state_data.quote;
- self.state_data.left = "o";
- self.token_data = .{ .xml_declaration = .{ .version = version, .encoding = encoding, .standalone = false } };
- return .xml_declaration;
- },
-
- .xml_decl_standalone_value => if (c == self.state_data.left[0]) {
- if (self.state_data.left.len == 1) {
- self.state = .xml_decl_standalone_value_end;
- // self.state_data.quote = self.state_data.quote;
- } else {
- self.state_data.left = self.state_data.left[1..];
- }
- return .ok;
- },
-
- .xml_decl_standalone_value_end => if (c == self.state_data.quote) {
- self.state = .xml_decl_after_standalone;
- return .ok;
- },
-
- .xml_decl_after_standalone => if (syntax.isSpace(c)) {
- return .ok;
- } else if (c == '?') {
- self.state = .xml_decl_end;
- return .ok;
- },
-
- .xml_decl_end => if (c == '>') {
- self.state = .start_after_xml_decl;
- return .ok;
- },
-
- .start_after_xml_decl => if (syntax.isSpace(c)) {
- return .ok;
- } else if (c == '<') {
- self.state = .unknown_start;
- return .ok;
- },
-
- .doctype_start => if (c == self.state_data.left[0]) {
- if (self.state_data.left.len == 1) {
- return error.DoctypeNotSupported;
- } else {
- self.state_data.left = self.state_data.left[1..];
- return .ok;
- }
- },
-
- .document_content => if (syntax.isSpace(c)) {
- return .ok;
- } else if (c == '<') {
- self.state = .unknown_start;
- return .ok;
- },
-
- .unknown_start => if (syntax.isNameStartChar(c) and !self.seen_root_element) {
- self.state = .element_start_name;
- self.state_data.start = self.pos;
- return .ok;
- } else if (c == '/' and self.depth > 0) {
- self.state = .element_end;
- return .ok;
- } else if (c == '!') {
- self.state = .unknown_start_bang;
- return .ok;
- } else if (c == '?') {
- self.state = .pi;
- return .ok;
- },
-
- .unknown_start_bang => if (c == '-') {
- self.state = .comment_before_start;
- return .ok;
- } else if (self.depth > 0 and c == '[') {
- // Textual content is not allowed outside the root element.
- self.state = .cdata_before_start;
- self.state_data.left = "CDATA[";
- return .ok;
- } else if (self.depth == 0 and !self.seen_root_element and c == 'D') {
- self.state = .doctype_start;
- self.state_data.left = "OCTYPE ";
- return .ok;
- },
-
- .comment_before_start => if (c == '-') {
- self.state = .comment;
- self.state_data.start = self.pos + len;
- self.token_data = .{ .comment_start = {} };
- return .comment_start;
- },
-
- .comment => if (c == '-') {
- self.state = .comment_maybe_before_end;
- // self.state_data.start = self.state_data.start;
- self.state_data.end = self.pos;
- return .ok;
- } else if (syntax.isChar(c)) {
- return .ok;
- },
-
- .comment_maybe_before_end => if (c == '-') {
- const content = Range{ .start = self.state_data.start, .end = self.state_data.end };
- self.state = .comment_before_end;
- self.token_data = .{ .comment_content = .{ .content = content, .final = true } };
- return .comment_content;
- } else if (syntax.isChar(c)) {
- self.state = .comment;
- // self.state_data.start = self.state_data.start;
- return .ok;
- },
-
- .comment_before_end => if (c == '>') {
- if (self.depth == 0) {
- self.state = .document_content;
- } else {
- self.state = .content;
- self.state_data.start = self.pos + len;
- }
- return .ok;
- },
-
- .pi => if (syntax.isNameStartChar(c)) {
- self.state = .pi_target;
- self.state_data.start = self.pos;
- return .ok;
- },
-
- .pi_target => if (syntax.isNameChar(c)) {
- return .ok;
- } else if (syntax.isSpace(c)) {
- const target = Range{ .start = self.state_data.start, .end = self.pos };
- self.state = .pi_after_target;
- self.token_data = .{ .pi_start = .{ .target = target } };
- return .pi_start;
- } else if (c == '?') {
- const target = Range{ .start = self.state_data.start, .end = self.pos };
- self.state = .pi_maybe_end;
- self.state_data.start = self.pos;
- self.state_data.end = self.pos;
- self.token_data = .{ .pi_start = .{ .target = target } };
- return .pi_start;
- },
-
- .pi_after_target => if (syntax.isSpace(c)) {
- return .ok;
- } else if (syntax.isChar(c)) {
- self.state = .pi_content;
- self.state_data.start = self.pos;
- return .ok;
- } else if (c == '?') {
- self.state = .pi_maybe_end;
- self.state_data.start = self.pos;
- self.state_data.end = self.pos;
- return .ok;
- },
-
- .pi_content => if (c == '?') {
- self.state = .pi_maybe_end;
- // self.state_data.start = self.state_data.start;
- self.state_data.end = self.pos;
- return .ok;
- } else if (syntax.isChar(c)) {
- return .ok;
- },
-
- .pi_maybe_end => if (c == '>') {
- const content = Range{ .start = self.state_data.start, .end = self.state_data.end };
- if (self.depth == 0) {
- self.state = .document_content;
- } else {
- self.state = .content;
- self.state_data.start = self.pos + len;
- }
- self.token_data = .{ .pi_content = .{ .content = content, .final = true } };
- return .pi_content;
- } else if (syntax.isChar(c)) {
- self.state = .pi_content;
- // self.state_data.start = self.state_data.start;
- return .ok;
- },
-
- .cdata_before_start => if (c == self.state_data.left[0]) {
- if (self.state_data.left.len == 1) {
- self.state = .cdata;
- self.state_data.start = self.pos + len;
- } else {
- self.state_data.left = self.state_data.left[1..];
- }
- return .ok;
- },
-
- .cdata => if (c == ']') {
- self.state = .cdata_maybe_before_end;
- // self.state_data.start = self.state_data.start;
- self.state_data.end = self.pos;
- return .ok;
- } else if (syntax.isChar(c)) {
- return .ok;
- },
-
- .cdata_maybe_before_end => if (c == ']') {
- self.state = .cdata_maybe_end;
- // self.state_data.start = self.state_data.start;
- // self.state_data.end = self.state_data.end;
- return .ok;
- } else if (syntax.isChar(c)) {
- self.state = .cdata;
- // self.state_data.start = self.state_data.start;
- return .ok;
- },
-
- .cdata_maybe_end => if (c == ']') {
- // For every ']' after two have been encountered, the end
- // position is incremented so only the final ']]>' marks the end of
- // CDATA.
- self.state_data.end += 1;
- return .ok;
- } else if (c == '>') {
- const text = Range{ .start = self.state_data.start, .end = self.state_data.end };
- self.state = .content;
- self.state_data.start = self.pos + len;
- self.token_data = .{ .element_content = .{ .content = .{ .text = text } } };
- return .element_content;
- } else if (syntax.isChar(c)) {
- self.state = .cdata;
- // self.state_data.start = self.state_data.start;
- return .ok;
- },
-
- .element_start_name => if (syntax.isNameChar(c)) {
- return .ok;
- } else if (syntax.isSpace(c)) {
- self.depth += 1;
- const name = Range{ .start = self.state_data.start, .end = self.pos };
- self.state = .element_start_after_name;
- self.token_data = .{ .element_start = .{ .name = name } };
- return .element_start;
- } else if (c == '/') {
- self.depth += 1;
- const name = Range{ .start = self.state_data.start, .end = self.pos };
- self.state = .element_start_empty;
- self.token_data = .{ .element_start = .{ .name = name } };
- return .element_start;
- } else if (c == '>') {
- self.depth += 1;
- const name = Range{ .start = self.state_data.start, .end = self.pos };
- self.state = .content;
- self.state_data.start = self.pos + len;
- self.token_data = .{ .element_start = .{ .name = name } };
- return .element_start;
- },
-
- .element_start_after_name => if (syntax.isSpace(c)) {
- return .ok;
- } else if (syntax.isNameStartChar(c)) {
- self.state = .attribute_name;
- self.state_data.start = self.pos;
- return .ok;
- } else if (c == '/') {
- self.state = .element_start_empty;
- return .ok;
- } else if (c == '>') {
- self.state = .content;
- self.state_data.start = self.pos + len;
- return .ok;
- },
-
- .element_start_empty => if (c == '>') {
- self.depth -= 1;
- if (self.depth == 0) {
- self.seen_root_element = true;
- }
- if (self.depth == 0) {
- self.state = .document_content;
- } else {
- self.state = .content;
- self.state_data.start = self.pos + len;
- }
- self.token_data = .{ .element_end_empty = {} };
- return .element_end_empty;
- },
-
- .attribute_name => if (syntax.isNameChar(c)) {
- return .ok;
- } else if (syntax.isSpace(c)) {
- const name = Range{ .start = self.state_data.start, .end = self.pos };
- self.state = .attribute_after_name;
- self.token_data = .{ .attribute_start = .{ .name = name } };
- return .attribute_start;
- } else if (c == '=') {
- const name = Range{ .start = self.state_data.start, .end = self.pos };
- self.state = .attribute_after_equals;
- self.token_data = .{ .attribute_start = .{ .name = name } };
- return .attribute_start;
- },
-
- .attribute_after_name => if (syntax.isSpace(c)) {
- return .ok;
- } else if (c == '=') {
- self.state = .attribute_after_equals;
- return .ok;
- },
-
- .attribute_after_equals => if (syntax.isSpace(c)) {
- return .ok;
- } else if (c == '"' or c == '\'') {
- self.state = .attribute_content;
- self.state_data.start = self.pos + len;
- self.state_data.quote = @as(u8, @intCast(c));
- return .ok;
- },
-
- .attribute_content => if (c == self.state_data.quote) {
- const text = Range{ .start = self.state_data.start, .end = self.pos };
- self.state = .attribute_after_content;
- self.token_data = .{ .attribute_content = .{ .content = .{ .text = text }, .final = true } };
- return .attribute_content;
- } else if (c == '&') {
- const text = Range{ .start = self.state_data.start, .end = self.pos };
- self.state = .attribute_content_ref_start;
- // self.state_data.quote = self.state_data.quote;
- if (text.isEmpty()) {
- // We do not want to emit an empty text content token between entities
- return .ok;
- } else {
- self.token_data = .{ .attribute_content = .{ .content = .{ .text = text } } };
- return .attribute_content;
- }
- } else if (c != '<' and syntax.isChar(c)) {
- return .ok;
- },
-
- .attribute_content_ref_start => if (syntax.isNameStartChar(c)) {
- self.state = .attribute_content_entity_ref_name;
- self.state_data.start = self.pos;
- // self.state_data.quote = self.state_data.quote;
- return .ok;
- } else if (c == '#') {
- self.state = .attribute_content_char_ref_start;
- // self.state_data.quote = self.state_data.quote;
- return .ok;
- },
-
- .attribute_content_entity_ref_name => if (syntax.isNameChar(c)) {
- return .ok;
- } else if (c == ';') {
- const entity = Range{ .start = self.state_data.start, .end = self.pos };
- self.state = .attribute_content;
- self.state_data.start = self.pos + len;
- // self.state_data.quote = self.state_data.quote;
- self.token_data = .{ .attribute_content = .{ .content = .{ .entity = entity } } };
- return .attribute_content;
- },
-
- .attribute_content_char_ref_start => if (syntax.isDigit(c)) {
- self.state = .attribute_content_char_ref;
- self.state_data.hex = false;
- self.state_data.value = syntax.digitValue(c);
- // self.state_data.quote = self.state_data.quote;
- return .ok;
- } else if (c == 'x') {
- self.state = .attribute_content_char_ref;
- self.state_data.hex = true;
- self.state_data.value = 0;
- // self.state_data.quote = self.state_data.quote;
- return .ok;
- },
-
- .attribute_content_char_ref => if (!self.state_data.hex and syntax.isDigit(c)) {
- const value = 10 * @as(u32, self.state_data.value) + syntax.digitValue(c);
- if (value > std.math.maxInt(u21)) {
- return error.InvalidCharacterReference;
- }
- self.state_data.value = @as(u21, @intCast(value));
- return .ok;
- } else if (self.state_data.hex and syntax.isHexDigit(c)) {
- const value = 16 * @as(u32, self.state_data.value) + syntax.hexDigitValue(c);
- if (value > std.math.maxInt(u21)) {
- return error.InvalidCharacterReference;
- }
- self.state_data.value = @as(u21, @intCast(value));
- return .ok;
- } else if (c == ';') {
- const codepoint = self.state_data.value;
- if (!syntax.isChar(codepoint)) {
- return error.InvalidCharacterReference;
- }
- self.state = .attribute_content;
- self.state_data.start = self.pos + len;
- // self.state_data.quote = self.state_data.quote;
- self.token_data = .{ .attribute_content = .{ .content = .{ .codepoint = codepoint } } };
- return .attribute_content;
- },
-
- .attribute_after_content => if (syntax.isSpace(c)) {
- self.state = .element_start_after_name;
- return .ok;
- } else if (c == '/') {
- self.state = .element_start_empty;
- return .ok;
- } else if (c == '>') {
- self.state = .content;
- self.state_data.start = self.pos + len;
- return .ok;
- },
-
- .element_end => if (syntax.isNameStartChar(c)) {
- self.state = .element_end_name;
- self.state_data.start = self.pos;
- return .ok;
- },
-
- .element_end_name => if (syntax.isNameChar(c)) {
- return .ok;
- } else if (syntax.isSpace(c)) {
- self.depth -= 1;
- const name = Range{ .start = self.state_data.start, .end = self.pos };
- self.state = .element_end_after_name;
- self.token_data = .{ .element_end = .{ .name = name } };
- return .element_end;
- } else if (c == '>') {
- self.depth -= 1;
- if (self.depth == 0) {
- self.seen_root_element = true;
- }
- const name = Range{ .start = self.state_data.start, .end = self.pos };
- if (self.depth == 0) {
- self.state = .document_content;
- } else {
- self.state = .content;
- self.state_data.start = self.pos + len;
- }
- self.token_data = .{ .element_end = .{ .name = name } };
- return .element_end;
- },
-
- .element_end_after_name => if (syntax.isSpace(c)) {
- return .ok;
- } else if (c == '>') {
- if (self.depth == 0) {
- self.seen_root_element = true;
- }
- if (self.depth == 0) {
- self.state = .document_content;
- } else {
- self.state = .content;
- self.state_data.start = self.pos + len;
- }
- return .ok;
- },
-
- inline .content,
- .content_cdata_maybe_before_end,
- .content_cdata_maybe_end,
- => |state| if (c == ']') {
- switch (state) {
- .content => {
- self.state = .content_cdata_maybe_before_end;
- // self.state_data.start = self.state_data.start;
- },
- .content_cdata_maybe_before_end => {
- self.state = .content_cdata_maybe_end;
- // self.state_data.start = self.state_data.start;
- },
- else => {},
- }
- return .ok;
- } else if (state == .content_cdata_maybe_end and c == ']') {
- return .ok;
- } else if (state == .content_cdata_maybe_end and c == '>') {
- return error.SyntaxError;
- } else if (c == '<') {
- const text = Range{ .start = self.state_data.start, .end = self.pos };
- self.state = .unknown_start;
- if (text.isEmpty()) {
- // Do not report empty text content between elements, e.g.
- // (there is no text content between or
- // within e1 and e2).
- return .ok;
- } else {
- self.token_data = .{ .element_content = .{ .content = .{ .text = text } } };
- return .element_content;
- }
- } else if (c == '&') {
- const text = Range{ .start = self.state_data.start, .end = self.pos };
- self.state = .content_ref_start;
- if (text.isEmpty()) {
- return .ok;
- } else {
- self.token_data = .{ .element_content = .{ .content = .{ .text = text } } };
- return .element_content;
- }
- } else if (syntax.isChar(c)) {
- if (state != .content) {
- self.state = .content;
- // self.state_data.start = self.state_data.start;
- }
- return .ok;
- },
-
- .content_ref_start => if (syntax.isNameStartChar(c)) {
- self.state = .content_entity_ref_name;
- self.state_data.start = self.pos;
- return .ok;
- } else if (c == '#') {
- self.state = .content_char_ref_start;
- return .ok;
- },
-
- .content_entity_ref_name => if (syntax.isNameChar(c)) {
- return .ok;
- } else if (c == ';') {
- const entity = Range{ .start = self.state_data.start, .end = self.pos };
- self.state = .content;
- self.state_data.start = self.pos + len;
- self.token_data = .{ .element_content = .{ .content = .{ .entity = entity } } };
- return .element_content;
- },
-
- .content_char_ref_start => if (syntax.isDigit(c)) {
- self.state = .content_char_ref;
- self.state_data.hex = false;
- self.state_data.value = syntax.digitValue(c);
- return .ok;
- } else if (c == 'x') {
- self.state = .content_char_ref;
- self.state_data.hex = true;
- self.state_data.value = 0;
- return .ok;
- },
-
- .content_char_ref => if (!self.state_data.hex and syntax.isDigit(c)) {
- const value = 10 * @as(u32, self.state_data.value) + syntax.digitValue(c);
- if (value > std.math.maxInt(u21)) {
- return error.InvalidCharacterReference;
- }
- self.state_data.value = @as(u21, @intCast(value));
- return .ok;
- } else if (self.state_data.hex and syntax.isHexDigit(c)) {
- const value = 16 * @as(u32, self.state_data.value) + syntax.hexDigitValue(c);
- if (value > std.math.maxInt(u21)) {
- return error.InvalidCharacterReference;
- }
- self.state_data.value = @as(u21, @intCast(value));
- return .ok;
- } else if (c == ';') {
- const codepoint = self.state_data.value;
- if (!syntax.isChar(codepoint)) {
- return error.InvalidCharacterReference;
- }
- self.state = .content;
- self.state_data.start = self.pos + len;
- self.token_data = .{ .element_content = .{ .content = .{ .codepoint = codepoint } } };
- return .element_content;
- },
-
- .@"error" => return error.SyntaxError,
- }
-
- return error.SyntaxError;
-}
-
-/// Signals that there is no further input to scan, and returns an error if
-/// the scanner is not in a valid state to handle this (for example, if this
-/// is called while in the middle of element content).
-pub fn endInput(self: *Scanner) error{UnexpectedEndOfInput}!void {
- if (self.state != .document_content or !self.seen_root_element) {
- return error.UnexpectedEndOfInput;
- }
-}
-
-test Scanner {
- try testValid(
- \\
- \\
- \\
- \\
- \\
- \\ Hello,
- \\
- \\
- \\ Text content goes here.
- \\
- \\
- \\
- \\
- \\
- \\
- \\
- , &.{
- .{ .xml_declaration = .{ .version = .{ .start = 15, .end = 18 } } },
- .{ .pi_start = .{ .target = .{ .start = 24, .end = 31 } } }, // some-pi
- .{ .pi_content = .{ .content = .{ .start = 31, .end = 31 }, .final = true } },
- .comment_start,
- .{ .comment_content = .{ .content = .{ .start = 38, .end = 85 }, .final = true } },
- .{ .pi_start = .{ .target = .{ .start = 91, .end = 111 } } }, // some-pi-with-content
- .{ .pi_content = .{ .content = .{ .start = 112, .end = 119 }, .final = true } },
- .{ .element_start = .{ .name = .{ .start = 123, .end = 127 } } }, // root
- .{ .element_content = .{ .content = .{ .text = .{ .start = 128, .end = 131 } } } },
- .{ .element_start = .{ .name = .{ .start = 132, .end = 133 } } }, // p
- .{ .attribute_start = .{ .name = .{ .start = 134, .end = 139 } } },
- .{ .attribute_content = .{ .content = .{ .text = .{ .start = 141, .end = 145 } }, .final = true } },
- .{ .element_content = .{ .content = .{ .text = .{ .start = 147, .end = 154 } } } },
- .{ .element_content = .{ .content = .{ .text = .{ .start = 163, .end = 169 } } } },
- .{ .element_end = .{ .name = .{ .start = 174, .end = 175 } } }, // /p
- .{ .element_content = .{ .content = .{ .text = .{ .start = 176, .end = 179 } } } },
- .{ .element_start = .{ .name = .{ .start = 180, .end = 184 } } }, // line
- .element_end_empty,
- .{ .element_content = .{ .content = .{ .text = .{ .start = 187, .end = 190 } } } },
- .{ .pi_start = .{ .target = .{ .start = 192, .end = 202 } } }, // another-pi
- .{ .pi_content = .{ .content = .{ .start = 202, .end = 202 }, .final = true } },
- .{ .element_content = .{ .content = .{ .text = .{ .start = 204, .end = 233 } } } },
- .{ .element_start = .{ .name = .{ .start = 234, .end = 237 } } }, // div
- .{ .element_start = .{ .name = .{ .start = 239, .end = 240 } } }, // p
- .{ .element_content = .{ .content = .{ .entity = .{ .start = 242, .end = 245 } } } },
- .{ .element_end = .{ .name = .{ .start = 248, .end = 249 } } }, // /p
- .{ .element_end = .{ .name = .{ .start = 252, .end = 255 } } }, // /div
- .{ .element_content = .{ .content = .{ .text = .{ .start = 256, .end = 257 } } } },
- .{ .element_end = .{ .name = .{ .start = 259, .end = 263 } } }, // /root
- .comment_start,
- .{ .comment_content = .{ .content = .{ .start = 269, .end = 325 }, .final = true } },
- .{ .pi_start = .{ .target = .{ .start = 332, .end = 339 } } }, // comment
- .{ .pi_content = .{ .content = .{ .start = 340, .end = 351 }, .final = true } },
- });
-}
-
-test "BOM" {
- try testValid("\u{FEFF}", &.{
- .{ .element_start = .{ .name = .{ .start = 4, .end = 11 } } },
- .element_end_empty,
- });
-}
-
-test "empty root element" {
- try testValid("", &.{
- .{ .element_start = .{ .name = .{ .start = 1, .end = 8 } } },
- .element_end_empty,
- });
- try testValid("", &.{
- .{ .element_start = .{ .name = .{ .start = 1, .end = 8 } } },
- .element_end_empty,
- });
-}
-
-test "root element with no content" {
- try testValid("", &.{
- .{ .element_start = .{ .name = .{ .start = 1, .end = 8 } } },
- .{ .element_end = .{ .name = .{ .start = 11, .end = 18 } } },
- });
-}
-
-test "element content" {
- try testValid("Hello, world!", &.{
- .{ .element_start = .{ .name = .{ .start = 1, .end = 8 } } },
- .{ .element_content = .{ .content = .{ .text = .{ .start = 9, .end = 22 } } } },
- .{ .element_end = .{ .name = .{ .start = 24, .end = 31 } } },
- });
-}
-
-test "element nesting" {
- try testValid("", &.{
- .{ .element_start = .{ .name = .{ .start = 1, .end = 5 } } },
- .{ .element_start = .{ .name = .{ .start = 7, .end = 10 } } },
- .{ .element_start = .{ .name = .{ .start = 12, .end = 17 } } },
- .element_end_empty,
- .{ .element_end = .{ .name = .{ .start = 21, .end = 24 } } },
- .{ .element_end = .{ .name = .{ .start = 27, .end = 31 } } },
- });
- try testValid("", &.{
- .{ .element_start = .{ .name = .{ .start = 1, .end = 5 } } },
- .{ .element_start = .{ .name = .{ .start = 10, .end = 13 } } },
- .{ .element_start = .{ .name = .{ .start = 16, .end = 21 } } },
- .element_end_empty,
- .{ .element_end = .{ .name = .{ .start = 26, .end = 29 } } },
- .{ .element_end = .{ .name = .{ .start = 33, .end = 37 } } },
- });
- try testInvalid("", error.SyntaxError, 14);
- try testInvalid("", error.SyntaxError, 16);
- try testIncomplete("");
- try testIncomplete("");
-}
-
-test "XML declaration" {
- try testValid(
- \\
- \\
- , &.{
- .{ .xml_declaration = .{ .version = .{ .start = 15, .end = 18 } } },
- .{ .element_start = .{ .name = .{ .start = 23, .end = 27 } } },
- .element_end_empty,
- });
- try testValid(
- \\
- \\
- , &.{
- .{ .xml_declaration = .{ .version = .{ .start = 17, .end = 20 } } },
- .{ .element_start = .{ .name = .{ .start = 25, .end = 29 } } },
- .element_end_empty,
- });
- try testValid(
- \\
- \\
- , &.{
- .{ .xml_declaration = .{ .version = .{ .start = 15, .end = 18 } } },
- .{ .element_start = .{ .name = .{ .start = 23, .end = 27 } } },
- .element_end_empty,
- });
- try testValid(
- \\
- \\
- , &.{
- .{ .xml_declaration = .{ .version = .{ .start = 15, .end = 20 } } },
- .{ .element_start = .{ .name = .{ .start = 25, .end = 29 } } },
- .element_end_empty,
- });
- try testValid(
- \\
- \\
- , &.{
- .{ .xml_declaration = .{ .version = .{ .start = 15, .end = 18 }, .encoding = .{ .start = 30, .end = 35 } } },
- .{ .element_start = .{ .name = .{ .start = 40, .end = 44 } } },
- .element_end_empty,
- });
- try testValid(
- \\
- \\
- , &.{
- .{ .xml_declaration = .{ .version = .{ .start = 17, .end = 20 }, .encoding = .{ .start = 34, .end = 39 } } },
- .{ .element_start = .{ .name = .{ .start = 44, .end = 48 } } },
- .element_end_empty,
- });
- try testValid(
- \\
- \\
- , &.{
- .{ .xml_declaration = .{ .version = .{ .start = 15, .end = 18 }, .encoding = .{ .start = 30, .end = 35 } } },
- .{ .element_start = .{ .name = .{ .start = 40, .end = 44 } } },
- .element_end_empty,
- });
- try testValid(
- \\
- \\
- , &.{
- .{ .xml_declaration = .{ .version = .{ .start = 15, .end = 18 }, .encoding = .{ .start = 30, .end = 35 } } },
- .{ .element_start = .{ .name = .{ .start = 40, .end = 44 } } },
- .element_end_empty,
- });
- try testValid(
- \\
- \\
- , &.{
- .{ .xml_declaration = .{ .version = .{ .start = 15, .end = 18 }, .encoding = .{ .start = 30, .end = 35 } } },
- .{ .element_start = .{ .name = .{ .start = 40, .end = 44 } } },
- .element_end_empty,
- });
- try testValid(
- \\
- \\
- , &.{
- .{ .xml_declaration = .{ .version = .{ .start = 15, .end = 18 }, .standalone = true } },
- .{ .element_start = .{ .name = .{ .start = 40, .end = 44 } } },
- .element_end_empty,
- });
- try testValid(
- \\
- \\
- , &.{
- .{ .xml_declaration = .{ .version = .{ .start = 15, .end = 18 }, .standalone = false } },
- .{ .element_start = .{ .name = .{ .start = 39, .end = 43 } } },
- .element_end_empty,
- });
- try testValid(
- \\
- \\
- , &.{
- .{ .xml_declaration = .{ .version = .{ .start = 17, .end = 20 }, .standalone = true } },
- .{ .element_start = .{ .name = .{ .start = 44, .end = 48 } } },
- .element_end_empty,
- });
- try testValid(
- \\
- \\
- , &.{
- .{ .xml_declaration = .{ .version = .{ .start = 15, .end = 18 }, .encoding = .{ .start = 30, .end = 35 }, .standalone = true } },
- .{ .element_start = .{ .name = .{ .start = 57, .end = 61 } } },
- .element_end_empty,
- });
- try testValid(
- \\
- \\
- , &.{
- .{ .xml_declaration = .{ .version = .{ .start = 17, .end = 20 }, .encoding = .{ .start = 34, .end = 39 }, .standalone = true } },
- .{ .element_start = .{ .name = .{ .start = 63, .end = 67 } } },
- .element_end_empty,
- });
- try testInvalid("", error.SyntaxError, 19);
- try testInvalid("", error.SyntaxError, 36);
-}
-
-test "doctype" {
- try testInvalid("", error.DoctypeNotSupported, 9);
- try testInvalid("", error.DoctypeNotSupported, 30);
- try testInvalid("", error.SyntaxError, 10);
- try testInvalid("", error.SyntaxError, 8);
-}
-
-test "CDATA" {
- try testValid("", &.{
- .{ .element_start = .{ .name = .{ .start = 1, .end = 8 } } },
- .{ .element_content = .{ .content = .{ .text = .{ .start = 18, .end = 20 } } } },
- .{ .element_end = .{ .name = .{ .start = 25, .end = 32 } } },
- });
- try testValid("", &.{
- .{ .element_start = .{ .name = .{ .start = 1, .end = 8 } } },
- .{ .element_content = .{ .content = .{ .text = .{ .start = 18, .end = 21 } } } },
- .{ .element_end = .{ .name = .{ .start = 26, .end = 33 } } },
- });
- try testValid("]]]]]]]>", &.{
- .{ .element_start = .{ .name = .{ .start = 1, .end = 8 } } },
- .{ .element_content = .{ .content = .{ .text = .{ .start = 18, .end = 27 } } } },
- .{ .element_end = .{ .name = .{ .start = 32, .end = 39 } } },
- });
-}
-
-test "references" {
- try testValid(
- \\<Hi!!>
- , &.{
- .{ .element_start = .{ .name = .{ .start = 1, .end = 8 } } },
- .{ .attribute_start = .{ .name = .{ .start = 9, .end = 18 } } },
- .{ .attribute_content = .{ .content = .{ .text = .{ .start = 20, .end = 25 } } } },
- .{ .attribute_content = .{ .content = .{ .codepoint = 0x2C } } },
- .{ .attribute_content = .{ .content = .{ .codepoint = 32 } } },
- .{ .attribute_content = .{ .content = .{ .text = .{ .start = 36, .end = 42 } } } },
- .{ .attribute_content = .{ .content = .{ .entity = .{ .start = 43, .end = 46 } } } },
- .{ .attribute_content = .{ .content = .{ .text = .{ .start = 47, .end = 56 } }, .final = true } },
- .{ .element_content = .{ .content = .{ .entity = .{ .start = 59, .end = 61 } } } },
- .{ .element_content = .{ .content = .{ .text = .{ .start = 62, .end = 64 } } } },
- .{ .element_content = .{ .content = .{ .codepoint = 33 } } },
- .{ .element_content = .{ .content = .{ .codepoint = 0x21 } } },
- .{ .element_content = .{ .content = .{ .entity = .{ .start = 76, .end = 78 } } } },
- .{ .element_end = .{ .name = .{ .start = 81, .end = 88 } } },
- });
-}
-
-test "PI at document start" {
- try testValid("", &.{
- .{ .pi_start = .{ .target = .{ .start = 2, .end = 9 } } },
- .{ .pi_content = .{ .content = .{ .start = 9, .end = 9 }, .final = true } },
- .{ .element_start = .{ .name = .{ .start = 12, .end = 16 } } },
- .element_end_empty,
- });
- try testValid("", &.{
- .{ .pi_start = .{ .target = .{ .start = 2, .end = 4 } } },
- .{ .pi_content = .{ .content = .{ .start = 4, .end = 4 }, .final = true } },
- .{ .element_start = .{ .name = .{ .start = 7, .end = 11 } } },
- .element_end_empty,
- });
- try testValid("", &.{
- .{ .pi_start = .{ .target = .{ .start = 2, .end = 6 } } },
- .{ .pi_content = .{ .content = .{ .start = 6, .end = 6 }, .final = true } },
- .{ .element_start = .{ .name = .{ .start = 9, .end = 13 } } },
- .element_end_empty,
- });
-}
-
-test "invalid top-level text" {
- try testInvalid("Hello, world!", error.SyntaxError, 0);
- try testInvalid(
- \\
- \\Hello, world!
- , error.SyntaxError, 22);
- try testInvalid(
- \\
- \\Hello, world!
- , error.SyntaxError, 9);
-}
-
-test "invalid XML declaration" {
- try testInvalid("", error.SyntaxError, 5);
- try testInvalid(" xml version='1.0' ?>", error.SyntaxError, 2);
- try testInvalid("", error.SyntaxError, 37);
- try testInvalid("", error.SyntaxError, 15);
- try testInvalid("", error.SyntaxError, 17);
- try testInvalid("", error.SyntaxError, 16);
- try testInvalid("", error.SyntaxError, 15);
- try testInvalid("", error.SyntaxError, 30);
- try testInvalid("", error.SyntaxError, 30);
- try testInvalid("", error.SyntaxError, 34);
- try testInvalid("", error.SyntaxError, 33);
- try testInvalid("&", error.SyntaxError, 10);
- try testInvalid("&", error.SyntaxError, 13);
- try testInvalid("ABC;", error.SyntaxError, 11);
- try testInvalid("C;", error.SyntaxError, 13);
- try testInvalid("xx;", error.SyntaxError, 12);
- try testInvalid("", error.InvalidCharacterReference, 12);
- try testInvalid("", error.InvalidCharacterReference, 18);
- try testInvalid("", error.InvalidCharacterReference, 16);
- try testInvalid("", error.InvalidCharacterReference, 18);
- try testInvalid("", error.SyntaxError, 16);
- try testInvalid("", error.SyntaxError, 19);
- try testInvalid("", error.SyntaxError, 17);
- try testInvalid("", error.SyntaxError, 19);
- try testInvalid("", error.SyntaxError, 18);
- try testInvalid("", error.InvalidCharacterReference, 18);
- try testInvalid("", error.InvalidCharacterReference, 24);
- try testInvalid("", error.InvalidCharacterReference, 22);
- try testInvalid("", error.InvalidCharacterReference, 24);
-}
-
-test "invalid content" {
- try testInvalid("Illegal: ]]>", error.SyntaxError, 20);
- try testInvalid("Also illegal: ]]]>", error.SyntaxError, 26);
- try testValid("]]>", &.{
- .{ .element_start = .{ .name = .{ .start = 1, .end = 8 } } },
- .{ .element_content = .{ .content = .{ .text = .{ .start = 9, .end = 11 } } } },
- .{ .element_content = .{ .content = .{ .entity = .{ .start = 12, .end = 14 } } } },
- .{ .element_end = .{ .name = .{ .start = 17, .end = 24 } } },
- });
- try testValid("[lol]
[lmao]", &.{
- .{ .element_start = .{ .name = .{ .start = 1, .end = 8 } } },
- .{ .element_content = .{ .content = .{ .text = .{ .start = 9, .end = 14 } } } },
- .{ .element_start = .{ .name = .{ .start = 15, .end = 17 } } },
- .element_end_empty,
- .{ .element_content = .{ .content = .{ .text = .{ .start = 19, .end = 25 } } } },
- .{ .element_end = .{ .name = .{ .start = 27, .end = 34 } } },
- });
-}
-
-test "attributes" {
- try testValid("", &.{
- .{ .element_start = .{ .name = .{ .start = 1, .end = 8 } } },
- .{ .attribute_start = .{ .name = .{ .start = 9, .end = 14 } } },
- .{ .attribute_content = .{ .content = .{ .text = .{ .start = 16, .end = 17 } }, .final = true } },
- .{ .attribute_start = .{ .name = .{ .start = 19, .end = 24 } } },
- .{ .attribute_content = .{ .content = .{ .text = .{ .start = 26, .end = 27 } }, .final = true } },
- .element_end_empty,
- });
- try testValid("", &.{
- .{ .element_start = .{ .name = .{ .start = 1, .end = 8 } } },
- .{ .attribute_start = .{ .name = .{ .start = 9, .end = 14 } } },
- .{ .attribute_content = .{ .content = .{ .text = .{ .start = 16, .end = 17 } }, .final = true } },
- .{ .attribute_start = .{ .name = .{ .start = 19, .end = 24 } } },
- .{ .attribute_content = .{ .content = .{ .text = .{ .start = 26, .end = 27 } }, .final = true } },
- .element_end_empty,
- });
- try testInvalid("", error.SyntaxError, 18);
- try testInvalid("", error.SyntaxError, 13);
-
- try testInvalid("", error.SyntaxError, 15);
- try testValid("", &.{
- .{ .element_start = .{ .name = .{ .start = 1, .end = 8 } } },
- .{ .attribute_start = .{ .name = .{ .start = 9, .end = 13 } } },
- .{ .attribute_content = .{ .content = .{ .entity = .{ .start = 16, .end = 18 } } } },
- .{ .attribute_content = .{ .content = .{ .entity = .{ .start = 20, .end = 22 } } } },
- .{ .attribute_content = .{ .content = .{ .text = .{ .start = 23, .end = 23 } }, .final = true } },
- .element_end_empty,
- });
-}
-
-test "missing root element" {
- try testIncomplete("");
- try testIncomplete("");
-}
-
-test "incomplete document" {
- try testIncomplete("<");
- try testIncomplete("");
- try testIncomplete(" .ok,
-
- // States which contain positional information but cannot immediately
- // be emitted as a token cannot be reset
- .pi_or_xml_decl_start,
- .pi_or_xml_decl_start_after_xml,
-
- .xml_decl_version_value_start,
- .xml_decl_version_value,
- .xml_decl_after_version_value,
- .xml_decl_after_version,
- .xml_decl_encoding_name,
- .xml_decl_after_encoding_name,
- .xml_decl_after_encoding_equals,
- .xml_decl_encoding_value_start,
- .xml_decl_encoding_value,
- .xml_decl_after_encoding_value,
- .xml_decl_after_encoding,
- .xml_decl_standalone_name,
- .xml_decl_after_standalone_name,
- .xml_decl_after_standalone_equals,
- .xml_decl_standalone_value_start,
-
- // None of the "maybe_end" states can be reset because we don't know if
- // the resulting content token should include the possible ending
- // characters until we read further to unambiguously determine whether
- // the state is ending.
- .comment_maybe_before_end,
-
- .pi_target,
- .pi_maybe_end,
-
- .cdata_maybe_before_end,
- .cdata_maybe_end,
-
- .element_start_name,
-
- .attribute_name,
- .attribute_content_entity_ref_name,
-
- .element_end_name,
-
- .content_entity_ref_name,
- => return error.CannotReset,
-
- // Some states (specifically, content states) can be reset by emitting
- // a token with the content seen so far
- .comment => token: {
- const range = Range{ .start = self.state_data.start, .end = self.pos };
- self.state_data.start = 0;
- if (range.isEmpty()) {
- break :token .ok;
- } else {
- self.token_data = .{ .comment_content = .{ .content = range } };
- break :token .comment_content;
- }
- },
-
- .pi_content => token: {
- const range = Range{ .start = self.state_data.start, .end = self.pos };
- self.state_data.start = 0;
- if (range.isEmpty()) {
- break :token .ok;
- } else {
- self.token_data = .{ .pi_content = .{ .content = range } };
- break :token .pi_content;
- }
- },
-
- .cdata => token: {
- const range = Range{ .start = self.state_data.start, .end = self.pos };
- self.state_data.start = 0;
- if (range.isEmpty()) {
- break :token .ok;
- } else {
- self.token_data = .{ .element_content = .{ .content = .{ .text = range } } };
- break :token .element_content;
- }
- },
-
- .attribute_content => token: {
- const range = Range{ .start = self.state_data.start, .end = self.pos };
- self.state_data.start = 0;
- if (range.isEmpty()) {
- break :token .ok;
- } else {
- self.token_data = .{ .attribute_content = .{ .content = .{ .text = range } } };
- break :token .attribute_content;
- }
- },
-
- .content, .content_cdata_maybe_before_end, .content_cdata_maybe_end => token: {
- const range = Range{ .start = self.state_data.start, .end = self.pos };
- self.state_data.start = 0;
- if (range.isEmpty()) {
- break :token .ok;
- } else {
- self.token_data = .{ .element_content = .{ .content = .{ .text = range } } };
- break :token .element_content;
- }
- },
- };
- self.pos = 0;
- return token;
-}
-
-test resetPos {
- var scanner = Scanner{};
- var tokens = std.ArrayListUnmanaged(Token.Full){};
- defer tokens.deinit(testing.allocator);
-
- for ("Hello,") |c| {
- switch (try scanner.next(c, 1)) {
- .ok => {},
- else => |token| try tokens.append(testing.allocator, scanner.fullToken(token)),
- }
- }
- try tokens.append(testing.allocator, scanner.fullToken(try scanner.resetPos()));
- for (" world!") |c| {
- switch (try scanner.next(c, 1)) {
- .ok => {},
- else => |token| try tokens.append(testing.allocator, scanner.fullToken(token)),
- }
- }
-
- try testing.expectEqualSlices(Token.Full, &.{
- .{ .element_start = .{ .name = .{ .start = 1, .end = 8 } } },
- .{ .element_content = .{ .content = .{ .text = .{ .start = 9, .end = 15 } } } },
- .{ .element_content = .{ .content = .{ .text = .{ .start = 0, .end = 7 } } } },
- .{ .element_end = .{ .name = .{ .start = 9, .end = 16 } } },
- }, tokens.items);
-}
-
-test "resetPos inside element reference name" {
- var scanner = Scanner{};
-
- for ("Hello, world &am") |c| {
- _ = try scanner.next(c, 1);
- }
- try testing.expectError(error.CannotReset, scanner.resetPos());
-}
diff --git a/src/Writer.zig b/src/Writer.zig
new file mode 100644
index 0000000..45a64ad
--- /dev/null
+++ b/src/Writer.zig
@@ -0,0 +1,198 @@
+const std = @import("std");
+const assert = std.debug.assert;
+
+options: Options,
+
+state: State,
+indent_level: u32,
+
+sink: Sink,
+
+const Writer = @This();
+
+pub const Options = struct {
+ indent: []const u8 = "",
+};
+
+pub const Sink = struct {
+ context: *const anyopaque,
+ writeFn: *const fn (context: *const anyopaque, data: []const u8) anyerror!void,
+
+ pub fn write(sink: *Sink, data: []const u8) anyerror!void {
+ return sink.writeFn(sink.context, data);
+ }
+};
+
+const State = enum {
+ start,
+ after_bom,
+ after_xml_declaration,
+ element_start,
+ after_structure_end,
+ text,
+ end,
+};
+
+pub fn init(sink: Sink, options: Options) Writer {
+ return .{
+ .options = options,
+
+ .state = .start,
+ .indent_level = 0,
+
+ .sink = sink,
+ };
+}
+
+pub const WriteError = error{};
+
+pub fn bom(writer: *Writer) anyerror!void {
+ assert(writer.state == .start);
+ try writer.raw("\u{FEFF}");
+ writer.state = .after_bom;
+}
+
+pub fn xmlDeclaration(writer: *Writer, encoding: ?[]const u8, standalone: ?bool) anyerror!void {
+ assert(writer.state == .start or writer.state == .after_bom);
+ try writer.raw("");
+ if (writer.options.indent.len > 0) try writer.newLineAndIndent();
+ writer.state = .after_xml_declaration;
+}
+
+pub fn elementStart(writer: *Writer, name: []const u8) anyerror!void {
+ switch (writer.state) {
+ .start, .after_bom, .after_xml_declaration, .text => {},
+ .element_start => {
+ try writer.raw(">");
+ try writer.newLineAndIndent();
+ },
+ .after_structure_end => {
+ try writer.newLineAndIndent();
+ },
+ .end => unreachable,
+ }
+ try writer.raw("<");
+ try writer.raw(name);
+ writer.state = .element_start;
+ writer.indent_level += 1;
+}
+
+pub fn elementEnd(writer: *Writer, name: []const u8) anyerror!void {
+ writer.indent_level -= 1;
+ switch (writer.state) {
+ .text => {},
+ .element_start => {
+ try writer.raw(">");
+ try writer.newLineAndIndent();
+ },
+ .after_structure_end => {
+ try writer.newLineAndIndent();
+ },
+ .start, .after_bom, .after_xml_declaration, .end => unreachable,
+ }
+ try writer.raw("");
+ try writer.raw(name);
+ try writer.raw(">");
+ writer.state = if (writer.indent_level > 0) .after_structure_end else .end;
+}
+
+pub fn elementEndEmpty(writer: *Writer) anyerror!void {
+ assert(writer.state == .element_start);
+ try writer.raw("/>");
+ writer.state = .after_structure_end;
+ writer.indent_level -= 1;
+}
+
+pub fn attribute(writer: *Writer, name: []const u8, value: []const u8) anyerror!void {
+ assert(writer.state == .element_start);
+ try writer.raw(" ");
+ try writer.raw(name);
+ try writer.raw("=\"");
+ try writer.attributeText(value);
+ try writer.raw("\"");
+}
+
+fn attributeText(writer: *Writer, s: []const u8) anyerror!void {
+ var pos: usize = 0;
+ while (std.mem.indexOfAnyPos(u8, s, pos, "\r\n\t&<\"")) |esc_pos| {
+ try writer.raw(s[pos..esc_pos]);
+ try writer.raw(switch (s[esc_pos]) {
+ '\r' => "
",
+ '\n' => "
",
+ '\t' => " ",
+ '&' => "&",
+ '<' => "<",
+ '"' => """,
+ else => unreachable,
+ });
+ pos = esc_pos + 1;
+ }
+ try writer.raw(s[pos..]);
+}
+
+pub fn pi(writer: *Writer, target: []const u8, data: []const u8) anyerror!void {
+ switch (writer.state) {
+ .start, .after_bom, .after_xml_declaration, .text, .end => {},
+ .element_start => {
+ try writer.raw(">");
+ try writer.newLineAndIndent();
+ },
+ .after_structure_end => {
+ try writer.newLineAndIndent();
+ },
+ }
+ try writer.raw("");
+ try writer.raw(target);
+ try writer.raw(" ");
+ try writer.raw(data);
+ try writer.raw("?>");
+ writer.state = .after_structure_end;
+}
+
+pub fn text(writer: *Writer, s: []const u8) anyerror!void {
+ switch (writer.state) {
+ .after_structure_end, .text => {},
+ .element_start => try writer.raw(">"),
+ .start, .after_bom, .after_xml_declaration, .end => unreachable,
+ }
+ var pos: usize = 0;
+ while (std.mem.indexOfAnyPos(u8, s, pos, "\r&<")) |esc_pos| {
+ try writer.raw(s[pos..esc_pos]);
+ try writer.raw(switch (s[esc_pos]) {
+ '\r' => "
",
+ '&' => "&",
+ '<' => "<",
+ else => unreachable,
+ });
+ pos = esc_pos + 1;
+ }
+ try writer.raw(s[pos..]);
+ writer.state = .text;
+}
+
+fn newLineAndIndent(writer: *Writer) anyerror!void {
+ if (writer.options.indent.len == 0) return;
+
+ try writer.raw("\n");
+ var n: usize = 0;
+ while (n < writer.indent_level) : (n += 1) {
+ try writer.raw(writer.options.indent);
+ }
+}
+
+fn raw(writer: *Writer, s: []const u8) anyerror!void {
+ try writer.sink.write(s);
+}
diff --git a/src/compat.zig b/src/compat.zig
deleted file mode 100644
index 79a65f0..0000000
--- a/src/compat.zig
+++ /dev/null
@@ -1,17 +0,0 @@
-//! Compatibility wrappers for APIs changed since Zig 0.12.
-
-const std = @import("std");
-
-pub fn ComptimeStringMapType(comptime V: type) type {
- return if (@hasDecl(std, "ComptimeStringMap"))
- type
- else
- std.StaticStringMap(V);
-}
-
-pub fn ComptimeStringMap(comptime V: type, comptime kvs_list: anytype) ComptimeStringMapType(V) {
- return if (@hasDecl(std, "ComptimeStringMap"))
- std.ComptimeStringMap(V, kvs_list)
- else
- std.StaticStringMap(V).initComptime(kvs_list);
-}
diff --git a/src/encoding.zig b/src/encoding.zig
deleted file mode 100644
index df06d4e..0000000
--- a/src/encoding.zig
+++ /dev/null
@@ -1,451 +0,0 @@
-//! Various encoding-related utilities.
-//!
-//! The central "interface" of this file is `Decoder`, which decodes XML
-//! content into Unicode codepoints for further processing. It consists
-//! of an error type `Error` and several declarations:
-//!
-//! - `const max_encoded_codepoint_len` - the maximum number of bytes a
-//! single Unicode codepoint may occupy in encoded form.
-//! - `fn readCodepoint(self: *Decoder, reader: anytype, buf: []u8) (Error || @TypeOf(reader).Error))!ReadResult` -
-//! reads a single codepoint from a `std.io.GenericReader` and writes its UTF-8
-//! encoding to `buf`. Should return `error.UnexpectedEndOfInput` if a full
-//! codepoint cannot be read, `error.Overflow` if the UTF-8-encoded form cannot
-//! be written to `buf`; other decoder-specific errors can also be used.
-//! - `fn adaptTo(self: *Decoder, encoding: []const u8) error{InvalidEncoding}!void` -
-//! accepts a UTF-8-encoded encoding name and returns an error if the desired
-//! encoding cannot be handled by the decoder. This is intended to support
-//! `Decoder` implementations which adapt to the encoding declared by an XML
-//! document.
-
-const std = @import("std");
-const ascii = std.ascii;
-const testing = std.testing;
-const unicode = std.unicode;
-const Allocator = std.mem.Allocator;
-const ArrayListUnmanaged = std.ArrayListUnmanaged;
-const BoundedArray = std.BoundedArray;
-
-/// The result of reading a single codepoint successfully.
-pub const ReadResult = packed struct(u32) {
- /// The codepoint read.
- codepoint: u21,
- /// The length of the codepoint encoded in UTF-8.
- byte_length: u10,
- /// If https://github.com/ziglang/zig/issues/104 is implemented, a much
- /// better API would be to make `ReadResult` a `packed struct(u31)` instead
- /// and use `?ReadResult` elsewhere. But, for now, this indicates whether
- /// `codepoint` and `byte_length` are present, so that the whole thing fits
- /// in a `u32` rather than unnecessarily taking up 8 bytes.
- present: bool = true,
-
- pub const none: ReadResult = .{
- .codepoint = 0,
- .byte_length = 0,
- .present = false,
- };
-};
-
-/// A decoder which handles UTF-8 or UTF-16, using a BOM to detect UTF-16
-/// endianness.
-///
-/// This is the bare minimum encoding support required of a standard-compliant
-/// XML parser.
-pub const DefaultDecoder = struct {
- state: union(enum) {
- start,
- utf8: Utf8Decoder,
- utf16_le: Utf16Decoder(.little),
- utf16_be: Utf16Decoder(.big),
- } = .start,
-
- pub const Error = Utf8Decoder.Error || Utf16Decoder(.little).Error || Utf16Decoder(.big).Error;
-
- pub const max_encoded_codepoint_len = 4;
- const bom = 0xFEFF;
- const bom_byte_length = unicode.utf8CodepointSequenceLength(bom) catch unreachable;
-
- pub fn readCodepoint(self: *DefaultDecoder, reader: anytype, buf: []u8) (Error || @TypeOf(reader).Error)!ReadResult {
- switch (self.state) {
- .start => {},
- inline else => |*inner| return inner.readCodepoint(reader, buf),
- }
- // If attempting to match the UTF-16 BOM fails for whatever reason, we
- // will assume we are reading UTF-8.
- self.state = .{ .utf8 = .{} };
- const b = reader.readByte() catch |e| switch (e) {
- error.EndOfStream => return error.UnexpectedEndOfInput,
- else => |other| return other,
- };
- switch (b) {
- 0xFE => {
- const b2 = reader.readByte() catch |e| switch (e) {
- error.EndOfStream => return error.InvalidUtf8,
- else => |other| return other,
- };
- if (b2 != 0xFF) return error.InvalidUtf8;
- self.state = .{ .utf16_be = .{} };
- if (bom_byte_length > buf.len) return error.Overflow;
- _ = unicode.utf8Encode(bom, buf) catch unreachable;
- return .{ .codepoint = bom, .byte_length = bom_byte_length };
- },
- 0xFF => {
- const b2 = reader.readByte() catch |e| switch (e) {
- error.EndOfStream => return error.InvalidUtf8,
- else => |other| return other,
- };
- if (b2 != 0xFE) return error.InvalidUtf8;
- self.state = .{ .utf16_le = .{} };
- if (bom_byte_length > buf.len) return error.Overflow;
- _ = unicode.utf8Encode(bom, buf) catch unreachable;
- return .{ .codepoint = bom, .byte_length = bom_byte_length };
- },
- else => {
- // The rest of this branch is copied from Utf8Decoder
- const byte_length = unicode.utf8ByteSequenceLength(b) catch return error.InvalidUtf8;
- if (byte_length > buf.len) return error.Overflow;
- buf[0] = b;
- if (byte_length == 1) return .{ .codepoint = b, .byte_length = 1 };
- reader.readNoEof(buf[1..byte_length]) catch |e| switch (e) {
- error.EndOfStream => return error.UnexpectedEndOfInput,
- else => |other| return other,
- };
- const codepoint = switch (byte_length) {
- 2 => unicode.utf8Decode2(buf[0..2]),
- 3 => unicode.utf8Decode3(buf[0..3]),
- 4 => unicode.utf8Decode4(buf[0..4]),
- else => unreachable,
- } catch return error.InvalidUtf8;
- return .{ .codepoint = codepoint, .byte_length = byte_length };
- },
- }
- }
-
- pub fn adaptTo(self: *DefaultDecoder, encoding: []const u8) error{InvalidEncoding}!void {
- switch (self.state) {
- .start => {},
- inline else => |*decoder| try decoder.adaptTo(encoding),
- }
- }
-};
-
-test DefaultDecoder {
- // UTF-8 no BOM
- {
- const input = "Hü日😀";
- var decoder = try testDecode(DefaultDecoder, input, &.{
- 'H',
- 'ü',
- '日',
- '😀',
- });
- try decoder.adaptTo("utf-8");
- try decoder.adaptTo("UTF-8");
- }
-
- // UTF-8 BOM
- {
- const input = "\u{FEFF}Hü日😀";
- var decoder = try testDecode(DefaultDecoder, input, &.{
- 0xFEFF,
- 'H',
- 'ü',
- '日',
- '😀',
- });
- try decoder.adaptTo("utf-8");
- try decoder.adaptTo("UTF-8");
- }
-
- // Invalid UTF-8 BOM
- {
- const input = "\xEF\x00\x00H";
- var decoder = try testDecode(DefaultDecoder, input, &.{
- error.InvalidUtf8,
- 'H',
- });
- try decoder.adaptTo("utf-8");
- try decoder.adaptTo("UTF-8");
- }
-
- // UTF-16BE BOM
- {
- const input = "\xFE\xFF" ++ // U+FEFF
- "\x00H" ++
- "\x00\xFC" ++ // ü
- "\x65\xE5" ++ // 日
- "\xD8\x3D\xDE\x00"; // 😀
- var decoder = try testDecode(DefaultDecoder, input, &.{
- 0xFEFF,
- 'H',
- 'ü',
- '日',
- '😀',
- });
- try decoder.adaptTo("utf-16");
- try decoder.adaptTo("UTF-16");
- try decoder.adaptTo("utf-16be");
- try decoder.adaptTo("UTF-16BE");
- }
-
- // Invalid UTF-16BE BOM
- {
- const input = "\xFE\x00H";
- var decoder = try testDecode(DefaultDecoder, input, &.{
- error.InvalidUtf8,
- 'H',
- });
- try decoder.adaptTo("utf-8");
- try decoder.adaptTo("UTF-8");
- }
-
- // UTF-16LE BOM
- {
- const input = "\xFF\xFE" ++ // U+FEFF
- "H\x00" ++
- "\xFC\x00" ++ // ü
- "\xE5\x65" ++ // 日
- "\x3D\xD8\x00\xDE"; // 😀
- var decoder = try testDecode(DefaultDecoder, input, &.{
- 0xFEFF,
- 'H',
- 'ü',
- '日',
- '😀',
- });
- try decoder.adaptTo("utf-16");
- try decoder.adaptTo("UTF-16");
- try decoder.adaptTo("utf-16le");
- try decoder.adaptTo("UTF-16LE");
- }
-
- // Invalid UTF-16LE BOM
- {
- const input = "\xFF\xFFH";
- var decoder = try testDecode(DefaultDecoder, input, &.{
- error.InvalidUtf8,
- 'H',
- });
- try decoder.adaptTo("utf-8");
- try decoder.adaptTo("UTF-8");
- }
-}
-
-/// A decoder which handles only UTF-8.
-pub const Utf8Decoder = struct {
- pub const max_encoded_codepoint_len = 4;
-
- pub const Error = error{ InvalidUtf8, Overflow, UnexpectedEndOfInput };
-
- pub fn readCodepoint(_: *Utf8Decoder, reader: anytype, buf: []u8) (Error || @TypeOf(reader).Error)!ReadResult {
- const b = reader.readByte() catch |e| switch (e) {
- error.EndOfStream => return ReadResult.none,
- else => |other| return other,
- };
- const byte_length = unicode.utf8ByteSequenceLength(b) catch return error.InvalidUtf8;
- if (byte_length > buf.len) return error.Overflow;
- buf[0] = b;
- if (byte_length == 1) return .{ .codepoint = b, .byte_length = 1 };
- reader.readNoEof(buf[1..byte_length]) catch |e| switch (e) {
- error.EndOfStream => return error.UnexpectedEndOfInput,
- else => |other| return other,
- };
- const codepoint = switch (byte_length) {
- 2 => unicode.utf8Decode2(buf[0..2]),
- 3 => unicode.utf8Decode3(buf[0..3]),
- 4 => unicode.utf8Decode4(buf[0..4]),
- else => unreachable,
- } catch return error.InvalidUtf8;
- return .{ .codepoint = codepoint, .byte_length = byte_length };
- }
-
- pub fn adaptTo(_: *Utf8Decoder, encoding: []const u8) error{InvalidEncoding}!void {
- if (!ascii.eqlIgnoreCase(encoding, "utf-8")) {
- return error.InvalidEncoding;
- }
- }
-};
-
-test Utf8Decoder {
- const input =
- // 1-byte encodings
- "\x00\x01 ABC abc 123" ++
- // 2-byte encodings
- "éèçñåβΘ" ++
- // 3-byte encodings
- "日本語AESTHETIC" ++
- // 4-byte encodings
- "😳😂❤️👩👩👧👧" ++
- // Overlong encodings
- "\xC0\x80\xE0\x80\x80\xF0\x80\x80\x80" ++
- // Out of bounds codepoint
- "\xF7\xBF\xBF\xBF" ++
- // Surrogate halves
- "\xED\xA0\x80\xED\xBF\xBF";
- _ = try testDecode(Utf8Decoder, input, &.{
- '\x00',
- '\x01',
- ' ',
- 'A',
- 'B',
- 'C',
- ' ',
- 'a',
- 'b',
- 'c',
- ' ',
- '1',
- '2',
- '3',
- 'é',
- 'è',
- 'ç',
- 'ñ',
- 'å',
- 'β',
- 'Θ',
- '日',
- '本',
- '語',
- 'A',
- 'E',
- 'S',
- 'T',
- 'H',
- 'E',
- 'T',
- 'I',
- 'C',
- '😳',
- '😂',
- '❤',
- '\u{FE0F}', // variation selector-16
- '👩',
- '\u{200D}', // zero-width joiner
- '👩',
- '\u{200D}', // zero-width joiner
- '👧',
- '\u{200D}', // zero-width joiner
- '👧',
- error.InvalidUtf8, // 2-byte U+0000
- error.InvalidUtf8, // 3-byte U+0000
- error.InvalidUtf8, // 4-byte U+0000
- error.InvalidUtf8, // attempted U+1FFFFF
- error.InvalidUtf8, // U+D800
- error.InvalidUtf8, // U+DFFF
- });
-}
-
-/// A decoder which handles only UTF-16 of a given endianness.
-pub fn Utf16Decoder(comptime endian: std.builtin.Endian) type {
- return struct {
- const Self = @This();
-
- pub const Error = error{ InvalidUtf16, Overflow, UnexpectedEndOfInput };
-
- pub const max_encoded_codepoint_len = 4;
-
- pub fn readCodepoint(_: *Self, reader: anytype, buf: []u8) (Error || @TypeOf(reader).Error)!ReadResult {
- var u_buf: [2]u8 = undefined;
- const u_len = try reader.readAll(&u_buf);
- switch (u_len) {
- 0 => return ReadResult.none,
- 1 => return error.UnexpectedEndOfInput,
- else => {},
- }
- const u = std.mem.readInt(u16, &u_buf, endian);
- const code_unit_length = unicode.utf16CodeUnitSequenceLength(u) catch return error.InvalidUtf16;
- const codepoint = switch (code_unit_length) {
- 1 => u,
- 2 => codepoint: {
- const low = reader.readInt(u16, endian) catch |e| switch (e) {
- error.EndOfStream => return error.UnexpectedEndOfInput,
- else => |other| return other,
- };
- break :codepoint unicode.utf16DecodeSurrogatePair(&.{ u, low }) catch return error.InvalidUtf16;
- },
- else => unreachable,
- };
- const byte_length = unicode.utf8CodepointSequenceLength(codepoint) catch unreachable;
- if (byte_length > buf.len) return error.Overflow;
- _ = unicode.utf8Encode(codepoint, buf) catch unreachable;
- return .{ .codepoint = codepoint, .byte_length = byte_length };
- }
-
- pub fn adaptTo(_: *Self, encoding: []const u8) error{InvalidEncoding}!void {
- if (!(ascii.eqlIgnoreCase(encoding, "utf-16") or
- (endian == .big and ascii.eqlIgnoreCase(encoding, "utf-16be")) or
- (endian == .little and ascii.eqlIgnoreCase(encoding, "utf-16le"))))
- {
- return error.InvalidEncoding;
- }
- }
- };
-}
-
-test Utf16Decoder {
- // little-endian
- {
- const input = "\x00\x00" ++ // U+0000
- "A\x00" ++ // A
- "b\x00" ++ // b
- "5\x00" ++ // 5
- "\xE5\x65" ++ // 日
- "\x3D\xD8\x33\xDE" ++ // 😳
- "\x00\xD8\x00\x00" ++ // unpaired high surrogate followed by U+0000
- "\xFF\xDF" // unpaired low surrogate
- ;
- _ = try testDecode(Utf16Decoder(.little), input, &.{
- '\x00',
- 'A',
- 'b',
- '5',
- '日',
- '😳',
- error.InvalidUtf16,
- error.InvalidUtf16,
- });
- }
-
- // big-endian
- {
- const input = "\x00\x00" ++ // U+0000
- "\x00A" ++ // A
- "\x00b" ++ // b
- "\x005" ++ // 5
- "\x65\xE5" ++ // 日
- "\xD8\x3D\xDE\x33" ++ // 😳
- "\xD8\x00\x00\x00" ++ // unpaired high surrogate followed by U+0000
- "\xDF\xFF" // unpaired low surrogate
- ;
- _ = try testDecode(Utf16Decoder(.big), input, &.{
- '\x00',
- 'A',
- 'b',
- '5',
- '日',
- '😳',
- error.InvalidUtf16,
- error.InvalidUtf16,
- });
- }
-}
-
-fn testDecode(comptime Decoder: type, input: []const u8, expected: []const (Decoder.Error!u21)) !Decoder {
- var decoder: Decoder = .{};
- var decoded = ArrayListUnmanaged(Decoder.Error!u21){};
- defer decoded.deinit(testing.allocator);
- var input_stream = std.io.fixedBufferStream(input);
- var buf: [4]u8 = undefined;
- while (true) {
- if (decoder.readCodepoint(input_stream.reader(), &buf)) |c| {
- if (!c.present) break;
- try decoded.append(testing.allocator, c.codepoint);
- } else |err| {
- try decoded.append(testing.allocator, err);
- }
- }
-
- try testing.expectEqualDeep(expected, decoded.items);
-
- return decoder;
-}
diff --git a/src/node.zig b/src/node.zig
deleted file mode 100644
index d27f61a..0000000
--- a/src/node.zig
+++ /dev/null
@@ -1,60 +0,0 @@
-const std = @import("std");
-const mem = std.mem;
-const ArenaAllocator = std.heap.ArenaAllocator;
-const QName = @import("reader.zig").QName;
-
-/// A node value along with an `ArenaAllocator` used to allocate all memory
-/// backing it.
-pub fn OwnedValue(comptime T: type) type {
- return struct {
- value: T,
- arena: ArenaAllocator,
-
- const Self = @This();
-
- pub fn deinit(self: *Self) void {
- self.arena.deinit();
- self.* = undefined;
- }
- };
-}
-
-/// A node in an XML document.
-pub const Node = union(enum) {
- document: Document,
- element: Element,
- attribute: Attribute,
- comment: Comment,
- pi: Pi,
- text: Text,
-
- pub const Document = struct {
- version: []const u8 = "1.0",
- encoding: ?[]const u8 = null,
- standalone: ?bool = null,
- children: []const Node,
- };
-
- pub const Element = struct {
- name: QName,
- children: []const Node = &.{},
- };
-
- pub const Attribute = struct {
- name: QName,
- value: []const u8,
- };
-
- pub const Comment = struct {
- content: []const u8,
- };
-
- pub const Pi = struct {
- target: []const u8,
- content: []const u8,
- };
-
- pub const Text = struct {
- content: []const u8,
- };
-};
diff --git a/src/reader.zig b/src/reader.zig
deleted file mode 100644
index d01a137..0000000
--- a/src/reader.zig
+++ /dev/null
@@ -1,1149 +0,0 @@
-const std = @import("std");
-const fmt = std.fmt;
-const mem = std.mem;
-const testing = std.testing;
-const unicode = std.unicode;
-const Allocator = mem.Allocator;
-const ArenaAllocator = std.heap.ArenaAllocator;
-const ArrayListUnmanaged = std.ArrayListUnmanaged;
-const ComptimeStringMap = @import("compat.zig").ComptimeStringMap;
-const StringArrayHashMapUnmanaged = std.StringArrayHashMapUnmanaged;
-const StringHashMapUnmanaged = std.StringHashMapUnmanaged;
-const encoding = @import("encoding.zig");
-const syntax = @import("syntax.zig");
-const Node = @import("node.zig").Node;
-const OwnedValue = @import("node.zig").OwnedValue;
-const Scanner = @import("Scanner.zig");
-const Token = @import("token_reader.zig").Token;
-const TokenReader = @import("token_reader.zig").TokenReader;
-
-const max_encoded_codepoint_len = 4;
-
-/// A qualified name.
-pub const QName = struct {
- prefix: ?[]const u8 = null,
- ns: ?[]const u8 = null,
- local: []const u8,
-
- /// Returns whether this name has the given namespace and local name.
- pub fn is(self: QName, ns: ?[]const u8, local: []const u8) bool {
- if (self.ns) |self_ns| {
- if (!mem.eql(u8, self_ns, ns orelse return false)) {
- return false;
- }
- } else if (ns != null) {
- return false;
- }
- return mem.eql(u8, self.local, local);
- }
-
- test is {
- try testing.expect((QName{ .local = "abc" }).is(null, "abc"));
- try testing.expect((QName{ .ns = "http://example.com/ns/", .local = "abc" }).is("http://example.com/ns/", "abc"));
- try testing.expect(!(QName{ .local = "abc" }).is(null, "def"));
- try testing.expect(!(QName{ .local = "abc" }).is("http://example.com/ns/", "abc"));
- try testing.expect(!(QName{ .ns = "http://example.com/ns/", .local = "abc" }).is(null, "abc"));
- try testing.expect(!(QName{ .ns = "http://example.com/ns/", .local = "abc" }).is("http://example.com/ns2/", "abc"));
- try testing.expect(!(QName{ .ns = "http://example.com/ns/", .local = "abc" }).is("http://example.com/ns/", "def"));
- try testing.expect(!(QName{ .ns = "http://example.com/ns/", .local = "abc" }).is("http://example.com/ns2/", "def"));
- }
-
- fn clone(self: QName, allocator: Allocator) !QName {
- const prefix = if (self.prefix) |prefix| try allocator.dupe(u8, prefix) else null;
- errdefer if (prefix) |p| allocator.free(p);
- const ns = if (self.ns) |ns| try allocator.dupe(u8, ns) else null;
- errdefer if (ns) |n| allocator.free(n);
- const local = try allocator.dupe(u8, self.local);
- return .{ .prefix = prefix, .ns = ns, .local = local };
- }
-
- /// Duplicates the `ns` value, if any.
- ///
- /// This is to allow the `QName` to outlive the closure of its containing
- /// scope.
- inline fn dupNs(self: *QName, allocator: Allocator) !void {
- if (self.ns) |*ns| {
- ns.* = try allocator.dupe(u8, ns.*);
- }
- }
-};
-
-/// A hash map `Context` which compares namespace URIs and local names (that is,
-/// name identity according to the XML namespaces spec, since the prefix does
-/// not contribute to the identity of a QName).
-const QNameContext = struct {
- const Self = @This();
-
- pub fn hash(_: Self, name: QName) u64 {
- var h = std.hash.Wyhash.init(0);
- if (name.ns) |ns| {
- h.update(ns);
- }
- h.update(name.local);
- return h.final();
- }
-
- pub fn eql(_: Self, name1: QName, name2: QName) bool {
- return name1.is(name2.ns, name2.local);
- }
-};
-
-const QNameSet = std.HashMapUnmanaged(QName, void, QNameContext, std.hash_map.default_max_load_percentage);
-
-/// An event emitted by a reader.
-pub const Event = union(enum) {
- xml_declaration: XmlDeclaration,
- element_start: ElementStart,
- element_content: ElementContent,
- element_end: ElementEnd,
- comment: Comment,
- pi: Pi,
-
- pub const XmlDeclaration = struct {
- version: []const u8,
- encoding: ?[]const u8 = null,
- standalone: ?bool = null,
- };
-
- pub const ElementStart = struct {
- name: QName,
- attributes: []const Attribute = &.{},
- };
-
- pub const Attribute = struct {
- name: QName,
- value: []const u8,
- };
-
- pub const ElementContent = struct {
- content: []const u8,
- };
-
- pub const ElementEnd = struct {
- name: QName,
- };
-
- pub const Comment = struct {
- content: []const u8,
- };
-
- pub const Pi = struct {
- target: []const u8,
- content: []const u8,
- };
-};
-
-/// A map of predefined XML entities to their replacement text.
-///
-/// Until DTDs are understood and parsed, these are the only named entities
-/// supported by this parser.
-const entities = ComptimeStringMap([]const u8, .{
- .{ "amp", "&" },
- .{ "lt", "<" },
- .{ "gt", ">" },
- .{ "apos", "'" },
- .{ "quot", "\"" },
-});
-
-const xml_ns = "http://www.w3.org/XML/1998/namespace";
-const xmlns_ns = "http://www.w3.org/2000/xmlns/";
-
-const predefined_ns_prefixes = ComptimeStringMap([]const u8, .{
- .{ "xml", xml_ns },
- .{ "xmlns", xmlns_ns },
-});
-
-/// A context for namespace information in a document.
-///
-/// The context maintains a hierarchy of namespace scopes. Initially, there is
-/// no active scope (corresponding to the beginning of a document, before the
-/// start of the root element).
-pub const NamespaceContext = struct {
- scopes: ArrayListUnmanaged(StringHashMapUnmanaged([]const u8)) = .{},
-
- pub const Error = error{
- CannotUndeclareNsPrefix,
- InvalidNsBinding,
- InvalidQName,
- UndeclaredNsPrefix,
- QNameNotAllowed,
- };
-
- pub fn deinit(self: *NamespaceContext, allocator: Allocator) void {
- while (self.scopes.items.len > 0) {
- self.endScope(allocator);
- }
- self.scopes.deinit(allocator);
- self.* = undefined;
- }
-
- /// Starts a new scope.
- pub fn startScope(self: *NamespaceContext, allocator: Allocator) !void {
- try self.scopes.append(allocator, .{});
- }
-
- /// Ends the current scope.
- ///
- /// Only valid if there is a current scope.
- pub fn endScope(self: *NamespaceContext, allocator: Allocator) void {
- var bindings = self.scopes.pop();
- var iter = bindings.iterator();
- while (iter.next()) |entry| {
- allocator.free(entry.key_ptr.*);
- allocator.free(entry.value_ptr.*);
- }
- bindings.deinit(allocator);
- }
-
- /// Binds the default namespace in the current scope.
- ///
- /// Only valid if there is a current scope.
- pub fn bindDefault(self: *NamespaceContext, allocator: Allocator, uri: []const u8) !void {
- if (mem.eql(u8, uri, xml_ns) or mem.eql(u8, uri, xmlns_ns)) {
- return error.InvalidNsBinding;
- }
- try self.bindInner(allocator, "", uri);
- }
-
- /// Binds a prefix in the current scope.
- ///
- /// Only valid if there is a current scope.
- pub fn bindPrefix(self: *NamespaceContext, allocator: Allocator, prefix: []const u8, uri: []const u8) !void {
- if (!syntax.isNcName(prefix)) {
- return error.InvalidQName;
- }
- if (mem.eql(u8, prefix, "xml") and !mem.eql(u8, uri, xml_ns)) {
- return error.InvalidNsBinding;
- }
- if (mem.eql(u8, uri, xml_ns) and !mem.eql(u8, prefix, "xml")) {
- return error.InvalidNsBinding;
- }
- if (mem.eql(u8, prefix, "xmlns")) {
- return error.InvalidNsBinding;
- }
- if (mem.eql(u8, uri, xmlns_ns) and !mem.eql(u8, prefix, "xmlns")) {
- return error.InvalidNsBinding;
- }
- try self.bindInner(allocator, prefix, uri);
- }
-
- fn bindInner(self: *NamespaceContext, allocator: Allocator, prefix: []const u8, uri: []const u8) !void {
- // TODO: validate that uri is a valid URI reference
- if (prefix.len != 0 and uri.len == 0) {
- return error.CannotUndeclareNsPrefix;
- }
- var bindings = &self.scopes.items[self.scopes.items.len - 1];
- const key = try allocator.dupe(u8, prefix);
- errdefer allocator.free(key);
- const value = try allocator.dupe(u8, uri);
- errdefer allocator.free(value);
- // We cannot clobber an existing prefix in this scope because that
- // would imply a duplicate attribute name, which is validated earlier.
- try bindings.putNoClobber(allocator, key, value);
- }
-
- /// Returns the URI, if any, bound to the given prefix.
- pub fn getUri(self: NamespaceContext, prefix: []const u8) ?[]const u8 {
- if (predefined_ns_prefixes.get(prefix)) |uri| {
- return uri;
- }
- return for (0..self.scopes.items.len) |i| {
- if (self.scopes.items[self.scopes.items.len - i - 1].get(prefix)) |uri| {
- break if (uri.len > 0) uri else null;
- }
- } else null;
- }
-
- /// Parses a possibly prefixed name and returns the corresponding `QName`.
- ///
- /// `use_default_ns` specifies if the default namespace (if any) should be
- /// implied for the given name if it is unprefixed. This is appropriate for
- /// element names but not attribute names, per the namespaces spec.
- pub fn parseName(self: NamespaceContext, name: []const u8, use_default_ns: bool) !QName {
- if (mem.indexOfScalar(u8, name, ':')) |sep_pos| {
- const prefix = name[0..sep_pos];
- const local = name[sep_pos + 1 ..];
- if (!syntax.isNcName(prefix) or !syntax.isNcName(local)) {
- return error.InvalidQName;
- }
- const ns = self.getUri(prefix) orelse return error.UndeclaredNsPrefix;
- return .{ .prefix = prefix, .ns = ns, .local = local };
- } else if (use_default_ns) {
- return .{ .ns = self.getUri(""), .local = name };
- } else {
- return .{ .local = name };
- }
- }
-};
-
-/// A drop-in replacement for `NamespaceContext` which doesn't actually do any
-/// namespace processing.
-pub const NoOpNamespaceContext = struct {
- pub const Error = error{};
-
- pub inline fn deinit(_: *NoOpNamespaceContext, _: Allocator) void {}
-
- pub inline fn startScope(_: *NoOpNamespaceContext, _: Allocator) !void {}
-
- pub inline fn endScope(_: *NoOpNamespaceContext, _: Allocator) void {}
-
- pub inline fn bindDefault(_: *NoOpNamespaceContext, _: Allocator, _: []const u8) !void {}
-
- pub inline fn bindPrefix(_: *NoOpNamespaceContext, _: Allocator, _: []const u8, _: []const u8) !void {}
-
- pub inline fn getUri(_: NoOpNamespaceContext, _: []const u8) ?[]const u8 {
- return null;
- }
-
- pub inline fn parseName(_: NoOpNamespaceContext, name: []const u8, _: bool) !QName {
- return .{ .local = name };
- }
-};
-
-/// Returns a `Reader` wrapping a `std.io.Reader`.
-pub fn reader(
- allocator: Allocator,
- r: anytype,
- comptime options: ReaderOptions,
-) Reader(@TypeOf(r), options) {
- return Reader(@TypeOf(r), options).init(allocator, r, .{});
-}
-
-/// Reads a full XML document from a `std.io.Reader`.
-pub fn readDocument(
- allocator: Allocator,
- r: anytype,
- comptime options: ReaderOptions,
-) !OwnedValue(Node.Document) {
- var arena = ArenaAllocator.init(allocator);
- errdefer arena.deinit();
- const node_allocator = arena.allocator();
-
- var decl_version: []const u8 = "1.0";
- var decl_encoding: ?[]const u8 = null;
- var decl_standalone: ?bool = null;
- var children = ArrayListUnmanaged(Node){};
-
- var xml_reader = reader(allocator, r, options);
- defer xml_reader.deinit();
- while (try xml_reader.next()) |event| {
- switch (event) {
- .xml_declaration => |xml_declaration| {
- decl_version = try node_allocator.dupe(u8, xml_declaration.version);
- if (xml_declaration.encoding) |e| {
- decl_encoding = try node_allocator.dupe(u8, e);
- }
- decl_standalone = xml_declaration.standalone;
- },
- .element_start => |element_start| try children.append(node_allocator, .{
- .element = try xml_reader.nextElementNode(node_allocator, element_start),
- }),
- .comment => |comment| try children.append(node_allocator, .{ .comment = .{
- .content = try node_allocator.dupe(u8, comment.content),
- } }),
- .pi => |pi| try children.append(node_allocator, .{ .pi = .{
- .target = try node_allocator.dupe(u8, pi.target),
- .content = try node_allocator.dupe(u8, pi.content),
- } }),
- else => unreachable,
- }
- }
-
- return .{
- .value = .{
- .version = decl_version,
- .encoding = decl_encoding,
- .standalone = decl_standalone,
- .children = children.items,
- },
- .arena = arena,
- };
-}
-
-/// Options for a `Reader`.
-pub const ReaderOptions = struct {
- /// The type of decoder to use.
- DecoderType: type = encoding.DefaultDecoder,
- /// The size of the internal buffer.
- ///
- /// This limits the byte length of "non-splittable" content, such as
- /// element and attribute names. Longer such content will result in
- /// `error.Overflow`.
- buffer_size: usize = 4096,
- /// Whether to normalize line endings and attribute values according to the
- /// XML specification.
- ///
- /// If this is set to false, no normalization will be done: for example,
- /// the line ending sequence `\r\n` will appear as-is in returned events
- /// rather than the normalized `\n`.
- enable_normalization: bool = true,
- /// Whether namespace information should be processed.
- ///
- /// If this is false, then `QName`s in the returned events will have only
- /// their `local` field populated, containing the full name of the element
- /// or attribute.
- namespace_aware: bool = true,
- /// Whether to keep track of the current location in the document.
- track_location: bool = false,
-};
-
-/// A streaming, pull-based XML parser wrapping a `std.io.Reader`.
-///
-/// This parser behaves similarly to Go's `encoding/xml` package. It is a
-/// higher-level abstraction over a `TokenReader` which uses an internal
-/// allocator to keep track of additional context. It performs additional
-/// well-formedness checks which the lower-level parsers cannot perform due to
-/// their design, such as ensuring element start and end tags match and
-/// attribute names are not duplicated. It is also able to process namespace
-/// information.
-///
-/// Since this parser wraps a `TokenReader`, the caveats on the `buffer_size`
-/// bounding the length of "non-splittable" content which are outlined in its
-/// documentation apply here as well.
-pub fn Reader(comptime ReaderType: type, comptime options: ReaderOptions) type {
- return struct {
- token_reader: TokenReaderType,
- /// A stack of element names enclosing the current context.
- element_names: ArrayListUnmanaged([]u8) = .{},
- /// The namespace context of the reader.
- namespace_context: NamespaceContextType = .{},
- /// A pending token which has been read but has not yet been handled as
- /// part of an event.
- pending_token: ?Token = null,
- /// A buffer for storing encoded Unicode codepoint data.
- codepoint_buf: [max_encoded_codepoint_len]u8 = undefined,
- /// A "buffer" for handling the contents of the next pending event.
- pending_event: union(enum) {
- none,
- element_start: struct {
- name: []const u8,
- attributes: StringArrayHashMapUnmanaged(ArrayListUnmanaged(u8)) = .{},
- },
- comment: struct { content: ArrayListUnmanaged(u8) = .{} },
- pi: struct { target: []const u8, content: ArrayListUnmanaged(u8) = .{} },
- } = .none,
- /// An arena to store memory for `pending_event` (and the event after
- /// it's returned).
- event_arena: ArenaAllocator,
- allocator: Allocator,
-
- const Self = @This();
- const TokenReaderType = TokenReader(ReaderType, .{
- .DecoderType = options.DecoderType,
- .buffer_size = options.buffer_size,
- .enable_normalization = options.enable_normalization,
- .track_location = options.track_location,
- });
- const NamespaceContextType = if (options.namespace_aware) NamespaceContext else NoOpNamespaceContext;
-
- pub const Error = error{
- DuplicateAttribute,
- MismatchedEndTag,
- UndeclaredEntityReference,
- } || Allocator.Error || TokenReaderType.Error || NamespaceContextType.Error;
-
- pub fn init(allocator: Allocator, r: ReaderType, decoder: options.DecoderType) Self {
- return .{
- .token_reader = TokenReaderType.init(r, decoder),
- .event_arena = ArenaAllocator.init(allocator),
- .allocator = allocator,
- };
- }
-
- pub fn deinit(self: *Self) void {
- for (self.element_names.items) |name| {
- self.allocator.free(name);
- }
- self.element_names.deinit(self.allocator);
- self.namespace_context.deinit(self.allocator);
- self.event_arena.deinit();
- self.* = undefined;
- }
-
- /// Returns the next event from the input.
- ///
- /// The returned event is only valid until the next reader operation.
- pub fn next(self: *Self) Error!?Event {
- _ = self.event_arena.reset(.retain_capacity);
- const event_allocator = self.event_arena.allocator();
- while (true) {
- switch (try self.nextToken()) {
- .eof => return null,
- .xml_declaration => return .{ .xml_declaration = .{
- .version = self.token_reader.token_data.xml_declaration.version,
- .encoding = self.token_reader.token_data.xml_declaration.encoding,
- .standalone = self.token_reader.token_data.xml_declaration.standalone,
- } },
- .element_start => {
- if (try self.finalizePendingEvent()) |event| {
- self.pending_token = .element_start;
- return event;
- }
- const name = try self.allocator.dupe(u8, self.token_reader.token_data.element_start.name);
- errdefer self.allocator.free(name);
- try self.element_names.append(self.allocator, name);
- errdefer _ = self.element_names.pop();
- try self.namespace_context.startScope(self.allocator);
- self.pending_event = .{ .element_start = .{ .name = name } };
- },
- .element_content => {
- if (try self.finalizePendingEvent()) |event| {
- self.pending_token = .element_content;
- return event;
- }
- return .{ .element_content = .{
- .content = try self.contentText(self.token_reader.token_data.element_content.content),
- } };
- },
- .element_end => {
- if (try self.finalizePendingEvent()) |event| {
- self.pending_token = .element_end;
- return event;
- }
- const expected_name = self.element_names.pop();
- defer self.allocator.free(expected_name);
- if (!mem.eql(u8, expected_name, self.token_reader.token_data.element_end.name)) {
- return error.MismatchedEndTag;
- }
- var qname = try self.namespace_context.parseName(self.token_reader.token_data.element_end.name, true);
- try qname.dupNs(event_allocator);
- self.namespace_context.endScope(self.allocator);
- return .{ .element_end = .{ .name = qname } };
- },
- .element_end_empty => {
- if (try self.finalizePendingEvent()) |event| {
- self.pending_token = .element_end_empty;
- return event;
- }
- const name = self.element_names.pop();
- defer self.allocator.free(name);
- const dup_name = try event_allocator.dupe(u8, name);
- var qname = try self.namespace_context.parseName(dup_name, true);
- try qname.dupNs(event_allocator);
- self.namespace_context.endScope(self.allocator);
- return .{ .element_end = .{ .name = qname } };
- },
- .attribute_start => {
- const attr_entry = try self.pending_event.element_start.attributes.getOrPut(
- event_allocator,
- self.token_reader.token_data.attribute_start.name,
- );
- if (attr_entry.found_existing) {
- return error.DuplicateAttribute;
- }
- // The attribute name will be invalidated after we get
- // the next token, so we have to duplicate it here.
- // This doesn't change the hash of the key, so it's
- // safe to do this.
- attr_entry.key_ptr.* = try event_allocator.dupe(u8, self.token_reader.token_data.attribute_start.name);
- attr_entry.value_ptr.* = .{};
- },
- .attribute_content => {
- const attributes = self.pending_event.element_start.attributes.values();
- try attributes[attributes.len - 1].appendSlice(event_allocator, try self.contentText(self.token_reader.token_data.attribute_content.content));
- },
- .comment_start => {
- if (try self.finalizePendingEvent()) |event| {
- self.pending_token = .comment_start;
- return event;
- }
- self.pending_event = .{ .comment = .{} };
- },
- .comment_content => {
- try self.pending_event.comment.content.appendSlice(event_allocator, self.token_reader.token_data.comment_content.content);
- if (self.token_reader.token_data.comment_content.final) {
- const event = Event{ .comment = .{ .content = self.pending_event.comment.content.items } };
- self.pending_event = .none;
- return event;
- }
- },
- .pi_start => {
- if (try self.finalizePendingEvent()) |event| {
- self.pending_token = .pi_start;
- return event;
- }
- if (options.namespace_aware and mem.indexOfScalar(u8, self.token_reader.token_data.pi_start.target, ':') != null) {
- return error.QNameNotAllowed;
- }
- self.pending_event = .{ .pi = .{
- .target = try event_allocator.dupe(u8, self.token_reader.token_data.pi_start.target),
- } };
- },
- .pi_content => {
- try self.pending_event.pi.content.appendSlice(event_allocator, self.token_reader.token_data.pi_content.content);
- if (self.token_reader.token_data.pi_content.final) {
- const event = Event{ .pi = .{
- .target = self.pending_event.pi.target,
- .content = self.pending_event.pi.content.items,
- } };
- self.pending_event = .none;
- return event;
- }
- },
- }
- }
- }
-
- fn nextToken(self: *Self) !Token {
- if (self.pending_token) |token| {
- self.pending_token = null;
- return token;
- }
- return try self.token_reader.next();
- }
-
- fn finalizePendingEvent(self: *Self) !?Event {
- const event_allocator = self.event_arena.allocator();
- switch (self.pending_event) {
- .none => return null,
- .element_start => |element_start| {
- // Bind all xmlns declarations in the current element
- for (element_start.attributes.keys(), element_start.attributes.values()) |attr_name, attr_value| {
- if (mem.eql(u8, attr_name, "xmlns")) {
- try self.namespace_context.bindDefault(self.allocator, attr_value.items);
- } else if (mem.startsWith(u8, attr_name, "xmlns:")) {
- try self.namespace_context.bindPrefix(self.allocator, attr_name["xmlns:".len..], attr_value.items);
- }
- }
-
- // Convert the element and attribute names to QNames
- const qname = try self.namespace_context.parseName(element_start.name, true);
- var attributes = ArrayListUnmanaged(Event.Attribute){};
- try attributes.ensureTotalCapacity(event_allocator, element_start.attributes.count());
- // When namespaces are enabled, we need to check uniqueness
- // of attribute QNames according to the namespaces spec
- var attr_qnames = if (options.namespace_aware) QNameSet{};
- if (options.namespace_aware) {
- try attr_qnames.ensureTotalCapacity(event_allocator, @intCast(element_start.attributes.count()));
- }
- for (element_start.attributes.keys(), element_start.attributes.values()) |attr_name, attr_value| {
- const attr_qname = try self.namespace_context.parseName(attr_name, false);
- attributes.appendAssumeCapacity(.{ .name = attr_qname, .value = attr_value.items });
- if (options.namespace_aware) {
- const entry = attr_qnames.getOrPutAssumeCapacity(attr_qname);
- if (entry.found_existing) {
- return error.DuplicateAttribute;
- }
- }
- }
-
- self.pending_event = .none;
- return .{ .element_start = .{ .name = qname, .attributes = attributes.items } };
- },
- // Other pending events will have already been handled by
- // looking at the 'final' content event
- else => unreachable,
- }
- }
-
- fn contentText(self: *Self, content: Token.Content) ![]const u8 {
- return switch (content) {
- .text => |text| text,
- .codepoint => |codepoint| text: {
- const len = unicode.utf8Encode(codepoint, &self.codepoint_buf) catch unreachable;
- break :text self.codepoint_buf[0..len];
- },
- .entity => |entity| entities.get(entity) orelse return error.UndeclaredEntityReference,
- };
- }
-
- pub fn nextNode(self: *Self, allocator: Allocator, element_start: Event.ElementStart) Error!OwnedValue(Node.Element) {
- var arena = ArenaAllocator.init(allocator);
- errdefer arena.deinit();
- return .{
- .value = try self.nextElementNode(arena.allocator(), element_start),
- .arena = arena,
- };
- }
-
- fn nextElementNode(self: *Self, allocator: Allocator, element_start: Event.ElementStart) Error!Node.Element {
- const name = try element_start.name.clone(allocator);
- var element_children = ArrayListUnmanaged(Node){};
- try element_children.ensureTotalCapacity(allocator, element_start.attributes.len);
- for (element_start.attributes) |attr| {
- element_children.appendAssumeCapacity(.{ .attribute = .{
- .name = try attr.name.clone(allocator),
- .value = try allocator.dupe(u8, attr.value),
- } });
- }
- var current_content = ArrayListUnmanaged(u8){};
- while (try self.next()) |event| {
- if (event != .element_content and current_content.items.len > 0) {
- try element_children.append(allocator, .{ .text = .{ .content = current_content.items } });
- current_content = .{};
- }
- switch (event) {
- .xml_declaration => unreachable,
- .element_start => |sub_element_start| try element_children.append(allocator, .{
- .element = try self.nextElementNode(allocator, sub_element_start),
- }),
- .element_content => |element_content| try current_content.appendSlice(allocator, element_content.content),
- .element_end => return .{ .name = name, .children = element_children.items },
- .comment => |comment| try element_children.append(allocator, .{ .comment = .{
- .content = try allocator.dupe(u8, comment.content),
- } }),
- .pi => |pi| try element_children.append(allocator, .{ .pi = .{
- .target = try allocator.dupe(u8, pi.target),
- .content = try allocator.dupe(u8, pi.content),
- } }),
- }
- }
- unreachable;
- }
-
- /// Returns an iterator over the remaining children of the current
- /// element.
- ///
- /// Note that, since the returned iterator's `next` function calls the
- /// `next` function of this reader internally, such calls will
- /// invalidate any event returned prior to calling this function.
- pub fn children(self: *Self) Children(Self) {
- return .{ .reader = self, .start_depth = self.element_names.items.len };
- }
- };
-}
-
-fn Children(comptime ReaderType: type) type {
- return struct {
- reader: *ReaderType,
- start_depth: usize,
-
- const Self = @This();
-
- /// Returns the next event.
- ///
- /// This function must not be called after it initially returns null.
- pub fn next(self: Self) ReaderType.Error!?Event {
- return switch (try self.reader.next() orelse return null) {
- .element_end => |element_end| if (self.reader.element_names.items.len >= self.start_depth) .{ .element_end = element_end } else null,
- else => |event| event,
- };
- }
-
- /// Returns an iterator over the remaining children of the current
- /// element.
- ///
- /// This may not be used after `next` returns null.
- pub fn children(self: Self) Self {
- return self.reader.children();
- }
-
- /// Skips the remaining children.
- ///
- /// `next` and `children` must not be used after this.
- pub fn skip(self: Self) ReaderType.Error!void {
- while (try self.next()) |_| {}
- }
- };
-}
-
-test Reader {
- try testValid(.{},
- \\
- \\
- \\
- \\
- \\
- \\ Hello,
- \\
- \\
- \\ Text content goes here.
- \\
- \\
- \\
- \\
- \\
- \\
- \\
- , &.{
- .{ .xml_declaration = .{ .version = "1.0" } },
- .{ .pi = .{ .target = "some-pi", .content = "" } },
- .{ .comment = .{ .content = " A processing instruction with content follows " } },
- .{ .pi = .{ .target = "some-pi-with-content", .content = "content" } },
- .{ .element_start = .{ .name = .{ .local = "root" } } },
- .{ .element_content = .{ .content = "\n " } },
- .{ .element_start = .{ .name = .{ .local = "p" }, .attributes = &.{
- .{ .name = .{ .local = "class" }, .value = "test" },
- } } },
- .{ .element_content = .{ .content = "Hello, " } },
- .{ .element_content = .{ .content = "world!" } },
- .{ .element_end = .{ .name = .{ .local = "p" } } },
- .{ .element_content = .{ .content = "\n " } },
- .{ .element_start = .{ .name = .{ .local = "line" } } },
- .{ .element_end = .{ .name = .{ .local = "line" } } },
- .{ .element_content = .{ .content = "\n " } },
- .{ .pi = .{ .target = "another-pi", .content = "" } },
- .{ .element_content = .{ .content = "\n Text content goes here.\n " } },
- .{ .element_start = .{ .name = .{ .local = "div" } } },
- .{ .element_start = .{ .name = .{ .local = "p" } } },
- .{ .element_content = .{ .content = "&" } },
- .{ .element_end = .{ .name = .{ .local = "p" } } },
- .{ .element_end = .{ .name = .{ .local = "div" } } },
- .{ .element_content = .{ .content = "\n" } },
- .{ .element_end = .{ .name = .{ .local = "root" } } },
- .{ .comment = .{ .content = " Comments are allowed after the end of the root element " } },
- .{ .pi = .{ .target = "comment", .content = "So are PIs " } },
- });
-}
-
-test "tag name matching" {
- try testInvalid(.{}, "", error.MismatchedEndTag);
- try testInvalid(.{}, "", error.MismatchedEndTag);
- try testInvalid(.{}, "Some contentMore content", error.MismatchedEndTag);
-}
-
-test "namespace handling" {
- try testValid(.{},
- \\
- \\
- \\
- \\
- \\
- \\
- \\
- , &.{
- .{ .element_start = .{ .name = .{ .prefix = "a", .ns = "urn:1", .local = "root" }, .attributes = &.{
- .{ .name = .{ .prefix = "xmlns", .ns = xmlns_ns, .local = "a" }, .value = "urn:1" },
- } } },
- .{ .element_content = .{ .content = "\n " } },
- .{ .element_start = .{ .name = .{ .ns = "urn:2", .local = "child" }, .attributes = &.{
- .{ .name = .{ .local = "xmlns" }, .value = "urn:2" },
- .{ .name = .{ .prefix = "xmlns", .ns = xmlns_ns, .local = "b" }, .value = "urn:3" },
- .{ .name = .{ .local = "attr" }, .value = "value" },
- } } },
- .{ .element_content = .{ .content = "\n " } },
- .{ .element_start = .{ .name = .{ .prefix = "b", .ns = "urn:3", .local = "child" }, .attributes = &.{
- .{ .name = .{ .prefix = "xmlns", .ns = xmlns_ns, .local = "a" }, .value = "urn:4" },
- .{ .name = .{ .prefix = "b", .ns = "urn:3", .local = "attr" }, .value = "value" },
- } } },
- .{ .element_content = .{ .content = "\n " } },
- .{ .element_start = .{ .name = .{ .prefix = "a", .ns = "urn:4", .local = "child" } } },
- .{ .element_end = .{ .name = .{ .prefix = "a", .ns = "urn:4", .local = "child" } } },
- .{ .element_content = .{ .content = "\n " } },
- .{ .element_end = .{ .name = .{ .prefix = "b", .ns = "urn:3", .local = "child" } } },
- .{ .element_content = .{ .content = "\n " } },
- .{ .element_end = .{ .name = .{ .ns = "urn:2", .local = "child" } } },
- .{ .element_content = .{ .content = "\n" } },
- .{ .element_end = .{ .name = .{ .prefix = "a", .ns = "urn:1", .local = "root" } } },
- });
- try testInvalid(.{}, "", error.UndeclaredNsPrefix);
- try testInvalid(.{}, "<: />", error.InvalidQName);
- try testInvalid(.{}, "", error.InvalidQName);
- try testInvalid(.{}, "<:a />", error.InvalidQName);
- try testInvalid(.{}, "", error.InvalidQName);
- try testInvalid(.{}, "", error.InvalidQName);
- try testInvalid(.{}, "", error.InvalidQName);
- try testInvalid(.{}, "", error.DuplicateAttribute);
- try testInvalid(.{}, "", error.DuplicateAttribute);
- try testInvalid(.{}, "", error.DuplicateAttribute);
- try testInvalid(.{}, "", error.InvalidNsBinding);
- try testInvalid(.{}, "", error.InvalidNsBinding);
- try testValid(.{}, "", &.{
- .{ .element_start = .{ .name = .{ .local = "root" }, .attributes = &.{
- .{ .name = .{ .prefix = "xmlns", .ns = xmlns_ns, .local = "xml" }, .value = "http://www.w3.org/XML/1998/namespace" },
- } } },
- .{ .element_end = .{ .name = .{ .local = "root" } } },
- });
- try testInvalid(.{}, "", error.InvalidNsBinding);
- try testInvalid(.{}, "", error.InvalidNsBinding);
- try testInvalid(.{}, "", error.InvalidNsBinding);
- try testInvalid(.{}, "", error.InvalidNsBinding);
- try testInvalid(.{}, "", error.InvalidNsBinding);
- try testInvalid(.{}, "", error.QNameNotAllowed);
-
- try testValid(.{ .namespace_aware = false },
- \\
- \\
- \\
- \\
- \\
- \\
- \\
- , &.{
- .{ .element_start = .{ .name = .{ .local = "a:root" }, .attributes = &.{
- .{ .name = .{ .local = "xmlns:a" }, .value = "urn:1" },
- } } },
- .{ .element_content = .{ .content = "\n " } },
- .{ .element_start = .{ .name = .{ .local = "child" }, .attributes = &.{
- .{ .name = .{ .local = "xmlns" }, .value = "urn:2" },
- .{ .name = .{ .local = "xmlns:b" }, .value = "urn:3" },
- .{ .name = .{ .local = "attr" }, .value = "value" },
- } } },
- .{ .element_content = .{ .content = "\n " } },
- .{ .element_start = .{ .name = .{ .local = "b:child" }, .attributes = &.{
- .{ .name = .{ .local = "xmlns:a" }, .value = "urn:4" },
- .{ .name = .{ .local = "b:attr" }, .value = "value" },
- } } },
- .{ .element_content = .{ .content = "\n " } },
- .{ .element_start = .{ .name = .{ .local = "a:child" } } },
- .{ .element_end = .{ .name = .{ .local = "a:child" } } },
- .{ .element_content = .{ .content = "\n " } },
- .{ .element_end = .{ .name = .{ .local = "b:child" } } },
- .{ .element_content = .{ .content = "\n " } },
- .{ .element_end = .{ .name = .{ .local = "child" } } },
- .{ .element_content = .{ .content = "\n" } },
- .{ .element_end = .{ .name = .{ .local = "a:root" } } },
- });
- try testValid(.{ .namespace_aware = false }, "", &.{
- .{ .element_start = .{ .name = .{ .local = "a:root" } } },
- .{ .element_end = .{ .name = .{ .local = "a:root" } } },
- });
- try testValid(.{ .namespace_aware = false }, "<: />", &.{
- .{ .element_start = .{ .name = .{ .local = ":" } } },
- .{ .element_end = .{ .name = .{ .local = ":" } } },
- });
- try testValid(.{ .namespace_aware = false }, "", &.{
- .{ .element_start = .{ .name = .{ .local = "a:" } } },
- .{ .element_end = .{ .name = .{ .local = "a:" } } },
- });
- try testValid(.{ .namespace_aware = false }, "<:a />", &.{
- .{ .element_start = .{ .name = .{ .local = ":a" } } },
- .{ .element_end = .{ .name = .{ .local = ":a" } } },
- });
- try testValid(.{ .namespace_aware = false }, "", &.{
- .{ .element_start = .{ .name = .{ .local = "root" }, .attributes = &.{
- .{ .name = .{ .local = "xmlns:" }, .value = "urn:1" },
- } } },
- .{ .element_end = .{ .name = .{ .local = "root" } } },
- });
- try testValid(.{ .namespace_aware = false }, "", &.{
- .{ .element_start = .{ .name = .{ .local = "root" }, .attributes = &.{
- .{ .name = .{ .local = "xmlns::" }, .value = "urn:1" },
- } } },
- .{ .element_end = .{ .name = .{ .local = "root" } } },
- });
- try testValid(.{ .namespace_aware = false }, "", &.{
- .{ .element_start = .{ .name = .{ .local = "root" }, .attributes = &.{
- .{ .name = .{ .local = "xmlns:a:b" }, .value = "urn:1" },
- } } },
- .{ .element_end = .{ .name = .{ .local = "root" } } },
- });
- try testInvalid(.{ .namespace_aware = false }, "", error.DuplicateAttribute);
- try testInvalid(.{ .namespace_aware = false }, "", error.DuplicateAttribute);
- try testValid(.{ .namespace_aware = false }, "", &.{
- .{ .element_start = .{ .name = .{ .local = "root" } } },
- .{ .pi = .{ .target = "ns:pi", .content = "" } },
- .{ .element_end = .{ .name = .{ .local = "root" } } },
- });
-}
-
-fn testValid(comptime options: ReaderOptions, input: []const u8, expected_events: []const Event) !void {
- var input_stream = std.io.fixedBufferStream(input);
- var input_reader = reader(testing.allocator, input_stream.reader(), options);
- defer input_reader.deinit();
- var i: usize = 0;
- while (try input_reader.next()) |event| : (i += 1) {
- if (i >= expected_events.len) {
- std.debug.print("Unexpected event after end: {}\n", .{event});
- return error.TestFailed;
- }
- testing.expectEqualDeep(expected_events[i], event) catch |e| {
- std.debug.print("(at index {})\n", .{i});
- return e;
- };
- }
- if (i != expected_events.len) {
- std.debug.print("Expected {} events, found {}\n", .{ expected_events.len, i });
- return error.TestFailed;
- }
-}
-
-fn testInvalid(comptime options: ReaderOptions, input: []const u8, expected_error: anyerror) !void {
- var input_stream = std.io.fixedBufferStream(input);
- var input_reader = reader(testing.allocator, input_stream.reader(), options);
- defer input_reader.deinit();
- while (input_reader.next()) |_| {} else |err| {
- try testing.expectEqual(expected_error, err);
- }
-}
-
-test "nextNode" {
- var input_stream = std.io.fixedBufferStream(
- \\
- \\
- \\
- \\
- \\
- \\ Hello,
- \\
- \\
- \\ Text content goes here.
- \\
- \\
- \\
- \\
- \\
- \\
- \\
- );
- var input_reader = reader(testing.allocator, input_stream.reader(), .{});
- defer input_reader.deinit();
-
- try testing.expectEqualDeep(@as(?Event, .{ .xml_declaration = .{ .version = "1.0" } }), try input_reader.next());
- try testing.expectEqualDeep(@as(?Event, .{ .pi = .{ .target = "some-pi", .content = "" } }), try input_reader.next());
- try testing.expectEqualDeep(@as(?Event, .{ .comment = .{ .content = " A processing instruction with content follows " } }), try input_reader.next());
- try testing.expectEqualDeep(@as(?Event, .{ .pi = .{ .target = "some-pi-with-content", .content = "content" } }), try input_reader.next());
-
- const root_start = try input_reader.next();
- try testing.expect(root_start != null and root_start.? == .element_start);
- var root_node = try input_reader.nextNode(testing.allocator, root_start.?.element_start);
- defer root_node.deinit();
- try testing.expectEqualDeep(Node.Element{ .name = .{ .local = "root" }, .children = &.{
- .{ .text = .{ .content = "\n " } },
- .{ .element = .{ .name = .{ .local = "p" }, .children = &.{
- .{ .attribute = .{ .name = .{ .local = "class" }, .value = "test" } },
- .{ .text = .{ .content = "Hello, world!" } },
- } } },
- .{ .text = .{ .content = "\n " } },
- .{ .element = .{ .name = .{ .local = "line" }, .children = &.{} } },
- .{ .text = .{ .content = "\n " } },
- .{ .pi = .{ .target = "another-pi", .content = "" } },
- .{ .text = .{ .content = "\n Text content goes here.\n " } },
- .{ .element = .{ .name = .{ .local = "div" }, .children = &.{
- .{ .element = .{ .name = .{ .local = "p" }, .children = &.{
- .{ .text = .{ .content = "&" } },
- } } },
- } } },
- .{ .text = .{ .content = "\n" } },
- } }, root_node.value);
-
- try testing.expectEqualDeep(@as(?Event, .{ .comment = .{ .content = " Comments are allowed after the end of the root element " } }), try input_reader.next());
- try testing.expectEqualDeep(@as(?Event, .{ .pi = .{ .target = "comment", .content = "So are PIs " } }), try input_reader.next());
- try testing.expect(try input_reader.next() == null);
-}
-
-test "nextNode namespace handling" {
- var input_stream = std.io.fixedBufferStream(
- \\
- \\
- \\
- \\
- \\
- \\
- \\
- );
- var input_reader = reader(testing.allocator, input_stream.reader(), .{});
- defer input_reader.deinit();
-
- const root_start = try input_reader.next();
- try testing.expect(root_start != null and root_start.? == .element_start);
- var root_node = try input_reader.nextNode(testing.allocator, root_start.?.element_start);
- defer root_node.deinit();
- try testing.expectEqualDeep(Node.Element{ .name = .{ .prefix = "a", .ns = "urn:1", .local = "root" }, .children = &.{
- .{ .attribute = .{ .name = .{ .prefix = "xmlns", .ns = xmlns_ns, .local = "a" }, .value = "urn:1" } },
- .{ .text = .{ .content = "\n " } },
- .{ .element = .{ .name = .{ .ns = "urn:2", .local = "child" }, .children = &.{
- .{ .attribute = .{ .name = .{ .local = "xmlns" }, .value = "urn:2" } },
- .{ .attribute = .{ .name = .{ .prefix = "xmlns", .ns = xmlns_ns, .local = "b" }, .value = "urn:3" } },
- .{ .attribute = .{ .name = .{ .local = "attr" }, .value = "value" } },
- .{ .text = .{ .content = "\n " } },
- .{ .element = .{ .name = .{ .prefix = "b", .ns = "urn:3", .local = "child" }, .children = &.{
- .{ .attribute = .{ .name = .{ .prefix = "xmlns", .ns = xmlns_ns, .local = "a" }, .value = "urn:4" } },
- .{ .attribute = .{ .name = .{ .prefix = "b", .ns = "urn:3", .local = "attr" }, .value = "value" } },
- .{ .text = .{ .content = "\n " } },
- .{ .element = .{ .name = .{ .prefix = "a", .ns = "urn:4", .local = "child" } } },
- .{ .text = .{ .content = "\n " } },
- } } },
- .{ .text = .{ .content = "\n " } },
- } } },
- .{ .text = .{ .content = "\n" } },
- } }, root_node.value);
-}
-
-test readDocument {
- var input_stream = std.io.fixedBufferStream(
- \\
- \\
- \\
- \\
- \\
- \\ Hello,
- \\
- \\
- \\ Text content goes here.
- \\
- \\
- \\
- \\
- \\
- \\
- \\
- );
- var document_node = try readDocument(testing.allocator, input_stream.reader(), .{});
- defer document_node.deinit();
-
- try testing.expectEqualDeep(Node.Document{ .version = "1.0", .children = &.{
- .{ .pi = .{ .target = "some-pi", .content = "" } },
- .{ .comment = .{ .content = " A processing instruction with content follows " } },
- .{ .pi = .{ .target = "some-pi-with-content", .content = "content" } },
- .{ .element = .{ .name = .{ .local = "root" }, .children = &.{
- .{ .text = .{ .content = "\n " } },
- .{ .element = .{ .name = .{ .local = "p" }, .children = &.{
- .{ .attribute = .{ .name = .{ .local = "class" }, .value = "test" } },
- .{ .text = .{ .content = "Hello, world!" } },
- } } },
- .{ .text = .{ .content = "\n " } },
- .{ .element = .{ .name = .{ .local = "line" }, .children = &.{} } },
- .{ .text = .{ .content = "\n " } },
- .{ .pi = .{ .target = "another-pi", .content = "" } },
- .{ .text = .{ .content = "\n Text content goes here.\n " } },
- .{ .element = .{ .name = .{ .local = "div" }, .children = &.{
- .{ .element = .{ .name = .{ .local = "p" }, .children = &.{
- .{ .text = .{ .content = "&" } },
- } } },
- } } },
- .{ .text = .{ .content = "\n" } },
- } } },
- .{ .comment = .{ .content = " Comments are allowed after the end of the root element " } },
- .{ .pi = .{ .target = "comment", .content = "So are PIs " } },
- } }, document_node.value);
-}
-
-test Children {
- var input_stream = std.io.fixedBufferStream(
- \\
- \\ Hello, world!
- \\ Some content.
- \\
- \\
- );
- var input_reader = reader(testing.allocator, input_stream.reader(), .{});
- defer input_reader.deinit();
-
- try testing.expectEqualDeep(@as(?Event, .{ .element_start = .{ .name = .{ .local = "root" } } }), try input_reader.next());
- const root_children = input_reader.children();
- try testing.expectEqualDeep(@as(?Event, .{ .element_content = .{ .content = "\n Hello, world!\n " } }), try root_children.next());
- try testing.expectEqualDeep(@as(?Event, .{ .element_start = .{ .name = .{ .local = "child1" }, .attributes = &.{
- .{ .name = .{ .local = "attr" }, .value = "value" },
- } } }), try root_children.next());
- const child1_children = root_children.children();
- try testing.expectEqualDeep(@as(?Event, .{ .element_content = .{ .content = "Some content." } }), try child1_children.next());
- try testing.expectEqual(@as(?Event, null), try child1_children.next());
- try testing.expectEqualDeep(@as(?Event, .{ .element_content = .{ .content = "\n " } }), try root_children.next());
- try testing.expectEqualDeep(@as(?Event, .{ .element_start = .{ .name = .{ .local = "child2" } } }), try root_children.next());
- const child2_children = root_children.children();
- try testing.expectEqualDeep(@as(?Event, .{ .comment = .{ .content = " Comment " } }), try child2_children.next());
- try testing.expectEqualDeep(@as(?Event, .{ .element_start = .{ .name = .{ .local = "child3" } } }), try child2_children.next());
- const child3_children = child2_children.children();
- try testing.expectEqual(@as(?Event, null), try child3_children.next());
- try testing.expectEqual(@as(?Event, null), try child2_children.next());
- try testing.expectEqualDeep(@as(?Event, .{ .element_content = .{ .content = "\n" } }), try root_children.next());
- try testing.expectEqual(@as(?Event, null), try root_children.next());
-}
-
-test "skip children" {
- var input_stream = std.io.fixedBufferStream(
- \\
- \\ Hello, world!
- \\ Some content.
- \\
- \\
- );
- var input_reader = reader(testing.allocator, input_stream.reader(), .{});
- defer input_reader.deinit();
-
- try testing.expectEqualDeep(@as(?Event, .{ .element_start = .{ .name = .{ .local = "root" } } }), try input_reader.next());
- const root_children = input_reader.children();
- try root_children.skip();
- try testing.expectEqual(@as(?Event, null), try input_reader.next());
-}
diff --git a/src/syntax.zig b/src/syntax.zig
deleted file mode 100644
index 9f16250..0000000
--- a/src/syntax.zig
+++ /dev/null
@@ -1,106 +0,0 @@
-const std = @import("std");
-const unicode = std.unicode;
-
-pub inline fn isChar(c: u21) bool {
- return switch (c) {
- '\t', '\r', '\n', ' '...0xD7FF, 0xE000...0xFFFD, 0x10000...0x10FFFF => true,
- else => false,
- };
-}
-
-pub inline fn isSpace(c: u21) bool {
- return switch (c) {
- ' ', '\t', '\r', '\n' => true,
- else => false,
- };
-}
-
-pub inline fn isDigit(c: u21) bool {
- return switch (c) {
- '0'...'9' => true,
- else => false,
- };
-}
-
-/// Note: only valid if `isDigit` returns true.
-pub inline fn digitValue(c: u21) u4 {
- return @intCast(c - '0');
-}
-
-pub inline fn isHexDigit(c: u21) bool {
- return switch (c) {
- '0'...'9', 'a'...'f', 'A'...'F' => true,
- else => false,
- };
-}
-
-/// Note: only valid if `isHexDigit` returns true.
-pub inline fn hexDigitValue(c: u21) u4 {
- return switch (c) {
- 'a'...'f' => @intCast(c - 'a' + 10),
- 'A'...'F' => @intCast(c - 'A' + 10),
- else => @intCast(c - '0'),
- };
-}
-
-/// Checks if `s` matches `NCName` from the namespaces spec.
-///
-/// Note: only valid if `s` is valid UTF-8.
-pub fn isNcName(s: []const u8) bool {
- var view = unicode.Utf8View.initUnchecked(s);
- var iter = view.iterator();
- const first_c = iter.nextCodepoint() orelse return false;
- if (first_c == ':' or !isNameStartChar(first_c)) {
- return false;
- }
- while (iter.nextCodepoint()) |c| {
- if (c == ':' or !isNameChar(c)) {
- return false;
- }
- }
- return true;
-}
-
-pub inline fn isNameStartChar(c: u21) bool {
- return switch (c) {
- ':',
- 'A'...'Z',
- '_',
- 'a'...'z',
- 0xC0...0xD6,
- 0xD8...0xF6,
- 0xF8...0x2FF,
- 0x370...0x37D,
- 0x37F...0x1FFF,
- 0x200C...0x200D,
- 0x2070...0x218F,
- 0x2C00...0x2FEF,
- 0x3001...0xD7FF,
- 0xF900...0xFDCF,
- 0xFDF0...0xFFFD,
- 0x10000...0xEFFFF,
- => true,
- else => false,
- };
-}
-
-pub inline fn isNameChar(c: u21) bool {
- return if (isNameStartChar(c)) true else switch (c) {
- '-', '.', '0'...'9', 0xB7, 0x0300...0x036F, 0x203F...0x2040 => true,
- else => false,
- };
-}
-
-pub inline fn isEncodingStartChar(c: u21) bool {
- return switch (c) {
- 'A'...'Z', 'a'...'z' => true,
- else => false,
- };
-}
-
-pub inline fn isEncodingChar(c: u21) bool {
- return switch (c) {
- 'A'...'Z', 'a'...'z', '0'...'9', '.', '_', '-' => true,
- else => false,
- };
-}
diff --git a/src/token_reader.zig b/src/token_reader.zig
deleted file mode 100644
index bcf7964..0000000
--- a/src/token_reader.zig
+++ /dev/null
@@ -1,621 +0,0 @@
-const std = @import("std");
-const mem = std.mem;
-const testing = std.testing;
-const unicode = std.unicode;
-const encoding = @import("encoding.zig");
-const Scanner = @import("Scanner.zig");
-
-/// A single XML token.
-///
-/// For efficiency, this is merely an enum specifying the token type. The actual
-/// token data is available in `Token.Data`, in the token reader's `token_data`
-/// field. The `fullToken` function can be used to get a `Token.Full`, which is
-/// a tagged union type and may be easier to consume in certain circumstances.
-pub const Token = enum {
- /// End of file.
- eof,
- /// XML declaration.
- xml_declaration,
- /// Element start tag.
- element_start,
- /// Element content.
- element_content,
- /// Element end tag.
- element_end,
- /// End of an empty element.
- element_end_empty,
- /// Attribute start.
- attribute_start,
- /// Attribute value content.
- attribute_content,
- /// Comment start.
- comment_start,
- /// Comment content.
- comment_content,
- /// Processing instruction (PI) start.
- pi_start,
- /// PI content.
- pi_content,
-
- /// The data associated with a token.
- ///
- /// Even token types which have no associated data are represented here, to
- /// provide some additional safety in safe build modes (where it can be
- /// checked whether the caller is referencing the correct data field).
- pub const Data = union {
- eof: void,
- xml_declaration: XmlDeclaration,
- element_start: ElementStart,
- element_content: ElementContent,
- element_end: ElementEnd,
- element_end_empty: void,
- attribute_start: AttributeStart,
- attribute_content: AttributeContent,
- comment_start: void,
- comment_content: CommentContent,
- pi_start: PiStart,
- pi_content: PiContent,
- };
-
- /// A token type plus data represented as a tagged union.
- pub const Full = union(Token) {
- eof,
- xml_declaration: XmlDeclaration,
- element_start: ElementStart,
- element_content: ElementContent,
- element_end: ElementEnd,
- element_end_empty,
- attribute_start: AttributeStart,
- attribute_content: AttributeContent,
- comment_start,
- comment_content: CommentContent,
- pi_start: PiStart,
- pi_content: PiContent,
- };
-
- pub const XmlDeclaration = struct {
- version: []const u8,
- encoding: ?[]const u8 = null,
- standalone: ?bool = null,
- };
-
- pub const ElementStart = struct {
- name: []const u8,
- };
-
- pub const ElementContent = struct {
- content: Content,
- };
-
- pub const ElementEnd = struct {
- name: []const u8,
- };
-
- pub const AttributeStart = struct {
- name: []const u8,
- };
-
- pub const AttributeContent = struct {
- content: Content,
- final: bool = false,
- };
-
- pub const CommentContent = struct {
- content: []const u8,
- final: bool = false,
- };
-
- pub const PiStart = struct {
- target: []const u8,
- };
-
- pub const PiContent = struct {
- content: []const u8,
- final: bool = false,
- };
-
- /// A bit of content of an element or attribute.
- pub const Content = union(enum) {
- /// Raw text content (does not contain any entities).
- text: []const u8,
- /// A Unicode codepoint.
- codepoint: u21,
- /// An entity reference, such as `&`. The range covers the name (`amp`).
- entity: []const u8,
- };
-};
-
-/// A location in a file.
-pub const Location = struct {
- /// The line number, starting at 1.
- line: usize = 1,
- /// The column number, starting at 1. Columns are counted using Unicode
- /// codepoints.
- column: usize = 1,
- /// Whether the last character seen was a `\r`.
- after_cr: bool = false,
-
- /// Advances the location by a single codepoint.
- pub fn advance(self: *Location, c: u21) void {
- if (c == '\n') {
- self.line += 1;
- self.column = 1;
- self.after_cr = false;
- } else if (c == '\r') {
- if (self.after_cr) {
- self.line += 1;
- self.column = 1;
- }
- self.column += 1;
- self.after_cr = true;
- } else if (self.after_cr) {
- self.line += 1;
- // Plain CR line endings cannot be detected as new lines
- // immediately, since they could be followed by LF. The following
- // character is what completes the line ending interpretation.
- self.column = 2;
- self.after_cr = false;
- } else {
- self.column += 1;
- }
- }
-};
-
-test Location {
- var loc = Location{};
- try expectLocation(loc, 1, 1);
- loc.advance('A');
- try expectLocation(loc, 1, 2);
- loc.advance('よ');
- try expectLocation(loc, 1, 3);
- loc.advance('🥰');
- try expectLocation(loc, 1, 4);
- loc.advance('\n');
- try expectLocation(loc, 2, 1);
- loc.advance('\r');
- loc.advance('\n');
- try expectLocation(loc, 3, 1);
- loc.advance('\r');
- loc.advance('A');
- try expectLocation(loc, 4, 2);
- loc.advance('\r');
- loc.advance('\r');
- loc.advance('A');
- try expectLocation(loc, 6, 2);
-}
-
-fn expectLocation(loc: Location, line: usize, column: usize) !void {
- if (loc.line != line or loc.column != column) {
- std.debug.print("expected {}:{}, found {}:{}", .{ line, column, loc.line, loc.column });
- return error.TestExpectedEqual;
- }
-}
-
-/// A drop-in replacement for `Location` which does not actually store location
-/// information.
-pub const NoOpLocation = struct {
- pub inline fn advance(_: *NoOpLocation, _: u21) void {}
-};
-
-/// Wraps a `std.io.Reader` in a `TokenReader` with the default buffer size
-/// (4096).
-pub fn tokenReader(
- reader: anytype,
- comptime options: TokenReaderOptions,
-) TokenReader(@TypeOf(reader), options) {
- return TokenReader(@TypeOf(reader), options).init(reader, .{});
-}
-
-/// Options for a `TokenReader`.
-pub const TokenReaderOptions = struct {
- /// The type of decoder to use.
- DecoderType: type = encoding.DefaultDecoder,
- /// The size of the internal buffer.
- ///
- /// This limits the byte length of "non-splittable" content, such as
- /// element and attribute names. Longer such content will result in
- /// `error.Overflow`.
- buffer_size: usize = 4096,
- /// Whether to normalize line endings and attribute values according to the
- /// XML specification.
- ///
- /// If this is set to false, no normalization will be done: for example,
- /// the line ending sequence `\r\n` will appear as-is in returned tokens
- /// rather than the normalized `\n`.
- enable_normalization: bool = true,
- /// Whether to keep track of the current location in the document.
- track_location: bool = false,
-};
-
-/// An XML parser which wraps a `std.io.Reader` and returns low-level tokens.
-///
-/// An internal buffer of size `buffer_size` is used to store data read from
-/// the reader, which is referenced by the returned tokens.
-///
-/// This parser offers several advantages over `Scanner` for higher-level
-/// use-cases:
-///
-/// - The returned `Token`s use byte slices rather than positional ranges.
-/// - The `next` function can be used in the typical Zig iterator pattern.
-/// There is no `ok` token which must be ignored, and there is no need to
-/// directly signal the end of input (the `Reader` provides this indication).
-/// - The line ending and attribute value normalization steps required by the
-/// XML specification (minus further attribute value normalization which
-/// depends on DTD information) are performed.
-///
-/// However, due to its use of an internal buffer and transcoding all input to
-/// UTF-8, it is not as efficient as a `Scanner` where these considerations are
-/// important. Additionally, `buffer_size` limits the maximum byte length of
-/// "unsplittable" content, such as element and attribute names (but not
-/// "splittable" content, such as element text content and attribute values).
-pub fn TokenReader(comptime ReaderType: type, comptime options: TokenReaderOptions) type {
- return struct {
- scanner: Scanner,
- reader: ReaderType,
- decoder: options.DecoderType,
- /// The data for the most recently returned token.
- token_data: Token.Data = undefined,
- /// The current location in the file (if enabled).
- location: if (options.track_location) Location else NoOpLocation = .{},
- /// Buffered content read by the reader for the current token.
- ///
- /// Events may reference this buffer via slices. The contents of the
- /// buffer (up until `scanner.pos`) are always valid UTF-8.
- buffer: [options.buffer_size]u8 = undefined,
- /// Whether the last codepoint read was a carriage return (`\r`).
- ///
- /// This is relevant for line break normalization.
- after_cr: if (options.enable_normalization) bool else void = if (options.enable_normalization) false,
-
- const Self = @This();
-
- pub const Error = error{
- InvalidEncoding,
- InvalidPiTarget,
- Overflow,
- UnexpectedEndOfInput,
- } || ReaderType.Error || options.DecoderType.Error || Scanner.Error;
-
- const max_encoded_codepoint_len = @max(options.DecoderType.max_encoded_codepoint_len, 4);
-
- pub fn init(reader: ReaderType, decoder: options.DecoderType) Self {
- return .{
- .scanner = Scanner{},
- .reader = reader,
- .decoder = decoder,
- };
- }
-
- /// Returns the full token (including data) from the most recent call to
- /// `next`. `token` must be the token returned from the last call to
- /// `next`.
- pub fn fullToken(self: *const Self, token: Token) Token.Full {
- return switch (token) {
- inline else => |tag| @unionInit(Token.Full, @tagName(tag), @field(self.token_data, @tagName(tag))),
- };
- }
-
- /// Returns the next token from the input.
- ///
- /// The slices in the `token_data` stored during this call are only
- /// valid until the next call to `next`.
- pub fn next(self: *Self) Error!Token {
- if (self.scanner.pos > 0) {
- // If the scanner position is > 0, that means we emitted an event
- // on the last call to next, and should try to reset the
- // position again in an effort to not run out of buffer space
- // (ideally, the scanner should be resettable after every token,
- // but we do not depend on this).
- if (self.scanner.resetPos()) |token| {
- if (token != .ok) {
- return try self.bufToken(token);
- }
- } else |_| {
- // Failure to reset isn't fatal (yet); we can still try to
- // complete the token below
- }
- }
-
- while (true) {
- if (self.scanner.pos + max_encoded_codepoint_len >= self.buffer.len) {
- if (self.scanner.resetPos()) |token| {
- if (token != .ok) {
- return try self.bufToken(token);
- }
- } else |_| {
- // Failure to reset here still isn't fatal, since we
- // may end up getting shorter codepoints which manage
- // to complete the current token.
- }
- }
-
- const c = try self.nextCodepoint();
- if (!c.present) {
- try self.scanner.endInput();
- self.token_data = .{ .eof = {} };
- return .eof;
- }
- const token = try self.scanner.next(c.codepoint, c.byte_length);
- if (token != .ok) {
- return try self.bufToken(token);
- }
- }
- }
-
- const nextCodepoint = if (options.enable_normalization) nextCodepointNormalized else nextCodepointRaw;
-
- fn nextCodepointNormalized(self: *Self) !encoding.ReadResult {
- var c = try self.nextCodepointRaw();
- if (!c.present) return c;
- if (self.after_cr) {
- self.after_cr = false;
- if (c.codepoint == '\n') {
- // \n after \r is ignored because \r was already processed
- // as a line ending
- c = try self.nextCodepointRaw();
- if (!c.present) return c;
- }
- }
- if (c.codepoint == '\r') {
- self.after_cr = true;
- c.codepoint = '\n';
- self.buffer[self.scanner.pos] = '\n';
- }
- if (self.scanner.state == .attribute_content and
- (c.codepoint == '\t' or c.codepoint == '\r' or c.codepoint == '\n'))
- {
- c.codepoint = ' ';
- self.buffer[self.scanner.pos] = ' ';
- }
- return c;
- }
-
- fn nextCodepointRaw(self: *Self) !encoding.ReadResult {
- const c = try self.decoder.readCodepoint(self.reader, self.buffer[self.scanner.pos..]);
- if (c.present) self.location.advance(c.codepoint);
- return c;
- }
-
- fn bufToken(self: *Self, token: Scanner.Token) !Token {
- switch (token) {
- .ok => unreachable,
- .xml_declaration => {
- self.token_data = .{ .xml_declaration = .{
- .version = self.bufRange(self.scanner.token_data.xml_declaration.version),
- .encoding = if (self.scanner.token_data.xml_declaration.encoding) |enc| self.bufRange(enc) else null,
- .standalone = self.scanner.token_data.xml_declaration.standalone,
- } };
- if (self.token_data.xml_declaration.encoding) |declared_encoding| {
- try self.decoder.adaptTo(declared_encoding);
- }
- return .xml_declaration;
- },
- .element_start => {
- self.token_data = .{ .element_start = .{
- .name = self.bufRange(self.scanner.token_data.element_start.name),
- } };
- return .element_start;
- },
- .element_content => {
- self.token_data = .{ .element_content = .{
- .content = self.bufContent(self.scanner.token_data.element_content.content),
- } };
- return .element_content;
- },
- .element_end => {
- self.token_data = .{ .element_end = .{
- .name = self.bufRange(self.scanner.token_data.element_end.name),
- } };
- return .element_end;
- },
- .element_end_empty => {
- self.token_data = .{ .element_end_empty = {} };
- return .element_end_empty;
- },
- .attribute_start => {
- self.token_data = .{ .attribute_start = .{
- .name = self.bufRange(self.scanner.token_data.attribute_start.name),
- } };
- return .attribute_start;
- },
- .attribute_content => {
- self.token_data = .{ .attribute_content = .{
- .content = self.bufContent(self.scanner.token_data.attribute_content.content),
- .final = self.scanner.token_data.attribute_content.final,
- } };
- return .attribute_content;
- },
- .comment_start => {
- self.token_data = .{ .comment_start = {} };
- return .comment_start;
- },
- .comment_content => {
- self.token_data = .{ .comment_content = .{
- .content = self.bufRange(self.scanner.token_data.comment_content.content),
- .final = self.scanner.token_data.comment_content.final,
- } };
- return .comment_content;
- },
- .pi_start => {
- const target = self.bufRange(self.scanner.token_data.pi_start.target);
- if (std.ascii.eqlIgnoreCase(target, "xml")) {
- return error.InvalidPiTarget;
- }
- self.token_data = .{ .pi_start = .{
- .target = target,
- } };
- return .pi_start;
- },
- .pi_content => {
- self.token_data = .{ .pi_content = .{
- .content = self.bufRange(self.scanner.token_data.pi_content.content),
- .final = self.scanner.token_data.pi_content.final,
- } };
- return .pi_content;
- },
- }
- }
-
- inline fn bufContent(self: *const Self, content: Scanner.Token.Content) Token.Content {
- return switch (content) {
- .text => |text| .{ .text = self.bufRange(text) },
- .codepoint => |codepoint| .{ .codepoint = codepoint },
- .entity => |entity| .{ .entity = self.bufRange(entity) },
- };
- }
-
- inline fn bufRange(self: *const Self, range: Scanner.Range) []const u8 {
- return self.buffer[range.start..range.end];
- }
- };
-}
-
-test TokenReader {
- try testValid(.{},
- \\
- \\
- \\
- \\
- \\
- \\ Hello,
- \\
- \\
- \\ Text content goes here.
- \\
- \\
- \\
- \\
- \\
- \\
- \\
- , &.{
- .{ .xml_declaration = .{ .version = "1.0" } },
- .{ .pi_start = .{ .target = "some-pi" } },
- .{ .pi_content = .{ .content = "", .final = true } },
- .comment_start,
- .{ .comment_content = .{ .content = " A processing instruction with content follows ", .final = true } },
- .{ .pi_start = .{ .target = "some-pi-with-content" } },
- .{ .pi_content = .{ .content = "content", .final = true } },
- .{ .element_start = .{ .name = "root" } },
- .{ .element_content = .{ .content = .{ .text = "\n " } } },
- .{ .element_start = .{ .name = "p" } },
- .{ .attribute_start = .{ .name = "class" } },
- .{ .attribute_content = .{ .content = .{ .text = "test" }, .final = true } },
- .{ .element_content = .{ .content = .{ .text = "Hello, " } } },
- .{ .element_content = .{ .content = .{ .text = "world!" } } },
- .{ .element_end = .{ .name = "p" } },
- .{ .element_content = .{ .content = .{ .text = "\n " } } },
- .{ .element_start = .{ .name = "line" } },
- .element_end_empty,
- .{ .element_content = .{ .content = .{ .text = "\n " } } },
- .{ .pi_start = .{ .target = "another-pi" } },
- .{ .pi_content = .{ .content = "", .final = true } },
- .{ .element_content = .{ .content = .{ .text = "\n Text content goes here.\n " } } },
- .{ .element_start = .{ .name = "div" } },
- .{ .element_start = .{ .name = "p" } },
- .{ .element_content = .{ .content = .{ .entity = "amp" } } },
- .{ .element_end = .{ .name = "p" } },
- .{ .element_end = .{ .name = "div" } },
- .{ .element_content = .{ .content = .{ .text = "\n" } } },
- .{ .element_end = .{ .name = "root" } },
- .comment_start,
- .{ .comment_content = .{ .content = " Comments are allowed after the end of the root element ", .final = true } },
- .{ .pi_start = .{ .target = "comment" } },
- .{ .pi_content = .{ .content = "So are PIs ", .final = true } },
- });
-}
-
-test "normalization" {
- try testValid(.{}, "Line 1\rLine 2\r\nLine 3\nLine 4\n\rLine 6\r\n\rLine 8", &.{
- .{ .element_start = .{ .name = "root" } },
- .{ .element_content = .{ .content = .{ .text = "Line 1\nLine 2\nLine 3\nLine 4\n\nLine 6\n\nLine 8" } } },
- .{ .element_end = .{ .name = "root" } },
- });
- try testValid(.{}, "", &.{
- .{ .element_start = .{ .name = "root" } },
- .{ .attribute_start = .{ .name = "attr" } },
- .{ .attribute_content = .{
- .content = .{ .text = " Line 1 Line 2 Line 3 Line 4 More content Line 6 Line 8 " },
- .final = true,
- } },
- .element_end_empty,
- });
- try testValid(.{ .enable_normalization = false }, "Line 1\rLine 2\r\nLine 3\nLine 4\n\rLine 6\r\n\rLine 8", &.{
- .{ .element_start = .{ .name = "root" } },
- .{ .element_content = .{ .content = .{ .text = "Line 1\rLine 2\r\nLine 3\nLine 4\n\rLine 6\r\n\rLine 8" } } },
- .{ .element_end = .{ .name = "root" } },
- });
- try testValid(.{ .enable_normalization = false }, "", &.{
- .{ .element_start = .{ .name = "root" } },
- .{ .attribute_start = .{ .name = "attr" } },
- .{ .attribute_content = .{
- .content = .{ .text = " Line 1\rLine 2\r\nLine 3\nLine 4\t\tMore content\n\rLine 6\r\n\rLine 8 " },
- .final = true,
- } },
- .element_end_empty,
- });
-}
-
-test "PI target" {
- try testValid(.{}, "", &.{
- .{ .xml_declaration = .{ .version = "1.0" } },
- .{ .element_start = .{ .name = "root" } },
- .{ .pi_start = .{ .target = "some-pi" } },
- .{ .pi_content = .{ .content = "", .final = true } },
- .{ .element_end = .{ .name = "root" } },
- });
- try testValid(.{}, "", &.{
- .{ .element_start = .{ .name = "root" } },
- .{ .pi_start = .{ .target = "x" } },
- .{ .pi_content = .{ .content = "2", .final = true } },
- .{ .element_end = .{ .name = "root" } },
- });
- try testValid(.{}, "", &.{
- .{ .element_start = .{ .name = "root" } },
- .{ .pi_start = .{ .target = "xm" } },
- .{ .pi_content = .{ .content = "2", .final = true } },
- .{ .element_end = .{ .name = "root" } },
- });
- try testValid(.{}, "", &.{
- .{ .element_start = .{ .name = "root" } },
- .{ .pi_start = .{ .target = "xml2" } },
- .{ .pi_content = .{ .content = "2", .final = true } },
- .{ .element_end = .{ .name = "root" } },
- });
- try testInvalid(.{}, "", error.InvalidPiTarget);
- try testInvalid(.{}, "", error.InvalidPiTarget);
- try testInvalid(.{}, "", error.InvalidPiTarget);
- try testInvalid(.{}, "", error.InvalidPiTarget);
-}
-
-fn testValid(comptime options: TokenReaderOptions, input: []const u8, expected_tokens: []const Token.Full) !void {
- var input_stream = std.io.fixedBufferStream(input);
- var input_reader = tokenReader(input_stream.reader(), options);
- var i: usize = 0;
- while (true) : (i += 1) {
- const token = try input_reader.next();
- if (token == .eof) break;
- if (i >= expected_tokens.len) {
- std.debug.print("Unexpected token after end: {}\n", .{token});
- return error.TestFailed;
- }
- testing.expectEqualDeep(expected_tokens[i], input_reader.fullToken(token)) catch |e| {
- std.debug.print("(at index {})\n", .{i});
- return e;
- };
- }
- if (i != expected_tokens.len) {
- std.debug.print("Expected {} tokens, found {}\n", .{ expected_tokens.len, i });
- return error.TestFailed;
- }
-}
-
-fn testInvalid(comptime options: TokenReaderOptions, input: []const u8, expected_error: anyerror) !void {
- var input_stream = std.io.fixedBufferStream(input);
- var input_reader = tokenReader(input_stream.reader(), options);
- while (input_reader.next()) |token| {
- if (token == .eof) return error.TestExpectedError;
- } else |err| {
- try testing.expectEqual(expected_error, err);
- }
-}
diff --git a/src/writer.zig b/src/writer.zig
deleted file mode 100644
index 8871139..0000000
--- a/src/writer.zig
+++ /dev/null
@@ -1,264 +0,0 @@
-const std = @import("std");
-const fmt = std.fmt;
-const testing = std.testing;
-const ArrayListUnmanaged = std.ArrayListUnmanaged;
-const Event = @import("reader.zig").Event;
-const QName = @import("reader.zig").QName;
-
-/// Returns a `Writer` wrapping a `std.io.Writer`.
-pub fn writer(w: anytype) Writer(@TypeOf(w)) {
- return .{ .w = w };
-}
-
-/// A streaming XML writer wrapping a `std.io.Writer`.
-///
-/// This writer exposes a selection of functions to write XML content with
-/// proper escaping where possible.
-///
-/// Some write functions come in sets to allow streaming longer contents rather
-/// than writing them all in one go: for example, `writeAttribute` is useful for
-/// writing an entire attribute name-value pair in one shot, but if the attribute
-/// value is potentially quite long, the sequence of `writeAttributeStart`,
-/// followed by an arbitrary (even zero) number of `writeAttributeContent`,
-/// followed by `writeAttributeEnd`, can be used as a lower-level alternative.
-///
-/// One interesting lower-level function is `writeElementStartEnd`, which is used
-/// to tell the writer to finish the current element start tag (all attributes
-/// have been written), in preparation for writing other content. The other
-/// functions (such as `writeElementContent`) will call this themselves if the
-/// writer is in the middle of a start tag, but calling this function directly
-/// could be useful if the user plans to write directly to the underlying
-/// writer.
-///
-/// Additionally, this writer makes no attempt at being able to write XML in
-/// arbitrary styles. For example, the quote character is not configurable, and
-/// there is no function for writing CDATA sections.
-///
-/// # Safety
-///
-/// There are caveats to the well-formedness of the resulting output:
-///
-/// 1. There is no protection against calling the various write functions out of
-/// order. For example, calling `writeElementEnd` without a corresponding
-/// `writeElementStart` will result in non-well-formed XML.
-/// 2. Processing instructions (PIs) and comments do not support escaping their
-/// content, so passing content to the corresponding write functions which
-/// contains illegal sequences for those constructs will result in
-/// unexpected outcomes. For example, calling `writeComment` with a value
-/// containing `-->` will result in the writer happily writing out the raw
-/// `-->` in the text of the comment, which will close the comment and write
-/// the rest of the provided text as raw XML (followed by the writer's
-/// inserted `-->`).
-/// 3. There are no validations that the names of elements and attributes match
-/// the allowed syntax for names. Likewise, there are no validations that the
-/// `version` and `encoding` passed to `writeXmlDeclaration` match the
-/// allowed syntax for those values.
-///
-/// As such, it is not safe to use all functionality of this writer with
-/// arbitrary user-provided data. What _is_ safe, however, is the more common
-/// case of using this writer with only attribute values and element content
-/// containing user-provided data, since those can always be escaped properly.
-pub fn Writer(comptime WriterType: type) type {
- return struct {
- w: WriterType,
- in_element_start: bool = false,
-
- const Self = @This();
-
- pub const Error = WriterType.Error;
-
- pub fn writeXmlDeclaration(self: *Self, version: []const u8, encoding: ?[]const u8, standalone: ?bool) Error!void {
- try self.w.print("");
- }
-
- pub fn writeElementStart(self: *Self, name: QName) Error!void {
- if (self.in_element_start) {
- try self.writeElementStartEnd();
- }
- try self.w.print("<{}", .{fmtQName(name)});
- self.in_element_start = true;
- }
-
- pub fn writeElementStartEnd(self: *Self) Error!void {
- try self.w.writeByte('>');
- self.in_element_start = false;
- }
-
- pub fn writeElementContent(self: *Self, content: []const u8) Error!void {
- if (self.in_element_start) {
- try self.writeElementStartEnd();
- }
- try self.w.print("{}", .{fmtElementContent(content)});
- }
-
- pub fn writeElementEnd(self: *Self, name: QName) Error!void {
- if (self.in_element_start) {
- try self.w.writeAll(" />");
- self.in_element_start = false;
- } else {
- try self.w.print("{}>", .{fmtQName(name)});
- }
- }
-
- pub fn writeAttribute(self: *Self, name: QName, content: []const u8) Error!void {
- try self.writeAttributeStart(name);
- try self.writeAttributeContent(content);
- try self.writeAttributeEnd();
- }
-
- pub fn writeAttributeStart(self: *Self, name: QName) Error!void {
- try self.w.print(" {}=\"", .{fmtQName(name)});
- }
-
- pub fn writeAttributeContent(self: *Self, content: []const u8) Error!void {
- try self.w.print("{}", .{fmtAttributeContent(content)});
- }
-
- pub fn writeAttributeEnd(self: *Self) Error!void {
- try self.w.writeByte('"');
- }
-
- pub fn writeComment(self: *Self, content: []const u8) Error!void {
- try self.writeCommentStart();
- try self.writeCommentContent(content);
- try self.writeCommentEnd();
- }
-
- pub fn writeCommentStart(self: *Self) Error!void {
- if (self.in_element_start) {
- try self.writeElementStartEnd();
- }
- try self.w.writeAll("");
- }
-
- pub fn writePi(self: *Self, target: []const u8, content: []const u8) Error!void {
- try self.writePiStart(target);
- try self.writePiContent(content);
- try self.writePiEnd();
- }
-
- pub fn writePiStart(self: *Self, target: []const u8) Error!void {
- if (self.in_element_start) {
- try self.writeElementStartEnd();
- }
- try self.w.print("{} ", .{target});
- }
-
- pub fn writePiContent(self: *Self, content: []const u8) Error!void {
- try self.w.writeAll(content);
- }
-
- pub fn writePiEnd(self: *Self) Error!void {
- try self.w.writeAll("?>");
- }
- };
-}
-
-test Writer {
- var output = ArrayListUnmanaged(u8){};
- defer output.deinit(testing.allocator);
- var xml_writer = writer(output.writer(testing.allocator));
-
- const xmlns_ns = "http://www.w3.org/2000/xmlns/";
- try xml_writer.writeXmlDeclaration("1.0", "UTF-8", true);
- // The ns part of the QName is not used when writing, but may factor in to
- // future (optional) safety checks
- try xml_writer.writeElementStart(.{ .prefix = "test", .ns = "http://example.com/ns/test", .local = "root" });
- try xml_writer.writeAttribute(.{ .prefix = "xmlns", .ns = xmlns_ns, .local = "test" }, "http://example.com/ns/test");
- try xml_writer.writeComment(" Hello, world! ");
- try xml_writer.writeElementContent("Some text & some other text. ");
- try xml_writer.writeElementContent("Another .");
- try xml_writer.writeElementStart(.{ .local = "sub" });
- try xml_writer.writeAttribute(.{ .local = "escaped" }, "&<>\"'");
- try xml_writer.writeElementEnd(.{ .local = "sub" });
- try xml_writer.writeElementEnd(.{ .prefix = "test", .ns = "http://example.com/ns/test", .local = "root" });
-
- try testing.expectEqualStrings(
- \\
- ++
- \\
- ++
- \\
- ++
- \\Some text & some other text. Another <sentence>.
- ++
- \\
- ++
- \\
- , output.items);
-}
-
-/// Returns a `std.fmt.Formatter` for escaped attribute content.
-pub fn fmtAttributeContent(data: []const u8) fmt.Formatter(formatAttributeContent) {
- return .{ .data = data };
-}
-
-fn formatAttributeContent(
- data: []const u8,
- comptime _: []const u8,
- _: fmt.FormatOptions,
- w: anytype,
-) !void {
- for (data) |b| switch (b) {
- '\t' => try w.writeAll(" "),
- '\n' => try w.writeAll("
"),
- '\r' => try w.writeAll("
"),
- '"' => try w.writeAll("""),
- '&' => try w.writeAll("&"),
- '<' => try w.writeAll("<"),
- else => try w.writeByte(b),
- };
-}
-
-/// Returns a `std.fmt.Formatter` for escaped element content.
-pub fn fmtElementContent(data: []const u8) fmt.Formatter(formatElementContent) {
- return .{ .data = data };
-}
-
-fn formatElementContent(
- data: []const u8,
- comptime _: []const u8,
- _: fmt.FormatOptions,
- w: anytype,
-) !void {
- for (data) |b| switch (b) {
- '\r' => try w.writeAll("
"),
- '&' => try w.writeAll("&"),
- '<' => try w.writeAll("<"),
- else => try w.writeByte(b),
- };
-}
-
-/// Returns a `std.fmt.Formatter` for a QName (formats as `prefix:local` or
-/// just `local` if no prefix).
-pub fn fmtQName(data: QName) fmt.Formatter(formatQName) {
- return .{ .data = data };
-}
-
-fn formatQName(
- data: QName,
- comptime _: []const u8,
- _: fmt.FormatOptions,
- w: anytype,
-) !void {
- if (data.prefix) |prefix| {
- try w.writeAll(prefix);
- try w.writeByte(':');
- }
- try w.writeAll(data.local);
-}
diff --git a/src/xml.zig b/src/xml.zig
index 5856a7b..bfe4569 100644
--- a/src/xml.zig
+++ b/src/xml.zig
@@ -1,53 +1,476 @@
-//! An XML library, currently supporting reading XML.
-//!
-//! Most applications will want to start with `Reader` and investigate the
-//! other parser options if they want to avoid dynamic memory allocation or
-//! want better performance at the expense of ease of use.
-//!
-//! There are three parsers available, with increasing levels of abstraction,
-//! ease of use, and standard conformance. The documentation for each parser
-//! provides more detailed information on its functionality.
-//!
-//! 1. `Scanner` - the lowest-level parser. A state machine that accepts
-//! Unicode codepoints one by one and returns "tokens" referencing ranges of
-//! input data.
-//! 2. `TokenReader` - a mid-level parser that improves on `Scanner` by
-//! buffering input so that returned tokens can use UTF-8-encoded byte
-//! slices rather than ranges. It also uses a `std.io.Reader` and a decoder
-//! (see `encoding`) rather than forcing the user to pass codepoints
-//! directly.
-//! 3. `Reader` - a general-purpose streaming parser which can handle
-//! namespaces. Helper functions are available to parse some or all of a
-//! document into a `Node`, which acts as a minimal DOM abstraction.
-
const std = @import("std");
-const testing = std.testing;
+const Allocator = std.mem.Allocator;
+const assert = std.debug.assert;
+const expectEqual = std.testing.expectEqual;
+const expectEqualStrings = std.testing.expectEqualStrings;
+
+pub const Location = struct {
+ line: usize,
+ column: usize,
+
+ pub const start: Location = .{ .line = 1, .column = 1 };
+
+ pub fn update(loc: *Location, s: []const u8) void {
+ var pos: usize = 0;
+ while (std.mem.indexOfAnyPos(u8, s, pos, "\r\n")) |nl_pos| {
+ loc.line += 1;
+ loc.column = 1;
+ if (s[nl_pos] == '\r' and nl_pos + 1 < s.len and s[nl_pos + 1] == '\n') {
+ pos = nl_pos + 2;
+ } else {
+ pos = nl_pos + 1;
+ }
+ }
+ loc.column += s.len - pos;
+ }
+};
+
+pub const QName = struct {
+ ns: []const u8,
+ local: []const u8,
+};
+
+pub const PrefixedQName = struct {
+ prefix: []const u8,
+ ns: []const u8,
+ local: []const u8,
+};
+
+pub const predefined_entities = std.StaticStringMap([]const u8).initComptime(.{
+ .{ "lt", "<" },
+ .{ "gt", ">" },
+ .{ "amp", "&" },
+ .{ "apos", "'" },
+ .{ "quot", "\"" },
+});
+
+pub const ns_xml = "http://www.w3.org/XML/1998/namespace";
+pub const ns_xmlns = "http://www.w3.org/2000/xmlns/";
+pub const predefined_namespace_uris = std.StaticStringMap([]const u8).initComptime(.{
+ .{ "xml", ns_xml },
+ .{ "xmlns", ns_xmlns },
+});
+
+pub const Reader = @import("Reader.zig");
+
+pub fn GenericReader(comptime SourceError: type) type {
+ return struct {
+ reader: Reader,
+
+ /// See `Reader.deinit`.
+ pub inline fn deinit(reader: *@This()) void {
+ reader.reader.deinit();
+ }
+
+ pub const ReadError = Reader.ReadError || SourceError;
+
+ /// See `Reader.read`.
+ pub inline fn read(reader: *@This()) ReadError!Reader.Node {
+ return @errorCast(reader.reader.read());
+ }
+
+ /// See `Reader.readElementText`.
+ pub inline fn readElementText(reader: *@This()) (ReadError || Allocator.Error)![]const u8 {
+ return @errorCast(reader.reader.readElementText());
+ }
+
+ pub inline fn readElementTextAlloc(reader: *@This(), gpa: Allocator) (ReadError || Allocator.Error)![]u8 {
+ return @errorCast(reader.reader.readElementTextAlloc(gpa));
+ }
+
+ /// See `Reader.readElementTextWrite`.
+ pub inline fn readElementTextWrite(reader: *@This(), writer: anytype) (ReadError || @TypeOf(writer).Error)!void {
+ return @errorCast(reader.reader.readElementTextWrite(writer.any()));
+ }
+
+ /// See `Reader.skipProlog`.
+ pub inline fn skipProlog(reader: *@This()) ReadError!void {
+ return @errorCast(reader.reader.skipProlog());
+ }
+
+ /// See `Reader.skipElement`.
+ pub inline fn skipElement(reader: *@This()) ReadError!void {
+ return @errorCast(reader.reader.skipElement());
+ }
+
+ /// See `Reader.location`.
+ pub inline fn location(reader: @This()) Location {
+ return reader.reader.location();
+ }
+
+ /// See `Reader.errorCode`.
+ pub inline fn errorCode(reader: @This()) Reader.ErrorCode {
+ return reader.reader.errorCode();
+ }
+
+ /// See `Reader.errorLocation`.
+ pub inline fn errorLocation(reader: @This()) Location {
+ return reader.reader.errorLocation();
+ }
+
+ /// See `Reader.xmlDeclarationVersion`.
+ pub inline fn xmlDeclarationVersion(reader: @This()) []const u8 {
+ return reader.reader.xmlDeclarationVersion();
+ }
+
+ /// See `Reader.xmlDeclarationEncoding`.
+ pub inline fn xmlDeclarationEncoding(reader: @This()) ?[]const u8 {
+ return reader.reader.xmlDeclarationEncoding();
+ }
+
+ /// See `Reader.xmlDeclarationStandalone`.
+ pub inline fn xmlDeclarationStandalone(reader: @This()) ?bool {
+ return reader.reader.xmlDeclarationStandalone();
+ }
+
+ /// See `Reader.elementName`.
+ pub inline fn elementName(reader: @This()) []const u8 {
+ return reader.reader.elementName();
+ }
+
+ /// See `Reader.elementNameNs`.
+ pub inline fn elementNameNs(reader: @This()) PrefixedQName {
+ return reader.reader.elementNameNs();
+ }
+
+ /// See `Reader.attributeCount`.
+ pub inline fn attributeCount(reader: @This()) usize {
+ return reader.reader.attributeCount();
+ }
+
+ /// See `Reader.attributeName`.
+ pub inline fn attributeName(reader: @This(), n: usize) []const u8 {
+ return reader.reader.attributeName(n);
+ }
+
+ /// See `Reader.attributeNameNs`.
+ pub inline fn attributeNameNs(reader: @This(), n: usize) PrefixedQName {
+ return reader.reader.attributeNameNs(n);
+ }
+
+ /// See `Reader.attributeValue`.
+ pub inline fn attributeValue(reader: *@This(), n: usize) Allocator.Error![]const u8 {
+ return reader.reader.attributeValue(n);
+ }
+
+ /// See `Reader.attributeValueAlloc`.
+ pub inline fn attributeValueAlloc(reader: @This(), gpa: Allocator, n: usize) Allocator.Error![]u8 {
+ return reader.reader.attributeValueAlloc(gpa, n);
+ }
+
+ /// See `Reader.attributeValueWrite`.
+ pub inline fn attributeValueWrite(reader: @This(), n: usize, writer: anytype) @TypeOf(writer).Error!void {
+ return @errorCast(reader.reader.attributeValueWrite(n, writer.any()));
+ }
+
+ /// See `Reader.attributeValueRaw`.
+ pub inline fn attributeValueRaw(reader: @This(), n: usize) []const u8 {
+ return reader.reader.attributeValueRaw(n);
+ }
+
+ /// See `Reader.attributeLocation`.
+ pub inline fn attributeLocation(reader: @This(), n: usize) Location {
+ return reader.reader.attributeLocation(n);
+ }
+
+ /// See `Reader.attributeIndex`.
+ pub inline fn attributeIndex(reader: @This(), name: []const u8) ?usize {
+ return reader.reader.attributeIndex(name);
+ }
+
+ /// See `Reader.attributeIndexNs`.
+ pub inline fn attributeIndexNs(reader: @This(), ns: []const u8, local: []const u8) ?usize {
+ return reader.reader.attributeIndexNs(ns, local);
+ }
+
+ /// See `Reader.comment`.
+ pub inline fn comment(reader: *@This()) Allocator.Error![]const u8 {
+ return reader.reader.comment();
+ }
+
+ /// See `Reader.commentWrite`.
+ pub inline fn commentWrite(reader: @This(), writer: anytype) @TypeOf(writer).Error!void {
+ return @errorCast(reader.reader.commentWrite(writer.any()));
+ }
+
+ /// See `Reader.commentRaw`.
+ pub inline fn commentRaw(reader: @This()) []const u8 {
+ return reader.reader.commentRaw();
+ }
+
+ /// See `Reader.piTarget`.
+ pub inline fn piTarget(reader: @This()) []const u8 {
+ return reader.reader.piTarget();
+ }
+
+ /// See `Reader.piData`.
+ pub inline fn piData(reader: *@This()) Allocator.Error![]const u8 {
+ return reader.reader.piData();
+ }
+
+ /// See `Reader.piDataWrite`.
+ pub inline fn piDataWrite(reader: @This(), writer: anytype) @TypeOf(writer).Error!void {
+ return @errorCast(reader.reader.piDataWrite(writer.any()));
+ }
+
+ /// See `Reader.piDataRaw`.
+ pub inline fn piDataRaw(reader: @This()) []const u8 {
+ return reader.reader.piDataRaw();
+ }
+
+ /// See `Reader.text`.
+ pub inline fn text(reader: *@This()) Allocator.Error![]const u8 {
+ return reader.reader.text();
+ }
+
+ /// See `Reader.textWrite`.
+ pub inline fn textWrite(reader: @This(), writer: anytype) @TypeOf(writer).Error!void {
+ return @errorCast(reader.reader.textWrite(writer.any()));
+ }
+
+ /// See `Reader.textRaw`.
+ pub inline fn textRaw(reader: @This()) []const u8 {
+ return reader.reader.textRaw();
+ }
+
+ /// See `Reader.cdataWrite`.
+ pub inline fn cdataWrite(reader: @This(), writer: anytype) @TypeOf(writer).Error!void {
+ return @errorCast(reader.reader.cdataWrite(writer.any()));
+ }
+
+ /// See `Reader.cdata`.
+ pub inline fn cdata(reader: *@This()) Allocator.Error![]const u8 {
+ return reader.reader.cdata();
+ }
-pub const encoding = @import("encoding.zig");
+ /// See `Reader.cdataRaw`.
+ pub inline fn cdataRaw(reader: @This()) []const u8 {
+ return reader.reader.cdataRaw();
+ }
-pub const Scanner = @import("Scanner.zig");
+ /// See `Reader.entityReferenceName`.
+ pub inline fn entityReferenceName(reader: @This()) []const u8 {
+ return reader.reader.entityReferenceName();
+ }
-pub const tokenReader = @import("token_reader.zig").tokenReader;
-pub const TokenReader = @import("token_reader.zig").TokenReader;
-pub const TokenReaderOptions = @import("token_reader.zig").TokenReaderOptions;
-pub const Token = @import("token_reader.zig").Token;
+ /// See `Reader.characterReferenceChar`.
+ pub inline fn characterReferenceChar(reader: @This()) u21 {
+ return reader.reader.characterReferenceChar();
+ }
-pub const reader = @import("reader.zig").reader;
-pub const readDocument = @import("reader.zig").readDocument;
-pub const Reader = @import("reader.zig").Reader;
-pub const ReaderOptions = @import("reader.zig").ReaderOptions;
-pub const QName = @import("reader.zig").QName;
-pub const Event = @import("reader.zig").Event;
+ /// See `Reader.characterReferenceName`.
+ pub inline fn characterReferenceName(reader: @This()) []const u8 {
+ return reader.reader.characterReferenceName();
+ }
-pub const Node = @import("node.zig").Node;
-pub const OwnedValue = @import("node.zig").OwnedValue;
+ /// See `Reader.namespaceUri`.
+ pub inline fn namespaceUri(reader: @This(), prefix: []const u8) []const u8 {
+ return reader.reader.namespaceUri(prefix);
+ }
-pub const writer = @import("writer.zig").writer;
-pub const Writer = @import("writer.zig").Writer;
-pub const fmtAttributeContent = @import("writer.zig").fmtAttributeContent;
-pub const fmtElementContent = @import("writer.zig").fmtElementContent;
-pub const fmtQName = @import("writer.zig").fmtQName;
+ /// Returns the underlying raw `Reader`.
+ pub inline fn raw(reader: *@This()) *Reader {
+ return &reader.reader;
+ }
+ };
+}
+
+pub const StaticDocument = struct {
+ data: []const u8,
+ pos: usize,
+
+ pub const Error = error{};
+
+ pub fn init(data: []const u8) StaticDocument {
+ return .{ .data = data, .pos = 0 };
+ }
+
+ pub fn reader(doc: *StaticDocument, gpa: Allocator, options: Reader.Options) GenericReader(Error) {
+ return .{ .reader = Reader.init(gpa, doc.source(), options) };
+ }
+
+ pub fn source(doc: *StaticDocument) Reader.Source {
+ return .{
+ .context = doc,
+ .moveFn = &move,
+ };
+ }
+
+ fn move(context: *const anyopaque, advance: usize, len: usize) anyerror![]const u8 {
+ const doc: *StaticDocument = @alignCast(@constCast(@ptrCast(context)));
+ doc.pos += advance;
+ const rest_doc = doc.data[doc.pos..];
+ return rest_doc[0..@min(len, rest_doc.len)];
+ }
+};
+
+pub fn StreamingDocument(comptime ReaderType: type) type {
+ return struct {
+ stream: ReaderType,
+ buf: []u8,
+ pos: usize,
+ avail: usize,
+ gpa: Allocator,
+
+ pub const Error = ReaderType.Error || Allocator.Error;
+
+ pub fn init(gpa: Allocator, stream: ReaderType) @This() {
+ return .{
+ .stream = stream,
+ .buf = &.{},
+ .pos = 0,
+ .avail = 0,
+ .gpa = gpa,
+ };
+ }
+
+ pub fn deinit(doc: *@This()) void {
+ doc.gpa.free(doc.buf);
+ doc.* = undefined;
+ }
+
+ pub fn reader(doc: *@This(), gpa: Allocator, options: Reader.Options) GenericReader(Error) {
+ return .{ .reader = Reader.init(gpa, doc.source(), options) };
+ }
+
+ pub fn source(doc: *@This()) Reader.Source {
+ return .{
+ .context = doc,
+ .moveFn = &move,
+ };
+ }
+
+ fn move(context: *const anyopaque, advance: usize, len: usize) anyerror![]const u8 {
+ const doc: *@This() = @alignCast(@constCast(@ptrCast(context)));
+ doc.pos += advance;
+ if (len <= doc.avail - doc.pos) return doc.buf[doc.pos..][0..len];
+ doc.discardRead();
+ try doc.fillBuffer(len);
+ return doc.buf[0..@min(len, doc.avail)];
+ }
+
+ fn discardRead(doc: *@This()) void {
+ doc.avail -= doc.pos;
+ std.mem.copyForwards(u8, doc.buf[0..doc.avail], doc.buf[doc.pos..][0..doc.avail]);
+ doc.pos = 0;
+ }
+
+ const min_buf_len = 4096;
+
+ fn fillBuffer(doc: *@This(), target_len: usize) !void {
+ if (target_len > doc.buf.len) {
+ const new_buf_len = @min(min_buf_len, std.math.ceilPowerOfTwoAssert(usize, target_len));
+ doc.buf = try doc.gpa.realloc(doc.buf, new_buf_len);
+ }
+ doc.avail += try doc.stream.read(doc.buf[doc.avail..]);
+ }
+ };
+}
+
+pub fn streamingDocument(gpa: Allocator, reader: anytype) StreamingDocument(@TypeOf(reader)) {
+ return StreamingDocument(@TypeOf(reader)).init(gpa, reader);
+}
+
+test streamingDocument {
+ var fbs = std.io.fixedBufferStream(
+ \\
+ \\Hello, world!
+ \\
+ );
+ var doc = streamingDocument(std.testing.allocator, fbs.reader());
+ defer doc.deinit();
+ var reader = doc.reader(std.testing.allocator, .{});
+ defer reader.deinit();
+
+ try expectEqual(.xml_declaration, try reader.read());
+ try expectEqualStrings("1.0", reader.xmlDeclarationVersion());
+
+ try expectEqual(.element_start, try reader.read());
+ try expectEqualStrings("root", reader.elementName());
+
+ try expectEqual(.text, try reader.read());
+ try expectEqualStrings("Hello, world!", reader.textRaw());
+
+ try expectEqual(.element_end, try reader.read());
+ try expectEqualStrings("root", reader.elementName());
+
+ try expectEqual(.eof, try reader.read());
+}
+
+pub const Writer = @import("Writer.zig");
+
+pub fn GenericWriter(comptime SinkError: type) type {
+ return struct {
+ writer: Writer,
+
+ pub const WriteError = Writer.WriteError || SinkError;
+
+ pub inline fn bom(writer: *@This()) WriteError!void {
+ return @errorCast(writer.writer.bom());
+ }
+
+ pub inline fn xmlDeclaration(writer: *@This(), encoding: ?[]const u8, standalone: ?bool) WriteError!void {
+ return @errorCast(writer.writer.xmlDeclaration(encoding, standalone));
+ }
+
+ pub inline fn elementStart(writer: *@This(), name: []const u8) WriteError!void {
+ return @errorCast(writer.writer.elementStart(name));
+ }
+
+ pub inline fn elementEnd(writer: *@This(), name: []const u8) WriteError!void {
+ return @errorCast(writer.writer.elementEnd(name));
+ }
+
+ pub inline fn elementEndEmpty(writer: *@This()) WriteError!void {
+ return @errorCast(writer.writer.elementEndEmpty());
+ }
+
+ pub inline fn attribute(writer: *@This(), name: []const u8, value: []const u8) WriteError!void {
+ return @errorCast(writer.writer.attribute(name, value));
+ }
+
+ pub inline fn pi(writer: *@This(), target: []const u8, data: []const u8) WriteError!void {
+ return @errorCast(writer.writer.pi(target, data));
+ }
+
+ pub inline fn text(writer: *@This(), s: []const u8) WriteError!void {
+ return @errorCast(writer.writer.text(s));
+ }
+ };
+}
+
+pub fn StreamingOutput(comptime WriterType: type) type {
+ return struct {
+ stream: WriterType,
+
+ pub const Error = WriterType.Error;
+
+ pub fn writer(out: *const @This(), options: Writer.Options) GenericWriter(Error) {
+ return .{ .writer = Writer.init(out.sink(), options) };
+ }
+
+ pub fn sink(out: *const @This()) Writer.Sink {
+ return .{
+ .context = out,
+ .writeFn = &write,
+ };
+ }
+
+ fn write(context: *const anyopaque, data: []const u8) anyerror!void {
+ const out: *const @This() = @alignCast(@ptrCast(context));
+ var pos: usize = 0;
+ while (pos < data.len) {
+ pos += try out.stream.write(data[pos..]);
+ }
+ }
+ };
+}
+
+pub fn streamingOutput(writer: anytype) StreamingOutput(@TypeOf(writer)) {
+ return .{ .stream = writer };
+}
test {
- testing.refAllDecls(@This());
+ _ = Reader;
+ _ = Writer;
}
diff --git a/test/xmlconf.zig b/test/xmlconf.zig
deleted file mode 100644
index 4e17e69..0000000
--- a/test/xmlconf.zig
+++ /dev/null
@@ -1,471 +0,0 @@
-//! A test runner for the W3C XML conformance test suite:
-//! https://www.w3.org/XML/Test/
-
-const std = @import("std");
-const xml = @import("xml");
-const fs = std.fs;
-const io = std.io;
-const mem = std.mem;
-const process = std.process;
-const Allocator = mem.Allocator;
-const ArrayListUnmanaged = std.ArrayListUnmanaged;
-
-const usage =
- \\Usage: xmlconf [options] files...
- \\
- \\The provided files are expected to be XML documents containing a root
- \\TESTCASES element containing TESTs.
- \\
- \\Options:
- \\ -h, --help show help
- \\ -v, --verbose enable verbose output
- \\
-;
-
-const max_test_data_bytes = 2 * 1024 * 1024; // 2MB
-
-const Suite = struct {
- profile: ?[]const u8,
- tests: []const Test,
-};
-
-const Test = struct {
- id: []const u8,
- type: Type,
- version: ?[]const u8,
- edition: ?[]const u8,
- entities: Entities,
- namespace: bool,
- sections: []const u8,
- description: []const u8,
- input: []const u8,
- output: ?[]const u8,
-
- const Type = enum {
- valid,
- invalid,
- @"not-wf",
- @"error",
-
- fn parse(value: []const u8) !Type {
- inline for (std.meta.fields(Type)) |field| {
- if (mem.eql(u8, value, field.name)) {
- return @enumFromInt(field.value);
- }
- }
- return error.InvalidTest;
- }
- };
-
- const Entities = enum {
- both,
- none,
- parameter,
- general,
-
- fn parse(value: []const u8) !Entities {
- inline for (std.meta.fields(Entities)) |field| {
- if (mem.eql(u8, value, field.name)) {
- return @enumFromInt(field.value);
- }
- }
- return error.InvalidTest;
- }
- };
-};
-
-fn Context(comptime OutType: type) type {
- return struct {
- allocator: Allocator,
- verbose: bool,
- tty_config: io.tty.Config,
- out: OutType,
- passed: ArrayListUnmanaged(Test) = .{},
- failed: ArrayListUnmanaged(Test) = .{},
- skipped: ArrayListUnmanaged(Test) = .{},
-
- const Self = @This();
-
- fn msg(self: Self, comptime format: []const u8, args: anytype) !void {
- try self.out.print(format ++ "\n", args);
- }
-
- fn pass(self: *Self, @"test": Test) !void {
- try self.passed.append(self.allocator, @"test");
- if (self.verbose) {
- try self.tty_config.setColor(self.out, .green);
- try self.out.print("PASS: {s} ({s})\n", .{ @"test".id, @"test".sections });
- try self.tty_config.setColor(self.out, .reset);
- }
- }
-
- fn fail(self: *Self, @"test": Test, reason: []const u8) !void {
- try self.failed.append(self.allocator, @"test");
- try self.tty_config.setColor(self.out, .red);
- try self.out.print("FAIL: {s} ({s}): {s}\n", .{ @"test".id, @"test".sections, reason });
- try self.tty_config.setColor(self.out, .reset);
- }
-
- fn skip(self: *Self, @"test": Test, reason: []const u8) !void {
- try self.skipped.append(self.allocator, @"test");
- if (self.verbose) {
- try self.tty_config.setColor(self.out, .yellow);
- try self.out.print("SKIP: {s} ({s}): {s}\n", .{ @"test".id, @"test".sections, reason });
- try self.tty_config.setColor(self.out, .reset);
- }
- }
- };
-}
-
-fn context(allocator: Allocator, verbose: bool, tty_config: io.tty.Config, out: anytype) Context(@TypeOf(out)) {
- return .{ .allocator = allocator, .verbose = verbose, .tty_config = tty_config, .out = out };
-}
-
-pub fn main() !void {
- var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
- defer arena.deinit();
- const allocator = arena.allocator();
-
- var args_iter = try process.argsWithAllocator(allocator);
- defer args_iter.deinit();
- _ = args_iter.skip();
-
- const stderr = io.getStdErr().writer();
-
- var allow_options = true;
- var verbose = false;
- var suites = ArrayListUnmanaged(Suite){};
- while (args_iter.next()) |arg| {
- if (allow_options and mem.startsWith(u8, arg, "-")) {
- if (std.mem.eql(u8, arg, "-h") or std.mem.eql(u8, arg, "--help")) {
- try stderr.writeAll(usage);
- process.exit(0);
- } else if (std.mem.eql(u8, arg, "-v") or std.mem.eql(u8, arg, "--verbose")) {
- verbose = true;
- } else if (std.mem.eql(u8, arg, "--")) {
- allow_options = false;
- } else {
- try stderr.print("unrecognized option: {s}", .{arg});
- process.exit(1);
- }
- } else {
- var suite_dir = try fs.cwd().openDir(fs.path.dirname(arg) orelse ".", .{});
- defer suite_dir.close();
- var suite_file = try suite_dir.openFile(fs.path.basename(arg), .{});
- defer suite_file.close();
-
- var buf_reader = io.bufferedReader(suite_file.reader());
- var suite_reader = xml.reader(allocator, buf_reader.reader(), xml.encoding.DefaultDecoder{}, .{});
- defer suite_reader.deinit();
- try suites.append(allocator, try readSuite(allocator, suite_dir, &suite_reader));
- }
- }
-
- if (suites.items.len == 0) {
- try stderr.writeAll("expected at least one test suite file");
- process.exit(1);
- }
-
- const stdout = io.getStdOut();
- const tty_config = io.tty.detectConfig(stdout);
- var stdout_buf = io.bufferedWriter(stdout.writer());
- var ctx = context(allocator, verbose, tty_config, stdout_buf.writer());
-
- for (suites.items) |suite| {
- try runSuite(suite, &ctx);
- }
-
- try ctx.msg("DONE: {} passed, {} failed, {} skipped", .{
- ctx.passed.items.len,
- ctx.failed.items.len,
- ctx.skipped.items.len,
- });
- try stdout_buf.flush();
-}
-
-fn readSuite(allocator: Allocator, suite_dir: fs.Dir, suite_reader: anytype) !Suite {
- var profile: ?[]const u8 = null;
- var tests = ArrayListUnmanaged(Test){};
-
- while (try suite_reader.next()) |event| {
- switch (event) {
- .element_start => |element_start| if (element_start.name.is(null, "TESTCASES")) {
- for (element_start.attributes) |attr| {
- if (attr.name.is(null, "PROFILE")) {
- profile = try allocator.dupe(u8, attr.value);
- }
- }
- } else if (element_start.name.is(null, "TEST")) {
- try tests.append(allocator, try readTest(allocator, suite_dir, element_start, suite_reader.children()));
- } else {
- try suite_reader.children().skip();
- },
- else => {},
- }
- }
-
- return .{
- .profile = profile,
- .tests = tests.items,
- };
-}
-
-fn readTest(allocator: Allocator, suite_dir: fs.Dir, test_start: xml.Event.ElementStart, test_reader: anytype) !Test {
- var id: ?[]const u8 = null;
- var @"type": ?Test.Type = null;
- var version: ?[]const u8 = null;
- var edition: ?[]const u8 = null;
- var entities = Test.Entities.none;
- var namespace = true;
- var sections: ?[]const u8 = null;
- var description = ArrayListUnmanaged(u8){};
- var input: ?[]const u8 = null;
- var output: ?[]const u8 = null;
-
- for (test_start.attributes) |attr| {
- if (attr.name.is(null, "ID")) {
- id = try allocator.dupe(u8, attr.value);
- } else if (attr.name.is(null, "TYPE")) {
- @"type" = try Test.Type.parse(attr.value);
- } else if (attr.name.is(null, "VERSION")) {
- version = try allocator.dupe(u8, attr.value);
- } else if (attr.name.is(null, "EDITION")) {
- edition = try allocator.dupe(u8, attr.value);
- } else if (attr.name.is(null, "ENTITIES")) {
- entities = try Test.Entities.parse(attr.value);
- } else if (attr.name.is(null, "NAMESPACE")) {
- namespace = mem.eql(u8, attr.value, "yes");
- } else if (attr.name.is(null, "SECTIONS")) {
- sections = try allocator.dupe(u8, attr.value);
- } else if (attr.name.is(null, "URI")) {
- input = try suite_dir.readFileAlloc(allocator, attr.value, max_test_data_bytes);
- } else if (attr.name.is(null, "OUTPUT")) {
- output = try suite_dir.readFileAlloc(allocator, attr.value, max_test_data_bytes);
- }
- }
-
- while (try test_reader.next()) |event| {
- switch (event) {
- .element_content => |element_content| try description.appendSlice(allocator, element_content.content),
- else => {},
- }
- }
-
- return .{
- .id = id orelse return error.InvalidTest,
- .type = @"type" orelse return error.InvalidTest,
- .version = version,
- .edition = edition,
- .entities = entities,
- .namespace = namespace,
- .sections = sections orelse return error.InvalidTest,
- .description = description.items,
- .input = input orelse return error.InvalidTest,
- .output = output,
- };
-}
-
-fn runSuite(suite: Suite, ctx: anytype) !void {
- try ctx.msg("START: {s}", .{suite.profile orelse "untitled"});
- var suite_ctx = context(ctx.allocator, ctx.verbose, ctx.tty_config, ctx.out);
- for (suite.tests) |@"test"| {
- try runTest(@"test", &suite_ctx);
- }
- try ctx.msg("DONE: {s}: passed={} failed={} skipped={}", .{
- suite.profile orelse "untitled",
- suite_ctx.passed.items.len,
- suite_ctx.failed.items.len,
- suite_ctx.skipped.items.len,
- });
- try ctx.passed.appendSlice(ctx.allocator, suite_ctx.passed.items);
- try ctx.failed.appendSlice(ctx.allocator, suite_ctx.failed.items);
- try ctx.skipped.appendSlice(ctx.allocator, suite_ctx.skipped.items);
-}
-
-fn runTest(@"test": Test, ctx: anytype) !void {
- if (@"test".version) |version| {
- if (!mem.eql(u8, version, "1.0")) {
- return try ctx.skip(@"test", "only XML 1.0 is supported");
- }
- }
- if (@"test".edition) |edition| {
- // This check will technically be incorrect if a 15th edition is
- // released at some point, which seems highly unlikely
- if (mem.indexOfScalar(u8, edition, '5') == null) {
- return try ctx.skip(@"test", "only the fifth edition of XML 1.0 is supported");
- }
- }
-
- switch (@"test".type) {
- .valid, .invalid => {
- var input_stream = io.fixedBufferStream(@"test".input);
- // TODO: making namespace_aware a comptime option makes this possibly more difficult than it should be
- if (@"test".namespace) {
- var input_reader = xml.reader(ctx.allocator, input_stream.reader(), xml.encoding.DefaultDecoder{}, .{});
- defer input_reader.deinit();
- try runTestValid(@"test", &input_reader, ctx);
- } else {
- var input_reader = xml.reader(ctx.allocator, input_stream.reader(), xml.encoding.DefaultDecoder{}, .{
- .namespace_aware = false,
- });
- defer input_reader.deinit();
- try runTestValid(@"test", &input_reader, ctx);
- }
- },
- .@"not-wf" => {
- var input_stream = io.fixedBufferStream(@"test".input);
- if (@"test".namespace) {
- var input_reader = xml.reader(ctx.allocator, input_stream.reader(), xml.encoding.DefaultDecoder{}, .{});
- defer input_reader.deinit();
- try runTestNonWf(@"test", &input_reader, ctx);
- } else {
- var input_reader = xml.reader(ctx.allocator, input_stream.reader(), xml.encoding.DefaultDecoder{}, .{
- .namespace_aware = false,
- });
- defer input_reader.deinit();
- try runTestNonWf(@"test", &input_reader, ctx);
- }
- },
- .@"error" => return try ctx.skip(@"test", "TODO: not sure how to run error tests"),
- }
-}
-
-fn runTestValid(@"test": Test, input_reader: anytype, ctx: anytype) !void {
- var buf = ArrayListUnmanaged(u8){};
- defer buf.deinit(ctx.allocator);
- while (input_reader.next()) |maybe_event| {
- if (maybe_event) |event| {
- try writeCanonical(ctx.allocator, &buf, event);
- } else {
- if (@"test".output) |output| {
- if (!mem.eql(u8, buf.items, output)) {
- return try ctx.fail(@"test", "expected output does not match");
- }
- }
- return try ctx.pass(@"test");
- }
- } else |e| switch (e) {
- error.DoctypeNotSupported => return try ctx.skip(@"test", "doctype not supported"),
- error.CannotUndeclareNsPrefix,
- error.DuplicateAttribute,
- error.InvalidCharacterReference,
- error.InvalidEncoding,
- error.InvalidNsBinding,
- error.InvalidPiTarget,
- error.InvalidQName,
- error.InvalidUtf8,
- error.InvalidUtf16,
- error.MismatchedEndTag,
- error.SyntaxError,
- error.UndeclaredEntityReference,
- error.UndeclaredNsPrefix,
- error.UnexpectedEndOfInput,
- error.QNameNotAllowed,
- => return try ctx.fail(@"test", @errorName(e)),
- else => |other_e| return other_e,
- }
-}
-
-fn runTestNonWf(@"test": Test, input_reader: anytype, ctx: anytype) !void {
- while (input_reader.next()) |event| {
- if (event == null) {
- return try ctx.fail(@"test", "expected error, found none");
- }
- } else |e| switch (e) {
- error.DoctypeNotSupported => return try ctx.skip(@"test", "doctype not supported"),
- error.CannotUndeclareNsPrefix,
- error.DuplicateAttribute,
- error.InvalidCharacterReference,
- error.InvalidEncoding,
- error.InvalidNsBinding,
- error.InvalidPiTarget,
- error.InvalidQName,
- error.InvalidUtf8,
- error.InvalidUtf16,
- error.MismatchedEndTag,
- error.SyntaxError,
- error.UndeclaredEntityReference,
- error.UndeclaredNsPrefix,
- error.UnexpectedEndOfInput,
- error.QNameNotAllowed,
- => return try ctx.pass(@"test"),
- else => |other_e| return other_e,
- }
-}
-
-fn writeCanonical(allocator: Allocator, buf: *ArrayListUnmanaged(u8), event: xml.Event) !void {
- switch (event) {
- .xml_declaration, .comment => {},
- .element_start => |element_start| {
- try buf.append(allocator, '<');
- try writeQName(allocator, buf, element_start.name);
- const attrs = try allocator.dupe(xml.Event.Attribute, element_start.attributes);
- defer allocator.free(attrs);
- std.sort.heap(xml.Event.Attribute, attrs, {}, attrLessThan);
- for (attrs) |attr| {
- try buf.append(allocator, ' ');
- try writeQName(allocator, buf, attr.name);
- try buf.appendSlice(allocator, "=\"");
- try writeContent(allocator, buf, attr.value);
- try buf.append(allocator, '"');
- }
- try buf.append(allocator, '>');
- },
- .element_content => |element_content| {
- try writeContent(allocator, buf, element_content.content);
- },
- .element_end => |element_end| {
- try buf.appendSlice(allocator, "");
- try writeQName(allocator, buf, element_end.name);
- try buf.append(allocator, '>');
- },
- .pi => |pi| {
- try buf.appendSlice(allocator, "");
- try buf.appendSlice(allocator, pi.target);
- try buf.append(allocator, ' ');
- try buf.appendSlice(allocator, pi.content);
- try buf.appendSlice(allocator, "?>");
- },
- }
-}
-
-fn writeQName(allocator: Allocator, buf: *ArrayListUnmanaged(u8), qname: xml.QName) !void {
- if (qname.prefix) |prefix| {
- try buf.appendSlice(allocator, prefix);
- try buf.append(allocator, ':');
- }
- try buf.appendSlice(allocator, qname.local);
-}
-
-fn writeContent(allocator: Allocator, buf: *ArrayListUnmanaged(u8), content: []const u8) !void {
- for (content) |c| {
- switch (c) {
- '&' => try buf.appendSlice(allocator, "&"),
- '<' => try buf.appendSlice(allocator, "<"),
- '>' => try buf.appendSlice(allocator, ">"),
- '"' => try buf.appendSlice(allocator, """),
- '\t' => try buf.appendSlice(allocator, " "),
- '\n' => try buf.appendSlice(allocator, "
"),
- '\r' => try buf.appendSlice(allocator, "
"),
- else => try buf.append(allocator, c),
- }
- }
-}
-
-fn attrLessThan(_: void, lhs: xml.Event.Attribute, rhs: xml.Event.Attribute) bool {
- // This is a pretty stupid implementation, but it should work for all
- // reasonable test cases
- var lhs_buf: [1024]u8 = undefined;
- const lhs_name = if (lhs.name.ns) |ns|
- std.fmt.bufPrint(&lhs_buf, "{s}:{s}", .{ ns, lhs.name.local }) catch @panic("attribute name too long")
- else
- lhs.name.local;
-
- var rhs_buf: [1024]u8 = undefined;
- const rhs_name = if (rhs.name.ns) |ns|
- std.fmt.bufPrint(&rhs_buf, "{s}:{s}", .{ ns, rhs.name.local }) catch @panic("attribute name too long")
- else
- rhs.name.local;
-
- return mem.lessThan(u8, lhs_name, rhs_name);
-}
diff --git a/xmlconf/build.zig b/xmlconf/build.zig
new file mode 100644
index 0000000..cc991f4
--- /dev/null
+++ b/xmlconf/build.zig
@@ -0,0 +1,44 @@
+const std = @import("std");
+
+pub fn build(b: *std.Build) void {
+ const target = b.standardTargetOptions(.{});
+ const optimize = b.standardOptimizeOption(.{});
+
+ const xml = b.dependency("xml", .{
+ .target = target,
+ .optimize = optimize,
+ });
+
+ const xmlconf_exe = b.addExecutable(.{
+ .name = "xmlconf",
+ .root_source_file = b.path("src/xmlconf.zig"),
+ .target = target,
+ .optimize = optimize,
+ });
+ xmlconf_exe.root_module.addImport("xml", xml.module("xml"));
+ b.installArtifact(xmlconf_exe);
+
+ const xmlts = b.dependency("xmlts", .{});
+ const xmlts_run = b.addRunArtifact(xmlconf_exe);
+ // Since we can't process DTDs yet, we need to manually specify the test
+ // suite root files individually.
+ const suite_paths: []const []const u8 = &.{
+ "eduni/errata-2e/errata2e.xml",
+ "eduni/errata-3e/errata3e.xml",
+ "eduni/errata-4e/errata4e.xml",
+ "ibm/ibm_oasis_invalid.xml",
+ "ibm/ibm_oasis_not-wf.xml",
+ "ibm/ibm_oasis_valid.xml",
+ "japanese/japanese.xml",
+ "oasis/oasis.xml",
+ // The sun test suite files are not structured in a way we can handle
+ // without DTD support.
+ "xmltest/xmltest.xml",
+ };
+ for (suite_paths) |path| {
+ xmlts_run.addFileArg(xmlts.path(path));
+ }
+
+ const test_step = b.step("test", "Run the tests");
+ test_step.dependOn(&xmlts_run.step);
+}
diff --git a/xmlconf/build.zig.zon b/xmlconf/build.zig.zon
new file mode 100644
index 0000000..02735b6
--- /dev/null
+++ b/xmlconf/build.zig.zon
@@ -0,0 +1,18 @@
+.{
+ .name = "xmlconf",
+ .version = "0.0.0",
+ .paths = .{
+ "build.zig",
+ "build.zig.zon",
+ "src",
+ },
+ .dependencies = .{
+ .xml = .{
+ .path = "..",
+ },
+ .xmlts = .{
+ .url = "https://www.w3.org/XML/Test/xmlts20130923.tar.gz",
+ .hash = "1220322f729089d5371fce0b0777edb9946cc54a389aa372c879d9c0843d862c4bbe",
+ },
+ },
+}
diff --git a/xmlconf/src/xmlconf.zig b/xmlconf/src/xmlconf.zig
new file mode 100644
index 0000000..c9a4faa
--- /dev/null
+++ b/xmlconf/src/xmlconf.zig
@@ -0,0 +1,456 @@
+const std = @import("std");
+const Allocator = std.mem.Allocator;
+const log = std.log;
+const xml = @import("xml");
+
+const usage =
+ \\Usage: xmlconf [options] files...
+ \\
+ \\Runs the provided xmlconf test suites.
+ \\
+ \\Options:
+ \\ -h, --help show help
+ \\ -v, --verbose increase verbosity
+ \\
+;
+
+var log_tty_config: std.io.tty.Config = undefined; // Will be initialized immediately in main
+var log_level: std.log.Level = .warn;
+
+pub const std_options: std.Options = .{
+ .logFn = logImpl,
+};
+
+pub fn logImpl(
+ comptime level: std.log.Level,
+ comptime scope: @Type(.enum_literal),
+ comptime format: []const u8,
+ args: anytype,
+) void {
+ if (@intFromEnum(level) > @intFromEnum(log_level)) return;
+
+ const prefix = if (scope == .default)
+ comptime level.asText() ++ ": "
+ else
+ comptime level.asText() ++ "(" ++ @tagName(scope) ++ "): ";
+ std.debug.lockStdErr();
+ defer std.debug.unlockStdErr();
+ const stderr = std.io.getStdErr().writer();
+ log_tty_config.setColor(stderr, switch (level) {
+ .err => .bright_red,
+ .warn => .bright_yellow,
+ .info => .bright_blue,
+ .debug => .bright_magenta,
+ }) catch return;
+ stderr.writeAll(prefix) catch return;
+ log_tty_config.setColor(stderr, .reset) catch return;
+ stderr.print(format ++ "\n", args) catch return;
+}
+
+pub fn main() !void {
+ log_tty_config = std.io.tty.detectConfig(std.io.getStdErr());
+
+ var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator);
+ defer arena_state.deinit();
+ const arena = arena_state.allocator();
+
+ var suite_paths = std.ArrayList([]const u8).init(arena);
+
+ var args: ArgIterator = .{ .args = try std.process.argsWithAllocator(arena) };
+ _ = args.next();
+ while (args.next()) |arg| {
+ switch (arg) {
+ .option => |option| if (option.is('h', "help")) {
+ try std.io.getStdOut().writeAll(usage);
+ std.process.exit(0);
+ } else if (option.is('v', "verbose")) {
+ log_level = switch (log_level) {
+ .err => .warn,
+ .warn => .info,
+ .info => .debug,
+ .debug => .debug,
+ };
+ } else {
+ fatal("unrecognized option: {}", .{option});
+ },
+ .param => |param| {
+ try suite_paths.append(try arena.dupe(u8, param));
+ },
+ .unexpected_value => |unexpected_value| fatal("unexpected value to --{s}: {s}", .{
+ unexpected_value.option,
+ unexpected_value.value,
+ }),
+ }
+ }
+
+ var gpa_state: std.heap.GeneralPurposeAllocator(.{}) = .{};
+ defer _ = gpa_state.deinit();
+ const gpa = gpa_state.allocator();
+
+ var results: Results = .{};
+ for (suite_paths.items) |suite_path| {
+ runFile(gpa, suite_path, &results) catch |err|
+ results.err("running suite {s}: {}", .{ suite_path, err });
+ }
+ std.debug.print("{} passed, {} failed, {} skipped\n", .{ results.passed, results.failed, results.skipped });
+ std.process.exit(if (results.ok()) 0 else 1);
+}
+
+fn fatal(comptime format: []const u8, args: anytype) noreturn {
+ log.err(format, args);
+ std.process.exit(1);
+}
+
+const Results = struct {
+ passed: usize = 0,
+ failed: usize = 0,
+ skipped: usize = 0,
+ run_error: bool = false,
+
+ fn ok(results: Results) bool {
+ return results.failed == 0 and !results.run_error;
+ }
+
+ fn pass(results: *Results, id: []const u8) void {
+ log.debug("pass: {s}", .{id});
+ results.passed += 1;
+ }
+
+ fn fail(results: *Results, id: []const u8, comptime fmt: []const u8, args: anytype) void {
+ log.err("fail: {s}: " ++ fmt, .{id} ++ args);
+ results.failed += 1;
+ }
+
+ fn skip(results: *Results, id: []const u8, comptime fmt: []const u8, args: anytype) void {
+ log.info("skip: {s}: " ++ fmt, .{id} ++ args);
+ results.skipped += 1;
+ }
+
+ fn err(results: *Results, comptime fmt: []const u8, args: anytype) void {
+ log.err(fmt, args);
+ results.run_error = true;
+ }
+};
+
+const max_file_size = 2 * 1024 * 1024;
+
+fn runFile(gpa: Allocator, path: []const u8, results: *Results) !void {
+ var dir = try std.fs.cwd().openDir(std.fs.path.dirname(path) orelse ".", .{});
+ defer dir.close();
+ const data = try dir.readFileAlloc(gpa, std.fs.path.basename(path), max_file_size);
+ defer gpa.free(data);
+ var doc = xml.StaticDocument.init(data);
+ var reader = doc.reader(gpa, .{});
+ defer reader.deinit();
+
+ try reader.skipProlog();
+ if (!std.mem.eql(u8, "TESTCASES", reader.elementName())) return error.InvalidTest;
+ try runSuite(gpa, dir, reader.raw(), results);
+}
+
+fn runSuite(gpa: Allocator, dir: std.fs.Dir, reader: *xml.Reader, results: *Results) !void {
+ if (reader.attributeIndex("PROFILE")) |profile_attr| {
+ log.info("suite: {s}", .{try reader.attributeValue(profile_attr)});
+ }
+
+ while (true) {
+ switch (try reader.read()) {
+ .element_start => if (std.mem.eql(u8, reader.elementName(), "TESTCASES")) {
+ try runSuite(gpa, dir, reader, results);
+ } else if (std.mem.eql(u8, reader.elementName(), "TEST")) {
+ try runTest(gpa, dir, reader, results);
+ } else {
+ return error.InvalidTest;
+ },
+ .element_end => break,
+ else => {},
+ }
+ }
+}
+
+fn runTest(gpa: Allocator, dir: std.fs.Dir, reader: *xml.Reader, results: *Results) !void {
+ const @"type" = type: {
+ const index = reader.attributeIndex("TYPE") orelse return error.InvalidTest;
+ break :type std.meta.stringToEnum(TestType, try reader.attributeValue(index)) orelse return error.InvalidTest;
+ };
+ const id = id: {
+ const index = reader.attributeIndex("ID") orelse return error.InvalidTest;
+ break :id try reader.attributeValueAlloc(gpa, index);
+ };
+ defer gpa.free(id);
+ if (reader.attributeIndex("VERSION")) |index| check_version: {
+ const versions = try reader.attributeValue(index);
+ var iter = std.mem.splitScalar(u8, versions, ' ');
+ while (iter.next()) |version| {
+ if (std.mem.eql(u8, version, "1.0")) break :check_version;
+ }
+ return results.skip(id, "only XML 1.0 is supported", .{});
+ }
+ if (reader.attributeIndex("EDITION")) |index| check_edition: {
+ const editions = try reader.attributeValue(index);
+ var iter = std.mem.splitScalar(u8, editions, ' ');
+ while (iter.next()) |edition| {
+ if (std.mem.eql(u8, edition, "5")) break :check_edition;
+ }
+ return results.skip(id, "only the fifth edition of XML 1.0 is supported", .{});
+ }
+ const namespace = namespace: {
+ const index = reader.attributeIndex("NAMESPACE") orelse break :namespace .yes;
+ break :namespace std.meta.stringToEnum(enum { yes, no }, try reader.attributeValue(index)) orelse return error.InvalidTest;
+ };
+ const input = input: {
+ const index = reader.attributeIndex("URI") orelse return error.InvalidTest;
+ const path = try reader.attributeValue(index);
+ break :input dir.readFileAlloc(gpa, path, max_file_size) catch |err|
+ return results.err("{s}: reading input file: {s}: {}", .{ id, path, err });
+ };
+ defer gpa.free(input);
+ const output = output: {
+ const index = reader.attributeIndex("OUTPUT") orelse break :output null;
+ const path = try reader.attributeValue(index);
+ break :output dir.readFileAlloc(gpa, path, max_file_size) catch |err|
+ return results.err("{s}: reading output file: {s}: {}", .{ id, path, err });
+ };
+ defer if (output) |o| gpa.free(o);
+ try reader.skipElement();
+
+ if (std.mem.startsWith(u8, input, "\xFE\xFF") or
+ std.mem.startsWith(u8, input, "\xFF\xFE"))
+ {
+ return results.skip(id, "UTF-16 unsupported", .{});
+ }
+
+ const options: TestOptions = .{
+ .namespace = namespace == .yes,
+ };
+ switch (@"type") {
+ .valid, .invalid => try runTestParseable(gpa, id, input, output, options, results),
+ .@"not-wf" => try runTestUnparseable(gpa, id, input, options, results),
+ .@"error" => results.skip(id, "not sure how to run error tests", .{}),
+ }
+}
+
+const TestOptions = struct {
+ namespace: bool,
+};
+
+fn runTestParseable(
+ gpa: Allocator,
+ id: []const u8,
+ input: []const u8,
+ output: ?[]const u8,
+ options: TestOptions,
+ results: *Results,
+) !void {
+ var doc = xml.StaticDocument.init(input);
+ var reader = doc.reader(gpa, .{
+ .namespace_aware = options.namespace,
+ });
+ defer reader.deinit();
+
+ var canonical_buf = std.ArrayList(u8).init(gpa);
+ defer canonical_buf.deinit();
+ var canonical_output = xml.streamingOutput(canonical_buf.writer());
+ var canonical = canonical_output.writer(.{});
+
+ while (true) {
+ const node = reader.read() catch |err| switch (err) {
+ error.MalformedXml => {
+ switch (reader.errorCode()) {
+ .doctype_unsupported => return results.skip(id, "doctype unsupported", .{}),
+ .xml_declaration_encoding_unsupported => return results.skip(id, "encoding unsupported", .{}),
+ else => |code| {
+ const loc = reader.errorLocation();
+ return results.fail(id, "malformed: {}:{}: {}", .{ loc.line, loc.column, code });
+ },
+ }
+ },
+ error.OutOfMemory => return error.OutOfMemory,
+ };
+ switch (node) {
+ .eof => break,
+ .xml_declaration, .comment => {}, // ignored in canonical form
+ .element_start => {
+ try canonical.elementStart(reader.elementName());
+
+ const sorted_attrs = try gpa.alloc(usize, reader.attributeCount());
+ defer gpa.free(sorted_attrs);
+ for (0..reader.attributeCount()) |i| sorted_attrs[i] = i;
+ std.sort.pdq(usize, sorted_attrs, reader, struct {
+ fn lessThan(r: @TypeOf(reader), lhs: usize, rhs: usize) bool {
+ return std.mem.lessThan(u8, r.attributeName(lhs), r.attributeName(rhs));
+ }
+ }.lessThan);
+ for (sorted_attrs) |i| {
+ try canonical.attribute(reader.attributeName(i), try reader.attributeValue(i));
+ }
+ },
+ .element_end => {
+ try canonical.elementEnd(reader.elementName());
+ },
+ .pi => {
+ try canonical.pi(reader.piTarget(), try reader.piData());
+ },
+ .text => {
+ try canonical.text(try reader.text());
+ },
+ .cdata => {
+ try canonical.text(try reader.cdata());
+ },
+ .character_reference => {
+ var buf: [4]u8 = undefined;
+ const len = std.unicode.utf8Encode(reader.characterReferenceChar(), &buf) catch unreachable;
+ try canonical.text(buf[0..len]);
+ },
+ .entity_reference => {
+ const value = xml.predefined_entities.get(reader.entityReferenceName()) orelse unreachable;
+ try canonical.text(value);
+ },
+ }
+ }
+
+ if (output) |expected_canonical| {
+ if (!std.mem.eql(u8, canonical_buf.items, expected_canonical)) {
+ return results.fail(
+ id,
+ "canonical output does not match\n\nexpected:\n{s}\n\nactual:{s}",
+ .{ expected_canonical, canonical_buf.items },
+ );
+ }
+ }
+ return results.pass(id);
+}
+
+fn runTestUnparseable(
+ gpa: Allocator,
+ id: []const u8,
+ input: []const u8,
+ options: TestOptions,
+ results: *Results,
+) !void {
+ var doc = xml.StaticDocument.init(input);
+ var reader = doc.reader(gpa, .{
+ .namespace_aware = options.namespace,
+ });
+ defer reader.deinit();
+
+ while (true) {
+ const node = reader.read() catch |err| switch (err) {
+ error.MalformedXml => switch (reader.errorCode()) {
+ .doctype_unsupported => return results.skip(id, "doctype unsupported", .{}),
+ .xml_declaration_encoding_unsupported => return results.skip(id, "encoding unsupported", .{}),
+ else => return results.pass(id),
+ },
+ error.OutOfMemory => return error.OutOfMemory,
+ };
+ if (node == .eof) return results.fail(id, "expected to fail to parse", .{});
+ }
+}
+
+const TestType = enum {
+ valid,
+ invalid,
+ @"not-wf",
+ @"error",
+};
+
+// Inspired by https://github.com/judofyr/parg
+const ArgIterator = struct {
+ args: std.process.ArgIterator,
+ state: union(enum) {
+ normal,
+ short: []const u8,
+ long: struct {
+ option: []const u8,
+ value: []const u8,
+ },
+ params_only,
+ } = .normal,
+
+ const Arg = union(enum) {
+ option: union(enum) {
+ short: u8,
+ long: []const u8,
+
+ fn is(option: @This(), short: ?u8, long: ?[]const u8) bool {
+ return switch (option) {
+ .short => |c| short == c,
+ .long => |s| std.mem.eql(u8, long orelse return false, s),
+ };
+ }
+
+ pub fn format(option: @This(), comptime _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void {
+ switch (option) {
+ .short => |c| try writer.print("-{c}", .{c}),
+ .long => |s| try writer.print("--{s}", .{s}),
+ }
+ }
+ },
+ param: []const u8,
+ unexpected_value: struct {
+ option: []const u8,
+ value: []const u8,
+ },
+ };
+
+ fn deinit(iter: *ArgIterator) void {
+ iter.args.deinit();
+ iter.* = undefined;
+ }
+
+ fn next(iter: *ArgIterator) ?Arg {
+ switch (iter.state) {
+ .normal => {
+ const arg = iter.args.next() orelse return null;
+ if (std.mem.eql(u8, arg, "--")) {
+ iter.state = .params_only;
+ return .{ .param = iter.args.next() orelse return null };
+ } else if (std.mem.startsWith(u8, arg, "--")) {
+ if (std.mem.indexOfScalar(u8, arg, '=')) |equals_index| {
+ const option = arg["--".len..equals_index];
+ iter.state = .{ .long = .{
+ .option = option,
+ .value = arg[equals_index + 1 ..],
+ } };
+ return .{ .option = .{ .long = option } };
+ } else {
+ return .{ .option = .{ .long = arg["--".len..] } };
+ }
+ } else if (std.mem.startsWith(u8, arg, "-") and arg.len > 1) {
+ if (arg.len > 2) {
+ iter.state = .{ .short = arg["-".len + 1 ..] };
+ }
+ return .{ .option = .{ .short = arg["-".len] } };
+ } else {
+ return .{ .param = arg };
+ }
+ },
+ .short => |rest| {
+ if (rest.len > 1) {
+ iter.state = .{ .short = rest[1..] };
+ }
+ return .{ .option = .{ .short = rest[0] } };
+ },
+ .long => |long| return .{ .unexpected_value = .{
+ .option = long.option,
+ .value = long.value,
+ } },
+ .params_only => return .{ .param = iter.args.next() orelse return null },
+ }
+ }
+
+ fn optionValue(iter: *ArgIterator) ?[]const u8 {
+ switch (iter.state) {
+ .normal => return iter.args.next(),
+ .short => |rest| {
+ iter.state = .normal;
+ return rest;
+ },
+ .long => |long| {
+ iter.state = .normal;
+ return long.value;
+ },
+ .params_only => unreachable,
+ }
+ }
+};