diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..98d0681 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,11 @@ +root = true + +[*] +charset = utf-8 +end_of_line = lf +indent_style = space +insert_final_newline = true +trim_trailing_whitespace = true + +[*.xml] +indent_size = 2 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a3ee6e5..cd6e3f8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - zig-version: [0.12.0, 0.13.0, master] + zig-version: [0.13.0, master] steps: - name: Checkout uses: actions/checkout@v3 diff --git a/.gitignore b/.gitignore index f33e3bd..d8c8979 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,2 @@ -bench/*.xml -callgrind.out.* -core* -fuzz/outputs -test/xmlconf .zig-cache -zig-cache zig-out diff --git a/README.md b/README.md index 3abe6ee..88a5af4 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # zig-xml -zig-xml is an XML library for Zig, currently supporting Zig 0.12.0, 0.13.0, and -the latest master at the time of writing. +zig-xml is an XML library for Zig, currently supporting Zig 0.13.0 and the +latest master at the time of writing. See the documentation in the code for more information about the available APIs (start in `xml.zig`). Autodocs are also published to GitHub Pages: @@ -12,185 +12,26 @@ The library aims to confirm with the following standards: - [XML 1.0 Fifth Edition](https://www.w3.org/TR/2008/REC-xml-20081126/) - [XML Namespaces 1.0 Third Edition](https://www.w3.org/TR/2009/REC-xml-names-20091208/) -Other standards (such as XML 1.1 or XML 1.0 prior to the fifth edition) are only -supported insofar as they are compatible with the above standards. In practice, -this should not make much difference, since XML 1.1 is rarely used, and the -differences between XML 1.0 editions are minor (the XML 1.0 fifth edition -standard allows many more characters in names than previous editions, subsuming -the -[only non-harmful feature of XML 1.1](http://www.ibiblio.org/xml/books/effectivexml/chapters/03.html)). - -## Feature overview - -Key for the list: +Currently, DTDs (DOCTYPE) are not supported, nor is any non-UTF-8 encoding. -- ✅ Supported -- 🚧 Partially supported -- ❌ Unsupported, but planned -- ❓️ Unsupported, maybe planned (long-term) -- 👎️ Unsupported, not planned - -Features: - -- ✅ Streaming parser (three options are available, `Reader` is the most - general-purpose but also the slowest) - - ✅ Core XML 1.0 language minus `DOCTYPE` - - ✅ Well-formedness checks not involving DTD (varying degrees of lesser - support in `TokenReader` and `Scanner`) - - ✅ End-of-line and attribute value normalization (in `Reader` and - `TokenReader` only, optional) - - ✅ Namespace support (in `Reader` only, optional) - - 🚧 Detailed errors - - 🚧 Source location tracking - - ❌ `DOCTYPE` (just parsing, not doing anything with it) - (https://github.com/ianprime0509/zig-xml/issues/9) - - ❓️ Non-validating `DOCTYPE` handling (entity expansion, further attribute - value normalization for non-`CDATA` types) (no external DTD content) - - ❓️ Hooks for loading external DTD content - - ❓️ XML 1.1 - - 👎️ Validation -- 🚧 DOM parser (current `Node` abstraction is limited and read-only) -- ✅ Unicode - - ✅ UTF-8 - - ✅ UTF-16 - - ✅ UTF-8 vs UTF-16 auto-detection (`DefaultDecoder`) - - ❌ US-ASCII (this is for support of US-ASCII as its own encoding; note that - all ASCII can be treated as UTF-8) - - ❌ ISO 8859-1 - - ❓️ Other encodings besides these - - ✅ User-definable additional encodings (meaning even though this library - doesn't provide other encodings out of the box, you can write them yourself) -- 🚧 XML writer (https://github.com/ianprime0509/zig-xml/issues/10) -- 👎️ XPath, XML Schema, other XML-related stuff +Other standards (such as XML 1.1 or XML 1.0 prior to the fifth edition) are only +supported insofar as they are compatible with the above standards. ## Examples -See the `examples` directory (these examples are not very good right now but -they do show how to use most of the library). - -Another ("real-world") example can be found in the zig-gobject project: -https://github.com/ianprime0509/zig-gobject/blob/main/src/gir.zig +A basic example of usage can be found in the `examples` directory, and can be +built using `zig build install-examples`. ## Tests -There are several tests in the project itself using the standard Zig test -system. These tests can be run using `zig build test`. - -There is also a runner for the -[W3C XML Conformance Test Suite](https://www.w3.org/XML/Test/) under -`test/xmlconf.zig`. To build this runner as a standalone executable, run -`zig build install-xmlconf`. If you download the 20130923 version of the test -suite and place the `xmlconf` directory under `test`, you can also use -`zig build run-xmlconf` to run all the test suites the runner can currently -understand. The test suite files are not contained directly in this repository -due to unclear licensing and file size (16MB uncompressed). - -At the time of writing, the library passes all the conformance tests it is able -to run (353 of them); the other tests are skipped because they involve doctype -in one way or another or are for XML standards which aren't supported (XML 1.1, -editions of XML 1.0 besides the fifth edition). - -## Fuzzing - -This library has some basic support for fuzz testing, taking its basic method -from the article -[Fuzzing Zig Code Using AFL++](https://www.ryanliptak.com/blog/fuzzing-zig-code/). -To start fuzzing, you will need -[AFL++](https://github.com/AFLplusplus/AFLplusplus), specifically -`afl-clang-lto` and `afl-fuzz`, in your path. Then, you can run -`zig build fuzz`. To resume a prior fuzzing session, pass `-Dresume=true`. - -You can also run `zig build install-fuzz` to just build the fuzz executable and -then run it with `afl-fuzz` separately. - -Finally, if any crashes are identified during fuzzing, they can be replayed by -feeding the crash input back to `zig build fuzz-reproduce`, which will yield an -error trace for further debugging. - -## Benchmarking and performance - -**TL;DR:** `Reader` and `TokenReader` are relatively slow compared to other -popular libraries. `Scanner` is faster (on a similar level as yxml), but -comparatively doesn't do very much. - -There is a benchmarking setup in the `bench` directory. The benchmark is for -parsing through an entire XML file without doing any additional processing. The -XML file is loaded completely into memory first, then the parser is executed on -it until it completes. - -Below are some benchmarking results as of August 14, 2023, using Zig -`0.12.0-dev.906+2d7d037c4`, as performed on my laptop. The results were obtained -by executing [poop](https://github.com/andrewrk/poop) on the benchmark -implementations. - -### GTK 4 GIR - -This is a 5.7MB XML file containing GObject introspection metadata for GTK 4. In -the output below, libxml2 is used as the baseline. The three benchmarks -`reader`, `token_reader`, and `scanner` test the three APIs provided by this -library, and the mxml and yxml libraries are also included for comparison. - -``` -Benchmark 1 (78 runs): zig-out/bin/libxml2 Gtk-4.0.gir - measurement mean ± σ min … max outliers delta - wall_time 64.2ms ± 1.87ms 55.5ms … 70.1ms 4 ( 5%) 0% - peak_rss 14.6MB ± 76.4KB 14.4MB … 14.7MB 0 ( 0%) 0% - cpu_cycles 196M ± 1.03M 194M … 200M 3 ( 4%) 0% - instructions 409M ± 43.1 409M … 409M 0 ( 0%) 0% - cache_references 5.44M ± 325K 5.08M … 6.97M 5 ( 6%) 0% - cache_misses 66.0K ± 5.36K 55.0K … 91.0K 3 ( 4%) 0% - branch_misses 874K ± 3.80K 868K … 890K 1 ( 1%) 0% - -Benchmark 2 (30 runs): zig-out/bin/reader Gtk-4.0.gir - measurement mean ± σ min … max outliers delta - wall_time 170ms ± 1.59ms 167ms … 173ms 0 ( 0%) 💩+164.2% ± 1.2% - peak_rss 7.29MB ± 73.8KB 7.08MB … 7.34MB 0 ( 0%) ⚡- 50.0% ± 0.2% - cpu_cycles 583M ± 2.88M 579M … 590M 0 ( 0%) 💩+196.9% ± 0.4% - instructions 1.38G ± 32.2 1.38G … 1.38G 0 ( 0%) 💩+237.2% ± 0.0% - cache_references 751K ± 135K 580K … 1.12M 0 ( 0%) ⚡- 86.2% ± 2.2% - cache_misses 17.5K ± 5.41K 12.9K … 34.5K 3 (10%) ⚡- 73.5% ± 3.5% - branch_misses 1.06M ± 10.9K 1.05M … 1.11M 2 ( 7%) 💩+ 21.5% ± 0.3% - -Benchmark 3 (38 runs): zig-out/bin/token_reader Gtk-4.0.gir - measurement mean ± σ min … max outliers delta - wall_time 135ms ± 1.59ms 132ms … 138ms 0 ( 0%) 💩+110.4% ± 1.1% - peak_rss 7.31MB ± 54.2KB 7.21MB … 7.34MB 8 (21%) ⚡- 49.8% ± 0.2% - cpu_cycles 462M ± 2.20M 459M … 467M 0 ( 0%) 💩+135.5% ± 0.3% - instructions 1.14G ± 21.0 1.14G … 1.14G 0 ( 0%) 💩+179.9% ± 0.0% - cache_references 237K ± 7.40K 225K … 255K 0 ( 0%) ⚡- 95.6% ± 1.9% - cache_misses 10.1K ± 1.29K 8.16K … 13.2K 0 ( 0%) ⚡- 84.8% ± 2.7% - branch_misses 815K ± 919 813K … 816K 3 ( 8%) ⚡- 6.8% ± 0.1% - -Benchmark 4 (103 runs): zig-out/bin/scanner Gtk-4.0.gir - measurement mean ± σ min … max outliers delta - wall_time 48.6ms ± 1.82ms 45.8ms … 55.2ms 4 ( 4%) ⚡- 24.3% ± 0.8% - peak_rss 7.27MB ± 87.8KB 7.08MB … 7.34MB 0 ( 0%) ⚡- 50.1% ± 0.2% - cpu_cycles 152M ± 3.48M 151M … 177M 5 ( 5%) ⚡- 22.4% ± 0.4% - instructions 472M ± 19.9 472M … 472M 0 ( 0%) 💩+ 15.6% ± 0.0% - cache_references 209K ± 1.80K 207K … 222K 4 ( 4%) ⚡- 96.2% ± 1.2% - cache_misses 7.95K ± 179 7.59K … 8.50K 0 ( 0%) ⚡- 88.0% ± 1.6% - branch_misses 511K ± 874 510K … 518K 13 (13%) ⚡- 41.6% ± 0.1% - -Benchmark 5 (63 runs): zig-out/bin/mxml Gtk-4.0.gir - measurement mean ± σ min … max outliers delta - wall_time 80.2ms ± 2.44ms 76.0ms … 87.9ms 3 ( 5%) 💩+ 24.9% ± 1.1% - peak_rss 7.44MB ± 56.3KB 7.34MB … 7.47MB 15 (24%) ⚡- 48.9% ± 0.2% - cpu_cycles 262M ± 2.95M 258M … 281M 1 ( 2%) 💩+ 33.4% ± 0.4% - instructions 762M ± 56.7K 762M … 762M 3 ( 5%) 💩+ 86.4% ± 0.0% - cache_references 401K ± 473K 272K … 3.08M 10 (16%) ⚡- 92.6% ± 2.4% - cache_misses 14.2K ± 2.62K 12.0K … 31.1K 2 ( 3%) ⚡- 78.5% ± 2.2% - branch_misses 1.02M ± 99.5K 998K … 1.79M 4 ( 6%) 💩+ 16.3% ± 2.5% +The library has several tests of its own, which can be run using `zig build test`. -Benchmark 6 (196 runs): zig-out/bin/yxml Gtk-4.0.gir - measurement mean ± σ min … max outliers delta - wall_time 25.4ms ± 1.03ms 23.9ms … 34.3ms 3 ( 2%) ⚡- 60.4% ± 0.5% - peak_rss 7.29MB ± 77.0KB 7.08MB … 7.34MB 0 ( 0%) ⚡- 50.0% ± 0.1% - cpu_cycles 71.0M ± 1.03M 70.5M … 84.2M 5 ( 3%) ⚡- 63.8% ± 0.1% - instructions 236M ± 20.1 236M … 236M 0 ( 0%) ⚡- 42.2% ± 0.0% - cache_references 202K ± 805 201K … 210K 7 ( 4%) ⚡- 96.3% ± 0.8% - cache_misses 8.00K ± 215 7.64K … 9.57K 4 ( 2%) ⚡- 87.9% ± 1.1% - branch_misses 239K ± 787 238K … 248K 21 (11%) ⚡- 72.7% ± 0.1% -``` +The `xmlconf` directory additionally contains a runner for the [W3C XML +Conformance Test Suite](https://www.w3.org/XML/Test/). Running `zig build test` +in that directory will fetch the test suite distribution tarball and run the +tests within. Due to features missing in the current parser implementation (DTD +support), many tests are currently skipped. At the time of writing, 250 tests +pass, and 924 are skipped due to unsupported features. ## License diff --git a/bench/build.zig b/bench/build.zig index 2be01c6..dd8058d 100644 --- a/bench/build.zig +++ b/bench/build.zig @@ -5,14 +5,6 @@ const Step = Build.Step; pub fn build(b: *Build) !void { const xml = b.dependency("xml", .{}).module("xml"); - const bench_scanner = addBench(b, "scanner"); - bench_scanner.root_module.addImport("xml", xml); - bench_scanner.linkLibC(); - - const bench_token_reader = addBench(b, "token_reader"); - bench_token_reader.root_module.addImport("xml", xml); - bench_token_reader.linkLibC(); - const bench_reader = addBench(b, "reader"); bench_reader.root_module.addImport("xml", xml); bench_reader.linkLibC(); diff --git a/bench/build.zig.zon b/bench/build.zig.zon index 50bfb4f..bfdc142 100644 --- a/bench/build.zig.zon +++ b/bench/build.zig.zon @@ -12,8 +12,8 @@ .path = "..", }, .libxml2 = .{ - .url = "git+https://github.com/ianprime0509/zig-libxml2#9a88110c7ea7a541cb6ead6a648c69a8fc929141", - .hash = "1220b556b7c193580caa53db7e95ad31c0ac589af8adcd894594b93dd1f7875b9405", + .url = "git+https://github.com/ianprime0509/zig-libxml2?ref=main#6cebb963e0ad5789825eb2333a4d21fab8f35a92", + .hash = "12200f672ceb8df0c715a7018e5c53ad434db17f900c620e6238f178cc9a9d80b88e", }, .mxml = .{ .url = "git+https://github.com/michaelrsweet/mxml.git#809204a3051607f54b57e2950f3a5520d79ae383", diff --git a/bench/src/reader.zig b/bench/src/reader.zig index 8a82fbb..dedbae3 100644 --- a/bench/src/reader.zig +++ b/bench/src/reader.zig @@ -4,10 +4,8 @@ const xml = @import("xml"); pub const main = @import("common.zig").main; pub fn runBench(data: []const u8) !void { - var data_stream = std.io.fixedBufferStream(data); - var reader = xml.reader(std.heap.c_allocator, data_stream.reader(), .{ - .DecoderType = xml.encoding.Utf8Decoder, - }); + var doc = xml.StaticDocument.init(data); + var reader = doc.reader(std.heap.c_allocator, .{}); defer reader.deinit(); - while (try reader.next()) |_| {} + while (try reader.read() != .eof) {} } diff --git a/bench/src/scanner.zig b/bench/src/scanner.zig deleted file mode 100644 index 933f168..0000000 --- a/bench/src/scanner.zig +++ /dev/null @@ -1,16 +0,0 @@ -const std = @import("std"); -const xml = @import("xml"); - -pub const main = @import("common.zig").main; - -pub fn runBench(data: []const u8) !void { - var scanner = xml.Scanner{}; - var data_stream = std.io.fixedBufferStream(data); - var decoder = xml.encoding.Utf8Decoder{}; - var buf: [4]u8 = undefined; - while (true) { - const c = try decoder.readCodepoint(data_stream.reader(), &buf); - if (!c.present) break; - _ = try scanner.next(c.codepoint, c.byte_length); - } -} diff --git a/bench/src/token_reader.zig b/bench/src/token_reader.zig deleted file mode 100644 index 8858949..0000000 --- a/bench/src/token_reader.zig +++ /dev/null @@ -1,15 +0,0 @@ -const std = @import("std"); -const xml = @import("xml"); - -pub const main = @import("common.zig").main; - -pub fn runBench(data: []const u8) !void { - var data_stream = std.io.fixedBufferStream(data); - var token_reader = xml.tokenReader(data_stream.reader(), .{ - .DecoderType = xml.encoding.Utf8Decoder, - }); - while (true) { - const token = try token_reader.next(); - if (token == .eof) break; - } -} diff --git a/build.zig b/build.zig index bfe58f4..55fdf6b 100644 --- a/build.zig +++ b/build.zig @@ -8,173 +8,40 @@ pub fn build(b: *Build) void { const xml = b.addModule("xml", .{ .root_source_file = b.path("src/xml.zig"), - }); - - addTests(b, target, optimize, xml); - addDocs(b, target); - addExamples(b, target, optimize, xml); - addFuzz(b, target, xml); -} - -fn addTests(b: *Build, target: Build.ResolvedTarget, optimize: Mode, xml: *Build.Module) void { - const main_tests = b.addTest(.{ - .root_source_file = b.path("src/xml.zig"), .target = target, .optimize = optimize, }); - const run_main_tests = b.addRunArtifact(main_tests); - - const test_step = b.step("test", "Run library tests"); - test_step.dependOn(&run_main_tests.step); - - const xmlconf_exe = b.addExecutable(.{ - .name = "xmlconf", - .root_source_file = b.path("test/xmlconf.zig"), + const test_step = b.step("test", "Run the tests"); + const xml_test = b.addTest(.{ + .root_source_file = b.path("src/xml.zig"), .target = target, - .optimize = optimize, - }); - xmlconf_exe.root_module.addImport("xml", xml); - - const install_xmlconf_step = b.step("install-xmlconf", "Install xmlconf test runner"); - install_xmlconf_step.dependOn(&b.addInstallArtifact(xmlconf_exe, .{}).step); - - const run_xmlconf_exe = b.addRunArtifact(xmlconf_exe); - if (b.args) |args| { - run_xmlconf_exe.addArgs(args); - } - // Since we can't yet handle doctypes, the test files need to be specified - // individually - run_xmlconf_exe.addArgs(&.{ - "test/xmlconf/eduni/errata-2e/errata2e.xml", - "test/xmlconf/eduni/errata-3e/errata3e.xml", - "test/xmlconf/eduni/errata-4e/errata4e.xml", - "test/xmlconf/eduni/misc/ht-bh.xml", - "test/xmlconf/eduni/namespaces/1.0/rmt-ns10.xml", - "test/xmlconf/eduni/namespaces/1.1/rmt-ns11.xml", - "test/xmlconf/eduni/namespaces/errata-1e/errata1e.xml", - "test/xmlconf/eduni/xml-1.1/xml11.xml", - "test/xmlconf/ibm/ibm_oasis_invalid.xml", - "test/xmlconf/ibm/ibm_oasis_not-wf.xml", - "test/xmlconf/ibm/ibm_oasis_valid.xml", - "test/xmlconf/japanese/japanese.xml", - "test/xmlconf/oasis/oasis.xml", - // The test case files in the sun directory do not have an enclosing - // TESTCASES element, and only work when directly substituted as entity - // content, so they cannot be used at this time. - "test/xmlconf/xmltest/xmltest.xml", }); + const xml_test_run = b.addRunArtifact(xml_test); + test_step.dependOn(&xml_test_run.step); - const run_xmlconf_step = b.step("run-xmlconf", "Run xmlconf test cases"); - run_xmlconf_step.dependOn(&run_xmlconf_exe.step); -} - -fn addDocs(b: *Build, target: Build.ResolvedTarget) void { - const obj = b.addObject(.{ + const docs_step = b.step("docs", "Build the documentation"); + const xml_docs = b.addObject(.{ .name = "xml", .root_source_file = b.path("src/xml.zig"), .target = target, .optimize = .Debug, }); - const docs_path = obj.getEmittedDocs(); - - const install_docs = b.addInstallDirectory(.{ - .source_dir = docs_path, + const xml_docs_copy = b.addInstallDirectory(.{ + .source_dir = xml_docs.getEmittedDocs(), .install_dir = .prefix, .install_subdir = "docs", }); + docs_step.dependOn(&xml_docs_copy.step); - const docs_step = b.step("docs", "Generate documentation"); - docs_step.dependOn(&install_docs.step); -} - -fn addExamples(b: *Build, target: Build.ResolvedTarget, optimize: Mode, xml: *Build.Module) void { - const install_examples_step = b.step("install-examples", "Install examples"); - - const scan_exe = b.addExecutable(.{ - .name = "scan", - .root_source_file = b.path("examples/scan.zig"), - .target = target, - .optimize = optimize, - }); - scan_exe.root_module.addImport("xml", xml); - install_examples_step.dependOn(&b.addInstallArtifact(scan_exe, .{}).step); - - const run_scan_exe = b.addRunArtifact(scan_exe); - if (b.args) |args| { - run_scan_exe.addArgs(args); - } - - const run_scan_step = b.step("run-example-scan", "Run scan example"); - run_scan_step.dependOn(&run_scan_exe.step); - - const read_exe = b.addExecutable(.{ - .name = "read", - .root_source_file = b.path("examples/read.zig"), + const install_examples_step = b.step("install-examples", "Build and install the example programs"); + const example_reader_exe = b.addExecutable(.{ + .name = "example-reader", + .root_source_file = b.path("examples/reader.zig"), .target = target, .optimize = optimize, }); - read_exe.root_module.addImport("xml", xml); - install_examples_step.dependOn(&b.addInstallArtifact(read_exe, .{}).step); - - const run_read_exe = b.addRunArtifact(read_exe); - if (b.args) |args| { - run_read_exe.addArgs(args); - } - - const run_read_step = b.step("run-example-read", "Run read example"); - run_read_step.dependOn(&run_read_exe.step); -} - -fn addFuzz(b: *Build, target: Build.ResolvedTarget, xml: *Build.Module) void { - // Thanks to https://www.ryanliptak.com/blog/fuzzing-zig-code/ for the basis of this! - const fuzz_lib = b.addStaticLibrary(.{ - .name = "fuzz", - .root_source_file = b.path("fuzz/main.zig"), - .target = target, - .optimize = .Debug, - }); - fuzz_lib.want_lto = true; - fuzz_lib.bundle_compiler_rt = true; - fuzz_lib.root_module.addImport("xml", xml); - - const fuzz_compile = b.addSystemCommand(&.{ "afl-clang-lto", "-o" }); - const fuzz_exe = fuzz_compile.addOutputFileArg("fuzz"); - fuzz_compile.addArtifactArg(fuzz_lib); - const fuzz_install = b.addInstallBinFile(fuzz_exe, "fuzz"); - - const run_fuzz_compile_step = b.step("install-fuzz", "Build executable for fuzz testing using afl-clang-lto"); - run_fuzz_compile_step.dependOn(&fuzz_install.step); - - const run_fuzz = b.addSystemCommand(&.{"afl-fuzz"}); - run_fuzz.addArg("-i"); - if (b.option(bool, "resume", "Resume fuzzing rather than starting a new run") orelse false) { - run_fuzz.addArg("-"); - } else { - run_fuzz.addArg(b.pathJoin(&.{ "fuzz", "inputs" })); - } - run_fuzz.addArgs(&.{ "-o", b.pathJoin(&.{ "fuzz", "outputs" }) }); - const dictionaries = &[_][]const u8{ "xml.dict", "xml_UTF_16.dict", "xml_UTF_16BE.dict", "xml_UTF_16LE.dict" }; - for (dictionaries) |dictionary| { - run_fuzz.addArgs(&.{ "-x", b.pathJoin(&.{ "fuzz", "dictionaries", dictionary }) }); - } - run_fuzz.addFileArg(fuzz_exe); - const run_fuzz_step = b.step("fuzz", "Execute afl-fuzz with the fuzz testing executable"); - run_fuzz_step.dependOn(&run_fuzz.step); - - const fuzz_reproduce_exe = b.addExecutable(.{ - .name = "fuzz-reproduce", - .root_source_file = b.path("fuzz/main.zig"), - .target = target, - .optimize = .Debug, - }); - fuzz_reproduce_exe.root_module.addImport("xml", xml); - - const run_fuzz_reproduce_exe = b.addRunArtifact(fuzz_reproduce_exe); - if (b.args) |args| { - run_fuzz_reproduce_exe.addArgs(args); - } - - const run_fuzz_reproduce_step = b.step("fuzz-reproduce", "Reproduce crash found by fuzzing"); - run_fuzz_reproduce_step.dependOn(&run_fuzz_reproduce_exe.step); + example_reader_exe.root_module.addImport("xml", xml); + const example_reader_install = b.addInstallArtifact(example_reader_exe, .{}); + install_examples_step.dependOn(&example_reader_install.step); } diff --git a/build.zig.zon b/build.zig.zon index 8d81938..6eb80a1 100644 --- a/build.zig.zon +++ b/build.zig.zon @@ -1,5 +1,5 @@ .{ - .name = "zig-xml", + .name = "xml", .version = "0.1.0", .paths = .{ "src", diff --git a/examples/read.zig b/examples/read.zig deleted file mode 100644 index 3fb77f3..0000000 --- a/examples/read.zig +++ /dev/null @@ -1,46 +0,0 @@ -const std = @import("std"); -const xml = @import("xml"); - -pub fn main() !void { - var gpa = std.heap.GeneralPurposeAllocator(.{}){}; - defer _ = gpa.deinit(); - const allocator = gpa.allocator(); - - const args = try std.process.argsAlloc(allocator); - defer std.process.argsFree(allocator, args); - if (args.len != 2) { - return error.InvalidArguments; - } - const input_path = args[1]; - - const stdout_raw = std.io.getStdOut().writer(); - var stdout_buffered_writer = std.io.bufferedWriter(stdout_raw); - const stdout = stdout_buffered_writer.writer(); - - const input_file = try std.fs.cwd().openFile(input_path, .{}); - defer input_file.close(); - var input_buffered_reader = std.io.bufferedReader(input_file.reader()); - var reader = xml.reader(allocator, input_buffered_reader.reader(), .{}); - defer reader.deinit(); - - while (try reader.next()) |event| { - try printEvent(stdout, event); - } - try stdout_buffered_writer.flush(); -} - -fn printEvent(out: anytype, event: xml.Event) !void { - switch (event) { - .xml_declaration => |xml_declaration| try out.print(" |element_start| { - try out.print("<{?s}({?s}):{s}\n", .{ element_start.name.prefix, element_start.name.ns, element_start.name.local }); - for (element_start.attributes) |attr| { - try out.print(" @{?s}({?s}):{s}={s}\n", .{ attr.name.prefix, attr.name.ns, attr.name.local, attr.value }); - } - }, - .element_content => |element_content| try out.print(" {s}\n", .{element_content.content}), - .element_end => |element_end| try out.print("/{?s}({?s}):{s}\n", .{ element_end.name.prefix, element_end.name.ns, element_end.name.local }), - .comment => |comment| try out.print(" diff --git a/fuzz/inputs/valid-utf16be.xml b/fuzz/inputs/valid-utf16be.xml deleted file mode 100644 index 027c1f4..0000000 Binary files a/fuzz/inputs/valid-utf16be.xml and /dev/null differ diff --git a/fuzz/inputs/valid-utf16le.xml b/fuzz/inputs/valid-utf16le.xml deleted file mode 100644 index 958ccc7..0000000 Binary files a/fuzz/inputs/valid-utf16le.xml and /dev/null differ diff --git a/fuzz/inputs/valid.xml b/fuzz/inputs/valid.xml deleted file mode 100644 index a243053..0000000 --- a/fuzz/inputs/valid.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - Hello, world! - - Hello, world! - diff --git a/fuzz/main.zig b/fuzz/main.zig deleted file mode 100644 index 903422e..0000000 --- a/fuzz/main.zig +++ /dev/null @@ -1,30 +0,0 @@ -const std = @import("std"); -const xml = @import("xml"); - -fn cMain() callconv(.C) void { - main(); -} - -comptime { - @export(cMain, .{ .name = "main" }); -} - -pub fn main() void { - var gpa = std.heap.GeneralPurposeAllocator(.{}){}; - defer std.debug.assert(gpa.deinit() == .ok); - const allocator = gpa.allocator(); - - var stdin_buf = std.io.bufferedReader(std.io.getStdIn().reader()); - var reader = xml.reader(allocator, stdin_buf.reader(), .{}); - defer reader.deinit(); - - var stdout_buf = std.io.bufferedWriter(std.io.getStdOut().writer()); - const stdout = stdout_buf.writer(); - const stderr = std.io.getStdErr().writer(); - while (reader.next() catch |e| { - stderr.print("Error at {}: {}\n", .{ reader.token_reader.scanner.pos, e }) catch {}; - return; - }) |event| { - stdout.print("{} {}\n", .{ reader.token_reader.scanner.pos, event }) catch {}; - } -} diff --git a/fuzz/src/fuzz.zig b/fuzz/src/fuzz.zig new file mode 100644 index 0000000..86f553b --- /dev/null +++ b/fuzz/src/fuzz.zig @@ -0,0 +1,26 @@ +const std = @import("std"); +const Allocator = std.mem.Allocator; +const assert = std.debug.assert; +const xml = @import("xml"); + +export fn zig_fuzz_init() void {} + +export fn zig_fuzz_test(buf: [*]u8, len: isize) void { + var gpa_state: std.heap.GeneralPurposeAllocator(.{}) = .{}; + defer assert(gpa_state.deinit() == .ok); + const gpa = gpa_state.allocator(); + fuzz(gpa, buf[0..@intCast(len)]) catch @panic("OOM"); +} + +fn fuzz(gpa: Allocator, input: []const u8) !void { + var doc = xml.StaticDocument.init(input); + var reader = doc.reader(gpa, .{}); + defer reader.deinit(); + while (true) { + const node = reader.read() catch |err| switch (err) { + error.MalformedXml => break, + error.OutOfMemory => return error.OutOfMemory, + }; + if (node == .eof) break; + } +} diff --git a/src/Reader.zig b/src/Reader.zig new file mode 100644 index 0000000..bce392f --- /dev/null +++ b/src/Reader.zig @@ -0,0 +1,2216 @@ +//! A streaming XML parser, aiming to conform to the [XML 1.0 (Fifth +//! Edition)](https://www.w3.org/TR/2008/REC-xml-20081126) and [Namespaces in +//! XML 1.0 (Third Edition)](https://www.w3.org/TR/2009/REC-xml-names-20091208/) +//! specifications. +//! +//! This is the core, type-erased reader implementation. Generally, users will +//! not use this directly, but will use `xml.GenericReader`, which is a thin +//! wrapper around this type providing type safety for returned errors. +//! +//! A reader gets its raw data from a `Source`, which acts as a forward-only +//! window of an XML document. In a simple case (`xml.StaticDocument`), this +//! may just be slices of a document loaded completely in memory, but the same +//! interface works just as well for a document streamed from a byte reader +//! (`xml.StreamingDocument`). +//! +//! Calling `read` returns the next `Node` in the document, and other reader +//! functions specific to each node type can be used to obtain more information +//! about the current node. The convention is that functions associated with a +//! specific node type have names starting with the node type (and `attribute` +//! functions can only be called on an `element_start` node). +//! +//! Some reader functions end in `Ns`, providing namespace-aware functionality. +//! These functions must only be called on a reader configured to be +//! namespace-aware (namespace awareness is on by default in `Options`). + +const std = @import("std"); +const Allocator = std.mem.Allocator; +const assert = std.debug.assert; +const expectError = std.testing.expectError; +const expectEqual = std.testing.expectEqual; +const expectEqualDeep = std.testing.expectEqualDeep; +const expectEqualStrings = std.testing.expectEqualStrings; + +const Location = @import("xml.zig").Location; +const StaticDocument = @import("xml.zig").StaticDocument; +const QName = @import("xml.zig").QName; +const PrefixedQName = @import("xml.zig").PrefixedQName; +const predefined_entities = @import("xml.zig").predefined_entities; +const predefined_namespace_uris = @import("xml.zig").predefined_namespace_uris; +const ns_xml = @import("xml.zig").ns_xml; +const ns_xmlns = @import("xml.zig").ns_xmlns; + +options: Options, + +state: State, +/// An array of buffer spans relevant to the current node. +/// The layout of the spans depends on the node type: +/// - `eof` - none +/// - `xml_declaration` - "xml" (NAME VALUE)... +/// - `element_start` - NAME (NAME VALUE)... +/// - `element_end` - NAME +/// - `comment` - COMMENT +/// - `pi` - TARGET DATA +/// - `text` - none +/// - `cdata` - CDATA +/// - `character_reference` - REF +/// - `entity_reference` - REF +spans: std.ArrayListUnmanaged(BufSpan), +/// A map of attribute names to indexes. +/// The keys are slices into `buf`. +attributes: std.StringArrayHashMapUnmanaged(usize), +/// A map of attribute qnames to indexes. +/// The key `ns` and `local` values are slices into `buf`. +q_attributes: std.ArrayHashMapUnmanaged(QName, usize, QNameContext, true), +/// String data for the current element nesting context. +/// Each element start node appends the name of the element to this buffer, and +/// the element name is followed by any namespace prefixes and URIs declared on +/// the element so they can be referenced by `ns_prefixes`. +strings: std.ArrayListUnmanaged(u8), +/// The start indexes of the element names in `strings`. +element_names: std.ArrayListUnmanaged(StringIndex), +/// The namespace prefixes declared by the current nesting context of elements. +ns_prefixes: std.ArrayListUnmanaged(std.AutoArrayHashMapUnmanaged(StringIndex, StringIndex)), +/// The Unicode code point associated with the current character reference. +character: u21, + +source: Source, +/// The source location of the beginning of `buf`. +loc: Location, +/// Buffered data read from `source`. +buf: []const u8, +/// The current position of the reader in `buf`. +pos: usize, + +/// The last node returned by `read` (that is, the current node). +node: ?Node, +/// The current error code (only valid if `read` returned `error.MalformedXml`). +error_code: ErrorCode, +/// The position of the current error in `buf`. +error_pos: usize, + +scratch: std.ArrayListUnmanaged(u8), + +gpa: Allocator, + +const Reader = @This(); + +pub const Options = struct { + /// Whether the reader should handle namespaces in element and attribute + /// names. The `Ns`-suffixed functions of `Reader` may only be used when + /// this is enabled. + namespace_aware: bool = true, + /// Whether the reader should track the source location (line and column) + /// of nodes in the document. The `location` functions of `Reader` may only + /// be used when this is enabled. + location_aware: bool = true, + /// Whether the reader may assume that its input data is valid UTF-8. + assume_valid_utf8: bool = false, +}; + +pub const Node = enum { + eof, + xml_declaration, + element_start, + element_end, + comment, + pi, + text, + cdata, + character_reference, + entity_reference, +}; + +pub const ErrorCode = enum { + xml_declaration_attribute_unsupported, + xml_declaration_version_missing, + xml_declaration_version_unsupported, + xml_declaration_encoding_unsupported, + xml_declaration_standalone_malformed, + doctype_unsupported, + directive_unknown, + attribute_missing_space, + attribute_duplicate, + attribute_prefix_undeclared, + attribute_illegal_character, + element_end_mismatched, + element_end_unclosed, + comment_malformed, + comment_unclosed, + pi_unclosed, + pi_target_disallowed, + pi_missing_space, + text_cdata_end_disallowed, + cdata_unclosed, + entity_reference_unclosed, + entity_reference_undefined, + character_reference_unclosed, + character_reference_malformed, + name_malformed, + namespace_prefix_unbound, + namespace_binding_illegal, + namespace_prefix_illegal, + unexpected_character, + unexpected_eof, + expected_equals, + expected_quote, + missing_end_quote, + invalid_utf8, + illegal_character, +}; + +pub const Source = struct { + context: *const anyopaque, + moveFn: *const fn (context: *const anyopaque, advance: usize, len: usize) anyerror![]const u8, + + pub fn move(source: Source, advance: usize, len: usize) anyerror![]const u8 { + return source.moveFn(source.context, advance, len); + } +}; + +const State = enum { + invalid, + start, + after_xml_declaration, + after_doctype, + in_root, + empty_element, + empty_root, + after_root, + eof, +}; + +pub fn init(gpa: Allocator, source: Source, options: Options) Reader { + return .{ + .options = options, + + .state = .start, + .spans = .{}, + .attributes = .{}, + .q_attributes = .{}, + .strings = .{}, + .element_names = .{}, + .ns_prefixes = .{}, + .character = undefined, + + .source = source, + .loc = if (options.location_aware) Location.start else undefined, + .buf = &.{}, + .pos = 0, + + .node = null, + .error_code = undefined, + .error_pos = undefined, + + .scratch = .{}, + + .gpa = gpa, + }; +} + +pub fn deinit(reader: *Reader) void { + reader.spans.deinit(reader.gpa); + reader.attributes.deinit(reader.gpa); + reader.q_attributes.deinit(reader.gpa); + reader.strings.deinit(reader.gpa); + reader.element_names.deinit(reader.gpa); + for (reader.ns_prefixes.items) |*map| map.deinit(reader.gpa); + reader.ns_prefixes.deinit(reader.gpa); + reader.scratch.deinit(reader.gpa); + reader.* = undefined; +} + +/// Returns the location of the node. +/// Asserts that the reader is location-aware and there is a current node (`read` was called and did not return an error). +pub fn location(reader: Reader) Location { + assert(reader.options.location_aware and reader.node != null); + return reader.loc; +} + +test location { + var doc = StaticDocument.init( + \\ + \\ Hello, world! + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + + try expectEqual(.element_start, try reader.read()); + try expectEqualDeep(Location{ .line = 1, .column = 1 }, reader.location()); + + try expectEqual(.text, try reader.read()); + try expectEqualDeep(Location{ .line = 1, .column = 7 }, reader.location()); + + try expectEqual(.element_start, try reader.read()); + try expectEqualDeep(Location{ .line = 2, .column = 3 }, reader.location()); + + try expectEqual(.text, try reader.read()); + try expectEqualDeep(Location{ .line = 2, .column = 8 }, reader.location()); + + try expectEqual(.element_end, try reader.read()); + try expectEqualDeep(Location{ .line = 2, .column = 21 }, reader.location()); + + try expectEqual(.text, try reader.read()); + try expectEqualDeep(Location{ .line = 2, .column = 27 }, reader.location()); + + try expectEqual(.element_end, try reader.read()); + try expectEqualDeep(Location{ .line = 3, .column = 1 }, reader.location()); +} + +/// Returns the error code associated with the error. +/// Asserts that `error.MalformedXml` was returned by the last call to `read`. +pub fn errorCode(reader: Reader) ErrorCode { + assert(reader.state == .invalid); + return reader.error_code; +} + +test errorCode { + var doc = StaticDocument.init( + \\ + \\ <123>Hello, world! + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + + try expectEqual(.element_start, try reader.read()); + try expectEqual(.text, try reader.read()); + try expectError(error.MalformedXml, reader.read()); + try expectEqual(.name_malformed, reader.errorCode()); +} + +/// Returns the location where the error occurred. +/// Asserts that the reader is location-aware and `error.MalformedXml` was returned by the last call to `read`. +pub fn errorLocation(reader: Reader) Location { + assert(reader.state == .invalid); + var loc = reader.loc; + loc.update(reader.buf[0..reader.error_pos]); + return loc; +} + +test errorLocation { + var doc = StaticDocument.init( + \\ + \\ <123>Hello, world! + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + + try expectEqual(.element_start, try reader.read()); + try expectEqual(.text, try reader.read()); + try expectError(error.MalformedXml, reader.read()); + try expectEqualDeep(Location{ .line = 2, .column = 4 }, reader.errorLocation()); +} + +/// Returns the version declared in the XML declaration. +/// Asserts that the current node is `Node.xml_version`. +pub fn xmlDeclarationVersion(reader: Reader) []const u8 { + assert(reader.node == .xml_declaration); + return reader.attributeValueUnchecked(0); +} + +test xmlDeclarationVersion { + var doc = StaticDocument.init( + \\ + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.xml_declaration, try reader.read()); + try expectEqualStrings("1.0", reader.xmlDeclarationVersion()); +} + +/// Returns the encoding declared in the XML declaration. +/// Asserts that the current node is `Node.xml_version`. +pub fn xmlDeclarationEncoding(reader: Reader) ?[]const u8 { + assert(reader.node == .xml_declaration); + const n = reader.attributes.get("encoding") orelse return null; + return reader.attributeValueUnchecked(n); +} + +test xmlDeclarationEncoding { + var doc = StaticDocument.init( + \\ + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.xml_declaration, try reader.read()); + try expectEqualStrings("UTF-8", reader.xmlDeclarationEncoding().?); +} + +/// Returns whether the XML declaration declares the document to be standalone. +/// Asserts that the current node is `Node.xml_version`. +pub fn xmlDeclarationStandalone(reader: Reader) ?bool { + assert(reader.node == .xml_declaration); + const n = reader.attributes.get("standalone") orelse return null; + return std.mem.eql(u8, reader.attributeValueUnchecked(n), "yes"); +} + +test xmlDeclarationStandalone { + var doc = StaticDocument.init( + \\ + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.xml_declaration, try reader.read()); + try expectEqual(true, reader.xmlDeclarationStandalone()); +} + +/// Returns the name of the element. +/// Asserts that the current node is `Node.element_start` or `Node.element_end`. +pub fn elementName(reader: Reader) []const u8 { + assert(reader.node == .element_start or reader.node == .element_end); + return reader.elementNameUnchecked(); +} + +test elementName { + var doc = StaticDocument.init( + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.element_start, try reader.read()); + try expectEqualStrings("root", reader.elementName()); + try expectEqual(.element_end, try reader.read()); + try expectEqualStrings("root", reader.elementName()); +} + +/// Returns the name of the element as a `PrefixedQName`. +/// Asserts that the current node is `Node.element_start` or `Node.element_end` and that `reader` is namespace-aware. +pub fn elementNameNs(reader: Reader) PrefixedQName { + assert(reader.options.namespace_aware); + return reader.parseQName(reader.elementName()); +} + +test elementNameNs { + var doc = StaticDocument.init( + \\ + \\ + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + + try expectEqual(.element_start, try reader.read()); + try expectEqualStrings("", reader.elementNameNs().prefix); + try expectEqualStrings("https://example.com/ns", reader.elementNameNs().ns); + try expectEqualStrings("root", reader.elementNameNs().local); + + try expectEqual(.text, try reader.read()); + + try expectEqual(.element_start, try reader.read()); + try expectEqualStrings("a", reader.elementNameNs().prefix); + try expectEqualStrings("https://example.com/ns2", reader.elementNameNs().ns); + try expectEqualStrings("a", reader.elementNameNs().local); + + try expectEqual(.element_end, try reader.read()); + try expectEqualStrings("a", reader.elementNameNs().prefix); + try expectEqualStrings("https://example.com/ns2", reader.elementNameNs().ns); + try expectEqualStrings("a", reader.elementNameNs().local); + + try expectEqual(.text, try reader.read()); + + try expectEqual(.element_end, try reader.read()); + try expectEqualStrings("", reader.elementNameNs().prefix); + try expectEqualStrings("https://example.com/ns", reader.elementNameNs().ns); + try expectEqualStrings("root", reader.elementNameNs().local); +} + +fn elementNameUnchecked(reader: Reader) []const u8 { + return reader.bufSlice(reader.spans.items[0]); +} + +fn elementNamePos(reader: Reader) usize { + return reader.spans.items[0].start; +} + +/// Returns the number of attributes of the element. +/// Asserts that the current node is `Node.element_start`. +pub fn attributeCount(reader: Reader) usize { + assert(reader.node == .element_start); + return reader.attributeCountUnchecked(); +} + +test attributeCount { + var doc = StaticDocument.init( + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.element_start, try reader.read()); + try expectEqual(3, reader.attributeCount()); +} + +fn attributeCountUnchecked(reader: Reader) usize { + return @divExact(reader.spans.items.len - 1, 2); +} + +/// Returns the name of the `n`th attribute of the element. +/// Asserts that the current node is `Node.element_start` and `n` is less than `reader.nAttributes()`. +pub fn attributeName(reader: Reader, n: usize) []const u8 { + assert(reader.node == .element_start and n < reader.attributeCount()); + return reader.attributeNameUnchecked(n); +} + +test attributeName { + var doc = StaticDocument.init( + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.element_start, try reader.read()); + try expectEqualStrings("a", reader.attributeName(0)); + try expectEqualStrings("b", reader.attributeName(1)); + try expectEqualStrings("c", reader.attributeName(2)); +} + +/// Returns the name of the `n`th attribute of the element as a `PrefixedQName`. +/// If the reader is not namespace-aware, only the `local` part will be non-empty. +/// Asserts that the current node is `Node.element_start` and `n` is less than `reader.nAttributes()`. +pub fn attributeNameNs(reader: Reader, n: usize) PrefixedQName { + const name = reader.attributeName(n); + return if (reader.options.namespace_aware) reader.parseQName(name) else .{ + .prefix = "", + .ns = "", + .local = name, + }; +} + +test attributeNameNs { + var doc = StaticDocument.init( + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.element_start, try reader.read()); + + try expectEqualStrings("xmlns", reader.attributeNameNs(0).prefix); + try expectEqualStrings("http://www.w3.org/2000/xmlns/", reader.attributeNameNs(0).ns); + try expectEqualStrings("pre", reader.attributeNameNs(0).local); + + try expectEqualStrings("", reader.attributeNameNs(1).prefix); + try expectEqualStrings("", reader.attributeNameNs(1).ns); + try expectEqualStrings("a", reader.attributeNameNs(1).local); + + try expectEqualStrings("pre", reader.attributeNameNs(2).prefix); + try expectEqualStrings("https://example.com/ns", reader.attributeNameNs(2).ns); + try expectEqualStrings("b", reader.attributeNameNs(2).local); +} + +fn attributeNameUnchecked(reader: Reader, n: usize) []const u8 { + return reader.bufSlice(reader.spans.items[n * 2 + 1]); +} + +fn attributeNamePos(reader: Reader, n: usize) usize { + return reader.spans.items[n * 2 + 1].start; +} + +/// Returns the value of the `n`th attribute of the element. +/// This function may incur allocations if the attribute value contains entity or character +/// references, or CR, LF, or TAB characters which must be normalized according to the spec. +/// The returned value is owned by `reader` and is only valid until the next call to another +/// function on `reader`. +/// Asserts that the current node is `Node.element_start` and `n` is less than `reader.nAttributes()`. +pub fn attributeValue(reader: *Reader, n: usize) Allocator.Error![]const u8 { + const raw = reader.attributeValueRaw(n); + if (std.mem.indexOfAny(u8, raw, "&\t\r\n") == null) return raw; + reader.scratch.clearRetainingCapacity(); + const writer = reader.scratch.writer(reader.gpa); + reader.attributeValueWrite(n, writer.any()) catch |err| switch (err) { + error.OutOfMemory => return error.OutOfMemory, + else => unreachable, + }; + return reader.scratch.items; +} + +test attributeValue { + var doc = StaticDocument.init( + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.element_start, try reader.read()); + try expectEqualStrings("1", try reader.attributeValue(0)); + try expectEqualStrings("2", try reader.attributeValue(1)); + try expectEqualStrings("1 & 2", try reader.attributeValue(2)); +} + +/// Returns the value of the `n`th attribute of the element. +/// Asserts that the current node is `Node.element_start` and `n` is less than `reader.nAttributes()`. +pub fn attributeValueAlloc(reader: Reader, gpa: Allocator, n: usize) Allocator.Error![]u8 { + var buf = std.ArrayList(u8).init(gpa); + defer buf.deinit(); + const buf_writer = buf.writer(); + reader.attributeValueWrite(n, buf_writer.any()) catch |err| switch (err) { + error.OutOfMemory => return error.OutOfMemory, + else => unreachable, + }; + return buf.toOwnedSlice(); +} + +test attributeValueAlloc { + var doc = StaticDocument.init( + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.element_start, try reader.read()); + + const attr0 = try reader.attributeValueAlloc(std.testing.allocator, 0); + defer std.testing.allocator.free(attr0); + try expectEqualStrings("1", attr0); + const attr1 = try reader.attributeValueAlloc(std.testing.allocator, 1); + defer std.testing.allocator.free(attr1); + try expectEqualStrings("2", attr1); + const attr2 = try reader.attributeValueAlloc(std.testing.allocator, 2); + defer std.testing.allocator.free(attr2); + try expectEqualStrings("1 & 2", attr2); +} + +/// Writes the value of the `n`th attribute of the element to `writer`. +/// Asserts that the current node is `Node.element_start` and `n` is less than `reader.nAttributes()`. +pub fn attributeValueWrite(reader: Reader, n: usize, writer: std.io.AnyWriter) anyerror!void { + const raw = reader.attributeValueRaw(n); + var pos: usize = 0; + while (std.mem.indexOfAnyPos(u8, raw, pos, "&\t\r\n")) |split_pos| { + try writer.writeAll(raw[pos..split_pos]); + pos = split_pos; + switch (raw[pos]) { + '&' => { + const entity_end = std.mem.indexOfScalarPos(u8, raw, pos, ';') orelse unreachable; + if (raw[pos + "&".len] == '#') { + const c = if (raw[pos + "&#".len] == 'x') + std.fmt.parseInt(u21, raw[pos + "&#x".len .. entity_end], 16) catch unreachable + else + std.fmt.parseInt(u21, raw[pos + "&#".len .. entity_end], 10) catch unreachable; + var buf: [4]u8 = undefined; + const len = std.unicode.utf8Encode(c, &buf) catch unreachable; + try writer.writeAll(buf[0..len]); + } else { + try writer.writeAll(predefined_entities.get(raw[pos + "&".len .. entity_end]) orelse unreachable); + } + pos = entity_end + 1; + }, + '\t', '\n' => { + try writer.writeByte(' '); + pos += 1; + }, + '\r' => { + try writer.writeByte(' '); + if (pos + 1 < raw.len and raw[pos + 1] == '\n') { + pos += 2; + } else { + pos += 1; + } + }, + else => unreachable, + } + } + try writer.writeAll(raw[pos..]); +} + +test attributeValueWrite { + var doc = StaticDocument.init( + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.element_start, try reader.read()); + + var buf = std.ArrayList(u8).init(std.testing.allocator); + defer buf.deinit(); + + try reader.attributeValueWrite(0, buf.writer()); + try expectEqualStrings("1", buf.items); + + buf.clearRetainingCapacity(); + try reader.attributeValueWrite(1, buf.writer()); + try expectEqualStrings("2", buf.items); + + buf.clearRetainingCapacity(); + try reader.attributeValueWrite(2, buf.writer()); + try expectEqualStrings("1 & 2", buf.items); +} + +/// Returns the raw value of the `n`th attribute of the element, as it appears in the source. +/// Asserts that the current node is `Node.element_start` and `n` is less than `reader.nAttributes()`. +pub fn attributeValueRaw(reader: Reader, n: usize) []const u8 { + assert(reader.node == .element_start and n < reader.attributeCount()); + return reader.attributeValueUnchecked(n); +} + +test attributeValueRaw { + var doc = StaticDocument.init( + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.element_start, try reader.read()); + try expectEqualStrings("1", reader.attributeValueRaw(0)); + try expectEqualStrings("2", reader.attributeValueRaw(1)); + try expectEqualStrings("1 & 2", reader.attributeValueRaw(2)); +} + +fn attributeValueUnchecked(reader: Reader, n: usize) []const u8 { + return reader.bufSlice(reader.spans.items[n * 2 + 2]); +} + +fn attributeValuePos(reader: Reader, n: usize) usize { + return reader.spans.items[n * 2 + 2].start; +} + +fn attributeValueEndPos(reader: Reader, n: usize) usize { + return reader.spans.items[n * 2 + 2].end; +} + +/// Returns the location of the `n`th attribute of the element. +/// Asserts that the reader is location-aware, the current node is `Node.element_start`, and `n` is less than `reader.nAttributes()`. +pub fn attributeLocation(reader: Reader, n: usize) Location { + assert(reader.options.location_aware and reader.node == .element_start and n < reader.attributeCount()); + var loc = reader.loc; + loc.update(reader.buf[0..reader.attributeNamePos(n)]); + return loc; +} + +/// Returns the index of the attribute named `name`. +/// Asserts that the current node is `Node.element_start`. +pub fn attributeIndex(reader: Reader, name: []const u8) ?usize { + assert(reader.node == .element_start); + return reader.attributes.get(name); +} + +test attributeIndex { + var doc = StaticDocument.init( + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.element_start, try reader.read()); + try expectEqual(0, reader.attributeIndex("one")); + try expectEqual(1, reader.attributeIndex("two")); + try expectEqual(2, reader.attributeIndex("three")); + try expectEqual(null, reader.attributeIndex("four")); +} + +/// Returns the index of the attribute with namespace `ns` and local name `local`. +/// Asserts that the current node is `Node.element_start` and `reader` is namespace-aware. +pub fn attributeIndexNs(reader: Reader, ns: []const u8, local: []const u8) ?usize { + assert(reader.node == .element_start and reader.options.namespace_aware); + return reader.q_attributes.get(.{ .ns = ns, .local = local }); +} + +test attributeIndexNs { + var doc = StaticDocument.init( + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.element_start, try reader.read()); + try expectEqual(0, reader.attributeIndexNs("", "xmlns")); + try expectEqual(1, reader.attributeIndexNs("http://www.w3.org/2000/xmlns/", "foo")); + try expectEqual(2, reader.attributeIndexNs("", "one")); + try expectEqual(3, reader.attributeIndexNs("http://example.com/foo", "two")); + try expectEqual(null, reader.attributeIndexNs("http://example.com", "one")); + try expectEqual(null, reader.attributeIndexNs("", "three")); +} + +/// Returns the text of the comment. +/// This function may incur allocations if the comment text contains CR +/// characters which must be normalized according to the spec. +/// The returned value is owned by `reader` and is only valid until the next call to another +/// function on `reader`. +/// Asserts that the current node is `Node.comment`. +pub fn comment(reader: *Reader) Allocator.Error![]const u8 { + return reader.newlineNormalizedScratch(reader.commentRaw()); +} + +test comment { + var doc = StaticDocument.init( + \\ + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.comment, try reader.read()); + try expectEqualStrings(" Hello, world! ", try reader.comment()); +} + +/// Writes the text of the comment to `writer`. +/// Asserts that the current node is `Node.comment`. +pub fn commentWrite(reader: Reader, writer: std.io.AnyWriter) anyerror!void { + try writeNewlineNormalized(reader.commentRaw(), writer); +} + +test commentWrite { + var doc = StaticDocument.init( + \\ + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.comment, try reader.read()); + + var buf = std.ArrayList(u8).init(std.testing.allocator); + defer buf.deinit(); + try reader.commentWrite(buf.writer()); + try expectEqualStrings(" Hello, world! ", buf.items); +} + +/// Returns the raw text of the comment, as it appears in the source. +/// Asserts that the current node is `Node.comment`. +pub fn commentRaw(reader: Reader) []const u8 { + assert(reader.node == .comment); + return reader.commentUnchecked(); +} + +test commentRaw { + var doc = StaticDocument.init( + \\ + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.comment, try reader.read()); + try expectEqualStrings(" Hello, world! ", reader.commentRaw()); +} + +fn commentUnchecked(reader: Reader) []const u8 { + return reader.bufSlice(reader.spans.items[0]); +} + +fn commentPos(reader: Reader) usize { + return reader.spans.items[0].start; +} + +/// Returns the target of the PI. +/// Asserts that the current node is `Node.pi`. +pub fn piTarget(reader: Reader) []const u8 { + assert(reader.node == .pi); + return reader.piTargetUnchecked(); +} + +test piTarget { + var doc = StaticDocument.init( + \\ + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.pi, try reader.read()); + try expectEqualStrings("pi-target", reader.piTarget()); +} + +fn piTargetUnchecked(reader: Reader) []const u8 { + return reader.bufSlice(reader.spans.items[0]); +} + +fn piTargetPos(reader: Reader) usize { + return reader.spans.items[0].start; +} + +fn piTargetEndPos(reader: Reader) usize { + return reader.spans.items[0].end; +} + +/// Returns the data of the PI. +/// This function may incur allocations if the PI data contains CR +/// characters which must be normalized according to the spec. +/// The returned value is owned by `reader` and is only valid until the next call to another +/// function on `reader`. +/// Asserts that the current node is `Node.pi`. +pub fn piData(reader: *Reader) Allocator.Error![]const u8 { + return reader.newlineNormalizedScratch(reader.piDataRaw()); +} + +test piData { + var doc = StaticDocument.init( + \\ + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.pi, try reader.read()); + try expectEqualStrings("pi-data", try reader.piData()); +} + +/// Writes the data of the PI to `writer`. +/// Asserts that the current node is `Node.pi`. +pub fn piDataWrite(reader: Reader, writer: std.io.AnyWriter) anyerror!void { + try writeNewlineNormalized(reader.piDataRaw(), writer); +} + +test piDataWrite { + var doc = StaticDocument.init( + \\ + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.pi, try reader.read()); + + var buf = std.ArrayList(u8).init(std.testing.allocator); + defer buf.deinit(); + try reader.piDataWrite(buf.writer()); + try expectEqualStrings("pi-data", buf.items); +} + +/// Returns the raw data of the PI, as it appears in the source. +/// Asserts that the current node is `Node.pi`. +pub fn piDataRaw(reader: Reader) []const u8 { + assert(reader.node == .pi); + return reader.piDataUnchecked(); +} + +test piDataRaw { + var doc = StaticDocument.init( + \\ + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.pi, try reader.read()); + try expectEqualStrings("pi-data", reader.piDataRaw()); +} + +fn piDataUnchecked(reader: Reader) []const u8 { + return reader.bufSlice(reader.spans.items[1]); +} + +fn piDataPos(reader: Reader) usize { + return reader.spans.items[1].start; +} + +fn piDataEndPos(reader: Reader) usize { + return reader.spans.items[1].end; +} + +/// Returns the text. +/// This function may incur allocations if the text contains CR +/// characters which must be normalized according to the spec. +/// The returned value is owned by `reader` and is only valid until the next call to another +/// function on `reader`. +/// Asserts that the current node is `Node.text`. +pub fn text(reader: *Reader) Allocator.Error![]const u8 { + return reader.newlineNormalizedScratch(reader.textRaw()); +} + +test text { + var doc = StaticDocument.init( + \\Hello, world! + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.element_start, try reader.read()); + try expectEqual(.text, try reader.read()); + try expectEqualStrings("Hello, world!", try reader.text()); +} + +/// Writes the text to `writer`. +/// Asserts that the current node is `Node.text`. +pub fn textWrite(reader: Reader, writer: std.io.AnyWriter) anyerror!void { + try writeNewlineNormalized(reader.textRaw(), writer); +} + +test textWrite { + var doc = StaticDocument.init( + \\Hello, world! + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.element_start, try reader.read()); + try expectEqual(.text, try reader.read()); + + var buf = std.ArrayList(u8).init(std.testing.allocator); + defer buf.deinit(); + try reader.textWrite(buf.writer()); + try expectEqualStrings("Hello, world!", buf.items); +} + +/// Returns the raw text, as it appears in the source. +/// Asserts that the current node is `Node.text`. +pub fn textRaw(reader: Reader) []const u8 { + assert(reader.node == .text); + return reader.textUnchecked(); +} + +test textRaw { + var doc = StaticDocument.init( + \\Hello, world! + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.element_start, try reader.read()); + try expectEqual(.text, try reader.read()); + try expectEqualStrings("Hello, world!", reader.textRaw()); +} + +fn textUnchecked(reader: Reader) []const u8 { + return reader.buf[0..reader.pos]; +} + +fn textPos(reader: Reader) usize { + _ = reader; + return 0; +} + +/// Returns the text of the CDATA section. +/// This function may incur allocations if the text contains CR +/// characters which must be normalized according to the spec. +/// The returned value is owned by `reader` and is only valid until the next call to another +/// function on `reader`. +/// Asserts that the current node is `Node.cdata`. +pub fn cdata(reader: *Reader) Allocator.Error![]const u8 { + return reader.newlineNormalizedScratch(reader.cdataRaw()); +} + +test cdata { + var doc = StaticDocument.init( + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.element_start, try reader.read()); + try expectEqual(.cdata, try reader.read()); + try expectEqualStrings("Hello, world!", try reader.cdata()); +} + +/// Writes the text of the CDATA section to `writer`. +/// Asserts that the current node is `Node.cdata`. +pub fn cdataWrite(reader: Reader, writer: std.io.AnyWriter) anyerror!void { + try writeNewlineNormalized(reader.cdataRaw(), writer); +} + +test cdataWrite { + var doc = StaticDocument.init( + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.element_start, try reader.read()); + try expectEqual(.cdata, try reader.read()); + + var buf = std.ArrayList(u8).init(std.testing.allocator); + defer buf.deinit(); + try reader.cdataWrite(buf.writer()); + try expectEqualStrings("Hello, world!", buf.items); +} + +/// Returns the raw text of the CDATA section, as it appears in the source. +/// Asserts that the current node is `Node.cdata`. +pub fn cdataRaw(reader: Reader) []const u8 { + assert(reader.node == .cdata); + return reader.cdataUnchecked(); +} + +test cdataRaw { + var doc = StaticDocument.init( + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.element_start, try reader.read()); + try expectEqual(.cdata, try reader.read()); + try expectEqualStrings("Hello, world!", reader.cdataRaw()); +} + +fn cdataUnchecked(reader: Reader) []const u8 { + return reader.bufSlice(reader.spans.items[0]); +} + +fn cdataPos(reader: Reader) usize { + return reader.spans.items[0].start; +} + +/// Returns the name of the referenced entity. +/// Asserts that the current node is `Node.entity_reference`. +pub fn entityReferenceName(reader: Reader) []const u8 { + assert(reader.node == .entity_reference); + return reader.entityReferenceNameUnchecked(); +} + +test entityReferenceName { + var doc = StaticDocument.init( + \\& + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.element_start, try reader.read()); + try expectEqual(.entity_reference, try reader.read()); + try expectEqualStrings("amp", reader.entityReferenceName()); +} + +fn entityReferenceNameUnchecked(reader: Reader) []const u8 { + return reader.bufSlice(reader.spans.items[0]); +} + +fn entityReferenceNamePos(reader: Reader) usize { + return reader.spans.items[0].start; +} + +/// Returns the referenced character (Unicode codepoint). +/// Asserts that the current node is `Node.character_reference`. +pub fn characterReferenceChar(reader: Reader) u21 { + assert(reader.node == .character_reference); + return reader.character; +} + +test characterReferenceChar { + var doc = StaticDocument.init( + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.element_start, try reader.read()); + try expectEqual(.character_reference, try reader.read()); + try expectEqual(0x20, reader.characterReferenceChar()); +} + +/// Returns the "name" of the referenced character, as it appears in the source. +/// Asserts that the current node is `Node.character_reference`. +pub fn characterReferenceName(reader: Reader) []const u8 { + assert(reader.node == .character_reference); + return reader.characterReferenceNameUnchecked(); +} + +test characterReferenceName { + var doc = StaticDocument.init( + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + try expectEqual(.element_start, try reader.read()); + try expectEqual(.character_reference, try reader.read()); + try expectEqualStrings("x20", reader.characterReferenceName()); +} + +fn characterReferenceNameUnchecked(reader: Reader) []const u8 { + return reader.bufSlice(reader.spans.items[0]); +} + +fn characterReferenceNamePos(reader: Reader) usize { + return reader.spans.items[0].start; +} + +fn newlineNormalizedScratch(reader: *Reader, raw: []const u8) Allocator.Error![]const u8 { + if (std.mem.indexOfScalar(u8, raw, '\r') == null) return raw; + reader.scratch.clearRetainingCapacity(); + const writer = reader.scratch.writer(reader.gpa); + writeNewlineNormalized(raw, writer.any()) catch |err| switch (err) { + error.OutOfMemory => return error.OutOfMemory, + else => unreachable, + }; + return reader.scratch.items; +} + +fn writeNewlineNormalized(raw: []const u8, writer: std.io.AnyWriter) anyerror!void { + var pos: usize = 0; + while (std.mem.indexOfScalarPos(u8, raw, pos, '\r')) |cr_pos| { + try writer.writeAll(raw[pos..cr_pos]); + try writer.writeByte('\n'); + if (cr_pos + 1 < raw.len and raw[cr_pos + 1] == '\n') { + pos = cr_pos + "\r\n".len; + } else { + pos = cr_pos + "\r".len; + } + } + try writer.writeAll(raw[pos..]); +} + +/// Returns the namespace URI bound to `prefix`, or an empty string if none. +/// If the reader is not namespace-aware, always returns an empty string. +pub fn namespaceUri(reader: Reader, prefix: []const u8) []const u8 { + if (!reader.options.namespace_aware) return ""; + if (predefined_namespace_uris.get(prefix)) |uri| return uri; + var i = reader.ns_prefixes.items.len; + const index = while (i > 0) { + i -= 1; + if (reader.ns_prefixes.items[i].getAdapted(prefix, StringIndexAdapter{ + .strings = reader.strings.items, + })) |uri| break uri; + } else return ""; + return reader.string(index); +} + +test namespaceUri { + var doc = StaticDocument.init( + \\ + \\ + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + + try expectEqual(.element_start, try reader.read()); + try expectEqualStrings("https://example.com/default", reader.namespaceUri("")); + try expectEqualStrings("https://example.com/other", reader.namespaceUri("other")); + try expectEqualStrings("", reader.namespaceUri("child")); + + try expectEqual(.text, try reader.read()); + try expectEqualStrings("https://example.com/default", reader.namespaceUri("")); + try expectEqualStrings("https://example.com/other", reader.namespaceUri("other")); + try expectEqualStrings("", reader.namespaceUri("child")); + + try expectEqual(.element_start, try reader.read()); + try expectEqualStrings("https://example.com/default", reader.namespaceUri("")); + try expectEqualStrings("https://example.com/other", reader.namespaceUri("other")); + try expectEqualStrings("https://example.com/child", reader.namespaceUri("child")); + + try expectEqual(.element_end, try reader.read()); + try expectEqualStrings("https://example.com/default", reader.namespaceUri("")); + try expectEqualStrings("https://example.com/other", reader.namespaceUri("other")); + try expectEqualStrings("https://example.com/child", reader.namespaceUri("child")); + + try expectEqual(.text, try reader.read()); + try expectEqualStrings("https://example.com/default", reader.namespaceUri("")); + try expectEqualStrings("https://example.com/other", reader.namespaceUri("other")); + try expectEqualStrings("", reader.namespaceUri("child")); + + try expectEqual(.element_end, try reader.read()); + try expectEqualStrings("https://example.com/default", reader.namespaceUri("")); + try expectEqualStrings("https://example.com/other", reader.namespaceUri("other")); + try expectEqualStrings("", reader.namespaceUri("child")); +} + +fn parseQName(reader: Reader, name: []const u8) PrefixedQName { + const prefix, const local = if (std.mem.indexOfScalar(u8, name, ':')) |colon_pos| + .{ name[0..colon_pos], name[colon_pos + 1 ..] } + else + .{ "", name }; + return .{ + .prefix = prefix, + .ns = reader.namespaceUri(prefix), + .local = local, + }; +} + +pub const ReadError = error{MalformedXml} || Allocator.Error; + +/// Reads and returns the next node in the document. +pub fn read(reader: *Reader) anyerror!Node { + errdefer reader.node = null; + const node: Node = while (true) { + switch (reader.state) { + .invalid => return error.MalformedXml, + .start => { + try reader.shift(); + try reader.skipBom(); + if (try reader.readMatch(" { + try reader.skipSpace(); + if (try reader.readMatch(" { + try reader.skipSpace(); + if (reader.pos == reader.buf.len) { + return reader.fatal(.unexpected_eof, reader.pos); + } else if (try reader.readMatch(" { + try reader.shift(); + if (reader.pos == reader.buf.len) { + return reader.fatal(.unexpected_eof, reader.pos); + } else if (try reader.readMatch("&#")) { + try reader.readCharacterReference(); + if (!try reader.readMatch(";")) return reader.fatal(.character_reference_unclosed, reader.pos); + try reader.checkCharacterReference(); + break .character_reference; + } else if (try reader.readMatch("&")) { + try reader.readName(); + if (!try reader.readMatch(";")) return reader.fatal(.entity_reference_unclosed, reader.pos); + try reader.checkEntityReference(); + break .entity_reference; + } else if (try reader.readMatch("")) return reader.fatal(.element_end_unclosed, reader.pos); + try reader.checkElementEnd(); + if (reader.element_names.items.len == 1) reader.state = .after_root; + break .element_end; + } else if (try reader.readMatch("<")) { + try reader.readName(); + if (try reader.readElementStartContent()) { + reader.state = .empty_element; + } + try reader.checkElementStart(); + break .element_start; + } else { + try reader.readText(); + try reader.checkText(); + break .text; + } + }, + .empty_element => { + reader.state = .in_root; + break .element_end; + }, + .empty_root => { + reader.state = .after_root; + break .element_end; + }, + .after_root => { + try reader.skipSpace(); + if (reader.pos == reader.buf.len) { + reader.state = .eof; + continue; + } else if (try reader.readMatch(" break .eof, + } + }; + reader.node = node; + return node; +} + +/// Reads and returns the text content of the element and its children. +/// The current node after returning is the end of the element. +/// The returned value is owned by `reader` and is only valid until the next call to another +/// function on `reader`. +/// Asserts that the current node is `Node.element_start`. +pub fn readElementText(reader: *Reader) anyerror![]const u8 { + reader.scratch.clearRetainingCapacity(); + const writer = reader.scratch.writer(reader.gpa); + try reader.readElementTextWrite(writer.any()); + return reader.scratch.items; +} + +test readElementText { + var doc = StaticDocument.init( + \\Hello, world! + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + + try expectEqual(.element_start, try reader.read()); + try expectEqualStrings("root", reader.elementName()); + try expectEqualStrings("Hello, world!", try reader.readElementText()); + try expectEqualStrings("root", reader.elementName()); + try expectEqual(.eof, try reader.read()); +} + +/// Reads and returns the text content of the element and its children. +/// The current node after returning is the end of the element. +/// Asserts that the current node is `Node.element_start`. +pub fn readElementTextAlloc(reader: *Reader, gpa: Allocator) anyerror![]u8 { + var buf = std.ArrayList(u8).init(gpa); + defer buf.deinit(); + const buf_writer = buf.writer(); + reader.readElementTextWrite(buf_writer.any()) catch |err| switch (err) { + error.OutOfMemory => return error.OutOfMemory, + else => unreachable, + }; + return buf.toOwnedSlice(); +} + +test readElementTextAlloc { + var doc = StaticDocument.init( + \\Hello, world! + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + + try expectEqual(.element_start, try reader.read()); + try expectEqualStrings("root", reader.elementName()); + const element_text = try reader.readElementTextAlloc(std.testing.allocator); + defer std.testing.allocator.free(element_text); + try expectEqualStrings("Hello, world!", element_text); + try expectEqualStrings("root", reader.elementName()); + try expectEqual(.eof, try reader.read()); +} + +/// Reads the text content of the element and its children and writes it to +/// `writer`. +/// The current node after returning is the end of the element. +/// Asserts that the current node is `Node.element_start`. +pub fn readElementTextWrite(reader: *Reader, writer: std.io.AnyWriter) anyerror!void { + assert(reader.node == .element_start); + const depth = reader.element_names.items.len; + while (true) { + switch (try reader.read()) { + .xml_declaration, .eof => unreachable, + .element_start, .comment, .pi => {}, + .element_end => if (reader.element_names.items.len == depth) return, + .text => try reader.textWrite(writer), + .cdata => try reader.cdataWrite(writer), + .character_reference => { + var buf: [4]u8 = undefined; + const len = std.unicode.utf8Encode(reader.characterReferenceChar(), &buf) catch unreachable; + try writer.writeAll(buf[0..len]); + }, + .entity_reference => { + const expanded = predefined_entities.get(reader.entityReferenceName()) orelse unreachable; + try writer.writeAll(expanded); + }, + } + } +} + +/// Reads and discards all document content until the start of the root element, +/// which is the current node after this function returns successfully. +/// Asserts that the start of the root element has not yet been read. +pub fn skipProlog(reader: *Reader) anyerror!void { + assert(reader.state == .start or reader.state == .after_xml_declaration or reader.state == .after_doctype); + while (true) { + if (try reader.read() == .element_start) return; + } +} + +test skipProlog { + var doc = StaticDocument.init( + \\ + \\ + \\ + \\ + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + + try reader.skipProlog(); + try expectEqualStrings("root", reader.elementName()); + try expectEqual(.element_end, try reader.read()); + try expectEqualStrings("root", reader.elementName()); + try expectEqual(.eof, try reader.read()); +} + +/// Reads and discards all document content until the end of the containing +/// element, which is the current node after this function returns successfully. +/// Asserts that the reader is currently inside an element (not before or after +/// the root element). +pub fn skipElement(reader: *Reader) anyerror!void { + assert(reader.state == .in_root or reader.state == .empty_element or reader.state == .empty_root); + const depth = reader.element_names.items.len; + while (true) { + if (try reader.read() == .element_end and reader.element_names.items.len == depth) return; + } +} + +test skipElement { + var doc = StaticDocument.init( + \\ + \\ Hello, world! + \\ + \\ + \\ + ); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + + try expectEqual(.element_start, try reader.read()); + try expectEqualStrings("root", reader.elementName()); + try reader.skipElement(); + try expectEqualStrings("root", reader.elementName()); + try expectEqual(.eof, try reader.read()); +} + +fn readXmlDeclarationContent(reader: *Reader) !void { + while (true) { + try reader.readSpace(); + if (try reader.readMatch("?>")) return; + try reader.readPair(); + } +} + +fn checkXmlDeclaration(reader: *Reader) !void { + try reader.checkAttributes(); + var state: enum { + start, + after_version, + after_encoding, + end, + } = .start; + for (0..reader.attributeCountUnchecked()) |i| { + const name = reader.attributeNameUnchecked(i); + const value = reader.attributeValueUnchecked(i); + switch (state) { + .start => if (std.mem.eql(u8, name, "version")) { + try reader.checkXmlVersion(value, i); + state = .after_version; + } else { + return reader.fatal(.xml_declaration_version_missing, 0); + }, + .after_version => if (std.mem.eql(u8, name, "encoding")) { + try reader.checkXmlEncoding(value, i); + state = .after_encoding; + } else if (std.mem.eql(u8, name, "standalone")) { + try reader.checkXmlStandalone(value, i); + state = .end; + } else { + return reader.fatal(.xml_declaration_attribute_unsupported, reader.attributeNamePos(i)); + }, + .after_encoding => if (std.mem.eql(u8, name, "standalone")) { + try reader.checkXmlStandalone(value, i); + state = .end; + } else { + return reader.fatal(.xml_declaration_attribute_unsupported, reader.attributeNamePos(i)); + }, + .end => return reader.fatal(.xml_declaration_attribute_unsupported, reader.attributeNamePos(i)), + } + } + if (state == .start) { + return reader.fatal(.xml_declaration_version_missing, 0); + } +} + +fn checkXmlVersion(reader: *Reader, version: []const u8, n_attr: usize) !void { + if (!std.mem.startsWith(u8, version, "1.")) { + return reader.fatal(.xml_declaration_version_unsupported, reader.attributeValuePos(n_attr)); + } + for (version["1.".len..]) |c| { + switch (c) { + '0'...'9' => {}, + else => return reader.fatal(.xml_declaration_version_unsupported, reader.attributeValuePos(n_attr)), + } + } +} + +fn checkXmlEncoding(reader: *Reader, encoding: []const u8, n_attr: usize) !void { + if (!std.ascii.eqlIgnoreCase(encoding, "utf-8")) { + return reader.fatal(.xml_declaration_encoding_unsupported, reader.attributeValuePos(n_attr)); + } +} + +fn checkXmlStandalone(reader: *Reader, standalone: []const u8, n_attr: usize) !void { + if (!std.mem.eql(u8, standalone, "yes") and !std.mem.eql(u8, standalone, "no")) { + return reader.fatal(.xml_declaration_standalone_malformed, reader.attributeValuePos(n_attr)); + } +} + +fn readElementStartContent(reader: *Reader) !bool { + while (true) { + try reader.readSpace(); + if (try reader.readMatch("/>")) { + return true; + } else if (try reader.readMatch(">")) { + return false; + } else { + try reader.readPair(); + } + } +} + +fn checkElementStart(reader: *Reader) !void { + const element_name = reader.elementNameUnchecked(); + const element_name_pos = reader.elementNamePos(); + try reader.checkName(element_name, element_name_pos); + try reader.checkAttributes(); + + const element_name_index = try reader.addString(element_name); + try reader.element_names.append(reader.gpa, element_name_index); + + if (reader.options.namespace_aware) { + try reader.ns_prefixes.append(reader.gpa, .{}); + try reader.checkAttributesNs(); + if (std.mem.indexOfScalar(u8, element_name, ':')) |colon_pos| { + const prefix = element_name[0..colon_pos]; + if (std.mem.eql(u8, prefix, "xmlns")) return reader.fatal(.namespace_prefix_illegal, element_name_pos); + try reader.checkNcName(prefix, element_name_pos); + const local = element_name[colon_pos + 1 ..]; + try reader.checkNcName(local, element_name_pos); + if (reader.namespaceUri(prefix).len == 0) return reader.fatal(.namespace_prefix_unbound, element_name_pos); + } + } +} + +fn checkAttributes(reader: *Reader) !void { + const n_attributes = reader.attributeCountUnchecked(); + try reader.attributes.ensureUnusedCapacity(reader.gpa, n_attributes); + for (0..n_attributes) |i| { + const name_pos = reader.attributeNamePos(i); + if (i > 0 and name_pos == reader.attributeValueEndPos(i - 1) + 1) { + return reader.fatal(.attribute_missing_space, name_pos); + } + + const name = reader.attributeNameUnchecked(i); + try reader.checkName(name, name_pos); + + const gop = reader.attributes.getOrPutAssumeCapacity(name); + if (gop.found_existing) return reader.fatal(.attribute_duplicate, name_pos); + gop.value_ptr.* = i; + + try reader.checkAttributeValue(i); + } +} + +fn checkAttributeValue(reader: *Reader, n: usize) !void { + const s = reader.attributeValueUnchecked(n); + const pos = reader.attributeValuePos(n); + try reader.validateUtf8(s, pos); + var i: usize = 0; + while (i < s.len) : (i += 1) { + switch (s[i]) { + '\t', + '\n', + '\r', + 0x20...('&' - 1), + ('&' + 1)...('<' - 1), + ('<' + 1)...0xEE, + 0xF0...0xFF, + => {}, + 0xEF => { + // We already validated for correct UTF-8, so we know 2 bytes follow. + // The Unicode codepoints U+FFFE and U+FFFF are not allowed as characters: + // U+FFFE: EF BF BE + // U+FFFF: EF BF BF + if (s[i + 1] == 0xBF and (s[i + 2] == 0xBE or s[i + 2] == 0xBF)) { + return reader.fatal(.illegal_character, pos + i); + } + }, + '<' => return reader.fatal(.attribute_illegal_character, pos + i), + '&' => { + if (std.mem.startsWith(u8, s[i + "&".len ..], "#")) { + const end = std.mem.indexOfScalarPos(u8, s, i, ';') orelse return reader.fatal(.character_reference_unclosed, pos + i); + const ref = s[i + "&#".len .. end]; + const c = if (std.mem.startsWith(u8, ref, "x")) + std.fmt.parseInt(u21, ref["x".len..], 16) catch return reader.fatal(.character_reference_malformed, pos + i) + else + std.fmt.parseInt(u21, ref, 10) catch return reader.fatal(.character_reference_malformed, pos + i); + if (!isChar(c)) return reader.fatal(.character_reference_malformed, pos + i); + } else { + const end = std.mem.indexOfScalarPos(u8, s, i, ';') orelse return reader.fatal(.entity_reference_unclosed, pos + i); + const ref = s[i + "&".len .. end]; + if (!predefined_entities.has(ref)) return reader.fatal(.entity_reference_undefined, pos + i); + i = end; + } + }, + else => return reader.fatal(.illegal_character, pos + i), + } + } +} + +fn checkAttributesNs(reader: *Reader) !void { + const n_attributes = reader.attributeCountUnchecked(); + try reader.q_attributes.ensureUnusedCapacity(reader.gpa, n_attributes); + const prefix_bindings = &reader.ns_prefixes.items[reader.ns_prefixes.items.len - 1]; + + for (0..n_attributes) |i| { + const name = reader.attributeNameUnchecked(i); + const pos = reader.attributeNamePos(i); + if (std.mem.eql(u8, name, "xmlns")) { + const value = reader.attributeValueUnchecked(i); + const uri_index = try reader.addAttributeValueString(value); + const uri = reader.string(uri_index); + if (std.mem.eql(u8, uri, ns_xml) or std.mem.eql(u8, uri, ns_xmlns)) { + return reader.fatal(.namespace_binding_illegal, pos); + } + try prefix_bindings.putNoClobber(reader.gpa, .empty, uri_index); + } else if (std.mem.startsWith(u8, name, "xmlns:")) { + const prefix = name["xmlns:".len..]; + if (std.mem.eql(u8, prefix, "xmlns")) return reader.fatal(.namespace_binding_illegal, pos); + try reader.checkNcName(prefix, pos); + const prefix_index = try reader.addString(prefix); + const value = reader.attributeValueUnchecked(i); + if (value.len == 0) return reader.fatal(.attribute_prefix_undeclared, pos); + const uri_index = try reader.addAttributeValueString(value); + const uri = reader.string(uri_index); + if (std.mem.eql(u8, uri, "xml") != std.mem.eql(u8, uri, ns_xml)) return reader.fatal(.namespace_binding_illegal, pos); + if (std.mem.eql(u8, uri, ns_xmlns)) return reader.fatal(.namespace_binding_illegal, pos); + try prefix_bindings.putNoClobber(reader.gpa, prefix_index, uri_index); + } + } + + for (0..n_attributes) |i| { + const name = reader.attributeNameUnchecked(i); + const pos = reader.attributeNamePos(i); + const colon_pos = std.mem.indexOfScalar(u8, name, ':') orelse { + reader.q_attributes.putAssumeCapacityNoClobber(.{ .ns = "", .local = name }, i); + continue; + }; + const prefix = name[0..colon_pos]; + try reader.checkNcName(prefix, pos); + const local = name[colon_pos + 1 ..]; + try reader.checkNcName(local, pos); + const uri = reader.namespaceUri(prefix); + if (uri.len == 0) return reader.fatal(.namespace_prefix_unbound, pos); + const gop = reader.q_attributes.getOrPutAssumeCapacity(.{ .ns = uri, .local = local }); + if (gop.found_existing) return reader.fatal(.attribute_duplicate, pos); + gop.value_ptr.* = i; + } +} + +fn addAttributeValueString(reader: *Reader, raw_value: []const u8) !StringIndex { + try reader.strings.append(reader.gpa, 0); + const start = reader.strings.items.len; + var i: usize = 0; + while (i < raw_value.len) : (i += 1) { + switch (raw_value[i]) { + '\t', '\n' => try reader.strings.append(reader.gpa, ' '), + '\r' => { + try reader.strings.append(reader.gpa, ' '); + if (i + 1 < raw_value.len and raw_value[i + 1] == '\n') i += 1; + }, + '&' => { + const entity_end = std.mem.indexOfScalarPos(u8, raw_value, i, ';') orelse unreachable; + if (raw_value[i + "&".len] == '#') { + const c = if (raw_value[i + "&#".len] == 'x') + std.fmt.parseInt(u21, raw_value[i + "&#x".len .. entity_end], 16) catch unreachable + else + std.fmt.parseInt(u21, raw_value[i + "&#".len .. entity_end], 10) catch unreachable; + try reader.strings.ensureUnusedCapacity(reader.gpa, 4); + reader.strings.items.len += std.unicode.utf8Encode(c, reader.strings.items) catch unreachable; + } else { + const expansion = predefined_entities.get(raw_value[i + "&".len .. entity_end]) orelse unreachable; + try reader.strings.appendSlice(reader.gpa, expansion); + } + i = entity_end; + }, + else => |b| try reader.strings.append(reader.gpa, b), + } + } + return @enumFromInt(start); +} + +fn checkElementEnd(reader: *Reader) !void { + const element_name = reader.string(reader.element_names.getLast()); + if (!std.mem.eql(u8, reader.elementNameUnchecked(), element_name)) { + return reader.fatal(.element_end_mismatched, reader.elementNamePos()); + } +} + +fn readCommentContent(reader: *Reader) !void { + const start = reader.pos; + while (true) { + reader.pos = std.mem.indexOfPos(u8, reader.buf, reader.pos, "--") orelse reader.buf.len; + if (reader.pos < reader.buf.len) { + if (!std.mem.startsWith(u8, reader.buf[reader.pos + "--".len ..], ">")) { + return reader.fatal(.comment_malformed, reader.pos); + } + try reader.spans.append(reader.gpa, .{ .start = start, .end = reader.pos }); + reader.pos += "-->".len; + return; + } + try reader.more(); + if (reader.pos == reader.buf.len) return reader.fatal(.comment_unclosed, reader.pos); + } +} + +fn checkComment(reader: *Reader) !void { + try reader.checkChars(reader.commentUnchecked(), reader.commentPos()); +} + +fn readPiContent(reader: *Reader) !void { + try reader.readSpace(); + const start = reader.pos; + while (true) { + reader.pos = std.mem.indexOfPos(u8, reader.buf, reader.pos, "?>") orelse reader.buf.len; + if (reader.pos < reader.buf.len) { + try reader.spans.append(reader.gpa, .{ .start = start, .end = reader.pos }); + reader.pos += "?>".len; + return; + } + try reader.more(); + if (reader.pos == reader.buf.len) return reader.fatal(.pi_unclosed, reader.pos); + } +} + +fn checkPi(reader: *Reader) !void { + const target = reader.piTargetUnchecked(); + if (std.ascii.eqlIgnoreCase(target, "xml")) { + return reader.fatal(.pi_target_disallowed, reader.piTargetPos()); + } + try reader.checkName(target, reader.piTargetPos()); + if (reader.options.namespace_aware and std.mem.indexOfScalar(u8, target, ':') != null) { + return reader.fatal(.name_malformed, reader.piTargetPos()); + } + if (reader.piTargetEndPos() == reader.piDataPos() and reader.piDataEndPos() > reader.piDataPos()) { + return reader.fatal(.pi_missing_space, reader.piDataPos()); + } + try reader.checkChars(reader.piDataUnchecked(), reader.piDataPos()); +} + +fn readText(reader: *Reader) !void { + while (reader.pos < reader.buf.len) { + const b = reader.buf[reader.pos]; + if (b == '&' or b == '<') return; + // We don't care about validating UTF-8 strictly here. + // We just don't want to end in the possible middle of a codepoint. + const nb: usize = if (b < 0x80) { + reader.pos += 1; + continue; + } else if (b < 0xE0) + 2 + else if (b < 0xF0) + 3 + else + 4; + if (reader.pos + nb > reader.buf.len) try reader.more(); + reader.pos = @min(reader.pos + nb, reader.buf.len); + } + // We don't want to end on a CR right before an LF, or CRLF normalization will not be possible. + if (reader.pos > 0 and reader.buf[reader.pos - 1] == '\r') { + try reader.more(); + if (reader.pos < reader.buf.len and reader.buf[reader.pos] == '\n') { + reader.pos += 1; + } + return; + } + // We also don't want to end in the middle of ']]>' which checkText needs to reject. + if (reader.pos > 0 and reader.buf[reader.pos - 1] == ']') { + try reader.more(); + if (std.mem.startsWith(u8, reader.buf[reader.pos..], "]>")) { + reader.pos += "]>".len; + } + return; + } +} + +fn checkText(reader: *Reader) !void { + const s = reader.textUnchecked(); + const pos = reader.textPos(); + try reader.validateUtf8(s, pos); + for (s, 0..) |c, i| { + switch (c) { + '\t', + '\n', + '\r', + 0x20...(']' - 1), + (']' + 1)...0xEE, + 0xF0...0xFF, + => {}, + ']' => { + if (std.mem.startsWith(u8, s[i + 1 ..], "]>")) { + return reader.fatal(.text_cdata_end_disallowed, pos + i); + } + }, + 0xEF => { + // We already validated for correct UTF-8, so we know 2 bytes follow. + // The Unicode codepoints U+FFFE and U+FFFF are not allowed as characters: + // U+FFFE: EF BF BE + // U+FFFF: EF BF BF + if (s[i + 1] == 0xBF and (s[i + 2] == 0xBE or s[i + 2] == 0xBF)) { + return reader.fatal(.illegal_character, pos + i); + } + }, + else => return reader.fatal(.illegal_character, pos + i), + } + } +} + +fn readCdata(reader: *Reader) !void { + const start = reader.pos; + while (true) { + reader.pos = std.mem.indexOfPos(u8, reader.buf, reader.pos, "]]>") orelse reader.buf.len; + if (reader.pos < reader.buf.len) { + try reader.spans.append(reader.gpa, .{ .start = start, .end = reader.pos }); + reader.pos += "]]>".len; + return; + } + try reader.more(); + if (reader.pos == reader.buf.len) return reader.fatal(.cdata_unclosed, reader.pos); + } +} + +fn checkCdata(reader: *Reader) !void { + try reader.checkChars(reader.cdataUnchecked(), reader.cdataPos()); +} + +fn checkEntityReference(reader: *Reader) !void { + if (!predefined_entities.has(reader.entityReferenceNameUnchecked())) { + return reader.fatal(.entity_reference_undefined, reader.entityReferenceNamePos()); + } +} + +fn readCharacterReference(reader: *Reader) !void { + const start = reader.pos; + while (true) { + while (reader.pos < reader.buf.len) { + switch (reader.buf[reader.pos]) { + '0'...'9', 'A'...'Z', 'a'...'z' => reader.pos += 1, + else => { + try reader.spans.append(reader.gpa, .{ .start = start, .end = reader.pos }); + return; + }, + } + } + try reader.more(); + if (reader.pos == reader.buf.len) { + try reader.spans.append(reader.gpa, .{ .start = start, .end = reader.pos }); + return; + } + } +} + +fn checkCharacterReference(reader: *Reader) !void { + const ref = reader.characterReferenceNameUnchecked(); + const pos = reader.characterReferenceNamePos(); + const c = if (std.mem.startsWith(u8, ref, "x")) + std.fmt.parseInt(u21, ref["x".len..], 16) catch return reader.fatal(.character_reference_malformed, pos) + else + std.fmt.parseInt(u21, ref, 10) catch return reader.fatal(.character_reference_malformed, pos); + if (!isChar(c)) return reader.fatal(.character_reference_malformed, pos); + reader.character = c; +} + +fn readName(reader: *Reader) !void { + const start = reader.pos; + while (true) { + while (reader.pos < reader.buf.len) { + switch (reader.buf[reader.pos]) { + 'A'...'Z', 'a'...'z', '0'...'9', ':', '_', '-', '.', 0x80...0xFF => reader.pos += 1, + else => { + try reader.spans.append(reader.gpa, .{ .start = start, .end = reader.pos }); + return; + }, + } + } + try reader.more(); + if (reader.pos == reader.buf.len) { + try reader.spans.append(reader.gpa, .{ .start = start, .end = reader.pos }); + return; + } + } +} + +fn readPair(reader: *Reader) !void { + try reader.readName(); + try reader.readSpace(); + if (!try reader.readMatch("=")) return reader.fatal(.expected_equals, reader.pos); + try reader.readSpace(); + try reader.readQuotedValue(); +} + +fn readQuotedValue(reader: *Reader) !void { + const quote = quote: { + if (reader.pos == reader.buf.len) { + try reader.more(); + if (reader.pos == reader.buf.len) return reader.fatal(.expected_quote, reader.pos); + } + break :quote switch (reader.buf[reader.pos]) { + '"', '\'' => |c| c, + else => return reader.fatal(.expected_quote, reader.pos), + }; + }; + reader.pos += 1; + const start = reader.pos; + while (true) { + reader.pos = std.mem.indexOfScalarPos(u8, reader.buf, reader.pos, quote) orelse reader.buf.len; + if (reader.pos < reader.buf.len) { + try reader.spans.append(reader.gpa, .{ .start = start, .end = reader.pos }); + reader.pos += 1; + return; + } + try reader.more(); + if (reader.pos == reader.buf.len) return reader.fatal(.missing_end_quote, reader.pos); + } +} + +fn readMatch(reader: *Reader, needle: []const u8) !bool { + if (reader.pos + needle.len > reader.buf.len) { + try reader.more(); + if (reader.pos + needle.len > reader.buf.len) return false; + } + if (std.mem.eql(u8, reader.buf[reader.pos..][0..needle.len], needle)) { + reader.pos += needle.len; + return true; + } + return false; +} + +fn readSpace(reader: *Reader) !void { + while (true) { + while (reader.pos < reader.buf.len) { + switch (reader.buf[reader.pos]) { + ' ', '\t', '\r', '\n' => reader.pos += 1, + else => return, + } + } + try reader.more(); + if (reader.pos == reader.buf.len) return; + } +} + +fn checkName(reader: *Reader, s: []const u8, pos: usize) !void { + const view = try reader.viewUtf8(s, pos); + var iter = view.iterator(); + if (!isNameStartChar(iter.nextCodepoint() orelse return reader.fatal(.name_malformed, pos))) { + return reader.fatal(.name_malformed, pos); + } + while (iter.nextCodepoint()) |c| { + if (!isNameChar(c)) return reader.fatal(.name_malformed, pos); + } +} + +fn checkNcName(reader: *Reader, s: []const u8, pos: usize) !void { + if (s.len == 0 or !isNameStartChar(s[0]) or std.mem.indexOfScalar(u8, s, ':') != null) { + return reader.fatal(.name_malformed, pos); + } +} + +fn isNameStartChar(c: u21) bool { + return switch (c) { + ':', + 'A'...'Z', + '_', + 'a'...'z', + 0xC0...0xD6, + 0xD8...0xF6, + 0xF8...0x2FF, + 0x370...0x37D, + 0x37F...0x1FFF, + 0x200C...0x200D, + 0x2070...0x218F, + 0x2C00...0x2FEF, + 0x3001...0xD7FF, + 0xF900...0xFDCF, + 0xFDF0...0xFFFD, + 0x10000...0xEFFFF, + => true, + else => false, + }; +} + +fn isNameChar(c: u21) bool { + return isNameStartChar(c) or switch (c) { + '-', + '.', + '0'...'9', + 0xB7, + 0x0300...0x036F, + 0x203F...0x2040, + => true, + else => false, + }; +} + +fn checkChars(reader: *Reader, s: []const u8, pos: usize) !void { + try reader.validateUtf8(s, pos); + for (s, 0..) |c, i| { + switch (c) { + '\t', '\n', '\r', 0x20...0xEE, 0xF0...0xFF => {}, + 0xEF => { + // We already validated for correct UTF-8, so we know 2 bytes follow. + // The Unicode codepoints U+FFFE and U+FFFF are not allowed as characters: + // U+FFFE: EF BF BE + // U+FFFF: EF BF BF + if (s[i + 1] == 0xBF and (s[i + 2] == 0xBE or s[i + 2] == 0xBF)) { + return reader.fatal(.illegal_character, pos + i); + } + }, + else => return reader.fatal(.illegal_character, pos + i), + } + } +} + +fn isChar(c: u21) bool { + return switch (c) { + 0x9, + 0xA, + 0xD, + 0x20...0xD7FF, + 0xE000...0xFFFD, + 0x10000...0x10FFFF, + => true, + else => false, + }; +} + +fn skipBom(reader: *Reader) !void { + const bom = "\u{FEFF}"; + if (std.mem.startsWith(u8, reader.buf[reader.pos..], bom)) { + reader.pos += bom.len; + try reader.shift(); + } +} + +fn skipSpace(reader: *Reader) !void { + while (true) { + while (reader.pos < reader.buf.len) { + switch (reader.buf[reader.pos]) { + ' ', '\t', '\r', '\n' => reader.pos += 1, + else => { + try reader.shift(); + return; + }, + } + } + try reader.shift(); + if (reader.pos == reader.buf.len) return; + } +} + +fn validateUtf8(reader: *Reader, s: []const u8, pos: usize) !void { + if (reader.options.assume_valid_utf8) return; + if (!std.unicode.utf8ValidateSlice(s)) return reader.fatalInvalidUtf8(s, pos); +} + +fn viewUtf8(reader: *Reader, s: []const u8, pos: usize) !std.unicode.Utf8View { + if (reader.options.assume_valid_utf8) return std.unicode.Utf8View.initUnchecked(s); + return std.unicode.Utf8View.init(s) catch reader.fatalInvalidUtf8(s, pos); +} + +fn fatalInvalidUtf8(reader: *Reader, s: []const u8, pos: usize) error{MalformedXml} { + // We need to backtrack and redo the UTF-8 validation to set the correct + // error location; the standard "validate UTF-8" function doesn't provide + // an index for the invalid data. + var invalid_pos: usize = 0; + while (true) { + const cp_len = std.unicode.utf8ByteSequenceLength(s[invalid_pos]) catch break; + if (invalid_pos + cp_len > s.len) break; + if (!std.unicode.utf8ValidateSlice(s[invalid_pos..][0..cp_len])) break; + invalid_pos += cp_len; + } + return reader.fatal(.invalid_utf8, pos + invalid_pos); +} + +const base_read_size = 4096; + +fn shift(reader: *Reader) !void { + if (reader.options.location_aware) { + reader.loc.update(reader.buf[0..reader.pos]); + } + + reader.buf = try reader.source.move(reader.pos, base_read_size); + reader.pos = 0; + reader.spans.clearRetainingCapacity(); + reader.attributes.clearRetainingCapacity(); + reader.q_attributes.clearRetainingCapacity(); + + if (reader.node == .element_end) { + if (reader.options.namespace_aware) { + var prefix_bindings = reader.ns_prefixes.pop(); + prefix_bindings.deinit(reader.gpa); + } + const element_name_start = reader.element_names.pop(); + reader.strings.shrinkRetainingCapacity(@intFromEnum(element_name_start)); + } +} + +fn more(reader: *Reader) !void { + reader.buf = try reader.source.move(0, reader.buf.len * 2); +} + +fn fatal(reader: *Reader, error_code: ErrorCode, error_pos: usize) error{MalformedXml} { + reader.state = .invalid; + reader.error_code = error_code; + reader.error_pos = error_pos; + return error.MalformedXml; +} + +const QNameContext = struct { + pub fn hash(ctx: @This(), qname: QName) u32 { + _ = ctx; + var w = std.hash.Wyhash.init(0); + w.update(qname.ns); + w.update(qname.local); + return @truncate(w.final()); + } + + pub fn eql(ctx: @This(), a: QName, b: QName, b_index: usize) bool { + _ = ctx; + _ = b_index; + return std.mem.eql(u8, a.ns, b.ns) and std.mem.eql(u8, a.local, b.local); + } +}; + +const BufSpan = struct { + start: usize, + end: usize, +}; + +fn bufSlice(reader: Reader, span: BufSpan) []const u8 { + return reader.buf[span.start..span.end]; +} + +const StringIndex = enum(usize) { empty = 0, _ }; + +const StringIndexAdapter = struct { + strings: []const u8, + + pub fn hash(ctx: @This(), key: []const u8) u32 { + _ = ctx; + return @truncate(std.hash.Wyhash.hash(0, key)); + } + + pub fn eql(ctx: @This(), a: []const u8, b: StringIndex, b_index: usize) bool { + _ = b_index; + const b_val = std.mem.sliceTo(ctx.strings[@intFromEnum(b)..], 0); + return std.mem.eql(u8, a, b_val); + } +}; + +fn addString(reader: *Reader, s: []const u8) !StringIndex { + try reader.strings.ensureUnusedCapacity(reader.gpa, s.len + 1); + reader.strings.appendAssumeCapacity(0); + const start = reader.strings.items.len; + reader.strings.appendSliceAssumeCapacity(s); + return @enumFromInt(start); +} + +fn string(reader: Reader, index: StringIndex) []const u8 { + return std.mem.sliceTo(reader.strings.items[@intFromEnum(index)..], 0); +} diff --git a/src/Scanner.zig b/src/Scanner.zig deleted file mode 100644 index 3939ea9..0000000 --- a/src/Scanner.zig +++ /dev/null @@ -1,2045 +0,0 @@ -//! A simple, low-level streaming XML parser. -//! -//! The design of the parser is strongly inspired by -//! [Yxml](https://dev.yorhel.nl/yxml). Codepoints are fed to the parser one by one -//! using the `next` function, then the `endInput` function should be used to -//! check that the parser is in a valid state for the end of input (e.g. not in -//! the middle of parsing an element). The tokens returned by the parser -//! reference the input data using `pos` ranges (the meaning of `pos` depends -//! on the meaning of the `len` passed to `next`). -//! -//! A higher-level parser which wants to do anything useful with the returned -//! tokens will need to store the input text fed to the `next` function in some -//! sort of buffer. If the document is stored entirely in memory, this buffer -//! could be the document content itself. If the document is being read in a -//! streaming manner, however, then an auxiliary buffer will be needed. To -//! avoid requiring such higher-level APIs to maintain an unbounded input -//! buffer, the `resetPos` function exists to reset `pos` to 0, if possible. -//! The approach taken by `TokenReader` is to call `resetPos` after every -//! token, and after reaching a state where space for a further codepoint is -//! not guaranteed. With this approach, the length of the buffer bounds the -//! maximum size of "unsplittable" content, such as element and attribute -//! names, but not "splittable" content such as element text content and -//! attribute values. -//! -//! Intentional (permanent) limitations (which can be addressed by -//! higher-level APIs, such as `Reader`): -//! -//! - Does not validate that corresponding open and close tags match. -//! - Does not validate that attribute names are not duplicated. -//! - Does not do any special handling of namespaces. -//! - Does not perform any sort of processing on text content or attribute -//! values (including normalization, expansion of entities, etc.). -//! - However, note that entity and character references in text content and -//! attribute values _are_ validated for correct syntax, although their -//! content is not (they may reference non-existent entities). -//! - Does not process DTDs in any way besides parsing them (TODO: see below). -//! -//! Unintentional (temporary) limitations (which will be removed over time): -//! -//! - Does not support `DOCTYPE` at all (using one will result in an error). -//! - Not extensively tested/fuzzed. - -/// The data for the most recently returned token. -token_data: Token.Data = undefined, -/// The current state of the scanner. -state: State = .start, -/// Data associated with the current state of the scanner. -state_data: State.Data = undefined, -/// The current position in the input. -/// -/// The meaning of this position is determined by the meaning of the `len` -/// value passed to `next`, which is determined by the user. For example, a -/// user with a byte slice or reader would probably want to pass `len` as the -/// number of bytes making up the codepoint, which would make `pos` a byte -/// offset. -pos: usize = 0, -/// The current element nesting depth. -depth: usize = 0, -/// Whether the root element has been seen already. -seen_root_element: bool = false, - -const std = @import("std"); -const testing = std.testing; -const unicode = std.unicode; -const syntax = @import("syntax.zig"); - -const Scanner = @This(); - -/// A range of byte positions in the input. -pub const Range = struct { - /// The start of the range (inclusive). - start: usize, - /// The end of the range (exclusive). - end: usize, - - pub fn isEmpty(self: Range) bool { - return self.start == self.end; - } - - pub fn format(self: Range, _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { - try writer.print("{}..{}", .{ self.start, self.end }); - } -}; - -/// A single XML token. -/// -/// The choice of tokens is designed to allow the buffer position to be reset as -/// often as reasonably possible ("forgetting" any range information before the -/// reset), supported by the following design decisions: -/// -/// - Tokens contain only the immediately necessary context: for example, the -/// `attribute_content` token does not store any information about the -/// attribute name, since it may have been processed many resets ago (if the -/// attribute content is very long). -/// - Multiple `content` tokens may be returned for a single enclosing context -/// (e.g. element or attribute) if the buffer is reset in the middle of -/// content or there are other necessary intervening factors, such as CDATA -/// in the middle of normal (non-CDATA) element content. -/// -/// For efficiency (avoiding copying when passing around tokens), this is -/// merely an enum specifying the token type. The actual token data is available -/// in `Token.Data`, in the scanner's `token_data` field. The `fullToken` -/// function can be used to get a `Token.Full`, which is a tagged union type and -/// may be easier to consume in certain circumstances. -pub const Token = enum { - /// Continue processing: no new token to report yet. - ok, - /// XML declaration. - xml_declaration, - /// Element start tag. - element_start, - /// Element content. - element_content, - /// Element end tag. - element_end, - /// End of an empty element. - element_end_empty, - /// Attribute start. - attribute_start, - /// Attribute value content. - attribute_content, - /// Comment start. - comment_start, - /// Comment content. - comment_content, - /// Processing instruction (PI) start. - pi_start, - /// PI content. - pi_content, - - /// The data associated with a token. - /// - /// Even token types which have no associated data are represented here, to - /// provide some additional safety in safe build modes (where it can be - /// checked whether the caller is referencing the correct data field). - pub const Data = union { - ok: void, - xml_declaration: XmlDeclaration, - element_start: ElementStart, - element_content: ElementContent, - element_end: ElementEnd, - element_end_empty: void, - attribute_start: AttributeStart, - attribute_content: AttributeContent, - comment_start: void, - comment_content: CommentContent, - pi_start: PiStart, - pi_content: PiContent, - }; - - /// A token type plus data represented as a tagged union. - pub const Full = union(Token) { - ok, - xml_declaration: XmlDeclaration, - element_start: ElementStart, - element_content: ElementContent, - element_end: ElementEnd, - element_end_empty, - attribute_start: AttributeStart, - attribute_content: AttributeContent, - comment_start, - comment_content: CommentContent, - pi_start: PiStart, - pi_content: PiContent, - }; - - pub const XmlDeclaration = struct { - version: Range, - encoding: ?Range = null, - standalone: ?bool = null, - }; - - pub const ElementStart = struct { - name: Range, - }; - - pub const ElementContent = struct { - content: Content, - }; - - pub const ElementEnd = struct { - name: Range, - }; - - pub const AttributeStart = struct { - name: Range, - }; - - pub const AttributeContent = struct { - content: Content, - final: bool = false, - }; - - pub const CommentContent = struct { - content: Range, - final: bool = false, - }; - - pub const PiStart = struct { - target: Range, - }; - - pub const PiContent = struct { - content: Range, - final: bool = false, - }; - - /// A bit of content of an element or attribute. - pub const Content = union(enum) { - /// Raw text content (does not contain any entities). - text: Range, - /// A Unicode codepoint. - codepoint: u21, - /// An entity reference, such as `&`. The range covers the name (`amp`). - entity: Range, - }; -}; - -/// Returns the full token (including data) from the most recent call to `next` -/// or `resetPos`. `token` must be the token returned from the last call to one -/// of those functions. -/// -/// --- -/// -/// API note: the use of `self: *const Scanner` rather than `self: Scanner` is -/// important to elimiate a potential footgun with the following code: -/// -/// ``` -/// const full_token = scanner.fullToken(try scanner.next(c, len)); -/// ``` -/// -/// If `self: Scanner` is used, then Zig will evaluate `scanner` in its current -/// state (for the expression `scanner.fullToken`) before calling -/// `scanner.next`. This leads to the result being incorrect, since the `scanner` -/// used for the `fullToken` call will have the old token data. -pub fn fullToken(self: *const Scanner, token: Token) Token.Full { - return switch (token) { - inline else => |tag| @unionInit(Token.Full, @tagName(tag), @field(self.token_data, @tagName(tag))), - }; -} - -/// The possible states of the parser. -/// -/// The parser is designed as a state machine. A state may need to hold -/// associated data to allow the necessary information to be included in a -/// future token. One shortcut used to avoid creating many unnecessary -/// additional states is to store a `left` byte slice tracking expected bytes -/// remaining in a state (the slice is always pointing to static strings, so -/// there are no lifetime considerations): for example, the word "version" in -/// an XML declaration is parsed in the xml_decl_version_name state, and -/// successive bytes are validated using the `left` slice (e.g. after parsing -/// "v", left is "ersion", so that when we handle the next character, we can -/// fail parsing if it is not "e", and then set `left` to "rsion", and so on). -pub const State = enum { - /// Start of document. - start, - /// Start of document after BOM. - start_after_bom, - - /// Same as unknown_start, but also allows the XML declaration. - start_unknown_start, - /// Start of a PI or XML declaration after ' if (c == 0xFEFF) { - self.state = .start_after_bom; - return .ok; - } else if (c == '<') { - self.state = .start_unknown_start; - return .ok; - } else if (syntax.isSpace(c)) { - self.state = .start_after_xml_decl; - return .ok; - }, - - .start_after_bom => if (c == '<') { - self.state = .start_unknown_start; - return .ok; - } else if (syntax.isSpace(c)) { - self.state = .start_after_xml_decl; - return .ok; - }, - - .start_unknown_start => if (syntax.isNameStartChar(c)) { - self.state = .element_start_name; - self.state_data.start = self.pos; - return .ok; - } else if (c == '?') { - self.state = .pi_or_xml_decl_start; - self.state_data.start = self.pos + len; - self.state_data.left = "xml"; - return .ok; - } else if (c == '!') { - self.state = .unknown_start_bang; - return .ok; - }, - - .pi_or_xml_decl_start => if (c == self.state_data.left[0]) { - if (self.state_data.left.len == 1) { - self.state = .pi_or_xml_decl_start_after_xml; - // self.state_data.start = self.state_data.start; - } else { - self.state_data.left = self.state_data.left[1..]; - } - return .ok; - } else if (syntax.isNameStartChar(c) or (syntax.isNameChar(c) and self.pos > self.state_data.start)) { - self.state = .pi_target; - // self.state_data.start = self.state_data.start; - return .ok; - } else if (syntax.isSpace(c) and self.pos > self.state_data.start) { - const target = Range{ .start = self.state_data.start, .end = self.pos }; - self.state = .pi_after_target; - self.token_data = .{ .pi_start = .{ .target = target } }; - return .pi_start; - } else if (c == '?' and self.pos > self.state_data.start) { - const target = Range{ .start = self.state_data.start, .end = self.pos }; - self.state = .pi_maybe_end; - self.state_data.start = self.pos; - self.state_data.end = self.pos; - self.token_data = .{ .pi_start = .{ .target = target } }; - return .pi_start; - }, - - .pi_or_xml_decl_start_after_xml => if (syntax.isSpace(c)) { - self.state = .xml_decl; - return .ok; - } else if (syntax.isNameChar(c)) { - self.state = .pi_target; - // self.state_data.start = self.state_data.start; - return .ok; - }, - - .xml_decl => if (syntax.isSpace(c)) { - return .ok; - } else if (c == 'v') { - self.state = .xml_decl_version_name; - self.state_data.left = "ersion"; - return .ok; - }, - - .xml_decl_version_name => if (c == self.state_data.left[0]) { - if (self.state_data.left.len == 1) { - self.state = .xml_decl_after_version_name; - } else { - self.state_data.left = self.state_data.left[1..]; - } - return .ok; - }, - - .xml_decl_after_version_name => if (syntax.isSpace(c)) { - return .ok; - } else if (c == '=') { - self.state = .xml_decl_after_version_equals; - return .ok; - }, - - .xml_decl_after_version_equals => if (syntax.isSpace(c)) { - return .ok; - } else if (c == '"' or c == '\'') { - self.state = .xml_decl_version_value_start; - self.state_data.start = self.pos + len; - self.state_data.quote = @intCast(c); - self.state_data.left = "1."; - return .ok; - }, - - .xml_decl_version_value_start => if (c == self.state_data.left[0]) { - if (self.state_data.left.len == 1) { - self.state = .xml_decl_version_value; - // self.state_data.start = self.state_data.start; - // self.state_data.quote = self.state_data.quote; - } else { - self.state_data.left = self.state_data.left[1..]; - } - return .ok; - }, - - .xml_decl_version_value => if (c == self.state_data.quote and self.pos > self.state_data.start + "1.".len) { - self.state = .xml_decl_after_version_value; - self.state_data.version = .{ .start = self.state_data.start, .end = self.pos }; - return .ok; - } else if (syntax.isDigit(c)) { - return .ok; - }, - - .xml_decl_after_version_value => if (syntax.isSpace(c)) { - self.state = .xml_decl_after_version; - // self.state_data.version = self.state_data.version; - return .ok; - } else if (c == '?') { - const version = self.state_data.version; - self.state = .xml_decl_end; - self.token_data = .{ .xml_declaration = .{ .version = version, .encoding = null, .standalone = null } }; - return .xml_declaration; - }, - - .xml_decl_after_version => if (syntax.isSpace(c)) { - return .ok; - } else if (c == 'e') { - self.state = .xml_decl_encoding_name; - // self.state_data.version = self.state_data.version; - self.state_data.left = "ncoding"; - return .ok; - } else if (c == 's') { - self.state = .xml_decl_standalone_name; - // self.state_data.version = self.state_data.version; - self.state_data.encoding = null; - self.state_data.left = "tandalone"; - return .ok; - } else if (c == '?') { - const version = self.state_data.version; - self.state = .xml_decl_end; - self.token_data = .{ .xml_declaration = .{ .version = version, .encoding = null, .standalone = null } }; - return .xml_declaration; - }, - - .xml_decl_encoding_name => if (c == self.state_data.left[0]) { - if (self.state_data.left.len == 1) { - self.state = .xml_decl_after_encoding_name; - // self.state_data.version = self.state_data.version; - } else { - self.state_data.left = self.state_data.left[1..]; - } - return .ok; - }, - - .xml_decl_after_encoding_name => if (syntax.isSpace(c)) { - return .ok; - } else if (c == '=') { - self.state = .xml_decl_after_encoding_equals; - // self.state_data.version = self.state_data.version; - return .ok; - }, - - .xml_decl_after_encoding_equals => if (syntax.isSpace(c)) { - return .ok; - } else if (c == '"' or c == '\'') { - self.state = .xml_decl_encoding_value_start; - // self.state_data.version = self.state_data.version; - self.state_data.start = self.pos + len; - self.state_data.quote = @as(u8, @intCast(c)); - return .ok; - }, - - .xml_decl_encoding_value_start => if (syntax.isEncodingStartChar(c)) { - self.state = .xml_decl_encoding_value; - // self.state_data.version = self.state_data.version; - // self.state_data.start = self.state_data.start; - // self.state_data.quote = self.state_data.quote; - return .ok; - }, - - .xml_decl_encoding_value => if (c == self.state_data.quote) { - self.state = .xml_decl_after_encoding_value; - // self.state_data.version = self.state_data.version; - self.state_data.encoding = .{ .start = self.state_data.start, .end = self.pos }; - return .ok; - } else if (syntax.isEncodingChar(c)) { - return .ok; - }, - - .xml_decl_after_encoding_value => if (syntax.isSpace(c)) { - self.state = .xml_decl_after_encoding; - // self.state_data.version = self.state_data.version; - // self.state_data.encoding = self.state_data.encoding; - return .ok; - } else if (c == '?') { - const version = self.state_data.version; - const encoding = self.state_data.encoding; - self.state = .xml_decl_end; - self.token_data = .{ .xml_declaration = .{ .version = version, .encoding = encoding, .standalone = null } }; - return .xml_declaration; - }, - - .xml_decl_after_encoding => if (syntax.isSpace(c)) { - return .ok; - } else if (c == 's') { - self.state = .xml_decl_standalone_name; - // self.state_data.version = self.state_data.version; - // self.state_data.encoding = self.state_data.encoding; - self.state_data.left = "tandalone"; - return .ok; - } else if (c == '?') { - const version = self.state_data.version; - const encoding = self.state_data.encoding; - self.state = .xml_decl_end; - self.token_data = .{ .xml_declaration = .{ .version = version, .encoding = encoding, .standalone = null } }; - return .xml_declaration; - }, - - .xml_decl_standalone_name => if (c == self.state_data.left[0]) { - if (self.state_data.left.len == 1) { - self.state = .xml_decl_after_standalone_name; - // self.state_data.version = self.state_data.version; - // self.state_data.encoding = self.state_data.encoding; - } else { - self.state_data.left = self.state_data.left[1..]; - } - return .ok; - }, - - .xml_decl_after_standalone_name => if (syntax.isSpace(c)) { - return .ok; - } else if (c == '=') { - self.state = .xml_decl_after_standalone_equals; - // self.state_data.version = self.state_data.version; - // self.state_data.encoding = self.state_data.encoding; - return .ok; - }, - - .xml_decl_after_standalone_equals => if (syntax.isSpace(c)) { - return .ok; - } else if (c == '"' or c == '\'') { - self.state = .xml_decl_standalone_value_start; - // self.state_data.version = self.state_data.version; - // self.state_data.encoding = self.state_data.encoding; - self.state_data.quote = @as(u8, @intCast(c)); - return .ok; - }, - - .xml_decl_standalone_value_start => if (c == 'y') { - const version = self.state_data.version; - const encoding = self.state_data.encoding; - self.state = .xml_decl_standalone_value; - // self.state_data.quote = self.state_data.quote; - self.state_data.left = "es"; - self.token_data = .{ .xml_declaration = .{ .version = version, .encoding = encoding, .standalone = true } }; - return .xml_declaration; - } else if (c == 'n') { - const version = self.state_data.version; - const encoding = self.state_data.encoding; - self.state = .xml_decl_standalone_value; - // self.state_data.quote = self.state_data.quote; - self.state_data.left = "o"; - self.token_data = .{ .xml_declaration = .{ .version = version, .encoding = encoding, .standalone = false } }; - return .xml_declaration; - }, - - .xml_decl_standalone_value => if (c == self.state_data.left[0]) { - if (self.state_data.left.len == 1) { - self.state = .xml_decl_standalone_value_end; - // self.state_data.quote = self.state_data.quote; - } else { - self.state_data.left = self.state_data.left[1..]; - } - return .ok; - }, - - .xml_decl_standalone_value_end => if (c == self.state_data.quote) { - self.state = .xml_decl_after_standalone; - return .ok; - }, - - .xml_decl_after_standalone => if (syntax.isSpace(c)) { - return .ok; - } else if (c == '?') { - self.state = .xml_decl_end; - return .ok; - }, - - .xml_decl_end => if (c == '>') { - self.state = .start_after_xml_decl; - return .ok; - }, - - .start_after_xml_decl => if (syntax.isSpace(c)) { - return .ok; - } else if (c == '<') { - self.state = .unknown_start; - return .ok; - }, - - .doctype_start => if (c == self.state_data.left[0]) { - if (self.state_data.left.len == 1) { - return error.DoctypeNotSupported; - } else { - self.state_data.left = self.state_data.left[1..]; - return .ok; - } - }, - - .document_content => if (syntax.isSpace(c)) { - return .ok; - } else if (c == '<') { - self.state = .unknown_start; - return .ok; - }, - - .unknown_start => if (syntax.isNameStartChar(c) and !self.seen_root_element) { - self.state = .element_start_name; - self.state_data.start = self.pos; - return .ok; - } else if (c == '/' and self.depth > 0) { - self.state = .element_end; - return .ok; - } else if (c == '!') { - self.state = .unknown_start_bang; - return .ok; - } else if (c == '?') { - self.state = .pi; - return .ok; - }, - - .unknown_start_bang => if (c == '-') { - self.state = .comment_before_start; - return .ok; - } else if (self.depth > 0 and c == '[') { - // Textual content is not allowed outside the root element. - self.state = .cdata_before_start; - self.state_data.left = "CDATA["; - return .ok; - } else if (self.depth == 0 and !self.seen_root_element and c == 'D') { - self.state = .doctype_start; - self.state_data.left = "OCTYPE "; - return .ok; - }, - - .comment_before_start => if (c == '-') { - self.state = .comment; - self.state_data.start = self.pos + len; - self.token_data = .{ .comment_start = {} }; - return .comment_start; - }, - - .comment => if (c == '-') { - self.state = .comment_maybe_before_end; - // self.state_data.start = self.state_data.start; - self.state_data.end = self.pos; - return .ok; - } else if (syntax.isChar(c)) { - return .ok; - }, - - .comment_maybe_before_end => if (c == '-') { - const content = Range{ .start = self.state_data.start, .end = self.state_data.end }; - self.state = .comment_before_end; - self.token_data = .{ .comment_content = .{ .content = content, .final = true } }; - return .comment_content; - } else if (syntax.isChar(c)) { - self.state = .comment; - // self.state_data.start = self.state_data.start; - return .ok; - }, - - .comment_before_end => if (c == '>') { - if (self.depth == 0) { - self.state = .document_content; - } else { - self.state = .content; - self.state_data.start = self.pos + len; - } - return .ok; - }, - - .pi => if (syntax.isNameStartChar(c)) { - self.state = .pi_target; - self.state_data.start = self.pos; - return .ok; - }, - - .pi_target => if (syntax.isNameChar(c)) { - return .ok; - } else if (syntax.isSpace(c)) { - const target = Range{ .start = self.state_data.start, .end = self.pos }; - self.state = .pi_after_target; - self.token_data = .{ .pi_start = .{ .target = target } }; - return .pi_start; - } else if (c == '?') { - const target = Range{ .start = self.state_data.start, .end = self.pos }; - self.state = .pi_maybe_end; - self.state_data.start = self.pos; - self.state_data.end = self.pos; - self.token_data = .{ .pi_start = .{ .target = target } }; - return .pi_start; - }, - - .pi_after_target => if (syntax.isSpace(c)) { - return .ok; - } else if (syntax.isChar(c)) { - self.state = .pi_content; - self.state_data.start = self.pos; - return .ok; - } else if (c == '?') { - self.state = .pi_maybe_end; - self.state_data.start = self.pos; - self.state_data.end = self.pos; - return .ok; - }, - - .pi_content => if (c == '?') { - self.state = .pi_maybe_end; - // self.state_data.start = self.state_data.start; - self.state_data.end = self.pos; - return .ok; - } else if (syntax.isChar(c)) { - return .ok; - }, - - .pi_maybe_end => if (c == '>') { - const content = Range{ .start = self.state_data.start, .end = self.state_data.end }; - if (self.depth == 0) { - self.state = .document_content; - } else { - self.state = .content; - self.state_data.start = self.pos + len; - } - self.token_data = .{ .pi_content = .{ .content = content, .final = true } }; - return .pi_content; - } else if (syntax.isChar(c)) { - self.state = .pi_content; - // self.state_data.start = self.state_data.start; - return .ok; - }, - - .cdata_before_start => if (c == self.state_data.left[0]) { - if (self.state_data.left.len == 1) { - self.state = .cdata; - self.state_data.start = self.pos + len; - } else { - self.state_data.left = self.state_data.left[1..]; - } - return .ok; - }, - - .cdata => if (c == ']') { - self.state = .cdata_maybe_before_end; - // self.state_data.start = self.state_data.start; - self.state_data.end = self.pos; - return .ok; - } else if (syntax.isChar(c)) { - return .ok; - }, - - .cdata_maybe_before_end => if (c == ']') { - self.state = .cdata_maybe_end; - // self.state_data.start = self.state_data.start; - // self.state_data.end = self.state_data.end; - return .ok; - } else if (syntax.isChar(c)) { - self.state = .cdata; - // self.state_data.start = self.state_data.start; - return .ok; - }, - - .cdata_maybe_end => if (c == ']') { - // For every ']' after two have been encountered, the end - // position is incremented so only the final ']]>' marks the end of - // CDATA. - self.state_data.end += 1; - return .ok; - } else if (c == '>') { - const text = Range{ .start = self.state_data.start, .end = self.state_data.end }; - self.state = .content; - self.state_data.start = self.pos + len; - self.token_data = .{ .element_content = .{ .content = .{ .text = text } } }; - return .element_content; - } else if (syntax.isChar(c)) { - self.state = .cdata; - // self.state_data.start = self.state_data.start; - return .ok; - }, - - .element_start_name => if (syntax.isNameChar(c)) { - return .ok; - } else if (syntax.isSpace(c)) { - self.depth += 1; - const name = Range{ .start = self.state_data.start, .end = self.pos }; - self.state = .element_start_after_name; - self.token_data = .{ .element_start = .{ .name = name } }; - return .element_start; - } else if (c == '/') { - self.depth += 1; - const name = Range{ .start = self.state_data.start, .end = self.pos }; - self.state = .element_start_empty; - self.token_data = .{ .element_start = .{ .name = name } }; - return .element_start; - } else if (c == '>') { - self.depth += 1; - const name = Range{ .start = self.state_data.start, .end = self.pos }; - self.state = .content; - self.state_data.start = self.pos + len; - self.token_data = .{ .element_start = .{ .name = name } }; - return .element_start; - }, - - .element_start_after_name => if (syntax.isSpace(c)) { - return .ok; - } else if (syntax.isNameStartChar(c)) { - self.state = .attribute_name; - self.state_data.start = self.pos; - return .ok; - } else if (c == '/') { - self.state = .element_start_empty; - return .ok; - } else if (c == '>') { - self.state = .content; - self.state_data.start = self.pos + len; - return .ok; - }, - - .element_start_empty => if (c == '>') { - self.depth -= 1; - if (self.depth == 0) { - self.seen_root_element = true; - } - if (self.depth == 0) { - self.state = .document_content; - } else { - self.state = .content; - self.state_data.start = self.pos + len; - } - self.token_data = .{ .element_end_empty = {} }; - return .element_end_empty; - }, - - .attribute_name => if (syntax.isNameChar(c)) { - return .ok; - } else if (syntax.isSpace(c)) { - const name = Range{ .start = self.state_data.start, .end = self.pos }; - self.state = .attribute_after_name; - self.token_data = .{ .attribute_start = .{ .name = name } }; - return .attribute_start; - } else if (c == '=') { - const name = Range{ .start = self.state_data.start, .end = self.pos }; - self.state = .attribute_after_equals; - self.token_data = .{ .attribute_start = .{ .name = name } }; - return .attribute_start; - }, - - .attribute_after_name => if (syntax.isSpace(c)) { - return .ok; - } else if (c == '=') { - self.state = .attribute_after_equals; - return .ok; - }, - - .attribute_after_equals => if (syntax.isSpace(c)) { - return .ok; - } else if (c == '"' or c == '\'') { - self.state = .attribute_content; - self.state_data.start = self.pos + len; - self.state_data.quote = @as(u8, @intCast(c)); - return .ok; - }, - - .attribute_content => if (c == self.state_data.quote) { - const text = Range{ .start = self.state_data.start, .end = self.pos }; - self.state = .attribute_after_content; - self.token_data = .{ .attribute_content = .{ .content = .{ .text = text }, .final = true } }; - return .attribute_content; - } else if (c == '&') { - const text = Range{ .start = self.state_data.start, .end = self.pos }; - self.state = .attribute_content_ref_start; - // self.state_data.quote = self.state_data.quote; - if (text.isEmpty()) { - // We do not want to emit an empty text content token between entities - return .ok; - } else { - self.token_data = .{ .attribute_content = .{ .content = .{ .text = text } } }; - return .attribute_content; - } - } else if (c != '<' and syntax.isChar(c)) { - return .ok; - }, - - .attribute_content_ref_start => if (syntax.isNameStartChar(c)) { - self.state = .attribute_content_entity_ref_name; - self.state_data.start = self.pos; - // self.state_data.quote = self.state_data.quote; - return .ok; - } else if (c == '#') { - self.state = .attribute_content_char_ref_start; - // self.state_data.quote = self.state_data.quote; - return .ok; - }, - - .attribute_content_entity_ref_name => if (syntax.isNameChar(c)) { - return .ok; - } else if (c == ';') { - const entity = Range{ .start = self.state_data.start, .end = self.pos }; - self.state = .attribute_content; - self.state_data.start = self.pos + len; - // self.state_data.quote = self.state_data.quote; - self.token_data = .{ .attribute_content = .{ .content = .{ .entity = entity } } }; - return .attribute_content; - }, - - .attribute_content_char_ref_start => if (syntax.isDigit(c)) { - self.state = .attribute_content_char_ref; - self.state_data.hex = false; - self.state_data.value = syntax.digitValue(c); - // self.state_data.quote = self.state_data.quote; - return .ok; - } else if (c == 'x') { - self.state = .attribute_content_char_ref; - self.state_data.hex = true; - self.state_data.value = 0; - // self.state_data.quote = self.state_data.quote; - return .ok; - }, - - .attribute_content_char_ref => if (!self.state_data.hex and syntax.isDigit(c)) { - const value = 10 * @as(u32, self.state_data.value) + syntax.digitValue(c); - if (value > std.math.maxInt(u21)) { - return error.InvalidCharacterReference; - } - self.state_data.value = @as(u21, @intCast(value)); - return .ok; - } else if (self.state_data.hex and syntax.isHexDigit(c)) { - const value = 16 * @as(u32, self.state_data.value) + syntax.hexDigitValue(c); - if (value > std.math.maxInt(u21)) { - return error.InvalidCharacterReference; - } - self.state_data.value = @as(u21, @intCast(value)); - return .ok; - } else if (c == ';') { - const codepoint = self.state_data.value; - if (!syntax.isChar(codepoint)) { - return error.InvalidCharacterReference; - } - self.state = .attribute_content; - self.state_data.start = self.pos + len; - // self.state_data.quote = self.state_data.quote; - self.token_data = .{ .attribute_content = .{ .content = .{ .codepoint = codepoint } } }; - return .attribute_content; - }, - - .attribute_after_content => if (syntax.isSpace(c)) { - self.state = .element_start_after_name; - return .ok; - } else if (c == '/') { - self.state = .element_start_empty; - return .ok; - } else if (c == '>') { - self.state = .content; - self.state_data.start = self.pos + len; - return .ok; - }, - - .element_end => if (syntax.isNameStartChar(c)) { - self.state = .element_end_name; - self.state_data.start = self.pos; - return .ok; - }, - - .element_end_name => if (syntax.isNameChar(c)) { - return .ok; - } else if (syntax.isSpace(c)) { - self.depth -= 1; - const name = Range{ .start = self.state_data.start, .end = self.pos }; - self.state = .element_end_after_name; - self.token_data = .{ .element_end = .{ .name = name } }; - return .element_end; - } else if (c == '>') { - self.depth -= 1; - if (self.depth == 0) { - self.seen_root_element = true; - } - const name = Range{ .start = self.state_data.start, .end = self.pos }; - if (self.depth == 0) { - self.state = .document_content; - } else { - self.state = .content; - self.state_data.start = self.pos + len; - } - self.token_data = .{ .element_end = .{ .name = name } }; - return .element_end; - }, - - .element_end_after_name => if (syntax.isSpace(c)) { - return .ok; - } else if (c == '>') { - if (self.depth == 0) { - self.seen_root_element = true; - } - if (self.depth == 0) { - self.state = .document_content; - } else { - self.state = .content; - self.state_data.start = self.pos + len; - } - return .ok; - }, - - inline .content, - .content_cdata_maybe_before_end, - .content_cdata_maybe_end, - => |state| if (c == ']') { - switch (state) { - .content => { - self.state = .content_cdata_maybe_before_end; - // self.state_data.start = self.state_data.start; - }, - .content_cdata_maybe_before_end => { - self.state = .content_cdata_maybe_end; - // self.state_data.start = self.state_data.start; - }, - else => {}, - } - return .ok; - } else if (state == .content_cdata_maybe_end and c == ']') { - return .ok; - } else if (state == .content_cdata_maybe_end and c == '>') { - return error.SyntaxError; - } else if (c == '<') { - const text = Range{ .start = self.state_data.start, .end = self.pos }; - self.state = .unknown_start; - if (text.isEmpty()) { - // Do not report empty text content between elements, e.g. - // (there is no text content between or - // within e1 and e2). - return .ok; - } else { - self.token_data = .{ .element_content = .{ .content = .{ .text = text } } }; - return .element_content; - } - } else if (c == '&') { - const text = Range{ .start = self.state_data.start, .end = self.pos }; - self.state = .content_ref_start; - if (text.isEmpty()) { - return .ok; - } else { - self.token_data = .{ .element_content = .{ .content = .{ .text = text } } }; - return .element_content; - } - } else if (syntax.isChar(c)) { - if (state != .content) { - self.state = .content; - // self.state_data.start = self.state_data.start; - } - return .ok; - }, - - .content_ref_start => if (syntax.isNameStartChar(c)) { - self.state = .content_entity_ref_name; - self.state_data.start = self.pos; - return .ok; - } else if (c == '#') { - self.state = .content_char_ref_start; - return .ok; - }, - - .content_entity_ref_name => if (syntax.isNameChar(c)) { - return .ok; - } else if (c == ';') { - const entity = Range{ .start = self.state_data.start, .end = self.pos }; - self.state = .content; - self.state_data.start = self.pos + len; - self.token_data = .{ .element_content = .{ .content = .{ .entity = entity } } }; - return .element_content; - }, - - .content_char_ref_start => if (syntax.isDigit(c)) { - self.state = .content_char_ref; - self.state_data.hex = false; - self.state_data.value = syntax.digitValue(c); - return .ok; - } else if (c == 'x') { - self.state = .content_char_ref; - self.state_data.hex = true; - self.state_data.value = 0; - return .ok; - }, - - .content_char_ref => if (!self.state_data.hex and syntax.isDigit(c)) { - const value = 10 * @as(u32, self.state_data.value) + syntax.digitValue(c); - if (value > std.math.maxInt(u21)) { - return error.InvalidCharacterReference; - } - self.state_data.value = @as(u21, @intCast(value)); - return .ok; - } else if (self.state_data.hex and syntax.isHexDigit(c)) { - const value = 16 * @as(u32, self.state_data.value) + syntax.hexDigitValue(c); - if (value > std.math.maxInt(u21)) { - return error.InvalidCharacterReference; - } - self.state_data.value = @as(u21, @intCast(value)); - return .ok; - } else if (c == ';') { - const codepoint = self.state_data.value; - if (!syntax.isChar(codepoint)) { - return error.InvalidCharacterReference; - } - self.state = .content; - self.state_data.start = self.pos + len; - self.token_data = .{ .element_content = .{ .content = .{ .codepoint = codepoint } } }; - return .element_content; - }, - - .@"error" => return error.SyntaxError, - } - - return error.SyntaxError; -} - -/// Signals that there is no further input to scan, and returns an error if -/// the scanner is not in a valid state to handle this (for example, if this -/// is called while in the middle of element content). -pub fn endInput(self: *Scanner) error{UnexpectedEndOfInput}!void { - if (self.state != .document_content or !self.seen_root_element) { - return error.UnexpectedEndOfInput; - } -} - -test Scanner { - try testValid( - \\ - \\ - \\ - \\ - \\ - \\

Hello,

- \\ - \\ - \\ Text content goes here. - \\

&

- \\
- \\ - \\ - \\ - \\ - \\ - , &.{ - .{ .xml_declaration = .{ .version = .{ .start = 15, .end = 18 } } }, - .{ .pi_start = .{ .target = .{ .start = 24, .end = 31 } } }, // some-pi - .{ .pi_content = .{ .content = .{ .start = 31, .end = 31 }, .final = true } }, - .comment_start, - .{ .comment_content = .{ .content = .{ .start = 38, .end = 85 }, .final = true } }, - .{ .pi_start = .{ .target = .{ .start = 91, .end = 111 } } }, // some-pi-with-content - .{ .pi_content = .{ .content = .{ .start = 112, .end = 119 }, .final = true } }, - .{ .element_start = .{ .name = .{ .start = 123, .end = 127 } } }, // root - .{ .element_content = .{ .content = .{ .text = .{ .start = 128, .end = 131 } } } }, - .{ .element_start = .{ .name = .{ .start = 132, .end = 133 } } }, // p - .{ .attribute_start = .{ .name = .{ .start = 134, .end = 139 } } }, - .{ .attribute_content = .{ .content = .{ .text = .{ .start = 141, .end = 145 } }, .final = true } }, - .{ .element_content = .{ .content = .{ .text = .{ .start = 147, .end = 154 } } } }, - .{ .element_content = .{ .content = .{ .text = .{ .start = 163, .end = 169 } } } }, - .{ .element_end = .{ .name = .{ .start = 174, .end = 175 } } }, // /p - .{ .element_content = .{ .content = .{ .text = .{ .start = 176, .end = 179 } } } }, - .{ .element_start = .{ .name = .{ .start = 180, .end = 184 } } }, // line - .element_end_empty, - .{ .element_content = .{ .content = .{ .text = .{ .start = 187, .end = 190 } } } }, - .{ .pi_start = .{ .target = .{ .start = 192, .end = 202 } } }, // another-pi - .{ .pi_content = .{ .content = .{ .start = 202, .end = 202 }, .final = true } }, - .{ .element_content = .{ .content = .{ .text = .{ .start = 204, .end = 233 } } } }, - .{ .element_start = .{ .name = .{ .start = 234, .end = 237 } } }, // div - .{ .element_start = .{ .name = .{ .start = 239, .end = 240 } } }, // p - .{ .element_content = .{ .content = .{ .entity = .{ .start = 242, .end = 245 } } } }, - .{ .element_end = .{ .name = .{ .start = 248, .end = 249 } } }, // /p - .{ .element_end = .{ .name = .{ .start = 252, .end = 255 } } }, // /div - .{ .element_content = .{ .content = .{ .text = .{ .start = 256, .end = 257 } } } }, - .{ .element_end = .{ .name = .{ .start = 259, .end = 263 } } }, // /root - .comment_start, - .{ .comment_content = .{ .content = .{ .start = 269, .end = 325 }, .final = true } }, - .{ .pi_start = .{ .target = .{ .start = 332, .end = 339 } } }, // comment - .{ .pi_content = .{ .content = .{ .start = 340, .end = 351 }, .final = true } }, - }); -} - -test "BOM" { - try testValid("\u{FEFF}", &.{ - .{ .element_start = .{ .name = .{ .start = 4, .end = 11 } } }, - .element_end_empty, - }); -} - -test "empty root element" { - try testValid("", &.{ - .{ .element_start = .{ .name = .{ .start = 1, .end = 8 } } }, - .element_end_empty, - }); - try testValid("", &.{ - .{ .element_start = .{ .name = .{ .start = 1, .end = 8 } } }, - .element_end_empty, - }); -} - -test "root element with no content" { - try testValid("", &.{ - .{ .element_start = .{ .name = .{ .start = 1, .end = 8 } } }, - .{ .element_end = .{ .name = .{ .start = 11, .end = 18 } } }, - }); -} - -test "element content" { - try testValid("Hello, world!", &.{ - .{ .element_start = .{ .name = .{ .start = 1, .end = 8 } } }, - .{ .element_content = .{ .content = .{ .text = .{ .start = 9, .end = 22 } } } }, - .{ .element_end = .{ .name = .{ .start = 24, .end = 31 } } }, - }); -} - -test "element nesting" { - try testValid("", &.{ - .{ .element_start = .{ .name = .{ .start = 1, .end = 5 } } }, - .{ .element_start = .{ .name = .{ .start = 7, .end = 10 } } }, - .{ .element_start = .{ .name = .{ .start = 12, .end = 17 } } }, - .element_end_empty, - .{ .element_end = .{ .name = .{ .start = 21, .end = 24 } } }, - .{ .element_end = .{ .name = .{ .start = 27, .end = 31 } } }, - }); - try testValid("", &.{ - .{ .element_start = .{ .name = .{ .start = 1, .end = 5 } } }, - .{ .element_start = .{ .name = .{ .start = 10, .end = 13 } } }, - .{ .element_start = .{ .name = .{ .start = 16, .end = 21 } } }, - .element_end_empty, - .{ .element_end = .{ .name = .{ .start = 26, .end = 29 } } }, - .{ .element_end = .{ .name = .{ .start = 33, .end = 37 } } }, - }); - try testInvalid("", error.SyntaxError, 14); - try testInvalid("", error.SyntaxError, 16); - try testIncomplete(""); - try testIncomplete(""); -} - -test "XML declaration" { - try testValid( - \\ - \\ - , &.{ - .{ .xml_declaration = .{ .version = .{ .start = 15, .end = 18 } } }, - .{ .element_start = .{ .name = .{ .start = 23, .end = 27 } } }, - .element_end_empty, - }); - try testValid( - \\ - \\ - , &.{ - .{ .xml_declaration = .{ .version = .{ .start = 17, .end = 20 } } }, - .{ .element_start = .{ .name = .{ .start = 25, .end = 29 } } }, - .element_end_empty, - }); - try testValid( - \\ - \\ - , &.{ - .{ .xml_declaration = .{ .version = .{ .start = 15, .end = 18 } } }, - .{ .element_start = .{ .name = .{ .start = 23, .end = 27 } } }, - .element_end_empty, - }); - try testValid( - \\ - \\ - , &.{ - .{ .xml_declaration = .{ .version = .{ .start = 15, .end = 20 } } }, - .{ .element_start = .{ .name = .{ .start = 25, .end = 29 } } }, - .element_end_empty, - }); - try testValid( - \\ - \\ - , &.{ - .{ .xml_declaration = .{ .version = .{ .start = 15, .end = 18 }, .encoding = .{ .start = 30, .end = 35 } } }, - .{ .element_start = .{ .name = .{ .start = 40, .end = 44 } } }, - .element_end_empty, - }); - try testValid( - \\ - \\ - , &.{ - .{ .xml_declaration = .{ .version = .{ .start = 17, .end = 20 }, .encoding = .{ .start = 34, .end = 39 } } }, - .{ .element_start = .{ .name = .{ .start = 44, .end = 48 } } }, - .element_end_empty, - }); - try testValid( - \\ - \\ - , &.{ - .{ .xml_declaration = .{ .version = .{ .start = 15, .end = 18 }, .encoding = .{ .start = 30, .end = 35 } } }, - .{ .element_start = .{ .name = .{ .start = 40, .end = 44 } } }, - .element_end_empty, - }); - try testValid( - \\ - \\ - , &.{ - .{ .xml_declaration = .{ .version = .{ .start = 15, .end = 18 }, .encoding = .{ .start = 30, .end = 35 } } }, - .{ .element_start = .{ .name = .{ .start = 40, .end = 44 } } }, - .element_end_empty, - }); - try testValid( - \\ - \\ - , &.{ - .{ .xml_declaration = .{ .version = .{ .start = 15, .end = 18 }, .encoding = .{ .start = 30, .end = 35 } } }, - .{ .element_start = .{ .name = .{ .start = 40, .end = 44 } } }, - .element_end_empty, - }); - try testValid( - \\ - \\ - , &.{ - .{ .xml_declaration = .{ .version = .{ .start = 15, .end = 18 }, .standalone = true } }, - .{ .element_start = .{ .name = .{ .start = 40, .end = 44 } } }, - .element_end_empty, - }); - try testValid( - \\ - \\ - , &.{ - .{ .xml_declaration = .{ .version = .{ .start = 15, .end = 18 }, .standalone = false } }, - .{ .element_start = .{ .name = .{ .start = 39, .end = 43 } } }, - .element_end_empty, - }); - try testValid( - \\ - \\ - , &.{ - .{ .xml_declaration = .{ .version = .{ .start = 17, .end = 20 }, .standalone = true } }, - .{ .element_start = .{ .name = .{ .start = 44, .end = 48 } } }, - .element_end_empty, - }); - try testValid( - \\ - \\ - , &.{ - .{ .xml_declaration = .{ .version = .{ .start = 15, .end = 18 }, .encoding = .{ .start = 30, .end = 35 }, .standalone = true } }, - .{ .element_start = .{ .name = .{ .start = 57, .end = 61 } } }, - .element_end_empty, - }); - try testValid( - \\ - \\ - , &.{ - .{ .xml_declaration = .{ .version = .{ .start = 17, .end = 20 }, .encoding = .{ .start = 34, .end = 39 }, .standalone = true } }, - .{ .element_start = .{ .name = .{ .start = 63, .end = 67 } } }, - .element_end_empty, - }); - try testInvalid("", error.SyntaxError, 19); - try testInvalid("", error.SyntaxError, 36); -} - -test "doctype" { - try testInvalid("", error.DoctypeNotSupported, 9); - try testInvalid("", error.DoctypeNotSupported, 30); - try testInvalid("", error.SyntaxError, 10); - try testInvalid("", error.SyntaxError, 8); -} - -test "CDATA" { - try testValid("", &.{ - .{ .element_start = .{ .name = .{ .start = 1, .end = 8 } } }, - .{ .element_content = .{ .content = .{ .text = .{ .start = 18, .end = 20 } } } }, - .{ .element_end = .{ .name = .{ .start = 25, .end = 32 } } }, - }); - try testValid("", &.{ - .{ .element_start = .{ .name = .{ .start = 1, .end = 8 } } }, - .{ .element_content = .{ .content = .{ .text = .{ .start = 18, .end = 21 } } } }, - .{ .element_end = .{ .name = .{ .start = 26, .end = 33 } } }, - }); - try testValid("]]]]]]]>", &.{ - .{ .element_start = .{ .name = .{ .start = 1, .end = 8 } } }, - .{ .element_content = .{ .content = .{ .text = .{ .start = 18, .end = 27 } } } }, - .{ .element_end = .{ .name = .{ .start = 32, .end = 39 } } }, - }); -} - -test "references" { - try testValid( - \\<Hi!!> - , &.{ - .{ .element_start = .{ .name = .{ .start = 1, .end = 8 } } }, - .{ .attribute_start = .{ .name = .{ .start = 9, .end = 18 } } }, - .{ .attribute_content = .{ .content = .{ .text = .{ .start = 20, .end = 25 } } } }, - .{ .attribute_content = .{ .content = .{ .codepoint = 0x2C } } }, - .{ .attribute_content = .{ .content = .{ .codepoint = 32 } } }, - .{ .attribute_content = .{ .content = .{ .text = .{ .start = 36, .end = 42 } } } }, - .{ .attribute_content = .{ .content = .{ .entity = .{ .start = 43, .end = 46 } } } }, - .{ .attribute_content = .{ .content = .{ .text = .{ .start = 47, .end = 56 } }, .final = true } }, - .{ .element_content = .{ .content = .{ .entity = .{ .start = 59, .end = 61 } } } }, - .{ .element_content = .{ .content = .{ .text = .{ .start = 62, .end = 64 } } } }, - .{ .element_content = .{ .content = .{ .codepoint = 33 } } }, - .{ .element_content = .{ .content = .{ .codepoint = 0x21 } } }, - .{ .element_content = .{ .content = .{ .entity = .{ .start = 76, .end = 78 } } } }, - .{ .element_end = .{ .name = .{ .start = 81, .end = 88 } } }, - }); -} - -test "PI at document start" { - try testValid("", &.{ - .{ .pi_start = .{ .target = .{ .start = 2, .end = 9 } } }, - .{ .pi_content = .{ .content = .{ .start = 9, .end = 9 }, .final = true } }, - .{ .element_start = .{ .name = .{ .start = 12, .end = 16 } } }, - .element_end_empty, - }); - try testValid("", &.{ - .{ .pi_start = .{ .target = .{ .start = 2, .end = 4 } } }, - .{ .pi_content = .{ .content = .{ .start = 4, .end = 4 }, .final = true } }, - .{ .element_start = .{ .name = .{ .start = 7, .end = 11 } } }, - .element_end_empty, - }); - try testValid("", &.{ - .{ .pi_start = .{ .target = .{ .start = 2, .end = 6 } } }, - .{ .pi_content = .{ .content = .{ .start = 6, .end = 6 }, .final = true } }, - .{ .element_start = .{ .name = .{ .start = 9, .end = 13 } } }, - .element_end_empty, - }); -} - -test "invalid top-level text" { - try testInvalid("Hello, world!", error.SyntaxError, 0); - try testInvalid( - \\ - \\Hello, world! - , error.SyntaxError, 22); - try testInvalid( - \\ - \\Hello, world! - , error.SyntaxError, 9); -} - -test "invalid XML declaration" { - try testInvalid("", error.SyntaxError, 5); - try testInvalid("", error.SyntaxError, 2); - try testInvalid("", error.SyntaxError, 37); - try testInvalid("", error.SyntaxError, 15); - try testInvalid("", error.SyntaxError, 17); - try testInvalid("", error.SyntaxError, 16); - try testInvalid("", error.SyntaxError, 15); - try testInvalid("", error.SyntaxError, 30); - try testInvalid("", error.SyntaxError, 30); - try testInvalid("", error.SyntaxError, 34); - try testInvalid("", error.SyntaxError, 33); - try testInvalid("&", error.SyntaxError, 10); - try testInvalid("&", error.SyntaxError, 13); - try testInvalid("&#ABC;", error.SyntaxError, 11); - try testInvalid(" C;", error.SyntaxError, 13); - try testInvalid("&#xxx;", error.SyntaxError, 12); - try testInvalid("", error.InvalidCharacterReference, 12); - try testInvalid("", error.InvalidCharacterReference, 18); - try testInvalid("", error.InvalidCharacterReference, 16); - try testInvalid("", error.InvalidCharacterReference, 18); - try testInvalid("", error.SyntaxError, 16); - try testInvalid("", error.SyntaxError, 19); - try testInvalid("", error.SyntaxError, 17); - try testInvalid("", error.SyntaxError, 19); - try testInvalid("", error.SyntaxError, 18); - try testInvalid("", error.InvalidCharacterReference, 18); - try testInvalid("", error.InvalidCharacterReference, 24); - try testInvalid("", error.InvalidCharacterReference, 22); - try testInvalid("", error.InvalidCharacterReference, 24); -} - -test "invalid content" { - try testInvalid("Illegal: ]]>", error.SyntaxError, 20); - try testInvalid("Also illegal: ]]]>", error.SyntaxError, 26); - try testValid("]]>", &.{ - .{ .element_start = .{ .name = .{ .start = 1, .end = 8 } } }, - .{ .element_content = .{ .content = .{ .text = .{ .start = 9, .end = 11 } } } }, - .{ .element_content = .{ .content = .{ .entity = .{ .start = 12, .end = 14 } } } }, - .{ .element_end = .{ .name = .{ .start = 17, .end = 24 } } }, - }); - try testValid("[lol]
[lmao]
", &.{ - .{ .element_start = .{ .name = .{ .start = 1, .end = 8 } } }, - .{ .element_content = .{ .content = .{ .text = .{ .start = 9, .end = 14 } } } }, - .{ .element_start = .{ .name = .{ .start = 15, .end = 17 } } }, - .element_end_empty, - .{ .element_content = .{ .content = .{ .text = .{ .start = 19, .end = 25 } } } }, - .{ .element_end = .{ .name = .{ .start = 27, .end = 34 } } }, - }); -} - -test "attributes" { - try testValid("", &.{ - .{ .element_start = .{ .name = .{ .start = 1, .end = 8 } } }, - .{ .attribute_start = .{ .name = .{ .start = 9, .end = 14 } } }, - .{ .attribute_content = .{ .content = .{ .text = .{ .start = 16, .end = 17 } }, .final = true } }, - .{ .attribute_start = .{ .name = .{ .start = 19, .end = 24 } } }, - .{ .attribute_content = .{ .content = .{ .text = .{ .start = 26, .end = 27 } }, .final = true } }, - .element_end_empty, - }); - try testValid("", &.{ - .{ .element_start = .{ .name = .{ .start = 1, .end = 8 } } }, - .{ .attribute_start = .{ .name = .{ .start = 9, .end = 14 } } }, - .{ .attribute_content = .{ .content = .{ .text = .{ .start = 16, .end = 17 } }, .final = true } }, - .{ .attribute_start = .{ .name = .{ .start = 19, .end = 24 } } }, - .{ .attribute_content = .{ .content = .{ .text = .{ .start = 26, .end = 27 } }, .final = true } }, - .element_end_empty, - }); - try testInvalid("", error.SyntaxError, 18); - try testInvalid("", error.SyntaxError, 13); - - try testInvalid("", error.SyntaxError, 15); - try testValid("", &.{ - .{ .element_start = .{ .name = .{ .start = 1, .end = 8 } } }, - .{ .attribute_start = .{ .name = .{ .start = 9, .end = 13 } } }, - .{ .attribute_content = .{ .content = .{ .entity = .{ .start = 16, .end = 18 } } } }, - .{ .attribute_content = .{ .content = .{ .entity = .{ .start = 20, .end = 22 } } } }, - .{ .attribute_content = .{ .content = .{ .text = .{ .start = 23, .end = 23 } }, .final = true } }, - .element_end_empty, - }); -} - -test "missing root element" { - try testIncomplete(""); - try testIncomplete(""); -} - -test "incomplete document" { - try testIncomplete("<"); - try testIncomplete(""); - try testIncomplete(" .ok, - - // States which contain positional information but cannot immediately - // be emitted as a token cannot be reset - .pi_or_xml_decl_start, - .pi_or_xml_decl_start_after_xml, - - .xml_decl_version_value_start, - .xml_decl_version_value, - .xml_decl_after_version_value, - .xml_decl_after_version, - .xml_decl_encoding_name, - .xml_decl_after_encoding_name, - .xml_decl_after_encoding_equals, - .xml_decl_encoding_value_start, - .xml_decl_encoding_value, - .xml_decl_after_encoding_value, - .xml_decl_after_encoding, - .xml_decl_standalone_name, - .xml_decl_after_standalone_name, - .xml_decl_after_standalone_equals, - .xml_decl_standalone_value_start, - - // None of the "maybe_end" states can be reset because we don't know if - // the resulting content token should include the possible ending - // characters until we read further to unambiguously determine whether - // the state is ending. - .comment_maybe_before_end, - - .pi_target, - .pi_maybe_end, - - .cdata_maybe_before_end, - .cdata_maybe_end, - - .element_start_name, - - .attribute_name, - .attribute_content_entity_ref_name, - - .element_end_name, - - .content_entity_ref_name, - => return error.CannotReset, - - // Some states (specifically, content states) can be reset by emitting - // a token with the content seen so far - .comment => token: { - const range = Range{ .start = self.state_data.start, .end = self.pos }; - self.state_data.start = 0; - if (range.isEmpty()) { - break :token .ok; - } else { - self.token_data = .{ .comment_content = .{ .content = range } }; - break :token .comment_content; - } - }, - - .pi_content => token: { - const range = Range{ .start = self.state_data.start, .end = self.pos }; - self.state_data.start = 0; - if (range.isEmpty()) { - break :token .ok; - } else { - self.token_data = .{ .pi_content = .{ .content = range } }; - break :token .pi_content; - } - }, - - .cdata => token: { - const range = Range{ .start = self.state_data.start, .end = self.pos }; - self.state_data.start = 0; - if (range.isEmpty()) { - break :token .ok; - } else { - self.token_data = .{ .element_content = .{ .content = .{ .text = range } } }; - break :token .element_content; - } - }, - - .attribute_content => token: { - const range = Range{ .start = self.state_data.start, .end = self.pos }; - self.state_data.start = 0; - if (range.isEmpty()) { - break :token .ok; - } else { - self.token_data = .{ .attribute_content = .{ .content = .{ .text = range } } }; - break :token .attribute_content; - } - }, - - .content, .content_cdata_maybe_before_end, .content_cdata_maybe_end => token: { - const range = Range{ .start = self.state_data.start, .end = self.pos }; - self.state_data.start = 0; - if (range.isEmpty()) { - break :token .ok; - } else { - self.token_data = .{ .element_content = .{ .content = .{ .text = range } } }; - break :token .element_content; - } - }, - }; - self.pos = 0; - return token; -} - -test resetPos { - var scanner = Scanner{}; - var tokens = std.ArrayListUnmanaged(Token.Full){}; - defer tokens.deinit(testing.allocator); - - for ("Hello,") |c| { - switch (try scanner.next(c, 1)) { - .ok => {}, - else => |token| try tokens.append(testing.allocator, scanner.fullToken(token)), - } - } - try tokens.append(testing.allocator, scanner.fullToken(try scanner.resetPos())); - for (" world!") |c| { - switch (try scanner.next(c, 1)) { - .ok => {}, - else => |token| try tokens.append(testing.allocator, scanner.fullToken(token)), - } - } - - try testing.expectEqualSlices(Token.Full, &.{ - .{ .element_start = .{ .name = .{ .start = 1, .end = 8 } } }, - .{ .element_content = .{ .content = .{ .text = .{ .start = 9, .end = 15 } } } }, - .{ .element_content = .{ .content = .{ .text = .{ .start = 0, .end = 7 } } } }, - .{ .element_end = .{ .name = .{ .start = 9, .end = 16 } } }, - }, tokens.items); -} - -test "resetPos inside element reference name" { - var scanner = Scanner{}; - - for ("Hello, world &am") |c| { - _ = try scanner.next(c, 1); - } - try testing.expectError(error.CannotReset, scanner.resetPos()); -} diff --git a/src/Writer.zig b/src/Writer.zig new file mode 100644 index 0000000..45a64ad --- /dev/null +++ b/src/Writer.zig @@ -0,0 +1,198 @@ +const std = @import("std"); +const assert = std.debug.assert; + +options: Options, + +state: State, +indent_level: u32, + +sink: Sink, + +const Writer = @This(); + +pub const Options = struct { + indent: []const u8 = "", +}; + +pub const Sink = struct { + context: *const anyopaque, + writeFn: *const fn (context: *const anyopaque, data: []const u8) anyerror!void, + + pub fn write(sink: *Sink, data: []const u8) anyerror!void { + return sink.writeFn(sink.context, data); + } +}; + +const State = enum { + start, + after_bom, + after_xml_declaration, + element_start, + after_structure_end, + text, + end, +}; + +pub fn init(sink: Sink, options: Options) Writer { + return .{ + .options = options, + + .state = .start, + .indent_level = 0, + + .sink = sink, + }; +} + +pub const WriteError = error{}; + +pub fn bom(writer: *Writer) anyerror!void { + assert(writer.state == .start); + try writer.raw("\u{FEFF}"); + writer.state = .after_bom; +} + +pub fn xmlDeclaration(writer: *Writer, encoding: ?[]const u8, standalone: ?bool) anyerror!void { + assert(writer.state == .start or writer.state == .after_bom); + try writer.raw(""); + if (writer.options.indent.len > 0) try writer.newLineAndIndent(); + writer.state = .after_xml_declaration; +} + +pub fn elementStart(writer: *Writer, name: []const u8) anyerror!void { + switch (writer.state) { + .start, .after_bom, .after_xml_declaration, .text => {}, + .element_start => { + try writer.raw(">"); + try writer.newLineAndIndent(); + }, + .after_structure_end => { + try writer.newLineAndIndent(); + }, + .end => unreachable, + } + try writer.raw("<"); + try writer.raw(name); + writer.state = .element_start; + writer.indent_level += 1; +} + +pub fn elementEnd(writer: *Writer, name: []const u8) anyerror!void { + writer.indent_level -= 1; + switch (writer.state) { + .text => {}, + .element_start => { + try writer.raw(">"); + try writer.newLineAndIndent(); + }, + .after_structure_end => { + try writer.newLineAndIndent(); + }, + .start, .after_bom, .after_xml_declaration, .end => unreachable, + } + try writer.raw(""); + writer.state = if (writer.indent_level > 0) .after_structure_end else .end; +} + +pub fn elementEndEmpty(writer: *Writer) anyerror!void { + assert(writer.state == .element_start); + try writer.raw("/>"); + writer.state = .after_structure_end; + writer.indent_level -= 1; +} + +pub fn attribute(writer: *Writer, name: []const u8, value: []const u8) anyerror!void { + assert(writer.state == .element_start); + try writer.raw(" "); + try writer.raw(name); + try writer.raw("=\""); + try writer.attributeText(value); + try writer.raw("\""); +} + +fn attributeText(writer: *Writer, s: []const u8) anyerror!void { + var pos: usize = 0; + while (std.mem.indexOfAnyPos(u8, s, pos, "\r\n\t&<\"")) |esc_pos| { + try writer.raw(s[pos..esc_pos]); + try writer.raw(switch (s[esc_pos]) { + '\r' => " ", + '\n' => " ", + '\t' => " ", + '&' => "&", + '<' => "<", + '"' => """, + else => unreachable, + }); + pos = esc_pos + 1; + } + try writer.raw(s[pos..]); +} + +pub fn pi(writer: *Writer, target: []const u8, data: []const u8) anyerror!void { + switch (writer.state) { + .start, .after_bom, .after_xml_declaration, .text, .end => {}, + .element_start => { + try writer.raw(">"); + try writer.newLineAndIndent(); + }, + .after_structure_end => { + try writer.newLineAndIndent(); + }, + } + try writer.raw(""); + writer.state = .after_structure_end; +} + +pub fn text(writer: *Writer, s: []const u8) anyerror!void { + switch (writer.state) { + .after_structure_end, .text => {}, + .element_start => try writer.raw(">"), + .start, .after_bom, .after_xml_declaration, .end => unreachable, + } + var pos: usize = 0; + while (std.mem.indexOfAnyPos(u8, s, pos, "\r&<")) |esc_pos| { + try writer.raw(s[pos..esc_pos]); + try writer.raw(switch (s[esc_pos]) { + '\r' => " ", + '&' => "&", + '<' => "<", + else => unreachable, + }); + pos = esc_pos + 1; + } + try writer.raw(s[pos..]); + writer.state = .text; +} + +fn newLineAndIndent(writer: *Writer) anyerror!void { + if (writer.options.indent.len == 0) return; + + try writer.raw("\n"); + var n: usize = 0; + while (n < writer.indent_level) : (n += 1) { + try writer.raw(writer.options.indent); + } +} + +fn raw(writer: *Writer, s: []const u8) anyerror!void { + try writer.sink.write(s); +} diff --git a/src/compat.zig b/src/compat.zig deleted file mode 100644 index 79a65f0..0000000 --- a/src/compat.zig +++ /dev/null @@ -1,17 +0,0 @@ -//! Compatibility wrappers for APIs changed since Zig 0.12. - -const std = @import("std"); - -pub fn ComptimeStringMapType(comptime V: type) type { - return if (@hasDecl(std, "ComptimeStringMap")) - type - else - std.StaticStringMap(V); -} - -pub fn ComptimeStringMap(comptime V: type, comptime kvs_list: anytype) ComptimeStringMapType(V) { - return if (@hasDecl(std, "ComptimeStringMap")) - std.ComptimeStringMap(V, kvs_list) - else - std.StaticStringMap(V).initComptime(kvs_list); -} diff --git a/src/encoding.zig b/src/encoding.zig deleted file mode 100644 index df06d4e..0000000 --- a/src/encoding.zig +++ /dev/null @@ -1,451 +0,0 @@ -//! Various encoding-related utilities. -//! -//! The central "interface" of this file is `Decoder`, which decodes XML -//! content into Unicode codepoints for further processing. It consists -//! of an error type `Error` and several declarations: -//! -//! - `const max_encoded_codepoint_len` - the maximum number of bytes a -//! single Unicode codepoint may occupy in encoded form. -//! - `fn readCodepoint(self: *Decoder, reader: anytype, buf: []u8) (Error || @TypeOf(reader).Error))!ReadResult` - -//! reads a single codepoint from a `std.io.GenericReader` and writes its UTF-8 -//! encoding to `buf`. Should return `error.UnexpectedEndOfInput` if a full -//! codepoint cannot be read, `error.Overflow` if the UTF-8-encoded form cannot -//! be written to `buf`; other decoder-specific errors can also be used. -//! - `fn adaptTo(self: *Decoder, encoding: []const u8) error{InvalidEncoding}!void` - -//! accepts a UTF-8-encoded encoding name and returns an error if the desired -//! encoding cannot be handled by the decoder. This is intended to support -//! `Decoder` implementations which adapt to the encoding declared by an XML -//! document. - -const std = @import("std"); -const ascii = std.ascii; -const testing = std.testing; -const unicode = std.unicode; -const Allocator = std.mem.Allocator; -const ArrayListUnmanaged = std.ArrayListUnmanaged; -const BoundedArray = std.BoundedArray; - -/// The result of reading a single codepoint successfully. -pub const ReadResult = packed struct(u32) { - /// The codepoint read. - codepoint: u21, - /// The length of the codepoint encoded in UTF-8. - byte_length: u10, - /// If https://github.com/ziglang/zig/issues/104 is implemented, a much - /// better API would be to make `ReadResult` a `packed struct(u31)` instead - /// and use `?ReadResult` elsewhere. But, for now, this indicates whether - /// `codepoint` and `byte_length` are present, so that the whole thing fits - /// in a `u32` rather than unnecessarily taking up 8 bytes. - present: bool = true, - - pub const none: ReadResult = .{ - .codepoint = 0, - .byte_length = 0, - .present = false, - }; -}; - -/// A decoder which handles UTF-8 or UTF-16, using a BOM to detect UTF-16 -/// endianness. -/// -/// This is the bare minimum encoding support required of a standard-compliant -/// XML parser. -pub const DefaultDecoder = struct { - state: union(enum) { - start, - utf8: Utf8Decoder, - utf16_le: Utf16Decoder(.little), - utf16_be: Utf16Decoder(.big), - } = .start, - - pub const Error = Utf8Decoder.Error || Utf16Decoder(.little).Error || Utf16Decoder(.big).Error; - - pub const max_encoded_codepoint_len = 4; - const bom = 0xFEFF; - const bom_byte_length = unicode.utf8CodepointSequenceLength(bom) catch unreachable; - - pub fn readCodepoint(self: *DefaultDecoder, reader: anytype, buf: []u8) (Error || @TypeOf(reader).Error)!ReadResult { - switch (self.state) { - .start => {}, - inline else => |*inner| return inner.readCodepoint(reader, buf), - } - // If attempting to match the UTF-16 BOM fails for whatever reason, we - // will assume we are reading UTF-8. - self.state = .{ .utf8 = .{} }; - const b = reader.readByte() catch |e| switch (e) { - error.EndOfStream => return error.UnexpectedEndOfInput, - else => |other| return other, - }; - switch (b) { - 0xFE => { - const b2 = reader.readByte() catch |e| switch (e) { - error.EndOfStream => return error.InvalidUtf8, - else => |other| return other, - }; - if (b2 != 0xFF) return error.InvalidUtf8; - self.state = .{ .utf16_be = .{} }; - if (bom_byte_length > buf.len) return error.Overflow; - _ = unicode.utf8Encode(bom, buf) catch unreachable; - return .{ .codepoint = bom, .byte_length = bom_byte_length }; - }, - 0xFF => { - const b2 = reader.readByte() catch |e| switch (e) { - error.EndOfStream => return error.InvalidUtf8, - else => |other| return other, - }; - if (b2 != 0xFE) return error.InvalidUtf8; - self.state = .{ .utf16_le = .{} }; - if (bom_byte_length > buf.len) return error.Overflow; - _ = unicode.utf8Encode(bom, buf) catch unreachable; - return .{ .codepoint = bom, .byte_length = bom_byte_length }; - }, - else => { - // The rest of this branch is copied from Utf8Decoder - const byte_length = unicode.utf8ByteSequenceLength(b) catch return error.InvalidUtf8; - if (byte_length > buf.len) return error.Overflow; - buf[0] = b; - if (byte_length == 1) return .{ .codepoint = b, .byte_length = 1 }; - reader.readNoEof(buf[1..byte_length]) catch |e| switch (e) { - error.EndOfStream => return error.UnexpectedEndOfInput, - else => |other| return other, - }; - const codepoint = switch (byte_length) { - 2 => unicode.utf8Decode2(buf[0..2]), - 3 => unicode.utf8Decode3(buf[0..3]), - 4 => unicode.utf8Decode4(buf[0..4]), - else => unreachable, - } catch return error.InvalidUtf8; - return .{ .codepoint = codepoint, .byte_length = byte_length }; - }, - } - } - - pub fn adaptTo(self: *DefaultDecoder, encoding: []const u8) error{InvalidEncoding}!void { - switch (self.state) { - .start => {}, - inline else => |*decoder| try decoder.adaptTo(encoding), - } - } -}; - -test DefaultDecoder { - // UTF-8 no BOM - { - const input = "Hü日😀"; - var decoder = try testDecode(DefaultDecoder, input, &.{ - 'H', - 'ü', - '日', - '😀', - }); - try decoder.adaptTo("utf-8"); - try decoder.adaptTo("UTF-8"); - } - - // UTF-8 BOM - { - const input = "\u{FEFF}Hü日😀"; - var decoder = try testDecode(DefaultDecoder, input, &.{ - 0xFEFF, - 'H', - 'ü', - '日', - '😀', - }); - try decoder.adaptTo("utf-8"); - try decoder.adaptTo("UTF-8"); - } - - // Invalid UTF-8 BOM - { - const input = "\xEF\x00\x00H"; - var decoder = try testDecode(DefaultDecoder, input, &.{ - error.InvalidUtf8, - 'H', - }); - try decoder.adaptTo("utf-8"); - try decoder.adaptTo("UTF-8"); - } - - // UTF-16BE BOM - { - const input = "\xFE\xFF" ++ // U+FEFF - "\x00H" ++ - "\x00\xFC" ++ // ü - "\x65\xE5" ++ // 日 - "\xD8\x3D\xDE\x00"; // 😀 - var decoder = try testDecode(DefaultDecoder, input, &.{ - 0xFEFF, - 'H', - 'ü', - '日', - '😀', - }); - try decoder.adaptTo("utf-16"); - try decoder.adaptTo("UTF-16"); - try decoder.adaptTo("utf-16be"); - try decoder.adaptTo("UTF-16BE"); - } - - // Invalid UTF-16BE BOM - { - const input = "\xFE\x00H"; - var decoder = try testDecode(DefaultDecoder, input, &.{ - error.InvalidUtf8, - 'H', - }); - try decoder.adaptTo("utf-8"); - try decoder.adaptTo("UTF-8"); - } - - // UTF-16LE BOM - { - const input = "\xFF\xFE" ++ // U+FEFF - "H\x00" ++ - "\xFC\x00" ++ // ü - "\xE5\x65" ++ // 日 - "\x3D\xD8\x00\xDE"; // 😀 - var decoder = try testDecode(DefaultDecoder, input, &.{ - 0xFEFF, - 'H', - 'ü', - '日', - '😀', - }); - try decoder.adaptTo("utf-16"); - try decoder.adaptTo("UTF-16"); - try decoder.adaptTo("utf-16le"); - try decoder.adaptTo("UTF-16LE"); - } - - // Invalid UTF-16LE BOM - { - const input = "\xFF\xFFH"; - var decoder = try testDecode(DefaultDecoder, input, &.{ - error.InvalidUtf8, - 'H', - }); - try decoder.adaptTo("utf-8"); - try decoder.adaptTo("UTF-8"); - } -} - -/// A decoder which handles only UTF-8. -pub const Utf8Decoder = struct { - pub const max_encoded_codepoint_len = 4; - - pub const Error = error{ InvalidUtf8, Overflow, UnexpectedEndOfInput }; - - pub fn readCodepoint(_: *Utf8Decoder, reader: anytype, buf: []u8) (Error || @TypeOf(reader).Error)!ReadResult { - const b = reader.readByte() catch |e| switch (e) { - error.EndOfStream => return ReadResult.none, - else => |other| return other, - }; - const byte_length = unicode.utf8ByteSequenceLength(b) catch return error.InvalidUtf8; - if (byte_length > buf.len) return error.Overflow; - buf[0] = b; - if (byte_length == 1) return .{ .codepoint = b, .byte_length = 1 }; - reader.readNoEof(buf[1..byte_length]) catch |e| switch (e) { - error.EndOfStream => return error.UnexpectedEndOfInput, - else => |other| return other, - }; - const codepoint = switch (byte_length) { - 2 => unicode.utf8Decode2(buf[0..2]), - 3 => unicode.utf8Decode3(buf[0..3]), - 4 => unicode.utf8Decode4(buf[0..4]), - else => unreachable, - } catch return error.InvalidUtf8; - return .{ .codepoint = codepoint, .byte_length = byte_length }; - } - - pub fn adaptTo(_: *Utf8Decoder, encoding: []const u8) error{InvalidEncoding}!void { - if (!ascii.eqlIgnoreCase(encoding, "utf-8")) { - return error.InvalidEncoding; - } - } -}; - -test Utf8Decoder { - const input = - // 1-byte encodings - "\x00\x01 ABC abc 123" ++ - // 2-byte encodings - "éèçñåβΘ" ++ - // 3-byte encodings - "日本語AESTHETIC" ++ - // 4-byte encodings - "😳😂❤️👩‍👩‍👧‍👧" ++ - // Overlong encodings - "\xC0\x80\xE0\x80\x80\xF0\x80\x80\x80" ++ - // Out of bounds codepoint - "\xF7\xBF\xBF\xBF" ++ - // Surrogate halves - "\xED\xA0\x80\xED\xBF\xBF"; - _ = try testDecode(Utf8Decoder, input, &.{ - '\x00', - '\x01', - ' ', - 'A', - 'B', - 'C', - ' ', - 'a', - 'b', - 'c', - ' ', - '1', - '2', - '3', - 'é', - 'è', - 'ç', - 'ñ', - 'å', - 'β', - 'Θ', - '日', - '本', - '語', - 'A', - 'E', - 'S', - 'T', - 'H', - 'E', - 'T', - 'I', - 'C', - '😳', - '😂', - '❤', - '\u{FE0F}', // variation selector-16 - '👩', - '\u{200D}', // zero-width joiner - '👩', - '\u{200D}', // zero-width joiner - '👧', - '\u{200D}', // zero-width joiner - '👧', - error.InvalidUtf8, // 2-byte U+0000 - error.InvalidUtf8, // 3-byte U+0000 - error.InvalidUtf8, // 4-byte U+0000 - error.InvalidUtf8, // attempted U+1FFFFF - error.InvalidUtf8, // U+D800 - error.InvalidUtf8, // U+DFFF - }); -} - -/// A decoder which handles only UTF-16 of a given endianness. -pub fn Utf16Decoder(comptime endian: std.builtin.Endian) type { - return struct { - const Self = @This(); - - pub const Error = error{ InvalidUtf16, Overflow, UnexpectedEndOfInput }; - - pub const max_encoded_codepoint_len = 4; - - pub fn readCodepoint(_: *Self, reader: anytype, buf: []u8) (Error || @TypeOf(reader).Error)!ReadResult { - var u_buf: [2]u8 = undefined; - const u_len = try reader.readAll(&u_buf); - switch (u_len) { - 0 => return ReadResult.none, - 1 => return error.UnexpectedEndOfInput, - else => {}, - } - const u = std.mem.readInt(u16, &u_buf, endian); - const code_unit_length = unicode.utf16CodeUnitSequenceLength(u) catch return error.InvalidUtf16; - const codepoint = switch (code_unit_length) { - 1 => u, - 2 => codepoint: { - const low = reader.readInt(u16, endian) catch |e| switch (e) { - error.EndOfStream => return error.UnexpectedEndOfInput, - else => |other| return other, - }; - break :codepoint unicode.utf16DecodeSurrogatePair(&.{ u, low }) catch return error.InvalidUtf16; - }, - else => unreachable, - }; - const byte_length = unicode.utf8CodepointSequenceLength(codepoint) catch unreachable; - if (byte_length > buf.len) return error.Overflow; - _ = unicode.utf8Encode(codepoint, buf) catch unreachable; - return .{ .codepoint = codepoint, .byte_length = byte_length }; - } - - pub fn adaptTo(_: *Self, encoding: []const u8) error{InvalidEncoding}!void { - if (!(ascii.eqlIgnoreCase(encoding, "utf-16") or - (endian == .big and ascii.eqlIgnoreCase(encoding, "utf-16be")) or - (endian == .little and ascii.eqlIgnoreCase(encoding, "utf-16le")))) - { - return error.InvalidEncoding; - } - } - }; -} - -test Utf16Decoder { - // little-endian - { - const input = "\x00\x00" ++ // U+0000 - "A\x00" ++ // A - "b\x00" ++ // b - "5\x00" ++ // 5 - "\xE5\x65" ++ // 日 - "\x3D\xD8\x33\xDE" ++ // 😳 - "\x00\xD8\x00\x00" ++ // unpaired high surrogate followed by U+0000 - "\xFF\xDF" // unpaired low surrogate - ; - _ = try testDecode(Utf16Decoder(.little), input, &.{ - '\x00', - 'A', - 'b', - '5', - '日', - '😳', - error.InvalidUtf16, - error.InvalidUtf16, - }); - } - - // big-endian - { - const input = "\x00\x00" ++ // U+0000 - "\x00A" ++ // A - "\x00b" ++ // b - "\x005" ++ // 5 - "\x65\xE5" ++ // 日 - "\xD8\x3D\xDE\x33" ++ // 😳 - "\xD8\x00\x00\x00" ++ // unpaired high surrogate followed by U+0000 - "\xDF\xFF" // unpaired low surrogate - ; - _ = try testDecode(Utf16Decoder(.big), input, &.{ - '\x00', - 'A', - 'b', - '5', - '日', - '😳', - error.InvalidUtf16, - error.InvalidUtf16, - }); - } -} - -fn testDecode(comptime Decoder: type, input: []const u8, expected: []const (Decoder.Error!u21)) !Decoder { - var decoder: Decoder = .{}; - var decoded = ArrayListUnmanaged(Decoder.Error!u21){}; - defer decoded.deinit(testing.allocator); - var input_stream = std.io.fixedBufferStream(input); - var buf: [4]u8 = undefined; - while (true) { - if (decoder.readCodepoint(input_stream.reader(), &buf)) |c| { - if (!c.present) break; - try decoded.append(testing.allocator, c.codepoint); - } else |err| { - try decoded.append(testing.allocator, err); - } - } - - try testing.expectEqualDeep(expected, decoded.items); - - return decoder; -} diff --git a/src/node.zig b/src/node.zig deleted file mode 100644 index d27f61a..0000000 --- a/src/node.zig +++ /dev/null @@ -1,60 +0,0 @@ -const std = @import("std"); -const mem = std.mem; -const ArenaAllocator = std.heap.ArenaAllocator; -const QName = @import("reader.zig").QName; - -/// A node value along with an `ArenaAllocator` used to allocate all memory -/// backing it. -pub fn OwnedValue(comptime T: type) type { - return struct { - value: T, - arena: ArenaAllocator, - - const Self = @This(); - - pub fn deinit(self: *Self) void { - self.arena.deinit(); - self.* = undefined; - } - }; -} - -/// A node in an XML document. -pub const Node = union(enum) { - document: Document, - element: Element, - attribute: Attribute, - comment: Comment, - pi: Pi, - text: Text, - - pub const Document = struct { - version: []const u8 = "1.0", - encoding: ?[]const u8 = null, - standalone: ?bool = null, - children: []const Node, - }; - - pub const Element = struct { - name: QName, - children: []const Node = &.{}, - }; - - pub const Attribute = struct { - name: QName, - value: []const u8, - }; - - pub const Comment = struct { - content: []const u8, - }; - - pub const Pi = struct { - target: []const u8, - content: []const u8, - }; - - pub const Text = struct { - content: []const u8, - }; -}; diff --git a/src/reader.zig b/src/reader.zig deleted file mode 100644 index d01a137..0000000 --- a/src/reader.zig +++ /dev/null @@ -1,1149 +0,0 @@ -const std = @import("std"); -const fmt = std.fmt; -const mem = std.mem; -const testing = std.testing; -const unicode = std.unicode; -const Allocator = mem.Allocator; -const ArenaAllocator = std.heap.ArenaAllocator; -const ArrayListUnmanaged = std.ArrayListUnmanaged; -const ComptimeStringMap = @import("compat.zig").ComptimeStringMap; -const StringArrayHashMapUnmanaged = std.StringArrayHashMapUnmanaged; -const StringHashMapUnmanaged = std.StringHashMapUnmanaged; -const encoding = @import("encoding.zig"); -const syntax = @import("syntax.zig"); -const Node = @import("node.zig").Node; -const OwnedValue = @import("node.zig").OwnedValue; -const Scanner = @import("Scanner.zig"); -const Token = @import("token_reader.zig").Token; -const TokenReader = @import("token_reader.zig").TokenReader; - -const max_encoded_codepoint_len = 4; - -/// A qualified name. -pub const QName = struct { - prefix: ?[]const u8 = null, - ns: ?[]const u8 = null, - local: []const u8, - - /// Returns whether this name has the given namespace and local name. - pub fn is(self: QName, ns: ?[]const u8, local: []const u8) bool { - if (self.ns) |self_ns| { - if (!mem.eql(u8, self_ns, ns orelse return false)) { - return false; - } - } else if (ns != null) { - return false; - } - return mem.eql(u8, self.local, local); - } - - test is { - try testing.expect((QName{ .local = "abc" }).is(null, "abc")); - try testing.expect((QName{ .ns = "http://example.com/ns/", .local = "abc" }).is("http://example.com/ns/", "abc")); - try testing.expect(!(QName{ .local = "abc" }).is(null, "def")); - try testing.expect(!(QName{ .local = "abc" }).is("http://example.com/ns/", "abc")); - try testing.expect(!(QName{ .ns = "http://example.com/ns/", .local = "abc" }).is(null, "abc")); - try testing.expect(!(QName{ .ns = "http://example.com/ns/", .local = "abc" }).is("http://example.com/ns2/", "abc")); - try testing.expect(!(QName{ .ns = "http://example.com/ns/", .local = "abc" }).is("http://example.com/ns/", "def")); - try testing.expect(!(QName{ .ns = "http://example.com/ns/", .local = "abc" }).is("http://example.com/ns2/", "def")); - } - - fn clone(self: QName, allocator: Allocator) !QName { - const prefix = if (self.prefix) |prefix| try allocator.dupe(u8, prefix) else null; - errdefer if (prefix) |p| allocator.free(p); - const ns = if (self.ns) |ns| try allocator.dupe(u8, ns) else null; - errdefer if (ns) |n| allocator.free(n); - const local = try allocator.dupe(u8, self.local); - return .{ .prefix = prefix, .ns = ns, .local = local }; - } - - /// Duplicates the `ns` value, if any. - /// - /// This is to allow the `QName` to outlive the closure of its containing - /// scope. - inline fn dupNs(self: *QName, allocator: Allocator) !void { - if (self.ns) |*ns| { - ns.* = try allocator.dupe(u8, ns.*); - } - } -}; - -/// A hash map `Context` which compares namespace URIs and local names (that is, -/// name identity according to the XML namespaces spec, since the prefix does -/// not contribute to the identity of a QName). -const QNameContext = struct { - const Self = @This(); - - pub fn hash(_: Self, name: QName) u64 { - var h = std.hash.Wyhash.init(0); - if (name.ns) |ns| { - h.update(ns); - } - h.update(name.local); - return h.final(); - } - - pub fn eql(_: Self, name1: QName, name2: QName) bool { - return name1.is(name2.ns, name2.local); - } -}; - -const QNameSet = std.HashMapUnmanaged(QName, void, QNameContext, std.hash_map.default_max_load_percentage); - -/// An event emitted by a reader. -pub const Event = union(enum) { - xml_declaration: XmlDeclaration, - element_start: ElementStart, - element_content: ElementContent, - element_end: ElementEnd, - comment: Comment, - pi: Pi, - - pub const XmlDeclaration = struct { - version: []const u8, - encoding: ?[]const u8 = null, - standalone: ?bool = null, - }; - - pub const ElementStart = struct { - name: QName, - attributes: []const Attribute = &.{}, - }; - - pub const Attribute = struct { - name: QName, - value: []const u8, - }; - - pub const ElementContent = struct { - content: []const u8, - }; - - pub const ElementEnd = struct { - name: QName, - }; - - pub const Comment = struct { - content: []const u8, - }; - - pub const Pi = struct { - target: []const u8, - content: []const u8, - }; -}; - -/// A map of predefined XML entities to their replacement text. -/// -/// Until DTDs are understood and parsed, these are the only named entities -/// supported by this parser. -const entities = ComptimeStringMap([]const u8, .{ - .{ "amp", "&" }, - .{ "lt", "<" }, - .{ "gt", ">" }, - .{ "apos", "'" }, - .{ "quot", "\"" }, -}); - -const xml_ns = "http://www.w3.org/XML/1998/namespace"; -const xmlns_ns = "http://www.w3.org/2000/xmlns/"; - -const predefined_ns_prefixes = ComptimeStringMap([]const u8, .{ - .{ "xml", xml_ns }, - .{ "xmlns", xmlns_ns }, -}); - -/// A context for namespace information in a document. -/// -/// The context maintains a hierarchy of namespace scopes. Initially, there is -/// no active scope (corresponding to the beginning of a document, before the -/// start of the root element). -pub const NamespaceContext = struct { - scopes: ArrayListUnmanaged(StringHashMapUnmanaged([]const u8)) = .{}, - - pub const Error = error{ - CannotUndeclareNsPrefix, - InvalidNsBinding, - InvalidQName, - UndeclaredNsPrefix, - QNameNotAllowed, - }; - - pub fn deinit(self: *NamespaceContext, allocator: Allocator) void { - while (self.scopes.items.len > 0) { - self.endScope(allocator); - } - self.scopes.deinit(allocator); - self.* = undefined; - } - - /// Starts a new scope. - pub fn startScope(self: *NamespaceContext, allocator: Allocator) !void { - try self.scopes.append(allocator, .{}); - } - - /// Ends the current scope. - /// - /// Only valid if there is a current scope. - pub fn endScope(self: *NamespaceContext, allocator: Allocator) void { - var bindings = self.scopes.pop(); - var iter = bindings.iterator(); - while (iter.next()) |entry| { - allocator.free(entry.key_ptr.*); - allocator.free(entry.value_ptr.*); - } - bindings.deinit(allocator); - } - - /// Binds the default namespace in the current scope. - /// - /// Only valid if there is a current scope. - pub fn bindDefault(self: *NamespaceContext, allocator: Allocator, uri: []const u8) !void { - if (mem.eql(u8, uri, xml_ns) or mem.eql(u8, uri, xmlns_ns)) { - return error.InvalidNsBinding; - } - try self.bindInner(allocator, "", uri); - } - - /// Binds a prefix in the current scope. - /// - /// Only valid if there is a current scope. - pub fn bindPrefix(self: *NamespaceContext, allocator: Allocator, prefix: []const u8, uri: []const u8) !void { - if (!syntax.isNcName(prefix)) { - return error.InvalidQName; - } - if (mem.eql(u8, prefix, "xml") and !mem.eql(u8, uri, xml_ns)) { - return error.InvalidNsBinding; - } - if (mem.eql(u8, uri, xml_ns) and !mem.eql(u8, prefix, "xml")) { - return error.InvalidNsBinding; - } - if (mem.eql(u8, prefix, "xmlns")) { - return error.InvalidNsBinding; - } - if (mem.eql(u8, uri, xmlns_ns) and !mem.eql(u8, prefix, "xmlns")) { - return error.InvalidNsBinding; - } - try self.bindInner(allocator, prefix, uri); - } - - fn bindInner(self: *NamespaceContext, allocator: Allocator, prefix: []const u8, uri: []const u8) !void { - // TODO: validate that uri is a valid URI reference - if (prefix.len != 0 and uri.len == 0) { - return error.CannotUndeclareNsPrefix; - } - var bindings = &self.scopes.items[self.scopes.items.len - 1]; - const key = try allocator.dupe(u8, prefix); - errdefer allocator.free(key); - const value = try allocator.dupe(u8, uri); - errdefer allocator.free(value); - // We cannot clobber an existing prefix in this scope because that - // would imply a duplicate attribute name, which is validated earlier. - try bindings.putNoClobber(allocator, key, value); - } - - /// Returns the URI, if any, bound to the given prefix. - pub fn getUri(self: NamespaceContext, prefix: []const u8) ?[]const u8 { - if (predefined_ns_prefixes.get(prefix)) |uri| { - return uri; - } - return for (0..self.scopes.items.len) |i| { - if (self.scopes.items[self.scopes.items.len - i - 1].get(prefix)) |uri| { - break if (uri.len > 0) uri else null; - } - } else null; - } - - /// Parses a possibly prefixed name and returns the corresponding `QName`. - /// - /// `use_default_ns` specifies if the default namespace (if any) should be - /// implied for the given name if it is unprefixed. This is appropriate for - /// element names but not attribute names, per the namespaces spec. - pub fn parseName(self: NamespaceContext, name: []const u8, use_default_ns: bool) !QName { - if (mem.indexOfScalar(u8, name, ':')) |sep_pos| { - const prefix = name[0..sep_pos]; - const local = name[sep_pos + 1 ..]; - if (!syntax.isNcName(prefix) or !syntax.isNcName(local)) { - return error.InvalidQName; - } - const ns = self.getUri(prefix) orelse return error.UndeclaredNsPrefix; - return .{ .prefix = prefix, .ns = ns, .local = local }; - } else if (use_default_ns) { - return .{ .ns = self.getUri(""), .local = name }; - } else { - return .{ .local = name }; - } - } -}; - -/// A drop-in replacement for `NamespaceContext` which doesn't actually do any -/// namespace processing. -pub const NoOpNamespaceContext = struct { - pub const Error = error{}; - - pub inline fn deinit(_: *NoOpNamespaceContext, _: Allocator) void {} - - pub inline fn startScope(_: *NoOpNamespaceContext, _: Allocator) !void {} - - pub inline fn endScope(_: *NoOpNamespaceContext, _: Allocator) void {} - - pub inline fn bindDefault(_: *NoOpNamespaceContext, _: Allocator, _: []const u8) !void {} - - pub inline fn bindPrefix(_: *NoOpNamespaceContext, _: Allocator, _: []const u8, _: []const u8) !void {} - - pub inline fn getUri(_: NoOpNamespaceContext, _: []const u8) ?[]const u8 { - return null; - } - - pub inline fn parseName(_: NoOpNamespaceContext, name: []const u8, _: bool) !QName { - return .{ .local = name }; - } -}; - -/// Returns a `Reader` wrapping a `std.io.Reader`. -pub fn reader( - allocator: Allocator, - r: anytype, - comptime options: ReaderOptions, -) Reader(@TypeOf(r), options) { - return Reader(@TypeOf(r), options).init(allocator, r, .{}); -} - -/// Reads a full XML document from a `std.io.Reader`. -pub fn readDocument( - allocator: Allocator, - r: anytype, - comptime options: ReaderOptions, -) !OwnedValue(Node.Document) { - var arena = ArenaAllocator.init(allocator); - errdefer arena.deinit(); - const node_allocator = arena.allocator(); - - var decl_version: []const u8 = "1.0"; - var decl_encoding: ?[]const u8 = null; - var decl_standalone: ?bool = null; - var children = ArrayListUnmanaged(Node){}; - - var xml_reader = reader(allocator, r, options); - defer xml_reader.deinit(); - while (try xml_reader.next()) |event| { - switch (event) { - .xml_declaration => |xml_declaration| { - decl_version = try node_allocator.dupe(u8, xml_declaration.version); - if (xml_declaration.encoding) |e| { - decl_encoding = try node_allocator.dupe(u8, e); - } - decl_standalone = xml_declaration.standalone; - }, - .element_start => |element_start| try children.append(node_allocator, .{ - .element = try xml_reader.nextElementNode(node_allocator, element_start), - }), - .comment => |comment| try children.append(node_allocator, .{ .comment = .{ - .content = try node_allocator.dupe(u8, comment.content), - } }), - .pi => |pi| try children.append(node_allocator, .{ .pi = .{ - .target = try node_allocator.dupe(u8, pi.target), - .content = try node_allocator.dupe(u8, pi.content), - } }), - else => unreachable, - } - } - - return .{ - .value = .{ - .version = decl_version, - .encoding = decl_encoding, - .standalone = decl_standalone, - .children = children.items, - }, - .arena = arena, - }; -} - -/// Options for a `Reader`. -pub const ReaderOptions = struct { - /// The type of decoder to use. - DecoderType: type = encoding.DefaultDecoder, - /// The size of the internal buffer. - /// - /// This limits the byte length of "non-splittable" content, such as - /// element and attribute names. Longer such content will result in - /// `error.Overflow`. - buffer_size: usize = 4096, - /// Whether to normalize line endings and attribute values according to the - /// XML specification. - /// - /// If this is set to false, no normalization will be done: for example, - /// the line ending sequence `\r\n` will appear as-is in returned events - /// rather than the normalized `\n`. - enable_normalization: bool = true, - /// Whether namespace information should be processed. - /// - /// If this is false, then `QName`s in the returned events will have only - /// their `local` field populated, containing the full name of the element - /// or attribute. - namespace_aware: bool = true, - /// Whether to keep track of the current location in the document. - track_location: bool = false, -}; - -/// A streaming, pull-based XML parser wrapping a `std.io.Reader`. -/// -/// This parser behaves similarly to Go's `encoding/xml` package. It is a -/// higher-level abstraction over a `TokenReader` which uses an internal -/// allocator to keep track of additional context. It performs additional -/// well-formedness checks which the lower-level parsers cannot perform due to -/// their design, such as ensuring element start and end tags match and -/// attribute names are not duplicated. It is also able to process namespace -/// information. -/// -/// Since this parser wraps a `TokenReader`, the caveats on the `buffer_size` -/// bounding the length of "non-splittable" content which are outlined in its -/// documentation apply here as well. -pub fn Reader(comptime ReaderType: type, comptime options: ReaderOptions) type { - return struct { - token_reader: TokenReaderType, - /// A stack of element names enclosing the current context. - element_names: ArrayListUnmanaged([]u8) = .{}, - /// The namespace context of the reader. - namespace_context: NamespaceContextType = .{}, - /// A pending token which has been read but has not yet been handled as - /// part of an event. - pending_token: ?Token = null, - /// A buffer for storing encoded Unicode codepoint data. - codepoint_buf: [max_encoded_codepoint_len]u8 = undefined, - /// A "buffer" for handling the contents of the next pending event. - pending_event: union(enum) { - none, - element_start: struct { - name: []const u8, - attributes: StringArrayHashMapUnmanaged(ArrayListUnmanaged(u8)) = .{}, - }, - comment: struct { content: ArrayListUnmanaged(u8) = .{} }, - pi: struct { target: []const u8, content: ArrayListUnmanaged(u8) = .{} }, - } = .none, - /// An arena to store memory for `pending_event` (and the event after - /// it's returned). - event_arena: ArenaAllocator, - allocator: Allocator, - - const Self = @This(); - const TokenReaderType = TokenReader(ReaderType, .{ - .DecoderType = options.DecoderType, - .buffer_size = options.buffer_size, - .enable_normalization = options.enable_normalization, - .track_location = options.track_location, - }); - const NamespaceContextType = if (options.namespace_aware) NamespaceContext else NoOpNamespaceContext; - - pub const Error = error{ - DuplicateAttribute, - MismatchedEndTag, - UndeclaredEntityReference, - } || Allocator.Error || TokenReaderType.Error || NamespaceContextType.Error; - - pub fn init(allocator: Allocator, r: ReaderType, decoder: options.DecoderType) Self { - return .{ - .token_reader = TokenReaderType.init(r, decoder), - .event_arena = ArenaAllocator.init(allocator), - .allocator = allocator, - }; - } - - pub fn deinit(self: *Self) void { - for (self.element_names.items) |name| { - self.allocator.free(name); - } - self.element_names.deinit(self.allocator); - self.namespace_context.deinit(self.allocator); - self.event_arena.deinit(); - self.* = undefined; - } - - /// Returns the next event from the input. - /// - /// The returned event is only valid until the next reader operation. - pub fn next(self: *Self) Error!?Event { - _ = self.event_arena.reset(.retain_capacity); - const event_allocator = self.event_arena.allocator(); - while (true) { - switch (try self.nextToken()) { - .eof => return null, - .xml_declaration => return .{ .xml_declaration = .{ - .version = self.token_reader.token_data.xml_declaration.version, - .encoding = self.token_reader.token_data.xml_declaration.encoding, - .standalone = self.token_reader.token_data.xml_declaration.standalone, - } }, - .element_start => { - if (try self.finalizePendingEvent()) |event| { - self.pending_token = .element_start; - return event; - } - const name = try self.allocator.dupe(u8, self.token_reader.token_data.element_start.name); - errdefer self.allocator.free(name); - try self.element_names.append(self.allocator, name); - errdefer _ = self.element_names.pop(); - try self.namespace_context.startScope(self.allocator); - self.pending_event = .{ .element_start = .{ .name = name } }; - }, - .element_content => { - if (try self.finalizePendingEvent()) |event| { - self.pending_token = .element_content; - return event; - } - return .{ .element_content = .{ - .content = try self.contentText(self.token_reader.token_data.element_content.content), - } }; - }, - .element_end => { - if (try self.finalizePendingEvent()) |event| { - self.pending_token = .element_end; - return event; - } - const expected_name = self.element_names.pop(); - defer self.allocator.free(expected_name); - if (!mem.eql(u8, expected_name, self.token_reader.token_data.element_end.name)) { - return error.MismatchedEndTag; - } - var qname = try self.namespace_context.parseName(self.token_reader.token_data.element_end.name, true); - try qname.dupNs(event_allocator); - self.namespace_context.endScope(self.allocator); - return .{ .element_end = .{ .name = qname } }; - }, - .element_end_empty => { - if (try self.finalizePendingEvent()) |event| { - self.pending_token = .element_end_empty; - return event; - } - const name = self.element_names.pop(); - defer self.allocator.free(name); - const dup_name = try event_allocator.dupe(u8, name); - var qname = try self.namespace_context.parseName(dup_name, true); - try qname.dupNs(event_allocator); - self.namespace_context.endScope(self.allocator); - return .{ .element_end = .{ .name = qname } }; - }, - .attribute_start => { - const attr_entry = try self.pending_event.element_start.attributes.getOrPut( - event_allocator, - self.token_reader.token_data.attribute_start.name, - ); - if (attr_entry.found_existing) { - return error.DuplicateAttribute; - } - // The attribute name will be invalidated after we get - // the next token, so we have to duplicate it here. - // This doesn't change the hash of the key, so it's - // safe to do this. - attr_entry.key_ptr.* = try event_allocator.dupe(u8, self.token_reader.token_data.attribute_start.name); - attr_entry.value_ptr.* = .{}; - }, - .attribute_content => { - const attributes = self.pending_event.element_start.attributes.values(); - try attributes[attributes.len - 1].appendSlice(event_allocator, try self.contentText(self.token_reader.token_data.attribute_content.content)); - }, - .comment_start => { - if (try self.finalizePendingEvent()) |event| { - self.pending_token = .comment_start; - return event; - } - self.pending_event = .{ .comment = .{} }; - }, - .comment_content => { - try self.pending_event.comment.content.appendSlice(event_allocator, self.token_reader.token_data.comment_content.content); - if (self.token_reader.token_data.comment_content.final) { - const event = Event{ .comment = .{ .content = self.pending_event.comment.content.items } }; - self.pending_event = .none; - return event; - } - }, - .pi_start => { - if (try self.finalizePendingEvent()) |event| { - self.pending_token = .pi_start; - return event; - } - if (options.namespace_aware and mem.indexOfScalar(u8, self.token_reader.token_data.pi_start.target, ':') != null) { - return error.QNameNotAllowed; - } - self.pending_event = .{ .pi = .{ - .target = try event_allocator.dupe(u8, self.token_reader.token_data.pi_start.target), - } }; - }, - .pi_content => { - try self.pending_event.pi.content.appendSlice(event_allocator, self.token_reader.token_data.pi_content.content); - if (self.token_reader.token_data.pi_content.final) { - const event = Event{ .pi = .{ - .target = self.pending_event.pi.target, - .content = self.pending_event.pi.content.items, - } }; - self.pending_event = .none; - return event; - } - }, - } - } - } - - fn nextToken(self: *Self) !Token { - if (self.pending_token) |token| { - self.pending_token = null; - return token; - } - return try self.token_reader.next(); - } - - fn finalizePendingEvent(self: *Self) !?Event { - const event_allocator = self.event_arena.allocator(); - switch (self.pending_event) { - .none => return null, - .element_start => |element_start| { - // Bind all xmlns declarations in the current element - for (element_start.attributes.keys(), element_start.attributes.values()) |attr_name, attr_value| { - if (mem.eql(u8, attr_name, "xmlns")) { - try self.namespace_context.bindDefault(self.allocator, attr_value.items); - } else if (mem.startsWith(u8, attr_name, "xmlns:")) { - try self.namespace_context.bindPrefix(self.allocator, attr_name["xmlns:".len..], attr_value.items); - } - } - - // Convert the element and attribute names to QNames - const qname = try self.namespace_context.parseName(element_start.name, true); - var attributes = ArrayListUnmanaged(Event.Attribute){}; - try attributes.ensureTotalCapacity(event_allocator, element_start.attributes.count()); - // When namespaces are enabled, we need to check uniqueness - // of attribute QNames according to the namespaces spec - var attr_qnames = if (options.namespace_aware) QNameSet{}; - if (options.namespace_aware) { - try attr_qnames.ensureTotalCapacity(event_allocator, @intCast(element_start.attributes.count())); - } - for (element_start.attributes.keys(), element_start.attributes.values()) |attr_name, attr_value| { - const attr_qname = try self.namespace_context.parseName(attr_name, false); - attributes.appendAssumeCapacity(.{ .name = attr_qname, .value = attr_value.items }); - if (options.namespace_aware) { - const entry = attr_qnames.getOrPutAssumeCapacity(attr_qname); - if (entry.found_existing) { - return error.DuplicateAttribute; - } - } - } - - self.pending_event = .none; - return .{ .element_start = .{ .name = qname, .attributes = attributes.items } }; - }, - // Other pending events will have already been handled by - // looking at the 'final' content event - else => unreachable, - } - } - - fn contentText(self: *Self, content: Token.Content) ![]const u8 { - return switch (content) { - .text => |text| text, - .codepoint => |codepoint| text: { - const len = unicode.utf8Encode(codepoint, &self.codepoint_buf) catch unreachable; - break :text self.codepoint_buf[0..len]; - }, - .entity => |entity| entities.get(entity) orelse return error.UndeclaredEntityReference, - }; - } - - pub fn nextNode(self: *Self, allocator: Allocator, element_start: Event.ElementStart) Error!OwnedValue(Node.Element) { - var arena = ArenaAllocator.init(allocator); - errdefer arena.deinit(); - return .{ - .value = try self.nextElementNode(arena.allocator(), element_start), - .arena = arena, - }; - } - - fn nextElementNode(self: *Self, allocator: Allocator, element_start: Event.ElementStart) Error!Node.Element { - const name = try element_start.name.clone(allocator); - var element_children = ArrayListUnmanaged(Node){}; - try element_children.ensureTotalCapacity(allocator, element_start.attributes.len); - for (element_start.attributes) |attr| { - element_children.appendAssumeCapacity(.{ .attribute = .{ - .name = try attr.name.clone(allocator), - .value = try allocator.dupe(u8, attr.value), - } }); - } - var current_content = ArrayListUnmanaged(u8){}; - while (try self.next()) |event| { - if (event != .element_content and current_content.items.len > 0) { - try element_children.append(allocator, .{ .text = .{ .content = current_content.items } }); - current_content = .{}; - } - switch (event) { - .xml_declaration => unreachable, - .element_start => |sub_element_start| try element_children.append(allocator, .{ - .element = try self.nextElementNode(allocator, sub_element_start), - }), - .element_content => |element_content| try current_content.appendSlice(allocator, element_content.content), - .element_end => return .{ .name = name, .children = element_children.items }, - .comment => |comment| try element_children.append(allocator, .{ .comment = .{ - .content = try allocator.dupe(u8, comment.content), - } }), - .pi => |pi| try element_children.append(allocator, .{ .pi = .{ - .target = try allocator.dupe(u8, pi.target), - .content = try allocator.dupe(u8, pi.content), - } }), - } - } - unreachable; - } - - /// Returns an iterator over the remaining children of the current - /// element. - /// - /// Note that, since the returned iterator's `next` function calls the - /// `next` function of this reader internally, such calls will - /// invalidate any event returned prior to calling this function. - pub fn children(self: *Self) Children(Self) { - return .{ .reader = self, .start_depth = self.element_names.items.len }; - } - }; -} - -fn Children(comptime ReaderType: type) type { - return struct { - reader: *ReaderType, - start_depth: usize, - - const Self = @This(); - - /// Returns the next event. - /// - /// This function must not be called after it initially returns null. - pub fn next(self: Self) ReaderType.Error!?Event { - return switch (try self.reader.next() orelse return null) { - .element_end => |element_end| if (self.reader.element_names.items.len >= self.start_depth) .{ .element_end = element_end } else null, - else => |event| event, - }; - } - - /// Returns an iterator over the remaining children of the current - /// element. - /// - /// This may not be used after `next` returns null. - pub fn children(self: Self) Self { - return self.reader.children(); - } - - /// Skips the remaining children. - /// - /// `next` and `children` must not be used after this. - pub fn skip(self: Self) ReaderType.Error!void { - while (try self.next()) |_| {} - } - }; -} - -test Reader { - try testValid(.{}, - \\ - \\ - \\ - \\ - \\ - \\

Hello,

- \\ - \\ - \\ Text content goes here. - \\

&

- \\
- \\ - \\ - \\ - \\ - \\ - , &.{ - .{ .xml_declaration = .{ .version = "1.0" } }, - .{ .pi = .{ .target = "some-pi", .content = "" } }, - .{ .comment = .{ .content = " A processing instruction with content follows " } }, - .{ .pi = .{ .target = "some-pi-with-content", .content = "content" } }, - .{ .element_start = .{ .name = .{ .local = "root" } } }, - .{ .element_content = .{ .content = "\n " } }, - .{ .element_start = .{ .name = .{ .local = "p" }, .attributes = &.{ - .{ .name = .{ .local = "class" }, .value = "test" }, - } } }, - .{ .element_content = .{ .content = "Hello, " } }, - .{ .element_content = .{ .content = "world!" } }, - .{ .element_end = .{ .name = .{ .local = "p" } } }, - .{ .element_content = .{ .content = "\n " } }, - .{ .element_start = .{ .name = .{ .local = "line" } } }, - .{ .element_end = .{ .name = .{ .local = "line" } } }, - .{ .element_content = .{ .content = "\n " } }, - .{ .pi = .{ .target = "another-pi", .content = "" } }, - .{ .element_content = .{ .content = "\n Text content goes here.\n " } }, - .{ .element_start = .{ .name = .{ .local = "div" } } }, - .{ .element_start = .{ .name = .{ .local = "p" } } }, - .{ .element_content = .{ .content = "&" } }, - .{ .element_end = .{ .name = .{ .local = "p" } } }, - .{ .element_end = .{ .name = .{ .local = "div" } } }, - .{ .element_content = .{ .content = "\n" } }, - .{ .element_end = .{ .name = .{ .local = "root" } } }, - .{ .comment = .{ .content = " Comments are allowed after the end of the root element " } }, - .{ .pi = .{ .target = "comment", .content = "So are PIs " } }, - }); -} - -test "tag name matching" { - try testInvalid(.{}, "", error.MismatchedEndTag); - try testInvalid(.{}, "", error.MismatchedEndTag); - try testInvalid(.{}, "Some contentMore content", error.MismatchedEndTag); -} - -test "namespace handling" { - try testValid(.{}, - \\ - \\ - \\ - \\ - \\ - \\ - \\ - , &.{ - .{ .element_start = .{ .name = .{ .prefix = "a", .ns = "urn:1", .local = "root" }, .attributes = &.{ - .{ .name = .{ .prefix = "xmlns", .ns = xmlns_ns, .local = "a" }, .value = "urn:1" }, - } } }, - .{ .element_content = .{ .content = "\n " } }, - .{ .element_start = .{ .name = .{ .ns = "urn:2", .local = "child" }, .attributes = &.{ - .{ .name = .{ .local = "xmlns" }, .value = "urn:2" }, - .{ .name = .{ .prefix = "xmlns", .ns = xmlns_ns, .local = "b" }, .value = "urn:3" }, - .{ .name = .{ .local = "attr" }, .value = "value" }, - } } }, - .{ .element_content = .{ .content = "\n " } }, - .{ .element_start = .{ .name = .{ .prefix = "b", .ns = "urn:3", .local = "child" }, .attributes = &.{ - .{ .name = .{ .prefix = "xmlns", .ns = xmlns_ns, .local = "a" }, .value = "urn:4" }, - .{ .name = .{ .prefix = "b", .ns = "urn:3", .local = "attr" }, .value = "value" }, - } } }, - .{ .element_content = .{ .content = "\n " } }, - .{ .element_start = .{ .name = .{ .prefix = "a", .ns = "urn:4", .local = "child" } } }, - .{ .element_end = .{ .name = .{ .prefix = "a", .ns = "urn:4", .local = "child" } } }, - .{ .element_content = .{ .content = "\n " } }, - .{ .element_end = .{ .name = .{ .prefix = "b", .ns = "urn:3", .local = "child" } } }, - .{ .element_content = .{ .content = "\n " } }, - .{ .element_end = .{ .name = .{ .ns = "urn:2", .local = "child" } } }, - .{ .element_content = .{ .content = "\n" } }, - .{ .element_end = .{ .name = .{ .prefix = "a", .ns = "urn:1", .local = "root" } } }, - }); - try testInvalid(.{}, "", error.UndeclaredNsPrefix); - try testInvalid(.{}, "<: />", error.InvalidQName); - try testInvalid(.{}, "", error.InvalidQName); - try testInvalid(.{}, "<:a />", error.InvalidQName); - try testInvalid(.{}, "", error.InvalidQName); - try testInvalid(.{}, "", error.InvalidQName); - try testInvalid(.{}, "", error.InvalidQName); - try testInvalid(.{}, "", error.DuplicateAttribute); - try testInvalid(.{}, "", error.DuplicateAttribute); - try testInvalid(.{}, "
", error.DuplicateAttribute); - try testInvalid(.{}, "", error.InvalidNsBinding); - try testInvalid(.{}, "", error.InvalidNsBinding); - try testValid(.{}, "", &.{ - .{ .element_start = .{ .name = .{ .local = "root" }, .attributes = &.{ - .{ .name = .{ .prefix = "xmlns", .ns = xmlns_ns, .local = "xml" }, .value = "http://www.w3.org/XML/1998/namespace" }, - } } }, - .{ .element_end = .{ .name = .{ .local = "root" } } }, - }); - try testInvalid(.{}, "", error.InvalidNsBinding); - try testInvalid(.{}, "", error.InvalidNsBinding); - try testInvalid(.{}, "", error.InvalidNsBinding); - try testInvalid(.{}, "", error.InvalidNsBinding); - try testInvalid(.{}, "", error.InvalidNsBinding); - try testInvalid(.{}, "", error.QNameNotAllowed); - - try testValid(.{ .namespace_aware = false }, - \\ - \\ - \\ - \\ - \\ - \\ - \\ - , &.{ - .{ .element_start = .{ .name = .{ .local = "a:root" }, .attributes = &.{ - .{ .name = .{ .local = "xmlns:a" }, .value = "urn:1" }, - } } }, - .{ .element_content = .{ .content = "\n " } }, - .{ .element_start = .{ .name = .{ .local = "child" }, .attributes = &.{ - .{ .name = .{ .local = "xmlns" }, .value = "urn:2" }, - .{ .name = .{ .local = "xmlns:b" }, .value = "urn:3" }, - .{ .name = .{ .local = "attr" }, .value = "value" }, - } } }, - .{ .element_content = .{ .content = "\n " } }, - .{ .element_start = .{ .name = .{ .local = "b:child" }, .attributes = &.{ - .{ .name = .{ .local = "xmlns:a" }, .value = "urn:4" }, - .{ .name = .{ .local = "b:attr" }, .value = "value" }, - } } }, - .{ .element_content = .{ .content = "\n " } }, - .{ .element_start = .{ .name = .{ .local = "a:child" } } }, - .{ .element_end = .{ .name = .{ .local = "a:child" } } }, - .{ .element_content = .{ .content = "\n " } }, - .{ .element_end = .{ .name = .{ .local = "b:child" } } }, - .{ .element_content = .{ .content = "\n " } }, - .{ .element_end = .{ .name = .{ .local = "child" } } }, - .{ .element_content = .{ .content = "\n" } }, - .{ .element_end = .{ .name = .{ .local = "a:root" } } }, - }); - try testValid(.{ .namespace_aware = false }, "", &.{ - .{ .element_start = .{ .name = .{ .local = "a:root" } } }, - .{ .element_end = .{ .name = .{ .local = "a:root" } } }, - }); - try testValid(.{ .namespace_aware = false }, "<: />", &.{ - .{ .element_start = .{ .name = .{ .local = ":" } } }, - .{ .element_end = .{ .name = .{ .local = ":" } } }, - }); - try testValid(.{ .namespace_aware = false }, "", &.{ - .{ .element_start = .{ .name = .{ .local = "a:" } } }, - .{ .element_end = .{ .name = .{ .local = "a:" } } }, - }); - try testValid(.{ .namespace_aware = false }, "<:a />", &.{ - .{ .element_start = .{ .name = .{ .local = ":a" } } }, - .{ .element_end = .{ .name = .{ .local = ":a" } } }, - }); - try testValid(.{ .namespace_aware = false }, "", &.{ - .{ .element_start = .{ .name = .{ .local = "root" }, .attributes = &.{ - .{ .name = .{ .local = "xmlns:" }, .value = "urn:1" }, - } } }, - .{ .element_end = .{ .name = .{ .local = "root" } } }, - }); - try testValid(.{ .namespace_aware = false }, "", &.{ - .{ .element_start = .{ .name = .{ .local = "root" }, .attributes = &.{ - .{ .name = .{ .local = "xmlns::" }, .value = "urn:1" }, - } } }, - .{ .element_end = .{ .name = .{ .local = "root" } } }, - }); - try testValid(.{ .namespace_aware = false }, "", &.{ - .{ .element_start = .{ .name = .{ .local = "root" }, .attributes = &.{ - .{ .name = .{ .local = "xmlns:a:b" }, .value = "urn:1" }, - } } }, - .{ .element_end = .{ .name = .{ .local = "root" } } }, - }); - try testInvalid(.{ .namespace_aware = false }, "", error.DuplicateAttribute); - try testInvalid(.{ .namespace_aware = false }, "", error.DuplicateAttribute); - try testValid(.{ .namespace_aware = false }, "", &.{ - .{ .element_start = .{ .name = .{ .local = "root" } } }, - .{ .pi = .{ .target = "ns:pi", .content = "" } }, - .{ .element_end = .{ .name = .{ .local = "root" } } }, - }); -} - -fn testValid(comptime options: ReaderOptions, input: []const u8, expected_events: []const Event) !void { - var input_stream = std.io.fixedBufferStream(input); - var input_reader = reader(testing.allocator, input_stream.reader(), options); - defer input_reader.deinit(); - var i: usize = 0; - while (try input_reader.next()) |event| : (i += 1) { - if (i >= expected_events.len) { - std.debug.print("Unexpected event after end: {}\n", .{event}); - return error.TestFailed; - } - testing.expectEqualDeep(expected_events[i], event) catch |e| { - std.debug.print("(at index {})\n", .{i}); - return e; - }; - } - if (i != expected_events.len) { - std.debug.print("Expected {} events, found {}\n", .{ expected_events.len, i }); - return error.TestFailed; - } -} - -fn testInvalid(comptime options: ReaderOptions, input: []const u8, expected_error: anyerror) !void { - var input_stream = std.io.fixedBufferStream(input); - var input_reader = reader(testing.allocator, input_stream.reader(), options); - defer input_reader.deinit(); - while (input_reader.next()) |_| {} else |err| { - try testing.expectEqual(expected_error, err); - } -} - -test "nextNode" { - var input_stream = std.io.fixedBufferStream( - \\ - \\ - \\ - \\ - \\ - \\

Hello,

- \\ - \\ - \\ Text content goes here. - \\

&

- \\
- \\ - \\ - \\ - \\ - \\ - ); - var input_reader = reader(testing.allocator, input_stream.reader(), .{}); - defer input_reader.deinit(); - - try testing.expectEqualDeep(@as(?Event, .{ .xml_declaration = .{ .version = "1.0" } }), try input_reader.next()); - try testing.expectEqualDeep(@as(?Event, .{ .pi = .{ .target = "some-pi", .content = "" } }), try input_reader.next()); - try testing.expectEqualDeep(@as(?Event, .{ .comment = .{ .content = " A processing instruction with content follows " } }), try input_reader.next()); - try testing.expectEqualDeep(@as(?Event, .{ .pi = .{ .target = "some-pi-with-content", .content = "content" } }), try input_reader.next()); - - const root_start = try input_reader.next(); - try testing.expect(root_start != null and root_start.? == .element_start); - var root_node = try input_reader.nextNode(testing.allocator, root_start.?.element_start); - defer root_node.deinit(); - try testing.expectEqualDeep(Node.Element{ .name = .{ .local = "root" }, .children = &.{ - .{ .text = .{ .content = "\n " } }, - .{ .element = .{ .name = .{ .local = "p" }, .children = &.{ - .{ .attribute = .{ .name = .{ .local = "class" }, .value = "test" } }, - .{ .text = .{ .content = "Hello, world!" } }, - } } }, - .{ .text = .{ .content = "\n " } }, - .{ .element = .{ .name = .{ .local = "line" }, .children = &.{} } }, - .{ .text = .{ .content = "\n " } }, - .{ .pi = .{ .target = "another-pi", .content = "" } }, - .{ .text = .{ .content = "\n Text content goes here.\n " } }, - .{ .element = .{ .name = .{ .local = "div" }, .children = &.{ - .{ .element = .{ .name = .{ .local = "p" }, .children = &.{ - .{ .text = .{ .content = "&" } }, - } } }, - } } }, - .{ .text = .{ .content = "\n" } }, - } }, root_node.value); - - try testing.expectEqualDeep(@as(?Event, .{ .comment = .{ .content = " Comments are allowed after the end of the root element " } }), try input_reader.next()); - try testing.expectEqualDeep(@as(?Event, .{ .pi = .{ .target = "comment", .content = "So are PIs " } }), try input_reader.next()); - try testing.expect(try input_reader.next() == null); -} - -test "nextNode namespace handling" { - var input_stream = std.io.fixedBufferStream( - \\ - \\ - \\ - \\ - \\ - \\ - \\ - ); - var input_reader = reader(testing.allocator, input_stream.reader(), .{}); - defer input_reader.deinit(); - - const root_start = try input_reader.next(); - try testing.expect(root_start != null and root_start.? == .element_start); - var root_node = try input_reader.nextNode(testing.allocator, root_start.?.element_start); - defer root_node.deinit(); - try testing.expectEqualDeep(Node.Element{ .name = .{ .prefix = "a", .ns = "urn:1", .local = "root" }, .children = &.{ - .{ .attribute = .{ .name = .{ .prefix = "xmlns", .ns = xmlns_ns, .local = "a" }, .value = "urn:1" } }, - .{ .text = .{ .content = "\n " } }, - .{ .element = .{ .name = .{ .ns = "urn:2", .local = "child" }, .children = &.{ - .{ .attribute = .{ .name = .{ .local = "xmlns" }, .value = "urn:2" } }, - .{ .attribute = .{ .name = .{ .prefix = "xmlns", .ns = xmlns_ns, .local = "b" }, .value = "urn:3" } }, - .{ .attribute = .{ .name = .{ .local = "attr" }, .value = "value" } }, - .{ .text = .{ .content = "\n " } }, - .{ .element = .{ .name = .{ .prefix = "b", .ns = "urn:3", .local = "child" }, .children = &.{ - .{ .attribute = .{ .name = .{ .prefix = "xmlns", .ns = xmlns_ns, .local = "a" }, .value = "urn:4" } }, - .{ .attribute = .{ .name = .{ .prefix = "b", .ns = "urn:3", .local = "attr" }, .value = "value" } }, - .{ .text = .{ .content = "\n " } }, - .{ .element = .{ .name = .{ .prefix = "a", .ns = "urn:4", .local = "child" } } }, - .{ .text = .{ .content = "\n " } }, - } } }, - .{ .text = .{ .content = "\n " } }, - } } }, - .{ .text = .{ .content = "\n" } }, - } }, root_node.value); -} - -test readDocument { - var input_stream = std.io.fixedBufferStream( - \\ - \\ - \\ - \\ - \\ - \\

Hello,

- \\ - \\ - \\ Text content goes here. - \\

&

- \\
- \\ - \\ - \\ - \\ - \\ - ); - var document_node = try readDocument(testing.allocator, input_stream.reader(), .{}); - defer document_node.deinit(); - - try testing.expectEqualDeep(Node.Document{ .version = "1.0", .children = &.{ - .{ .pi = .{ .target = "some-pi", .content = "" } }, - .{ .comment = .{ .content = " A processing instruction with content follows " } }, - .{ .pi = .{ .target = "some-pi-with-content", .content = "content" } }, - .{ .element = .{ .name = .{ .local = "root" }, .children = &.{ - .{ .text = .{ .content = "\n " } }, - .{ .element = .{ .name = .{ .local = "p" }, .children = &.{ - .{ .attribute = .{ .name = .{ .local = "class" }, .value = "test" } }, - .{ .text = .{ .content = "Hello, world!" } }, - } } }, - .{ .text = .{ .content = "\n " } }, - .{ .element = .{ .name = .{ .local = "line" }, .children = &.{} } }, - .{ .text = .{ .content = "\n " } }, - .{ .pi = .{ .target = "another-pi", .content = "" } }, - .{ .text = .{ .content = "\n Text content goes here.\n " } }, - .{ .element = .{ .name = .{ .local = "div" }, .children = &.{ - .{ .element = .{ .name = .{ .local = "p" }, .children = &.{ - .{ .text = .{ .content = "&" } }, - } } }, - } } }, - .{ .text = .{ .content = "\n" } }, - } } }, - .{ .comment = .{ .content = " Comments are allowed after the end of the root element " } }, - .{ .pi = .{ .target = "comment", .content = "So are PIs " } }, - } }, document_node.value); -} - -test Children { - var input_stream = std.io.fixedBufferStream( - \\ - \\ Hello, world! - \\ Some content. - \\ - \\ - ); - var input_reader = reader(testing.allocator, input_stream.reader(), .{}); - defer input_reader.deinit(); - - try testing.expectEqualDeep(@as(?Event, .{ .element_start = .{ .name = .{ .local = "root" } } }), try input_reader.next()); - const root_children = input_reader.children(); - try testing.expectEqualDeep(@as(?Event, .{ .element_content = .{ .content = "\n Hello, world!\n " } }), try root_children.next()); - try testing.expectEqualDeep(@as(?Event, .{ .element_start = .{ .name = .{ .local = "child1" }, .attributes = &.{ - .{ .name = .{ .local = "attr" }, .value = "value" }, - } } }), try root_children.next()); - const child1_children = root_children.children(); - try testing.expectEqualDeep(@as(?Event, .{ .element_content = .{ .content = "Some content." } }), try child1_children.next()); - try testing.expectEqual(@as(?Event, null), try child1_children.next()); - try testing.expectEqualDeep(@as(?Event, .{ .element_content = .{ .content = "\n " } }), try root_children.next()); - try testing.expectEqualDeep(@as(?Event, .{ .element_start = .{ .name = .{ .local = "child2" } } }), try root_children.next()); - const child2_children = root_children.children(); - try testing.expectEqualDeep(@as(?Event, .{ .comment = .{ .content = " Comment " } }), try child2_children.next()); - try testing.expectEqualDeep(@as(?Event, .{ .element_start = .{ .name = .{ .local = "child3" } } }), try child2_children.next()); - const child3_children = child2_children.children(); - try testing.expectEqual(@as(?Event, null), try child3_children.next()); - try testing.expectEqual(@as(?Event, null), try child2_children.next()); - try testing.expectEqualDeep(@as(?Event, .{ .element_content = .{ .content = "\n" } }), try root_children.next()); - try testing.expectEqual(@as(?Event, null), try root_children.next()); -} - -test "skip children" { - var input_stream = std.io.fixedBufferStream( - \\ - \\ Hello, world! - \\ Some content. - \\ - \\ - ); - var input_reader = reader(testing.allocator, input_stream.reader(), .{}); - defer input_reader.deinit(); - - try testing.expectEqualDeep(@as(?Event, .{ .element_start = .{ .name = .{ .local = "root" } } }), try input_reader.next()); - const root_children = input_reader.children(); - try root_children.skip(); - try testing.expectEqual(@as(?Event, null), try input_reader.next()); -} diff --git a/src/syntax.zig b/src/syntax.zig deleted file mode 100644 index 9f16250..0000000 --- a/src/syntax.zig +++ /dev/null @@ -1,106 +0,0 @@ -const std = @import("std"); -const unicode = std.unicode; - -pub inline fn isChar(c: u21) bool { - return switch (c) { - '\t', '\r', '\n', ' '...0xD7FF, 0xE000...0xFFFD, 0x10000...0x10FFFF => true, - else => false, - }; -} - -pub inline fn isSpace(c: u21) bool { - return switch (c) { - ' ', '\t', '\r', '\n' => true, - else => false, - }; -} - -pub inline fn isDigit(c: u21) bool { - return switch (c) { - '0'...'9' => true, - else => false, - }; -} - -/// Note: only valid if `isDigit` returns true. -pub inline fn digitValue(c: u21) u4 { - return @intCast(c - '0'); -} - -pub inline fn isHexDigit(c: u21) bool { - return switch (c) { - '0'...'9', 'a'...'f', 'A'...'F' => true, - else => false, - }; -} - -/// Note: only valid if `isHexDigit` returns true. -pub inline fn hexDigitValue(c: u21) u4 { - return switch (c) { - 'a'...'f' => @intCast(c - 'a' + 10), - 'A'...'F' => @intCast(c - 'A' + 10), - else => @intCast(c - '0'), - }; -} - -/// Checks if `s` matches `NCName` from the namespaces spec. -/// -/// Note: only valid if `s` is valid UTF-8. -pub fn isNcName(s: []const u8) bool { - var view = unicode.Utf8View.initUnchecked(s); - var iter = view.iterator(); - const first_c = iter.nextCodepoint() orelse return false; - if (first_c == ':' or !isNameStartChar(first_c)) { - return false; - } - while (iter.nextCodepoint()) |c| { - if (c == ':' or !isNameChar(c)) { - return false; - } - } - return true; -} - -pub inline fn isNameStartChar(c: u21) bool { - return switch (c) { - ':', - 'A'...'Z', - '_', - 'a'...'z', - 0xC0...0xD6, - 0xD8...0xF6, - 0xF8...0x2FF, - 0x370...0x37D, - 0x37F...0x1FFF, - 0x200C...0x200D, - 0x2070...0x218F, - 0x2C00...0x2FEF, - 0x3001...0xD7FF, - 0xF900...0xFDCF, - 0xFDF0...0xFFFD, - 0x10000...0xEFFFF, - => true, - else => false, - }; -} - -pub inline fn isNameChar(c: u21) bool { - return if (isNameStartChar(c)) true else switch (c) { - '-', '.', '0'...'9', 0xB7, 0x0300...0x036F, 0x203F...0x2040 => true, - else => false, - }; -} - -pub inline fn isEncodingStartChar(c: u21) bool { - return switch (c) { - 'A'...'Z', 'a'...'z' => true, - else => false, - }; -} - -pub inline fn isEncodingChar(c: u21) bool { - return switch (c) { - 'A'...'Z', 'a'...'z', '0'...'9', '.', '_', '-' => true, - else => false, - }; -} diff --git a/src/token_reader.zig b/src/token_reader.zig deleted file mode 100644 index bcf7964..0000000 --- a/src/token_reader.zig +++ /dev/null @@ -1,621 +0,0 @@ -const std = @import("std"); -const mem = std.mem; -const testing = std.testing; -const unicode = std.unicode; -const encoding = @import("encoding.zig"); -const Scanner = @import("Scanner.zig"); - -/// A single XML token. -/// -/// For efficiency, this is merely an enum specifying the token type. The actual -/// token data is available in `Token.Data`, in the token reader's `token_data` -/// field. The `fullToken` function can be used to get a `Token.Full`, which is -/// a tagged union type and may be easier to consume in certain circumstances. -pub const Token = enum { - /// End of file. - eof, - /// XML declaration. - xml_declaration, - /// Element start tag. - element_start, - /// Element content. - element_content, - /// Element end tag. - element_end, - /// End of an empty element. - element_end_empty, - /// Attribute start. - attribute_start, - /// Attribute value content. - attribute_content, - /// Comment start. - comment_start, - /// Comment content. - comment_content, - /// Processing instruction (PI) start. - pi_start, - /// PI content. - pi_content, - - /// The data associated with a token. - /// - /// Even token types which have no associated data are represented here, to - /// provide some additional safety in safe build modes (where it can be - /// checked whether the caller is referencing the correct data field). - pub const Data = union { - eof: void, - xml_declaration: XmlDeclaration, - element_start: ElementStart, - element_content: ElementContent, - element_end: ElementEnd, - element_end_empty: void, - attribute_start: AttributeStart, - attribute_content: AttributeContent, - comment_start: void, - comment_content: CommentContent, - pi_start: PiStart, - pi_content: PiContent, - }; - - /// A token type plus data represented as a tagged union. - pub const Full = union(Token) { - eof, - xml_declaration: XmlDeclaration, - element_start: ElementStart, - element_content: ElementContent, - element_end: ElementEnd, - element_end_empty, - attribute_start: AttributeStart, - attribute_content: AttributeContent, - comment_start, - comment_content: CommentContent, - pi_start: PiStart, - pi_content: PiContent, - }; - - pub const XmlDeclaration = struct { - version: []const u8, - encoding: ?[]const u8 = null, - standalone: ?bool = null, - }; - - pub const ElementStart = struct { - name: []const u8, - }; - - pub const ElementContent = struct { - content: Content, - }; - - pub const ElementEnd = struct { - name: []const u8, - }; - - pub const AttributeStart = struct { - name: []const u8, - }; - - pub const AttributeContent = struct { - content: Content, - final: bool = false, - }; - - pub const CommentContent = struct { - content: []const u8, - final: bool = false, - }; - - pub const PiStart = struct { - target: []const u8, - }; - - pub const PiContent = struct { - content: []const u8, - final: bool = false, - }; - - /// A bit of content of an element or attribute. - pub const Content = union(enum) { - /// Raw text content (does not contain any entities). - text: []const u8, - /// A Unicode codepoint. - codepoint: u21, - /// An entity reference, such as `&`. The range covers the name (`amp`). - entity: []const u8, - }; -}; - -/// A location in a file. -pub const Location = struct { - /// The line number, starting at 1. - line: usize = 1, - /// The column number, starting at 1. Columns are counted using Unicode - /// codepoints. - column: usize = 1, - /// Whether the last character seen was a `\r`. - after_cr: bool = false, - - /// Advances the location by a single codepoint. - pub fn advance(self: *Location, c: u21) void { - if (c == '\n') { - self.line += 1; - self.column = 1; - self.after_cr = false; - } else if (c == '\r') { - if (self.after_cr) { - self.line += 1; - self.column = 1; - } - self.column += 1; - self.after_cr = true; - } else if (self.after_cr) { - self.line += 1; - // Plain CR line endings cannot be detected as new lines - // immediately, since they could be followed by LF. The following - // character is what completes the line ending interpretation. - self.column = 2; - self.after_cr = false; - } else { - self.column += 1; - } - } -}; - -test Location { - var loc = Location{}; - try expectLocation(loc, 1, 1); - loc.advance('A'); - try expectLocation(loc, 1, 2); - loc.advance('よ'); - try expectLocation(loc, 1, 3); - loc.advance('🥰'); - try expectLocation(loc, 1, 4); - loc.advance('\n'); - try expectLocation(loc, 2, 1); - loc.advance('\r'); - loc.advance('\n'); - try expectLocation(loc, 3, 1); - loc.advance('\r'); - loc.advance('A'); - try expectLocation(loc, 4, 2); - loc.advance('\r'); - loc.advance('\r'); - loc.advance('A'); - try expectLocation(loc, 6, 2); -} - -fn expectLocation(loc: Location, line: usize, column: usize) !void { - if (loc.line != line or loc.column != column) { - std.debug.print("expected {}:{}, found {}:{}", .{ line, column, loc.line, loc.column }); - return error.TestExpectedEqual; - } -} - -/// A drop-in replacement for `Location` which does not actually store location -/// information. -pub const NoOpLocation = struct { - pub inline fn advance(_: *NoOpLocation, _: u21) void {} -}; - -/// Wraps a `std.io.Reader` in a `TokenReader` with the default buffer size -/// (4096). -pub fn tokenReader( - reader: anytype, - comptime options: TokenReaderOptions, -) TokenReader(@TypeOf(reader), options) { - return TokenReader(@TypeOf(reader), options).init(reader, .{}); -} - -/// Options for a `TokenReader`. -pub const TokenReaderOptions = struct { - /// The type of decoder to use. - DecoderType: type = encoding.DefaultDecoder, - /// The size of the internal buffer. - /// - /// This limits the byte length of "non-splittable" content, such as - /// element and attribute names. Longer such content will result in - /// `error.Overflow`. - buffer_size: usize = 4096, - /// Whether to normalize line endings and attribute values according to the - /// XML specification. - /// - /// If this is set to false, no normalization will be done: for example, - /// the line ending sequence `\r\n` will appear as-is in returned tokens - /// rather than the normalized `\n`. - enable_normalization: bool = true, - /// Whether to keep track of the current location in the document. - track_location: bool = false, -}; - -/// An XML parser which wraps a `std.io.Reader` and returns low-level tokens. -/// -/// An internal buffer of size `buffer_size` is used to store data read from -/// the reader, which is referenced by the returned tokens. -/// -/// This parser offers several advantages over `Scanner` for higher-level -/// use-cases: -/// -/// - The returned `Token`s use byte slices rather than positional ranges. -/// - The `next` function can be used in the typical Zig iterator pattern. -/// There is no `ok` token which must be ignored, and there is no need to -/// directly signal the end of input (the `Reader` provides this indication). -/// - The line ending and attribute value normalization steps required by the -/// XML specification (minus further attribute value normalization which -/// depends on DTD information) are performed. -/// -/// However, due to its use of an internal buffer and transcoding all input to -/// UTF-8, it is not as efficient as a `Scanner` where these considerations are -/// important. Additionally, `buffer_size` limits the maximum byte length of -/// "unsplittable" content, such as element and attribute names (but not -/// "splittable" content, such as element text content and attribute values). -pub fn TokenReader(comptime ReaderType: type, comptime options: TokenReaderOptions) type { - return struct { - scanner: Scanner, - reader: ReaderType, - decoder: options.DecoderType, - /// The data for the most recently returned token. - token_data: Token.Data = undefined, - /// The current location in the file (if enabled). - location: if (options.track_location) Location else NoOpLocation = .{}, - /// Buffered content read by the reader for the current token. - /// - /// Events may reference this buffer via slices. The contents of the - /// buffer (up until `scanner.pos`) are always valid UTF-8. - buffer: [options.buffer_size]u8 = undefined, - /// Whether the last codepoint read was a carriage return (`\r`). - /// - /// This is relevant for line break normalization. - after_cr: if (options.enable_normalization) bool else void = if (options.enable_normalization) false, - - const Self = @This(); - - pub const Error = error{ - InvalidEncoding, - InvalidPiTarget, - Overflow, - UnexpectedEndOfInput, - } || ReaderType.Error || options.DecoderType.Error || Scanner.Error; - - const max_encoded_codepoint_len = @max(options.DecoderType.max_encoded_codepoint_len, 4); - - pub fn init(reader: ReaderType, decoder: options.DecoderType) Self { - return .{ - .scanner = Scanner{}, - .reader = reader, - .decoder = decoder, - }; - } - - /// Returns the full token (including data) from the most recent call to - /// `next`. `token` must be the token returned from the last call to - /// `next`. - pub fn fullToken(self: *const Self, token: Token) Token.Full { - return switch (token) { - inline else => |tag| @unionInit(Token.Full, @tagName(tag), @field(self.token_data, @tagName(tag))), - }; - } - - /// Returns the next token from the input. - /// - /// The slices in the `token_data` stored during this call are only - /// valid until the next call to `next`. - pub fn next(self: *Self) Error!Token { - if (self.scanner.pos > 0) { - // If the scanner position is > 0, that means we emitted an event - // on the last call to next, and should try to reset the - // position again in an effort to not run out of buffer space - // (ideally, the scanner should be resettable after every token, - // but we do not depend on this). - if (self.scanner.resetPos()) |token| { - if (token != .ok) { - return try self.bufToken(token); - } - } else |_| { - // Failure to reset isn't fatal (yet); we can still try to - // complete the token below - } - } - - while (true) { - if (self.scanner.pos + max_encoded_codepoint_len >= self.buffer.len) { - if (self.scanner.resetPos()) |token| { - if (token != .ok) { - return try self.bufToken(token); - } - } else |_| { - // Failure to reset here still isn't fatal, since we - // may end up getting shorter codepoints which manage - // to complete the current token. - } - } - - const c = try self.nextCodepoint(); - if (!c.present) { - try self.scanner.endInput(); - self.token_data = .{ .eof = {} }; - return .eof; - } - const token = try self.scanner.next(c.codepoint, c.byte_length); - if (token != .ok) { - return try self.bufToken(token); - } - } - } - - const nextCodepoint = if (options.enable_normalization) nextCodepointNormalized else nextCodepointRaw; - - fn nextCodepointNormalized(self: *Self) !encoding.ReadResult { - var c = try self.nextCodepointRaw(); - if (!c.present) return c; - if (self.after_cr) { - self.after_cr = false; - if (c.codepoint == '\n') { - // \n after \r is ignored because \r was already processed - // as a line ending - c = try self.nextCodepointRaw(); - if (!c.present) return c; - } - } - if (c.codepoint == '\r') { - self.after_cr = true; - c.codepoint = '\n'; - self.buffer[self.scanner.pos] = '\n'; - } - if (self.scanner.state == .attribute_content and - (c.codepoint == '\t' or c.codepoint == '\r' or c.codepoint == '\n')) - { - c.codepoint = ' '; - self.buffer[self.scanner.pos] = ' '; - } - return c; - } - - fn nextCodepointRaw(self: *Self) !encoding.ReadResult { - const c = try self.decoder.readCodepoint(self.reader, self.buffer[self.scanner.pos..]); - if (c.present) self.location.advance(c.codepoint); - return c; - } - - fn bufToken(self: *Self, token: Scanner.Token) !Token { - switch (token) { - .ok => unreachable, - .xml_declaration => { - self.token_data = .{ .xml_declaration = .{ - .version = self.bufRange(self.scanner.token_data.xml_declaration.version), - .encoding = if (self.scanner.token_data.xml_declaration.encoding) |enc| self.bufRange(enc) else null, - .standalone = self.scanner.token_data.xml_declaration.standalone, - } }; - if (self.token_data.xml_declaration.encoding) |declared_encoding| { - try self.decoder.adaptTo(declared_encoding); - } - return .xml_declaration; - }, - .element_start => { - self.token_data = .{ .element_start = .{ - .name = self.bufRange(self.scanner.token_data.element_start.name), - } }; - return .element_start; - }, - .element_content => { - self.token_data = .{ .element_content = .{ - .content = self.bufContent(self.scanner.token_data.element_content.content), - } }; - return .element_content; - }, - .element_end => { - self.token_data = .{ .element_end = .{ - .name = self.bufRange(self.scanner.token_data.element_end.name), - } }; - return .element_end; - }, - .element_end_empty => { - self.token_data = .{ .element_end_empty = {} }; - return .element_end_empty; - }, - .attribute_start => { - self.token_data = .{ .attribute_start = .{ - .name = self.bufRange(self.scanner.token_data.attribute_start.name), - } }; - return .attribute_start; - }, - .attribute_content => { - self.token_data = .{ .attribute_content = .{ - .content = self.bufContent(self.scanner.token_data.attribute_content.content), - .final = self.scanner.token_data.attribute_content.final, - } }; - return .attribute_content; - }, - .comment_start => { - self.token_data = .{ .comment_start = {} }; - return .comment_start; - }, - .comment_content => { - self.token_data = .{ .comment_content = .{ - .content = self.bufRange(self.scanner.token_data.comment_content.content), - .final = self.scanner.token_data.comment_content.final, - } }; - return .comment_content; - }, - .pi_start => { - const target = self.bufRange(self.scanner.token_data.pi_start.target); - if (std.ascii.eqlIgnoreCase(target, "xml")) { - return error.InvalidPiTarget; - } - self.token_data = .{ .pi_start = .{ - .target = target, - } }; - return .pi_start; - }, - .pi_content => { - self.token_data = .{ .pi_content = .{ - .content = self.bufRange(self.scanner.token_data.pi_content.content), - .final = self.scanner.token_data.pi_content.final, - } }; - return .pi_content; - }, - } - } - - inline fn bufContent(self: *const Self, content: Scanner.Token.Content) Token.Content { - return switch (content) { - .text => |text| .{ .text = self.bufRange(text) }, - .codepoint => |codepoint| .{ .codepoint = codepoint }, - .entity => |entity| .{ .entity = self.bufRange(entity) }, - }; - } - - inline fn bufRange(self: *const Self, range: Scanner.Range) []const u8 { - return self.buffer[range.start..range.end]; - } - }; -} - -test TokenReader { - try testValid(.{}, - \\ - \\ - \\ - \\ - \\ - \\

Hello,

- \\ - \\ - \\ Text content goes here. - \\

&

- \\
- \\ - \\ - \\ - \\ - \\ - , &.{ - .{ .xml_declaration = .{ .version = "1.0" } }, - .{ .pi_start = .{ .target = "some-pi" } }, - .{ .pi_content = .{ .content = "", .final = true } }, - .comment_start, - .{ .comment_content = .{ .content = " A processing instruction with content follows ", .final = true } }, - .{ .pi_start = .{ .target = "some-pi-with-content" } }, - .{ .pi_content = .{ .content = "content", .final = true } }, - .{ .element_start = .{ .name = "root" } }, - .{ .element_content = .{ .content = .{ .text = "\n " } } }, - .{ .element_start = .{ .name = "p" } }, - .{ .attribute_start = .{ .name = "class" } }, - .{ .attribute_content = .{ .content = .{ .text = "test" }, .final = true } }, - .{ .element_content = .{ .content = .{ .text = "Hello, " } } }, - .{ .element_content = .{ .content = .{ .text = "world!" } } }, - .{ .element_end = .{ .name = "p" } }, - .{ .element_content = .{ .content = .{ .text = "\n " } } }, - .{ .element_start = .{ .name = "line" } }, - .element_end_empty, - .{ .element_content = .{ .content = .{ .text = "\n " } } }, - .{ .pi_start = .{ .target = "another-pi" } }, - .{ .pi_content = .{ .content = "", .final = true } }, - .{ .element_content = .{ .content = .{ .text = "\n Text content goes here.\n " } } }, - .{ .element_start = .{ .name = "div" } }, - .{ .element_start = .{ .name = "p" } }, - .{ .element_content = .{ .content = .{ .entity = "amp" } } }, - .{ .element_end = .{ .name = "p" } }, - .{ .element_end = .{ .name = "div" } }, - .{ .element_content = .{ .content = .{ .text = "\n" } } }, - .{ .element_end = .{ .name = "root" } }, - .comment_start, - .{ .comment_content = .{ .content = " Comments are allowed after the end of the root element ", .final = true } }, - .{ .pi_start = .{ .target = "comment" } }, - .{ .pi_content = .{ .content = "So are PIs ", .final = true } }, - }); -} - -test "normalization" { - try testValid(.{}, "Line 1\rLine 2\r\nLine 3\nLine 4\n\rLine 6\r\n\rLine 8", &.{ - .{ .element_start = .{ .name = "root" } }, - .{ .element_content = .{ .content = .{ .text = "Line 1\nLine 2\nLine 3\nLine 4\n\nLine 6\n\nLine 8" } } }, - .{ .element_end = .{ .name = "root" } }, - }); - try testValid(.{}, "", &.{ - .{ .element_start = .{ .name = "root" } }, - .{ .attribute_start = .{ .name = "attr" } }, - .{ .attribute_content = .{ - .content = .{ .text = " Line 1 Line 2 Line 3 Line 4 More content Line 6 Line 8 " }, - .final = true, - } }, - .element_end_empty, - }); - try testValid(.{ .enable_normalization = false }, "Line 1\rLine 2\r\nLine 3\nLine 4\n\rLine 6\r\n\rLine 8", &.{ - .{ .element_start = .{ .name = "root" } }, - .{ .element_content = .{ .content = .{ .text = "Line 1\rLine 2\r\nLine 3\nLine 4\n\rLine 6\r\n\rLine 8" } } }, - .{ .element_end = .{ .name = "root" } }, - }); - try testValid(.{ .enable_normalization = false }, "", &.{ - .{ .element_start = .{ .name = "root" } }, - .{ .attribute_start = .{ .name = "attr" } }, - .{ .attribute_content = .{ - .content = .{ .text = " Line 1\rLine 2\r\nLine 3\nLine 4\t\tMore content\n\rLine 6\r\n\rLine 8 " }, - .final = true, - } }, - .element_end_empty, - }); -} - -test "PI target" { - try testValid(.{}, "", &.{ - .{ .xml_declaration = .{ .version = "1.0" } }, - .{ .element_start = .{ .name = "root" } }, - .{ .pi_start = .{ .target = "some-pi" } }, - .{ .pi_content = .{ .content = "", .final = true } }, - .{ .element_end = .{ .name = "root" } }, - }); - try testValid(.{}, "", &.{ - .{ .element_start = .{ .name = "root" } }, - .{ .pi_start = .{ .target = "x" } }, - .{ .pi_content = .{ .content = "2", .final = true } }, - .{ .element_end = .{ .name = "root" } }, - }); - try testValid(.{}, "", &.{ - .{ .element_start = .{ .name = "root" } }, - .{ .pi_start = .{ .target = "xm" } }, - .{ .pi_content = .{ .content = "2", .final = true } }, - .{ .element_end = .{ .name = "root" } }, - }); - try testValid(.{}, "", &.{ - .{ .element_start = .{ .name = "root" } }, - .{ .pi_start = .{ .target = "xml2" } }, - .{ .pi_content = .{ .content = "2", .final = true } }, - .{ .element_end = .{ .name = "root" } }, - }); - try testInvalid(.{}, "", error.InvalidPiTarget); - try testInvalid(.{}, "", error.InvalidPiTarget); - try testInvalid(.{}, "", error.InvalidPiTarget); - try testInvalid(.{}, "", error.InvalidPiTarget); -} - -fn testValid(comptime options: TokenReaderOptions, input: []const u8, expected_tokens: []const Token.Full) !void { - var input_stream = std.io.fixedBufferStream(input); - var input_reader = tokenReader(input_stream.reader(), options); - var i: usize = 0; - while (true) : (i += 1) { - const token = try input_reader.next(); - if (token == .eof) break; - if (i >= expected_tokens.len) { - std.debug.print("Unexpected token after end: {}\n", .{token}); - return error.TestFailed; - } - testing.expectEqualDeep(expected_tokens[i], input_reader.fullToken(token)) catch |e| { - std.debug.print("(at index {})\n", .{i}); - return e; - }; - } - if (i != expected_tokens.len) { - std.debug.print("Expected {} tokens, found {}\n", .{ expected_tokens.len, i }); - return error.TestFailed; - } -} - -fn testInvalid(comptime options: TokenReaderOptions, input: []const u8, expected_error: anyerror) !void { - var input_stream = std.io.fixedBufferStream(input); - var input_reader = tokenReader(input_stream.reader(), options); - while (input_reader.next()) |token| { - if (token == .eof) return error.TestExpectedError; - } else |err| { - try testing.expectEqual(expected_error, err); - } -} diff --git a/src/writer.zig b/src/writer.zig deleted file mode 100644 index 8871139..0000000 --- a/src/writer.zig +++ /dev/null @@ -1,264 +0,0 @@ -const std = @import("std"); -const fmt = std.fmt; -const testing = std.testing; -const ArrayListUnmanaged = std.ArrayListUnmanaged; -const Event = @import("reader.zig").Event; -const QName = @import("reader.zig").QName; - -/// Returns a `Writer` wrapping a `std.io.Writer`. -pub fn writer(w: anytype) Writer(@TypeOf(w)) { - return .{ .w = w }; -} - -/// A streaming XML writer wrapping a `std.io.Writer`. -/// -/// This writer exposes a selection of functions to write XML content with -/// proper escaping where possible. -/// -/// Some write functions come in sets to allow streaming longer contents rather -/// than writing them all in one go: for example, `writeAttribute` is useful for -/// writing an entire attribute name-value pair in one shot, but if the attribute -/// value is potentially quite long, the sequence of `writeAttributeStart`, -/// followed by an arbitrary (even zero) number of `writeAttributeContent`, -/// followed by `writeAttributeEnd`, can be used as a lower-level alternative. -/// -/// One interesting lower-level function is `writeElementStartEnd`, which is used -/// to tell the writer to finish the current element start tag (all attributes -/// have been written), in preparation for writing other content. The other -/// functions (such as `writeElementContent`) will call this themselves if the -/// writer is in the middle of a start tag, but calling this function directly -/// could be useful if the user plans to write directly to the underlying -/// writer. -/// -/// Additionally, this writer makes no attempt at being able to write XML in -/// arbitrary styles. For example, the quote character is not configurable, and -/// there is no function for writing CDATA sections. -/// -/// # Safety -/// -/// There are caveats to the well-formedness of the resulting output: -/// -/// 1. There is no protection against calling the various write functions out of -/// order. For example, calling `writeElementEnd` without a corresponding -/// `writeElementStart` will result in non-well-formed XML. -/// 2. Processing instructions (PIs) and comments do not support escaping their -/// content, so passing content to the corresponding write functions which -/// contains illegal sequences for those constructs will result in -/// unexpected outcomes. For example, calling `writeComment` with a value -/// containing `-->` will result in the writer happily writing out the raw -/// `-->` in the text of the comment, which will close the comment and write -/// the rest of the provided text as raw XML (followed by the writer's -/// inserted `-->`). -/// 3. There are no validations that the names of elements and attributes match -/// the allowed syntax for names. Likewise, there are no validations that the -/// `version` and `encoding` passed to `writeXmlDeclaration` match the -/// allowed syntax for those values. -/// -/// As such, it is not safe to use all functionality of this writer with -/// arbitrary user-provided data. What _is_ safe, however, is the more common -/// case of using this writer with only attribute values and element content -/// containing user-provided data, since those can always be escaped properly. -pub fn Writer(comptime WriterType: type) type { - return struct { - w: WriterType, - in_element_start: bool = false, - - const Self = @This(); - - pub const Error = WriterType.Error; - - pub fn writeXmlDeclaration(self: *Self, version: []const u8, encoding: ?[]const u8, standalone: ?bool) Error!void { - try self.w.print(""); - } - - pub fn writeElementStart(self: *Self, name: QName) Error!void { - if (self.in_element_start) { - try self.writeElementStartEnd(); - } - try self.w.print("<{}", .{fmtQName(name)}); - self.in_element_start = true; - } - - pub fn writeElementStartEnd(self: *Self) Error!void { - try self.w.writeByte('>'); - self.in_element_start = false; - } - - pub fn writeElementContent(self: *Self, content: []const u8) Error!void { - if (self.in_element_start) { - try self.writeElementStartEnd(); - } - try self.w.print("{}", .{fmtElementContent(content)}); - } - - pub fn writeElementEnd(self: *Self, name: QName) Error!void { - if (self.in_element_start) { - try self.w.writeAll(" />"); - self.in_element_start = false; - } else { - try self.w.print("", .{fmtQName(name)}); - } - } - - pub fn writeAttribute(self: *Self, name: QName, content: []const u8) Error!void { - try self.writeAttributeStart(name); - try self.writeAttributeContent(content); - try self.writeAttributeEnd(); - } - - pub fn writeAttributeStart(self: *Self, name: QName) Error!void { - try self.w.print(" {}=\"", .{fmtQName(name)}); - } - - pub fn writeAttributeContent(self: *Self, content: []const u8) Error!void { - try self.w.print("{}", .{fmtAttributeContent(content)}); - } - - pub fn writeAttributeEnd(self: *Self) Error!void { - try self.w.writeByte('"'); - } - - pub fn writeComment(self: *Self, content: []const u8) Error!void { - try self.writeCommentStart(); - try self.writeCommentContent(content); - try self.writeCommentEnd(); - } - - pub fn writeCommentStart(self: *Self) Error!void { - if (self.in_element_start) { - try self.writeElementStartEnd(); - } - try self.w.writeAll(""); - } - - pub fn writePi(self: *Self, target: []const u8, content: []const u8) Error!void { - try self.writePiStart(target); - try self.writePiContent(content); - try self.writePiEnd(); - } - - pub fn writePiStart(self: *Self, target: []const u8) Error!void { - if (self.in_element_start) { - try self.writeElementStartEnd(); - } - try self.w.print(""); - } - }; -} - -test Writer { - var output = ArrayListUnmanaged(u8){}; - defer output.deinit(testing.allocator); - var xml_writer = writer(output.writer(testing.allocator)); - - const xmlns_ns = "http://www.w3.org/2000/xmlns/"; - try xml_writer.writeXmlDeclaration("1.0", "UTF-8", true); - // The ns part of the QName is not used when writing, but may factor in to - // future (optional) safety checks - try xml_writer.writeElementStart(.{ .prefix = "test", .ns = "http://example.com/ns/test", .local = "root" }); - try xml_writer.writeAttribute(.{ .prefix = "xmlns", .ns = xmlns_ns, .local = "test" }, "http://example.com/ns/test"); - try xml_writer.writeComment(" Hello, world! "); - try xml_writer.writeElementContent("Some text & some other text. "); - try xml_writer.writeElementContent("Another ."); - try xml_writer.writeElementStart(.{ .local = "sub" }); - try xml_writer.writeAttribute(.{ .local = "escaped" }, "&<>\"'"); - try xml_writer.writeElementEnd(.{ .local = "sub" }); - try xml_writer.writeElementEnd(.{ .prefix = "test", .ns = "http://example.com/ns/test", .local = "root" }); - - try testing.expectEqualStrings( - \\ - ++ - \\ - ++ - \\ - ++ - \\Some text & some other text. Another <sentence>. - ++ - \\ - ++ - \\ - , output.items); -} - -/// Returns a `std.fmt.Formatter` for escaped attribute content. -pub fn fmtAttributeContent(data: []const u8) fmt.Formatter(formatAttributeContent) { - return .{ .data = data }; -} - -fn formatAttributeContent( - data: []const u8, - comptime _: []const u8, - _: fmt.FormatOptions, - w: anytype, -) !void { - for (data) |b| switch (b) { - '\t' => try w.writeAll(" "), - '\n' => try w.writeAll(" "), - '\r' => try w.writeAll(" "), - '"' => try w.writeAll("""), - '&' => try w.writeAll("&"), - '<' => try w.writeAll("<"), - else => try w.writeByte(b), - }; -} - -/// Returns a `std.fmt.Formatter` for escaped element content. -pub fn fmtElementContent(data: []const u8) fmt.Formatter(formatElementContent) { - return .{ .data = data }; -} - -fn formatElementContent( - data: []const u8, - comptime _: []const u8, - _: fmt.FormatOptions, - w: anytype, -) !void { - for (data) |b| switch (b) { - '\r' => try w.writeAll(" "), - '&' => try w.writeAll("&"), - '<' => try w.writeAll("<"), - else => try w.writeByte(b), - }; -} - -/// Returns a `std.fmt.Formatter` for a QName (formats as `prefix:local` or -/// just `local` if no prefix). -pub fn fmtQName(data: QName) fmt.Formatter(formatQName) { - return .{ .data = data }; -} - -fn formatQName( - data: QName, - comptime _: []const u8, - _: fmt.FormatOptions, - w: anytype, -) !void { - if (data.prefix) |prefix| { - try w.writeAll(prefix); - try w.writeByte(':'); - } - try w.writeAll(data.local); -} diff --git a/src/xml.zig b/src/xml.zig index 5856a7b..bfe4569 100644 --- a/src/xml.zig +++ b/src/xml.zig @@ -1,53 +1,476 @@ -//! An XML library, currently supporting reading XML. -//! -//! Most applications will want to start with `Reader` and investigate the -//! other parser options if they want to avoid dynamic memory allocation or -//! want better performance at the expense of ease of use. -//! -//! There are three parsers available, with increasing levels of abstraction, -//! ease of use, and standard conformance. The documentation for each parser -//! provides more detailed information on its functionality. -//! -//! 1. `Scanner` - the lowest-level parser. A state machine that accepts -//! Unicode codepoints one by one and returns "tokens" referencing ranges of -//! input data. -//! 2. `TokenReader` - a mid-level parser that improves on `Scanner` by -//! buffering input so that returned tokens can use UTF-8-encoded byte -//! slices rather than ranges. It also uses a `std.io.Reader` and a decoder -//! (see `encoding`) rather than forcing the user to pass codepoints -//! directly. -//! 3. `Reader` - a general-purpose streaming parser which can handle -//! namespaces. Helper functions are available to parse some or all of a -//! document into a `Node`, which acts as a minimal DOM abstraction. - const std = @import("std"); -const testing = std.testing; +const Allocator = std.mem.Allocator; +const assert = std.debug.assert; +const expectEqual = std.testing.expectEqual; +const expectEqualStrings = std.testing.expectEqualStrings; + +pub const Location = struct { + line: usize, + column: usize, + + pub const start: Location = .{ .line = 1, .column = 1 }; + + pub fn update(loc: *Location, s: []const u8) void { + var pos: usize = 0; + while (std.mem.indexOfAnyPos(u8, s, pos, "\r\n")) |nl_pos| { + loc.line += 1; + loc.column = 1; + if (s[nl_pos] == '\r' and nl_pos + 1 < s.len and s[nl_pos + 1] == '\n') { + pos = nl_pos + 2; + } else { + pos = nl_pos + 1; + } + } + loc.column += s.len - pos; + } +}; + +pub const QName = struct { + ns: []const u8, + local: []const u8, +}; + +pub const PrefixedQName = struct { + prefix: []const u8, + ns: []const u8, + local: []const u8, +}; + +pub const predefined_entities = std.StaticStringMap([]const u8).initComptime(.{ + .{ "lt", "<" }, + .{ "gt", ">" }, + .{ "amp", "&" }, + .{ "apos", "'" }, + .{ "quot", "\"" }, +}); + +pub const ns_xml = "http://www.w3.org/XML/1998/namespace"; +pub const ns_xmlns = "http://www.w3.org/2000/xmlns/"; +pub const predefined_namespace_uris = std.StaticStringMap([]const u8).initComptime(.{ + .{ "xml", ns_xml }, + .{ "xmlns", ns_xmlns }, +}); + +pub const Reader = @import("Reader.zig"); + +pub fn GenericReader(comptime SourceError: type) type { + return struct { + reader: Reader, + + /// See `Reader.deinit`. + pub inline fn deinit(reader: *@This()) void { + reader.reader.deinit(); + } + + pub const ReadError = Reader.ReadError || SourceError; + + /// See `Reader.read`. + pub inline fn read(reader: *@This()) ReadError!Reader.Node { + return @errorCast(reader.reader.read()); + } + + /// See `Reader.readElementText`. + pub inline fn readElementText(reader: *@This()) (ReadError || Allocator.Error)![]const u8 { + return @errorCast(reader.reader.readElementText()); + } + + pub inline fn readElementTextAlloc(reader: *@This(), gpa: Allocator) (ReadError || Allocator.Error)![]u8 { + return @errorCast(reader.reader.readElementTextAlloc(gpa)); + } + + /// See `Reader.readElementTextWrite`. + pub inline fn readElementTextWrite(reader: *@This(), writer: anytype) (ReadError || @TypeOf(writer).Error)!void { + return @errorCast(reader.reader.readElementTextWrite(writer.any())); + } + + /// See `Reader.skipProlog`. + pub inline fn skipProlog(reader: *@This()) ReadError!void { + return @errorCast(reader.reader.skipProlog()); + } + + /// See `Reader.skipElement`. + pub inline fn skipElement(reader: *@This()) ReadError!void { + return @errorCast(reader.reader.skipElement()); + } + + /// See `Reader.location`. + pub inline fn location(reader: @This()) Location { + return reader.reader.location(); + } + + /// See `Reader.errorCode`. + pub inline fn errorCode(reader: @This()) Reader.ErrorCode { + return reader.reader.errorCode(); + } + + /// See `Reader.errorLocation`. + pub inline fn errorLocation(reader: @This()) Location { + return reader.reader.errorLocation(); + } + + /// See `Reader.xmlDeclarationVersion`. + pub inline fn xmlDeclarationVersion(reader: @This()) []const u8 { + return reader.reader.xmlDeclarationVersion(); + } + + /// See `Reader.xmlDeclarationEncoding`. + pub inline fn xmlDeclarationEncoding(reader: @This()) ?[]const u8 { + return reader.reader.xmlDeclarationEncoding(); + } + + /// See `Reader.xmlDeclarationStandalone`. + pub inline fn xmlDeclarationStandalone(reader: @This()) ?bool { + return reader.reader.xmlDeclarationStandalone(); + } + + /// See `Reader.elementName`. + pub inline fn elementName(reader: @This()) []const u8 { + return reader.reader.elementName(); + } + + /// See `Reader.elementNameNs`. + pub inline fn elementNameNs(reader: @This()) PrefixedQName { + return reader.reader.elementNameNs(); + } + + /// See `Reader.attributeCount`. + pub inline fn attributeCount(reader: @This()) usize { + return reader.reader.attributeCount(); + } + + /// See `Reader.attributeName`. + pub inline fn attributeName(reader: @This(), n: usize) []const u8 { + return reader.reader.attributeName(n); + } + + /// See `Reader.attributeNameNs`. + pub inline fn attributeNameNs(reader: @This(), n: usize) PrefixedQName { + return reader.reader.attributeNameNs(n); + } + + /// See `Reader.attributeValue`. + pub inline fn attributeValue(reader: *@This(), n: usize) Allocator.Error![]const u8 { + return reader.reader.attributeValue(n); + } + + /// See `Reader.attributeValueAlloc`. + pub inline fn attributeValueAlloc(reader: @This(), gpa: Allocator, n: usize) Allocator.Error![]u8 { + return reader.reader.attributeValueAlloc(gpa, n); + } + + /// See `Reader.attributeValueWrite`. + pub inline fn attributeValueWrite(reader: @This(), n: usize, writer: anytype) @TypeOf(writer).Error!void { + return @errorCast(reader.reader.attributeValueWrite(n, writer.any())); + } + + /// See `Reader.attributeValueRaw`. + pub inline fn attributeValueRaw(reader: @This(), n: usize) []const u8 { + return reader.reader.attributeValueRaw(n); + } + + /// See `Reader.attributeLocation`. + pub inline fn attributeLocation(reader: @This(), n: usize) Location { + return reader.reader.attributeLocation(n); + } + + /// See `Reader.attributeIndex`. + pub inline fn attributeIndex(reader: @This(), name: []const u8) ?usize { + return reader.reader.attributeIndex(name); + } + + /// See `Reader.attributeIndexNs`. + pub inline fn attributeIndexNs(reader: @This(), ns: []const u8, local: []const u8) ?usize { + return reader.reader.attributeIndexNs(ns, local); + } + + /// See `Reader.comment`. + pub inline fn comment(reader: *@This()) Allocator.Error![]const u8 { + return reader.reader.comment(); + } + + /// See `Reader.commentWrite`. + pub inline fn commentWrite(reader: @This(), writer: anytype) @TypeOf(writer).Error!void { + return @errorCast(reader.reader.commentWrite(writer.any())); + } + + /// See `Reader.commentRaw`. + pub inline fn commentRaw(reader: @This()) []const u8 { + return reader.reader.commentRaw(); + } + + /// See `Reader.piTarget`. + pub inline fn piTarget(reader: @This()) []const u8 { + return reader.reader.piTarget(); + } + + /// See `Reader.piData`. + pub inline fn piData(reader: *@This()) Allocator.Error![]const u8 { + return reader.reader.piData(); + } + + /// See `Reader.piDataWrite`. + pub inline fn piDataWrite(reader: @This(), writer: anytype) @TypeOf(writer).Error!void { + return @errorCast(reader.reader.piDataWrite(writer.any())); + } + + /// See `Reader.piDataRaw`. + pub inline fn piDataRaw(reader: @This()) []const u8 { + return reader.reader.piDataRaw(); + } + + /// See `Reader.text`. + pub inline fn text(reader: *@This()) Allocator.Error![]const u8 { + return reader.reader.text(); + } + + /// See `Reader.textWrite`. + pub inline fn textWrite(reader: @This(), writer: anytype) @TypeOf(writer).Error!void { + return @errorCast(reader.reader.textWrite(writer.any())); + } + + /// See `Reader.textRaw`. + pub inline fn textRaw(reader: @This()) []const u8 { + return reader.reader.textRaw(); + } + + /// See `Reader.cdataWrite`. + pub inline fn cdataWrite(reader: @This(), writer: anytype) @TypeOf(writer).Error!void { + return @errorCast(reader.reader.cdataWrite(writer.any())); + } + + /// See `Reader.cdata`. + pub inline fn cdata(reader: *@This()) Allocator.Error![]const u8 { + return reader.reader.cdata(); + } -pub const encoding = @import("encoding.zig"); + /// See `Reader.cdataRaw`. + pub inline fn cdataRaw(reader: @This()) []const u8 { + return reader.reader.cdataRaw(); + } -pub const Scanner = @import("Scanner.zig"); + /// See `Reader.entityReferenceName`. + pub inline fn entityReferenceName(reader: @This()) []const u8 { + return reader.reader.entityReferenceName(); + } -pub const tokenReader = @import("token_reader.zig").tokenReader; -pub const TokenReader = @import("token_reader.zig").TokenReader; -pub const TokenReaderOptions = @import("token_reader.zig").TokenReaderOptions; -pub const Token = @import("token_reader.zig").Token; + /// See `Reader.characterReferenceChar`. + pub inline fn characterReferenceChar(reader: @This()) u21 { + return reader.reader.characterReferenceChar(); + } -pub const reader = @import("reader.zig").reader; -pub const readDocument = @import("reader.zig").readDocument; -pub const Reader = @import("reader.zig").Reader; -pub const ReaderOptions = @import("reader.zig").ReaderOptions; -pub const QName = @import("reader.zig").QName; -pub const Event = @import("reader.zig").Event; + /// See `Reader.characterReferenceName`. + pub inline fn characterReferenceName(reader: @This()) []const u8 { + return reader.reader.characterReferenceName(); + } -pub const Node = @import("node.zig").Node; -pub const OwnedValue = @import("node.zig").OwnedValue; + /// See `Reader.namespaceUri`. + pub inline fn namespaceUri(reader: @This(), prefix: []const u8) []const u8 { + return reader.reader.namespaceUri(prefix); + } -pub const writer = @import("writer.zig").writer; -pub const Writer = @import("writer.zig").Writer; -pub const fmtAttributeContent = @import("writer.zig").fmtAttributeContent; -pub const fmtElementContent = @import("writer.zig").fmtElementContent; -pub const fmtQName = @import("writer.zig").fmtQName; + /// Returns the underlying raw `Reader`. + pub inline fn raw(reader: *@This()) *Reader { + return &reader.reader; + } + }; +} + +pub const StaticDocument = struct { + data: []const u8, + pos: usize, + + pub const Error = error{}; + + pub fn init(data: []const u8) StaticDocument { + return .{ .data = data, .pos = 0 }; + } + + pub fn reader(doc: *StaticDocument, gpa: Allocator, options: Reader.Options) GenericReader(Error) { + return .{ .reader = Reader.init(gpa, doc.source(), options) }; + } + + pub fn source(doc: *StaticDocument) Reader.Source { + return .{ + .context = doc, + .moveFn = &move, + }; + } + + fn move(context: *const anyopaque, advance: usize, len: usize) anyerror![]const u8 { + const doc: *StaticDocument = @alignCast(@constCast(@ptrCast(context))); + doc.pos += advance; + const rest_doc = doc.data[doc.pos..]; + return rest_doc[0..@min(len, rest_doc.len)]; + } +}; + +pub fn StreamingDocument(comptime ReaderType: type) type { + return struct { + stream: ReaderType, + buf: []u8, + pos: usize, + avail: usize, + gpa: Allocator, + + pub const Error = ReaderType.Error || Allocator.Error; + + pub fn init(gpa: Allocator, stream: ReaderType) @This() { + return .{ + .stream = stream, + .buf = &.{}, + .pos = 0, + .avail = 0, + .gpa = gpa, + }; + } + + pub fn deinit(doc: *@This()) void { + doc.gpa.free(doc.buf); + doc.* = undefined; + } + + pub fn reader(doc: *@This(), gpa: Allocator, options: Reader.Options) GenericReader(Error) { + return .{ .reader = Reader.init(gpa, doc.source(), options) }; + } + + pub fn source(doc: *@This()) Reader.Source { + return .{ + .context = doc, + .moveFn = &move, + }; + } + + fn move(context: *const anyopaque, advance: usize, len: usize) anyerror![]const u8 { + const doc: *@This() = @alignCast(@constCast(@ptrCast(context))); + doc.pos += advance; + if (len <= doc.avail - doc.pos) return doc.buf[doc.pos..][0..len]; + doc.discardRead(); + try doc.fillBuffer(len); + return doc.buf[0..@min(len, doc.avail)]; + } + + fn discardRead(doc: *@This()) void { + doc.avail -= doc.pos; + std.mem.copyForwards(u8, doc.buf[0..doc.avail], doc.buf[doc.pos..][0..doc.avail]); + doc.pos = 0; + } + + const min_buf_len = 4096; + + fn fillBuffer(doc: *@This(), target_len: usize) !void { + if (target_len > doc.buf.len) { + const new_buf_len = @min(min_buf_len, std.math.ceilPowerOfTwoAssert(usize, target_len)); + doc.buf = try doc.gpa.realloc(doc.buf, new_buf_len); + } + doc.avail += try doc.stream.read(doc.buf[doc.avail..]); + } + }; +} + +pub fn streamingDocument(gpa: Allocator, reader: anytype) StreamingDocument(@TypeOf(reader)) { + return StreamingDocument(@TypeOf(reader)).init(gpa, reader); +} + +test streamingDocument { + var fbs = std.io.fixedBufferStream( + \\ + \\Hello, world! + \\ + ); + var doc = streamingDocument(std.testing.allocator, fbs.reader()); + defer doc.deinit(); + var reader = doc.reader(std.testing.allocator, .{}); + defer reader.deinit(); + + try expectEqual(.xml_declaration, try reader.read()); + try expectEqualStrings("1.0", reader.xmlDeclarationVersion()); + + try expectEqual(.element_start, try reader.read()); + try expectEqualStrings("root", reader.elementName()); + + try expectEqual(.text, try reader.read()); + try expectEqualStrings("Hello, world!", reader.textRaw()); + + try expectEqual(.element_end, try reader.read()); + try expectEqualStrings("root", reader.elementName()); + + try expectEqual(.eof, try reader.read()); +} + +pub const Writer = @import("Writer.zig"); + +pub fn GenericWriter(comptime SinkError: type) type { + return struct { + writer: Writer, + + pub const WriteError = Writer.WriteError || SinkError; + + pub inline fn bom(writer: *@This()) WriteError!void { + return @errorCast(writer.writer.bom()); + } + + pub inline fn xmlDeclaration(writer: *@This(), encoding: ?[]const u8, standalone: ?bool) WriteError!void { + return @errorCast(writer.writer.xmlDeclaration(encoding, standalone)); + } + + pub inline fn elementStart(writer: *@This(), name: []const u8) WriteError!void { + return @errorCast(writer.writer.elementStart(name)); + } + + pub inline fn elementEnd(writer: *@This(), name: []const u8) WriteError!void { + return @errorCast(writer.writer.elementEnd(name)); + } + + pub inline fn elementEndEmpty(writer: *@This()) WriteError!void { + return @errorCast(writer.writer.elementEndEmpty()); + } + + pub inline fn attribute(writer: *@This(), name: []const u8, value: []const u8) WriteError!void { + return @errorCast(writer.writer.attribute(name, value)); + } + + pub inline fn pi(writer: *@This(), target: []const u8, data: []const u8) WriteError!void { + return @errorCast(writer.writer.pi(target, data)); + } + + pub inline fn text(writer: *@This(), s: []const u8) WriteError!void { + return @errorCast(writer.writer.text(s)); + } + }; +} + +pub fn StreamingOutput(comptime WriterType: type) type { + return struct { + stream: WriterType, + + pub const Error = WriterType.Error; + + pub fn writer(out: *const @This(), options: Writer.Options) GenericWriter(Error) { + return .{ .writer = Writer.init(out.sink(), options) }; + } + + pub fn sink(out: *const @This()) Writer.Sink { + return .{ + .context = out, + .writeFn = &write, + }; + } + + fn write(context: *const anyopaque, data: []const u8) anyerror!void { + const out: *const @This() = @alignCast(@ptrCast(context)); + var pos: usize = 0; + while (pos < data.len) { + pos += try out.stream.write(data[pos..]); + } + } + }; +} + +pub fn streamingOutput(writer: anytype) StreamingOutput(@TypeOf(writer)) { + return .{ .stream = writer }; +} test { - testing.refAllDecls(@This()); + _ = Reader; + _ = Writer; } diff --git a/test/xmlconf.zig b/test/xmlconf.zig deleted file mode 100644 index 4e17e69..0000000 --- a/test/xmlconf.zig +++ /dev/null @@ -1,471 +0,0 @@ -//! A test runner for the W3C XML conformance test suite: -//! https://www.w3.org/XML/Test/ - -const std = @import("std"); -const xml = @import("xml"); -const fs = std.fs; -const io = std.io; -const mem = std.mem; -const process = std.process; -const Allocator = mem.Allocator; -const ArrayListUnmanaged = std.ArrayListUnmanaged; - -const usage = - \\Usage: xmlconf [options] files... - \\ - \\The provided files are expected to be XML documents containing a root - \\TESTCASES element containing TESTs. - \\ - \\Options: - \\ -h, --help show help - \\ -v, --verbose enable verbose output - \\ -; - -const max_test_data_bytes = 2 * 1024 * 1024; // 2MB - -const Suite = struct { - profile: ?[]const u8, - tests: []const Test, -}; - -const Test = struct { - id: []const u8, - type: Type, - version: ?[]const u8, - edition: ?[]const u8, - entities: Entities, - namespace: bool, - sections: []const u8, - description: []const u8, - input: []const u8, - output: ?[]const u8, - - const Type = enum { - valid, - invalid, - @"not-wf", - @"error", - - fn parse(value: []const u8) !Type { - inline for (std.meta.fields(Type)) |field| { - if (mem.eql(u8, value, field.name)) { - return @enumFromInt(field.value); - } - } - return error.InvalidTest; - } - }; - - const Entities = enum { - both, - none, - parameter, - general, - - fn parse(value: []const u8) !Entities { - inline for (std.meta.fields(Entities)) |field| { - if (mem.eql(u8, value, field.name)) { - return @enumFromInt(field.value); - } - } - return error.InvalidTest; - } - }; -}; - -fn Context(comptime OutType: type) type { - return struct { - allocator: Allocator, - verbose: bool, - tty_config: io.tty.Config, - out: OutType, - passed: ArrayListUnmanaged(Test) = .{}, - failed: ArrayListUnmanaged(Test) = .{}, - skipped: ArrayListUnmanaged(Test) = .{}, - - const Self = @This(); - - fn msg(self: Self, comptime format: []const u8, args: anytype) !void { - try self.out.print(format ++ "\n", args); - } - - fn pass(self: *Self, @"test": Test) !void { - try self.passed.append(self.allocator, @"test"); - if (self.verbose) { - try self.tty_config.setColor(self.out, .green); - try self.out.print("PASS: {s} ({s})\n", .{ @"test".id, @"test".sections }); - try self.tty_config.setColor(self.out, .reset); - } - } - - fn fail(self: *Self, @"test": Test, reason: []const u8) !void { - try self.failed.append(self.allocator, @"test"); - try self.tty_config.setColor(self.out, .red); - try self.out.print("FAIL: {s} ({s}): {s}\n", .{ @"test".id, @"test".sections, reason }); - try self.tty_config.setColor(self.out, .reset); - } - - fn skip(self: *Self, @"test": Test, reason: []const u8) !void { - try self.skipped.append(self.allocator, @"test"); - if (self.verbose) { - try self.tty_config.setColor(self.out, .yellow); - try self.out.print("SKIP: {s} ({s}): {s}\n", .{ @"test".id, @"test".sections, reason }); - try self.tty_config.setColor(self.out, .reset); - } - } - }; -} - -fn context(allocator: Allocator, verbose: bool, tty_config: io.tty.Config, out: anytype) Context(@TypeOf(out)) { - return .{ .allocator = allocator, .verbose = verbose, .tty_config = tty_config, .out = out }; -} - -pub fn main() !void { - var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); - defer arena.deinit(); - const allocator = arena.allocator(); - - var args_iter = try process.argsWithAllocator(allocator); - defer args_iter.deinit(); - _ = args_iter.skip(); - - const stderr = io.getStdErr().writer(); - - var allow_options = true; - var verbose = false; - var suites = ArrayListUnmanaged(Suite){}; - while (args_iter.next()) |arg| { - if (allow_options and mem.startsWith(u8, arg, "-")) { - if (std.mem.eql(u8, arg, "-h") or std.mem.eql(u8, arg, "--help")) { - try stderr.writeAll(usage); - process.exit(0); - } else if (std.mem.eql(u8, arg, "-v") or std.mem.eql(u8, arg, "--verbose")) { - verbose = true; - } else if (std.mem.eql(u8, arg, "--")) { - allow_options = false; - } else { - try stderr.print("unrecognized option: {s}", .{arg}); - process.exit(1); - } - } else { - var suite_dir = try fs.cwd().openDir(fs.path.dirname(arg) orelse ".", .{}); - defer suite_dir.close(); - var suite_file = try suite_dir.openFile(fs.path.basename(arg), .{}); - defer suite_file.close(); - - var buf_reader = io.bufferedReader(suite_file.reader()); - var suite_reader = xml.reader(allocator, buf_reader.reader(), xml.encoding.DefaultDecoder{}, .{}); - defer suite_reader.deinit(); - try suites.append(allocator, try readSuite(allocator, suite_dir, &suite_reader)); - } - } - - if (suites.items.len == 0) { - try stderr.writeAll("expected at least one test suite file"); - process.exit(1); - } - - const stdout = io.getStdOut(); - const tty_config = io.tty.detectConfig(stdout); - var stdout_buf = io.bufferedWriter(stdout.writer()); - var ctx = context(allocator, verbose, tty_config, stdout_buf.writer()); - - for (suites.items) |suite| { - try runSuite(suite, &ctx); - } - - try ctx.msg("DONE: {} passed, {} failed, {} skipped", .{ - ctx.passed.items.len, - ctx.failed.items.len, - ctx.skipped.items.len, - }); - try stdout_buf.flush(); -} - -fn readSuite(allocator: Allocator, suite_dir: fs.Dir, suite_reader: anytype) !Suite { - var profile: ?[]const u8 = null; - var tests = ArrayListUnmanaged(Test){}; - - while (try suite_reader.next()) |event| { - switch (event) { - .element_start => |element_start| if (element_start.name.is(null, "TESTCASES")) { - for (element_start.attributes) |attr| { - if (attr.name.is(null, "PROFILE")) { - profile = try allocator.dupe(u8, attr.value); - } - } - } else if (element_start.name.is(null, "TEST")) { - try tests.append(allocator, try readTest(allocator, suite_dir, element_start, suite_reader.children())); - } else { - try suite_reader.children().skip(); - }, - else => {}, - } - } - - return .{ - .profile = profile, - .tests = tests.items, - }; -} - -fn readTest(allocator: Allocator, suite_dir: fs.Dir, test_start: xml.Event.ElementStart, test_reader: anytype) !Test { - var id: ?[]const u8 = null; - var @"type": ?Test.Type = null; - var version: ?[]const u8 = null; - var edition: ?[]const u8 = null; - var entities = Test.Entities.none; - var namespace = true; - var sections: ?[]const u8 = null; - var description = ArrayListUnmanaged(u8){}; - var input: ?[]const u8 = null; - var output: ?[]const u8 = null; - - for (test_start.attributes) |attr| { - if (attr.name.is(null, "ID")) { - id = try allocator.dupe(u8, attr.value); - } else if (attr.name.is(null, "TYPE")) { - @"type" = try Test.Type.parse(attr.value); - } else if (attr.name.is(null, "VERSION")) { - version = try allocator.dupe(u8, attr.value); - } else if (attr.name.is(null, "EDITION")) { - edition = try allocator.dupe(u8, attr.value); - } else if (attr.name.is(null, "ENTITIES")) { - entities = try Test.Entities.parse(attr.value); - } else if (attr.name.is(null, "NAMESPACE")) { - namespace = mem.eql(u8, attr.value, "yes"); - } else if (attr.name.is(null, "SECTIONS")) { - sections = try allocator.dupe(u8, attr.value); - } else if (attr.name.is(null, "URI")) { - input = try suite_dir.readFileAlloc(allocator, attr.value, max_test_data_bytes); - } else if (attr.name.is(null, "OUTPUT")) { - output = try suite_dir.readFileAlloc(allocator, attr.value, max_test_data_bytes); - } - } - - while (try test_reader.next()) |event| { - switch (event) { - .element_content => |element_content| try description.appendSlice(allocator, element_content.content), - else => {}, - } - } - - return .{ - .id = id orelse return error.InvalidTest, - .type = @"type" orelse return error.InvalidTest, - .version = version, - .edition = edition, - .entities = entities, - .namespace = namespace, - .sections = sections orelse return error.InvalidTest, - .description = description.items, - .input = input orelse return error.InvalidTest, - .output = output, - }; -} - -fn runSuite(suite: Suite, ctx: anytype) !void { - try ctx.msg("START: {s}", .{suite.profile orelse "untitled"}); - var suite_ctx = context(ctx.allocator, ctx.verbose, ctx.tty_config, ctx.out); - for (suite.tests) |@"test"| { - try runTest(@"test", &suite_ctx); - } - try ctx.msg("DONE: {s}: passed={} failed={} skipped={}", .{ - suite.profile orelse "untitled", - suite_ctx.passed.items.len, - suite_ctx.failed.items.len, - suite_ctx.skipped.items.len, - }); - try ctx.passed.appendSlice(ctx.allocator, suite_ctx.passed.items); - try ctx.failed.appendSlice(ctx.allocator, suite_ctx.failed.items); - try ctx.skipped.appendSlice(ctx.allocator, suite_ctx.skipped.items); -} - -fn runTest(@"test": Test, ctx: anytype) !void { - if (@"test".version) |version| { - if (!mem.eql(u8, version, "1.0")) { - return try ctx.skip(@"test", "only XML 1.0 is supported"); - } - } - if (@"test".edition) |edition| { - // This check will technically be incorrect if a 15th edition is - // released at some point, which seems highly unlikely - if (mem.indexOfScalar(u8, edition, '5') == null) { - return try ctx.skip(@"test", "only the fifth edition of XML 1.0 is supported"); - } - } - - switch (@"test".type) { - .valid, .invalid => { - var input_stream = io.fixedBufferStream(@"test".input); - // TODO: making namespace_aware a comptime option makes this possibly more difficult than it should be - if (@"test".namespace) { - var input_reader = xml.reader(ctx.allocator, input_stream.reader(), xml.encoding.DefaultDecoder{}, .{}); - defer input_reader.deinit(); - try runTestValid(@"test", &input_reader, ctx); - } else { - var input_reader = xml.reader(ctx.allocator, input_stream.reader(), xml.encoding.DefaultDecoder{}, .{ - .namespace_aware = false, - }); - defer input_reader.deinit(); - try runTestValid(@"test", &input_reader, ctx); - } - }, - .@"not-wf" => { - var input_stream = io.fixedBufferStream(@"test".input); - if (@"test".namespace) { - var input_reader = xml.reader(ctx.allocator, input_stream.reader(), xml.encoding.DefaultDecoder{}, .{}); - defer input_reader.deinit(); - try runTestNonWf(@"test", &input_reader, ctx); - } else { - var input_reader = xml.reader(ctx.allocator, input_stream.reader(), xml.encoding.DefaultDecoder{}, .{ - .namespace_aware = false, - }); - defer input_reader.deinit(); - try runTestNonWf(@"test", &input_reader, ctx); - } - }, - .@"error" => return try ctx.skip(@"test", "TODO: not sure how to run error tests"), - } -} - -fn runTestValid(@"test": Test, input_reader: anytype, ctx: anytype) !void { - var buf = ArrayListUnmanaged(u8){}; - defer buf.deinit(ctx.allocator); - while (input_reader.next()) |maybe_event| { - if (maybe_event) |event| { - try writeCanonical(ctx.allocator, &buf, event); - } else { - if (@"test".output) |output| { - if (!mem.eql(u8, buf.items, output)) { - return try ctx.fail(@"test", "expected output does not match"); - } - } - return try ctx.pass(@"test"); - } - } else |e| switch (e) { - error.DoctypeNotSupported => return try ctx.skip(@"test", "doctype not supported"), - error.CannotUndeclareNsPrefix, - error.DuplicateAttribute, - error.InvalidCharacterReference, - error.InvalidEncoding, - error.InvalidNsBinding, - error.InvalidPiTarget, - error.InvalidQName, - error.InvalidUtf8, - error.InvalidUtf16, - error.MismatchedEndTag, - error.SyntaxError, - error.UndeclaredEntityReference, - error.UndeclaredNsPrefix, - error.UnexpectedEndOfInput, - error.QNameNotAllowed, - => return try ctx.fail(@"test", @errorName(e)), - else => |other_e| return other_e, - } -} - -fn runTestNonWf(@"test": Test, input_reader: anytype, ctx: anytype) !void { - while (input_reader.next()) |event| { - if (event == null) { - return try ctx.fail(@"test", "expected error, found none"); - } - } else |e| switch (e) { - error.DoctypeNotSupported => return try ctx.skip(@"test", "doctype not supported"), - error.CannotUndeclareNsPrefix, - error.DuplicateAttribute, - error.InvalidCharacterReference, - error.InvalidEncoding, - error.InvalidNsBinding, - error.InvalidPiTarget, - error.InvalidQName, - error.InvalidUtf8, - error.InvalidUtf16, - error.MismatchedEndTag, - error.SyntaxError, - error.UndeclaredEntityReference, - error.UndeclaredNsPrefix, - error.UnexpectedEndOfInput, - error.QNameNotAllowed, - => return try ctx.pass(@"test"), - else => |other_e| return other_e, - } -} - -fn writeCanonical(allocator: Allocator, buf: *ArrayListUnmanaged(u8), event: xml.Event) !void { - switch (event) { - .xml_declaration, .comment => {}, - .element_start => |element_start| { - try buf.append(allocator, '<'); - try writeQName(allocator, buf, element_start.name); - const attrs = try allocator.dupe(xml.Event.Attribute, element_start.attributes); - defer allocator.free(attrs); - std.sort.heap(xml.Event.Attribute, attrs, {}, attrLessThan); - for (attrs) |attr| { - try buf.append(allocator, ' '); - try writeQName(allocator, buf, attr.name); - try buf.appendSlice(allocator, "=\""); - try writeContent(allocator, buf, attr.value); - try buf.append(allocator, '"'); - } - try buf.append(allocator, '>'); - }, - .element_content => |element_content| { - try writeContent(allocator, buf, element_content.content); - }, - .element_end => |element_end| { - try buf.appendSlice(allocator, "'); - }, - .pi => |pi| { - try buf.appendSlice(allocator, ""); - }, - } -} - -fn writeQName(allocator: Allocator, buf: *ArrayListUnmanaged(u8), qname: xml.QName) !void { - if (qname.prefix) |prefix| { - try buf.appendSlice(allocator, prefix); - try buf.append(allocator, ':'); - } - try buf.appendSlice(allocator, qname.local); -} - -fn writeContent(allocator: Allocator, buf: *ArrayListUnmanaged(u8), content: []const u8) !void { - for (content) |c| { - switch (c) { - '&' => try buf.appendSlice(allocator, "&"), - '<' => try buf.appendSlice(allocator, "<"), - '>' => try buf.appendSlice(allocator, ">"), - '"' => try buf.appendSlice(allocator, """), - '\t' => try buf.appendSlice(allocator, " "), - '\n' => try buf.appendSlice(allocator, " "), - '\r' => try buf.appendSlice(allocator, " "), - else => try buf.append(allocator, c), - } - } -} - -fn attrLessThan(_: void, lhs: xml.Event.Attribute, rhs: xml.Event.Attribute) bool { - // This is a pretty stupid implementation, but it should work for all - // reasonable test cases - var lhs_buf: [1024]u8 = undefined; - const lhs_name = if (lhs.name.ns) |ns| - std.fmt.bufPrint(&lhs_buf, "{s}:{s}", .{ ns, lhs.name.local }) catch @panic("attribute name too long") - else - lhs.name.local; - - var rhs_buf: [1024]u8 = undefined; - const rhs_name = if (rhs.name.ns) |ns| - std.fmt.bufPrint(&rhs_buf, "{s}:{s}", .{ ns, rhs.name.local }) catch @panic("attribute name too long") - else - rhs.name.local; - - return mem.lessThan(u8, lhs_name, rhs_name); -} diff --git a/xmlconf/build.zig b/xmlconf/build.zig new file mode 100644 index 0000000..cc991f4 --- /dev/null +++ b/xmlconf/build.zig @@ -0,0 +1,44 @@ +const std = @import("std"); + +pub fn build(b: *std.Build) void { + const target = b.standardTargetOptions(.{}); + const optimize = b.standardOptimizeOption(.{}); + + const xml = b.dependency("xml", .{ + .target = target, + .optimize = optimize, + }); + + const xmlconf_exe = b.addExecutable(.{ + .name = "xmlconf", + .root_source_file = b.path("src/xmlconf.zig"), + .target = target, + .optimize = optimize, + }); + xmlconf_exe.root_module.addImport("xml", xml.module("xml")); + b.installArtifact(xmlconf_exe); + + const xmlts = b.dependency("xmlts", .{}); + const xmlts_run = b.addRunArtifact(xmlconf_exe); + // Since we can't process DTDs yet, we need to manually specify the test + // suite root files individually. + const suite_paths: []const []const u8 = &.{ + "eduni/errata-2e/errata2e.xml", + "eduni/errata-3e/errata3e.xml", + "eduni/errata-4e/errata4e.xml", + "ibm/ibm_oasis_invalid.xml", + "ibm/ibm_oasis_not-wf.xml", + "ibm/ibm_oasis_valid.xml", + "japanese/japanese.xml", + "oasis/oasis.xml", + // The sun test suite files are not structured in a way we can handle + // without DTD support. + "xmltest/xmltest.xml", + }; + for (suite_paths) |path| { + xmlts_run.addFileArg(xmlts.path(path)); + } + + const test_step = b.step("test", "Run the tests"); + test_step.dependOn(&xmlts_run.step); +} diff --git a/xmlconf/build.zig.zon b/xmlconf/build.zig.zon new file mode 100644 index 0000000..02735b6 --- /dev/null +++ b/xmlconf/build.zig.zon @@ -0,0 +1,18 @@ +.{ + .name = "xmlconf", + .version = "0.0.0", + .paths = .{ + "build.zig", + "build.zig.zon", + "src", + }, + .dependencies = .{ + .xml = .{ + .path = "..", + }, + .xmlts = .{ + .url = "https://www.w3.org/XML/Test/xmlts20130923.tar.gz", + .hash = "1220322f729089d5371fce0b0777edb9946cc54a389aa372c879d9c0843d862c4bbe", + }, + }, +} diff --git a/xmlconf/src/xmlconf.zig b/xmlconf/src/xmlconf.zig new file mode 100644 index 0000000..c9a4faa --- /dev/null +++ b/xmlconf/src/xmlconf.zig @@ -0,0 +1,456 @@ +const std = @import("std"); +const Allocator = std.mem.Allocator; +const log = std.log; +const xml = @import("xml"); + +const usage = + \\Usage: xmlconf [options] files... + \\ + \\Runs the provided xmlconf test suites. + \\ + \\Options: + \\ -h, --help show help + \\ -v, --verbose increase verbosity + \\ +; + +var log_tty_config: std.io.tty.Config = undefined; // Will be initialized immediately in main +var log_level: std.log.Level = .warn; + +pub const std_options: std.Options = .{ + .logFn = logImpl, +}; + +pub fn logImpl( + comptime level: std.log.Level, + comptime scope: @Type(.enum_literal), + comptime format: []const u8, + args: anytype, +) void { + if (@intFromEnum(level) > @intFromEnum(log_level)) return; + + const prefix = if (scope == .default) + comptime level.asText() ++ ": " + else + comptime level.asText() ++ "(" ++ @tagName(scope) ++ "): "; + std.debug.lockStdErr(); + defer std.debug.unlockStdErr(); + const stderr = std.io.getStdErr().writer(); + log_tty_config.setColor(stderr, switch (level) { + .err => .bright_red, + .warn => .bright_yellow, + .info => .bright_blue, + .debug => .bright_magenta, + }) catch return; + stderr.writeAll(prefix) catch return; + log_tty_config.setColor(stderr, .reset) catch return; + stderr.print(format ++ "\n", args) catch return; +} + +pub fn main() !void { + log_tty_config = std.io.tty.detectConfig(std.io.getStdErr()); + + var arena_state = std.heap.ArenaAllocator.init(std.heap.page_allocator); + defer arena_state.deinit(); + const arena = arena_state.allocator(); + + var suite_paths = std.ArrayList([]const u8).init(arena); + + var args: ArgIterator = .{ .args = try std.process.argsWithAllocator(arena) }; + _ = args.next(); + while (args.next()) |arg| { + switch (arg) { + .option => |option| if (option.is('h', "help")) { + try std.io.getStdOut().writeAll(usage); + std.process.exit(0); + } else if (option.is('v', "verbose")) { + log_level = switch (log_level) { + .err => .warn, + .warn => .info, + .info => .debug, + .debug => .debug, + }; + } else { + fatal("unrecognized option: {}", .{option}); + }, + .param => |param| { + try suite_paths.append(try arena.dupe(u8, param)); + }, + .unexpected_value => |unexpected_value| fatal("unexpected value to --{s}: {s}", .{ + unexpected_value.option, + unexpected_value.value, + }), + } + } + + var gpa_state: std.heap.GeneralPurposeAllocator(.{}) = .{}; + defer _ = gpa_state.deinit(); + const gpa = gpa_state.allocator(); + + var results: Results = .{}; + for (suite_paths.items) |suite_path| { + runFile(gpa, suite_path, &results) catch |err| + results.err("running suite {s}: {}", .{ suite_path, err }); + } + std.debug.print("{} passed, {} failed, {} skipped\n", .{ results.passed, results.failed, results.skipped }); + std.process.exit(if (results.ok()) 0 else 1); +} + +fn fatal(comptime format: []const u8, args: anytype) noreturn { + log.err(format, args); + std.process.exit(1); +} + +const Results = struct { + passed: usize = 0, + failed: usize = 0, + skipped: usize = 0, + run_error: bool = false, + + fn ok(results: Results) bool { + return results.failed == 0 and !results.run_error; + } + + fn pass(results: *Results, id: []const u8) void { + log.debug("pass: {s}", .{id}); + results.passed += 1; + } + + fn fail(results: *Results, id: []const u8, comptime fmt: []const u8, args: anytype) void { + log.err("fail: {s}: " ++ fmt, .{id} ++ args); + results.failed += 1; + } + + fn skip(results: *Results, id: []const u8, comptime fmt: []const u8, args: anytype) void { + log.info("skip: {s}: " ++ fmt, .{id} ++ args); + results.skipped += 1; + } + + fn err(results: *Results, comptime fmt: []const u8, args: anytype) void { + log.err(fmt, args); + results.run_error = true; + } +}; + +const max_file_size = 2 * 1024 * 1024; + +fn runFile(gpa: Allocator, path: []const u8, results: *Results) !void { + var dir = try std.fs.cwd().openDir(std.fs.path.dirname(path) orelse ".", .{}); + defer dir.close(); + const data = try dir.readFileAlloc(gpa, std.fs.path.basename(path), max_file_size); + defer gpa.free(data); + var doc = xml.StaticDocument.init(data); + var reader = doc.reader(gpa, .{}); + defer reader.deinit(); + + try reader.skipProlog(); + if (!std.mem.eql(u8, "TESTCASES", reader.elementName())) return error.InvalidTest; + try runSuite(gpa, dir, reader.raw(), results); +} + +fn runSuite(gpa: Allocator, dir: std.fs.Dir, reader: *xml.Reader, results: *Results) !void { + if (reader.attributeIndex("PROFILE")) |profile_attr| { + log.info("suite: {s}", .{try reader.attributeValue(profile_attr)}); + } + + while (true) { + switch (try reader.read()) { + .element_start => if (std.mem.eql(u8, reader.elementName(), "TESTCASES")) { + try runSuite(gpa, dir, reader, results); + } else if (std.mem.eql(u8, reader.elementName(), "TEST")) { + try runTest(gpa, dir, reader, results); + } else { + return error.InvalidTest; + }, + .element_end => break, + else => {}, + } + } +} + +fn runTest(gpa: Allocator, dir: std.fs.Dir, reader: *xml.Reader, results: *Results) !void { + const @"type" = type: { + const index = reader.attributeIndex("TYPE") orelse return error.InvalidTest; + break :type std.meta.stringToEnum(TestType, try reader.attributeValue(index)) orelse return error.InvalidTest; + }; + const id = id: { + const index = reader.attributeIndex("ID") orelse return error.InvalidTest; + break :id try reader.attributeValueAlloc(gpa, index); + }; + defer gpa.free(id); + if (reader.attributeIndex("VERSION")) |index| check_version: { + const versions = try reader.attributeValue(index); + var iter = std.mem.splitScalar(u8, versions, ' '); + while (iter.next()) |version| { + if (std.mem.eql(u8, version, "1.0")) break :check_version; + } + return results.skip(id, "only XML 1.0 is supported", .{}); + } + if (reader.attributeIndex("EDITION")) |index| check_edition: { + const editions = try reader.attributeValue(index); + var iter = std.mem.splitScalar(u8, editions, ' '); + while (iter.next()) |edition| { + if (std.mem.eql(u8, edition, "5")) break :check_edition; + } + return results.skip(id, "only the fifth edition of XML 1.0 is supported", .{}); + } + const namespace = namespace: { + const index = reader.attributeIndex("NAMESPACE") orelse break :namespace .yes; + break :namespace std.meta.stringToEnum(enum { yes, no }, try reader.attributeValue(index)) orelse return error.InvalidTest; + }; + const input = input: { + const index = reader.attributeIndex("URI") orelse return error.InvalidTest; + const path = try reader.attributeValue(index); + break :input dir.readFileAlloc(gpa, path, max_file_size) catch |err| + return results.err("{s}: reading input file: {s}: {}", .{ id, path, err }); + }; + defer gpa.free(input); + const output = output: { + const index = reader.attributeIndex("OUTPUT") orelse break :output null; + const path = try reader.attributeValue(index); + break :output dir.readFileAlloc(gpa, path, max_file_size) catch |err| + return results.err("{s}: reading output file: {s}: {}", .{ id, path, err }); + }; + defer if (output) |o| gpa.free(o); + try reader.skipElement(); + + if (std.mem.startsWith(u8, input, "\xFE\xFF") or + std.mem.startsWith(u8, input, "\xFF\xFE")) + { + return results.skip(id, "UTF-16 unsupported", .{}); + } + + const options: TestOptions = .{ + .namespace = namespace == .yes, + }; + switch (@"type") { + .valid, .invalid => try runTestParseable(gpa, id, input, output, options, results), + .@"not-wf" => try runTestUnparseable(gpa, id, input, options, results), + .@"error" => results.skip(id, "not sure how to run error tests", .{}), + } +} + +const TestOptions = struct { + namespace: bool, +}; + +fn runTestParseable( + gpa: Allocator, + id: []const u8, + input: []const u8, + output: ?[]const u8, + options: TestOptions, + results: *Results, +) !void { + var doc = xml.StaticDocument.init(input); + var reader = doc.reader(gpa, .{ + .namespace_aware = options.namespace, + }); + defer reader.deinit(); + + var canonical_buf = std.ArrayList(u8).init(gpa); + defer canonical_buf.deinit(); + var canonical_output = xml.streamingOutput(canonical_buf.writer()); + var canonical = canonical_output.writer(.{}); + + while (true) { + const node = reader.read() catch |err| switch (err) { + error.MalformedXml => { + switch (reader.errorCode()) { + .doctype_unsupported => return results.skip(id, "doctype unsupported", .{}), + .xml_declaration_encoding_unsupported => return results.skip(id, "encoding unsupported", .{}), + else => |code| { + const loc = reader.errorLocation(); + return results.fail(id, "malformed: {}:{}: {}", .{ loc.line, loc.column, code }); + }, + } + }, + error.OutOfMemory => return error.OutOfMemory, + }; + switch (node) { + .eof => break, + .xml_declaration, .comment => {}, // ignored in canonical form + .element_start => { + try canonical.elementStart(reader.elementName()); + + const sorted_attrs = try gpa.alloc(usize, reader.attributeCount()); + defer gpa.free(sorted_attrs); + for (0..reader.attributeCount()) |i| sorted_attrs[i] = i; + std.sort.pdq(usize, sorted_attrs, reader, struct { + fn lessThan(r: @TypeOf(reader), lhs: usize, rhs: usize) bool { + return std.mem.lessThan(u8, r.attributeName(lhs), r.attributeName(rhs)); + } + }.lessThan); + for (sorted_attrs) |i| { + try canonical.attribute(reader.attributeName(i), try reader.attributeValue(i)); + } + }, + .element_end => { + try canonical.elementEnd(reader.elementName()); + }, + .pi => { + try canonical.pi(reader.piTarget(), try reader.piData()); + }, + .text => { + try canonical.text(try reader.text()); + }, + .cdata => { + try canonical.text(try reader.cdata()); + }, + .character_reference => { + var buf: [4]u8 = undefined; + const len = std.unicode.utf8Encode(reader.characterReferenceChar(), &buf) catch unreachable; + try canonical.text(buf[0..len]); + }, + .entity_reference => { + const value = xml.predefined_entities.get(reader.entityReferenceName()) orelse unreachable; + try canonical.text(value); + }, + } + } + + if (output) |expected_canonical| { + if (!std.mem.eql(u8, canonical_buf.items, expected_canonical)) { + return results.fail( + id, + "canonical output does not match\n\nexpected:\n{s}\n\nactual:{s}", + .{ expected_canonical, canonical_buf.items }, + ); + } + } + return results.pass(id); +} + +fn runTestUnparseable( + gpa: Allocator, + id: []const u8, + input: []const u8, + options: TestOptions, + results: *Results, +) !void { + var doc = xml.StaticDocument.init(input); + var reader = doc.reader(gpa, .{ + .namespace_aware = options.namespace, + }); + defer reader.deinit(); + + while (true) { + const node = reader.read() catch |err| switch (err) { + error.MalformedXml => switch (reader.errorCode()) { + .doctype_unsupported => return results.skip(id, "doctype unsupported", .{}), + .xml_declaration_encoding_unsupported => return results.skip(id, "encoding unsupported", .{}), + else => return results.pass(id), + }, + error.OutOfMemory => return error.OutOfMemory, + }; + if (node == .eof) return results.fail(id, "expected to fail to parse", .{}); + } +} + +const TestType = enum { + valid, + invalid, + @"not-wf", + @"error", +}; + +// Inspired by https://github.com/judofyr/parg +const ArgIterator = struct { + args: std.process.ArgIterator, + state: union(enum) { + normal, + short: []const u8, + long: struct { + option: []const u8, + value: []const u8, + }, + params_only, + } = .normal, + + const Arg = union(enum) { + option: union(enum) { + short: u8, + long: []const u8, + + fn is(option: @This(), short: ?u8, long: ?[]const u8) bool { + return switch (option) { + .short => |c| short == c, + .long => |s| std.mem.eql(u8, long orelse return false, s), + }; + } + + pub fn format(option: @This(), comptime _: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { + switch (option) { + .short => |c| try writer.print("-{c}", .{c}), + .long => |s| try writer.print("--{s}", .{s}), + } + } + }, + param: []const u8, + unexpected_value: struct { + option: []const u8, + value: []const u8, + }, + }; + + fn deinit(iter: *ArgIterator) void { + iter.args.deinit(); + iter.* = undefined; + } + + fn next(iter: *ArgIterator) ?Arg { + switch (iter.state) { + .normal => { + const arg = iter.args.next() orelse return null; + if (std.mem.eql(u8, arg, "--")) { + iter.state = .params_only; + return .{ .param = iter.args.next() orelse return null }; + } else if (std.mem.startsWith(u8, arg, "--")) { + if (std.mem.indexOfScalar(u8, arg, '=')) |equals_index| { + const option = arg["--".len..equals_index]; + iter.state = .{ .long = .{ + .option = option, + .value = arg[equals_index + 1 ..], + } }; + return .{ .option = .{ .long = option } }; + } else { + return .{ .option = .{ .long = arg["--".len..] } }; + } + } else if (std.mem.startsWith(u8, arg, "-") and arg.len > 1) { + if (arg.len > 2) { + iter.state = .{ .short = arg["-".len + 1 ..] }; + } + return .{ .option = .{ .short = arg["-".len] } }; + } else { + return .{ .param = arg }; + } + }, + .short => |rest| { + if (rest.len > 1) { + iter.state = .{ .short = rest[1..] }; + } + return .{ .option = .{ .short = rest[0] } }; + }, + .long => |long| return .{ .unexpected_value = .{ + .option = long.option, + .value = long.value, + } }, + .params_only => return .{ .param = iter.args.next() orelse return null }, + } + } + + fn optionValue(iter: *ArgIterator) ?[]const u8 { + switch (iter.state) { + .normal => return iter.args.next(), + .short => |rest| { + iter.state = .normal; + return rest; + }, + .long => |long| { + iter.state = .normal; + return long.value; + }, + .params_only => unreachable, + } + } +};