diff --git a/build.zig b/build.zig
index 55fdf6b..74e5707 100644
--- a/build.zig
+++ b/build.zig
@@ -35,8 +35,9 @@ pub fn build(b: *Build) void {
docs_step.dependOn(&xml_docs_copy.step);
const install_examples_step = b.step("install-examples", "Build and install the example programs");
+
const example_reader_exe = b.addExecutable(.{
- .name = "example-reader",
+ .name = "reader",
.root_source_file = b.path("examples/reader.zig"),
.target = target,
.optimize = optimize,
@@ -44,4 +45,14 @@ pub fn build(b: *Build) void {
example_reader_exe.root_module.addImport("xml", xml);
const example_reader_install = b.addInstallArtifact(example_reader_exe, .{});
install_examples_step.dependOn(&example_reader_install.step);
+
+ const example_canonicalize_exe = b.addExecutable(.{
+ .name = "canonicalize",
+ .root_source_file = b.path("examples/canonicalize.zig"),
+ .target = target,
+ .optimize = optimize,
+ });
+ example_canonicalize_exe.root_module.addImport("xml", xml);
+ const example_canonicalize_install = b.addInstallArtifact(example_canonicalize_exe, .{});
+ install_examples_step.dependOn(&example_canonicalize_install.step);
}
diff --git a/examples/canonicalize.zig b/examples/canonicalize.zig
new file mode 100644
index 0000000..149ba7d
--- /dev/null
+++ b/examples/canonicalize.zig
@@ -0,0 +1,91 @@
+const std = @import("std");
+const log = std.log;
+const xml = @import("xml");
+
+pub fn main() !void {
+ var gpa_state: std.heap.GeneralPurposeAllocator(.{}) = .{};
+ defer _ = gpa_state.deinit();
+ const gpa = gpa_state.allocator();
+
+ var args_iter = try std.process.argsWithAllocator(gpa);
+ defer args_iter.deinit();
+ _ = args_iter.next();
+ var pretty = false;
+ var input: ?[]u8 = null;
+ defer if (input) |f| gpa.free(f);
+ while (args_iter.next()) |arg| {
+ if (std.mem.eql(u8, arg, "-p") or std.mem.eql(u8, arg, "--pretty")) {
+ pretty = true;
+ } else {
+ if (input != null) return error.InvalidArguments; // usage: canonicalize [-p|--pretty] file
+ input = try gpa.dupe(u8, arg);
+ }
+ }
+
+ var input_file = try std.fs.cwd().openFile(input orelse return error.InvalidArguments, .{});
+ defer input_file.close();
+ var doc = xml.streamingDocument(gpa, input_file.reader());
+ defer doc.deinit();
+ var reader = doc.reader(gpa, .{});
+ defer reader.deinit();
+
+ var stdout_buf = std.io.bufferedWriter(std.io.getStdOut().writer());
+ const stdout_output = xml.streamingOutput(stdout_buf.writer());
+ var writer = stdout_output.writer(gpa, .{
+ .indent = if (pretty) " " else "",
+ });
+ defer writer.deinit();
+
+ while (true) {
+ const node = reader.read() catch |err| switch (err) {
+ error.MalformedXml => {
+ const loc = reader.errorLocation();
+ log.err("{}:{}: {}", .{ loc.line, loc.column, reader.errorCode() });
+ return error.MalformedXml;
+ },
+ else => |other| return other,
+ };
+ switch (node) {
+ .eof => break,
+ .xml_declaration, .comment => {}, // ignored in canonical form
+ .element_start => {
+ try writer.elementStart(reader.elementName());
+
+ const sorted_attrs = try gpa.alloc(usize, reader.attributeCount());
+ defer gpa.free(sorted_attrs);
+ for (0..reader.attributeCount()) |i| sorted_attrs[i] = i;
+ std.sort.pdq(usize, sorted_attrs, reader, struct {
+ fn lessThan(r: @TypeOf(reader), lhs: usize, rhs: usize) bool {
+ return std.mem.lessThan(u8, r.attributeName(lhs), r.attributeName(rhs));
+ }
+ }.lessThan);
+ for (sorted_attrs) |i| {
+ try writer.attribute(reader.attributeName(i), try reader.attributeValue(i));
+ }
+ },
+ .element_end => {
+ try writer.elementEnd();
+ },
+ .pi => {
+ try writer.pi(reader.piTarget(), try reader.piData());
+ },
+ .text => {
+ try writer.text(try reader.text());
+ },
+ .cdata => {
+ try writer.text(try reader.cdata());
+ },
+ .character_reference => {
+ var buf: [4]u8 = undefined;
+ const len = std.unicode.utf8Encode(reader.characterReferenceChar(), &buf) catch unreachable;
+ try writer.text(buf[0..len]);
+ },
+ .entity_reference => {
+ const value = xml.predefined_entities.get(reader.entityReferenceName()) orelse unreachable;
+ try writer.text(value);
+ },
+ }
+ }
+
+ try stdout_buf.flush();
+}
diff --git a/examples/reader.zig b/examples/reader.zig
index 40779f0..33610c5 100644
--- a/examples/reader.zig
+++ b/examples/reader.zig
@@ -9,7 +9,7 @@ pub fn main() !void {
const args = try std.process.argsAlloc(gpa);
defer std.process.argsFree(gpa, args);
if (args.len != 2) {
- return error.InvalidArguments; // usage: example-reader file
+ return error.InvalidArguments; // usage: reader file
}
var input_file = try std.fs.cwd().openFile(args[1], .{});
diff --git a/fuzz/src/fuzz.zig b/fuzz/src/fuzz.zig
index 86f553b..b2aa5e0 100644
--- a/fuzz/src/fuzz.zig
+++ b/fuzz/src/fuzz.zig
@@ -16,11 +16,53 @@ fn fuzz(gpa: Allocator, input: []const u8) !void {
var doc = xml.StaticDocument.init(input);
var reader = doc.reader(gpa, .{});
defer reader.deinit();
+
+ var out_bytes = std.ArrayList(u8).init(gpa);
+ defer out_bytes.deinit();
+ const output = xml.streamingOutput(out_bytes.writer());
+ var writer = output.writer(gpa, .{});
+ defer writer.deinit();
+
while (true) {
const node = reader.read() catch |err| switch (err) {
error.MalformedXml => break,
error.OutOfMemory => return error.OutOfMemory,
};
- if (node == .eof) break;
+ switch (node) {
+ .eof => break,
+ .xml_declaration => {
+ try writer.xmlDeclaration(reader.xmlDeclarationEncoding(), reader.xmlDeclarationStandalone());
+ },
+ .comment => {
+ // TODO: not implemented yet
+ },
+ .element_start => {
+ try writer.elementStart(reader.elementName());
+ for (0..reader.attributeCount()) |i| {
+ try writer.attribute(reader.attributeName(i), try reader.attributeValue(i));
+ }
+ },
+ .element_end => {
+ try writer.elementEnd();
+ },
+ .pi => {
+ try writer.pi(reader.piTarget(), try reader.piData());
+ },
+ .text => {
+ try writer.text(try reader.text());
+ },
+ .cdata => {
+ try writer.text(try reader.cdata());
+ },
+ .character_reference => {
+ var buf: [4]u8 = undefined;
+ const len = std.unicode.utf8Encode(reader.characterReferenceChar(), &buf) catch unreachable;
+ try writer.text(buf[0..len]);
+ },
+ .entity_reference => {
+ const value = xml.predefined_entities.get(reader.entityReferenceName()) orelse unreachable;
+ try writer.text(value);
+ },
+ }
}
}
diff --git a/src/Reader.zig b/src/Reader.zig
index fea9106..52ec967 100644
--- a/src/Reader.zig
+++ b/src/Reader.zig
@@ -2236,6 +2236,6 @@ fn addString(reader: *Reader, s: []const u8) !StringIndex {
return @enumFromInt(start);
}
-fn string(reader: Reader, index: StringIndex) []const u8 {
+fn string(reader: *const Reader, index: StringIndex) []const u8 {
return std.mem.sliceTo(reader.strings.items[@intFromEnum(index)..], 0);
}
diff --git a/src/Writer.zig b/src/Writer.zig
index 0e3a529..43dc5d8 100644
--- a/src/Writer.zig
+++ b/src/Writer.zig
@@ -1,17 +1,61 @@
const std = @import("std");
+const Allocator = std.mem.Allocator;
const assert = std.debug.assert;
+const expectEqual = std.testing.expectEqual;
+const expectEqualStrings = std.testing.expectEqualStrings;
+
+const ns_xmlns = @import("xml.zig").ns_xmlns;
+const predefined_namespace_prefixes = @import("xml.zig").predefined_namespace_prefixes;
+const streamingOutput = @import("xml.zig").streamingOutput;
options: Options,
state: State,
-indent_level: u32,
+/// String data for the current element nesting context.
+/// Each element start node appends the name of the element to this buffer, and
+/// the element name is followed by any namespace prefixes and URIs declared on
+/// the element so they can be referenced by `ns_prefixes`.
+strings: std.ArrayListUnmanaged(u8),
+/// The start indexes of the element names in `strings`.
+element_names: std.ArrayListUnmanaged(StringIndex),
+/// The namespace prefixes declared by the current nesting context of elements.
+ns_prefixes: std.ArrayListUnmanaged(std.AutoArrayHashMapUnmanaged(StringIndex, StringIndex)),
+/// Pending namespace prefixes to be declared on the next element start.
+pending_ns: std.AutoArrayHashMapUnmanaged(StringIndex, StringIndex),
+/// A counter for the next generated `ns123` namespace prefix to be used.
+gen_ns_prefix_counter: u32,
sink: Sink,
+gpa: Allocator,
+
const Writer = @This();
pub const Options = struct {
+ /// A string to be used as indentation for the output.
+ /// An empty value indicates no attempt should be made to pretty-print the
+ /// output.
+ ///
+ /// Using any value aside from an empty string may technically change the
+ /// content of the output according to the spec, because leading and
+ /// trailing whitespace within element content is always significant.
+ /// For example, the following XML samples are _not_ strictly equivalent:
+ ///
+ /// ```xml
+ ///
+ /// ```
+ ///
+ /// and
+ ///
+ /// ```xml
+ ///
+ ///
+ ///
+ /// ```
indent: []const u8 = "",
+ /// Whether the writer should be aware of XML namespaces. The `Ns`-suffixed
+ /// functions of `Writer` may only be used when this is enabled.
+ namespace_aware: bool = true,
};
pub const Sink = struct {
@@ -33,50 +77,184 @@ const State = enum {
end,
};
-pub fn init(sink: Sink, options: Options) Writer {
+pub fn init(gpa: Allocator, sink: Sink, options: Options) Writer {
return .{
.options = options,
.state = .start,
- .indent_level = 0,
+ .strings = .{},
+ .element_names = .{},
+ .ns_prefixes = .{},
+ .pending_ns = .{},
+ .gen_ns_prefix_counter = 0,
.sink = sink,
+
+ .gpa = gpa,
};
}
+pub fn deinit(writer: *Writer) void {
+ writer.strings.deinit(writer.gpa);
+ writer.element_names.deinit(writer.gpa);
+ for (writer.ns_prefixes.items) |*map| map.deinit(writer.gpa);
+ writer.ns_prefixes.deinit(writer.gpa);
+ writer.pending_ns.deinit(writer.gpa);
+ writer.* = undefined;
+}
+
pub const WriteError = error{};
+/// Writes the BOM (byte-order mark).
+/// Asserts that the writer is at the beginning of the document.
pub fn bom(writer: *Writer) anyerror!void {
assert(writer.state == .start);
- try writer.raw("\u{FEFF}");
+ try writer.write("\u{FEFF}");
writer.state = .after_bom;
}
+test bom {
+ var raw = std.ArrayList(u8).init(std.testing.allocator);
+ defer raw.deinit();
+ const out = streamingOutput(raw.writer());
+ var writer = out.writer(std.testing.allocator, .{ .indent = " " });
+ defer writer.deinit();
+
+ try writer.bom();
+ try writer.elementStart("root");
+ try writer.elementEndEmpty();
+
+ try expectEqualStrings("\u{FEFF}", raw.items);
+}
+
+/// Writes the XML declaration.
+/// Asserts that the writer is at the beginning of the document or just after the BOM (if any).
pub fn xmlDeclaration(writer: *Writer, encoding: ?[]const u8, standalone: ?bool) anyerror!void {
assert(writer.state == .start or writer.state == .after_bom);
- try writer.raw("");
+ try writer.write("?>");
if (writer.options.indent.len > 0) try writer.newLineAndIndent();
writer.state = .after_xml_declaration;
}
+test xmlDeclaration {
+ var raw = std.ArrayList(u8).init(std.testing.allocator);
+ defer raw.deinit();
+ const out = streamingOutput(raw.writer());
+ var writer = out.writer(std.testing.allocator, .{ .indent = " " });
+ defer writer.deinit();
+
+ try writer.xmlDeclaration("UTF-8", true);
+ try writer.elementStart("root");
+ try writer.elementEndEmpty();
+
+ try expectEqualStrings(
+ \\
+ \\
+ , raw.items);
+}
+
+/// Starts an element.
+/// Asserts that the writer is not after the end of the root element.
pub fn elementStart(writer: *Writer, name: []const u8) anyerror!void {
+ if (writer.options.namespace_aware) prefixed: {
+ const colon_pos = std.mem.indexOfScalar(u8, name, ':') orelse break :prefixed;
+ const prefix = name[0..colon_pos];
+ const local = name[colon_pos + 1 ..];
+ try writer.elementStartInternal(prefix, local);
+ return;
+ }
+ try writer.elementStartInternal("", name);
+}
+
+test elementStart {
+ var raw = std.ArrayList(u8).init(std.testing.allocator);
+ defer raw.deinit();
+ const out = streamingOutput(raw.writer());
+ var writer = out.writer(std.testing.allocator, .{ .indent = " " });
+ defer writer.deinit();
+
+ try writer.elementStart("root");
+ try writer.elementStart("element");
+ try writer.elementEnd();
+ try writer.elementEnd();
+
+ try expectEqualStrings(
+ \\
+ \\
+ \\
+ \\
+ , raw.items);
+}
+
+/// Starts a namespaced element.
+/// Asserts that the writer is namespace-aware and not after the end of the
+/// root element.
+///
+/// Currently, this function also asserts that `ns` is not empty, although that
+/// may be supported in the future.
+///
+/// If `ns` is already bound to a prefix (via an attribute or `bindNs`), that
+/// prefix will be used. Otherwise, a generated namespace prefix counting
+/// upwards from `ns0` will be declared and used.
+pub fn elementStartNs(writer: *Writer, ns: []const u8, local: []const u8) anyerror!void {
+ assert(writer.options.namespace_aware);
+ // TODO: XML 1.0 does not allow undeclaring namespace prefixes, so ensuring
+ // the empty namespace is actually used here is potentially quite tricky.
+ // For now, it is not allowed.
+ assert(ns.len > 0);
+ const prefix = writer.getNsPrefix(ns) orelse prefix: {
+ const str = try writer.generateNsPrefix();
+ // If we are already inside an element start, we don't want to
+ // immediately bind our new prefix in that scope. Rather, we
+ // want to wait to bind it on the newly started element.
+ try writer.pending_ns.put(writer.gpa, str, try writer.addString(ns));
+ break :prefix writer.string(str);
+ };
+ try writer.elementStartInternal(prefix, local);
+}
+
+test elementStartNs {
+ var raw = std.ArrayList(u8).init(std.testing.allocator);
+ defer raw.deinit();
+ const out = streamingOutput(raw.writer());
+ var writer = out.writer(std.testing.allocator, .{ .indent = " " });
+ defer writer.deinit();
+
+ try writer.elementStartNs("http://example.com/foo", "root");
+ try writer.elementStartNs("http://example.com/bar", "element");
+ try writer.elementStartNs("http://example.com/foo", "element");
+ try writer.elementEnd();
+ try writer.elementEnd();
+ try writer.elementEnd();
+
+ try expectEqualStrings(
+ \\
+ \\
+ \\
+ \\
+ \\
+ \\
+ , raw.items);
+}
+
+fn elementStartInternal(writer: *Writer, prefix: []const u8, local: []const u8) !void {
switch (writer.state) {
.start, .after_bom, .after_xml_declaration, .text => {},
.element_start => {
- try writer.raw(">");
+ try writer.write(">");
try writer.newLineAndIndent();
},
.after_structure_end => {
@@ -84,18 +262,51 @@ pub fn elementStart(writer: *Writer, name: []const u8) anyerror!void {
},
.end => unreachable,
}
- try writer.raw("<");
- try writer.raw(name);
+
+ try writer.write("<");
+ if (prefix.len > 0) {
+ try writer.write(prefix);
+ try writer.write(":");
+ }
+ try writer.write(local);
+
+ // TODO: this is what I would _like_ to do, but prefix may point into
+ // strings, which can be invalidated while resizing it...
+ // const element_name = try writer.addPrefixedString(prefix, local);
+ // This temporary allocation is reliable, but ugly. At least local won't
+ // point into strings, so we can avoid the allocation if there's no prefix.
+ const element_name = if (prefix.len > 0) name: {
+ const tmp = try std.fmt.allocPrint(writer.gpa, "{s}:{s}", .{ prefix, local });
+ defer writer.gpa.free(tmp);
+ break :name try writer.addString(tmp);
+ } else try writer.addString(local);
+ try writer.element_names.append(writer.gpa, element_name);
writer.state = .element_start;
- writer.indent_level += 1;
+
+ if (writer.options.namespace_aware) {
+ var ns_prefixes: std.AutoArrayHashMapUnmanaged(StringIndex, StringIndex) = .{};
+ try ns_prefixes.ensureUnusedCapacity(writer.gpa, writer.pending_ns.count());
+ var pending_ns_iter = writer.pending_ns.iterator();
+ while (pending_ns_iter.next()) |pending_ns| {
+ try writer.attributeInternal("xmlns", writer.string(pending_ns.key_ptr.*), writer.string(pending_ns.value_ptr.*));
+ // The pending_ns strings point into the string memory of the
+ // enclosing element, so they are guaranteed to remain valid for
+ // the lifetime of the current element.
+ try ns_prefixes.put(writer.gpa, pending_ns.key_ptr.*, pending_ns.value_ptr.*);
+ }
+ try writer.ns_prefixes.append(writer.gpa, ns_prefixes);
+ writer.pending_ns.clearRetainingCapacity();
+ }
}
-pub fn elementEnd(writer: *Writer, name: []const u8) anyerror!void {
- writer.indent_level -= 1;
+/// Ends the currently open element.
+/// Asserts that the writer is inside an element.
+pub fn elementEnd(writer: *Writer) anyerror!void {
+ const name = writer.element_names.pop();
switch (writer.state) {
.text => {},
.element_start => {
- try writer.raw(">");
+ try writer.write(">");
try writer.newLineAndIndent();
},
.after_structure_end => {
@@ -103,33 +314,193 @@ pub fn elementEnd(writer: *Writer, name: []const u8) anyerror!void {
},
.start, .after_bom, .after_xml_declaration, .end => unreachable,
}
- try writer.raw("");
- try writer.raw(name);
- try writer.raw(">");
- writer.state = if (writer.indent_level > 0) .after_structure_end else .end;
+ try writer.write("");
+ try writer.write(writer.string(name));
+ try writer.write(">");
+ writer.state = if (writer.element_names.items.len > 0) .after_structure_end else .end;
+ writer.strings.shrinkRetainingCapacity(@intFromEnum(name));
+ if (writer.options.namespace_aware) {
+ var ns_prefixes = writer.ns_prefixes.pop();
+ ns_prefixes.deinit(writer.gpa);
+ writer.pending_ns.clearRetainingCapacity();
+ }
+}
+
+test elementEnd {
+ var raw = std.ArrayList(u8).init(std.testing.allocator);
+ defer raw.deinit();
+ const out = streamingOutput(raw.writer());
+ var writer = out.writer(std.testing.allocator, .{ .indent = " " });
+ defer writer.deinit();
+
+ try writer.elementStart("root");
+ try writer.elementStart("element");
+ try writer.elementEnd();
+ try writer.elementEnd();
+
+ try expectEqualStrings(
+ \\
+ \\
+ \\
+ \\
+ , raw.items);
}
+/// Ends the currently open element as an empty element (``).
+/// Asserts that the writer is in an element start.
pub fn elementEndEmpty(writer: *Writer) anyerror!void {
assert(writer.state == .element_start);
- try writer.raw("/>");
+ try writer.write("/>");
writer.state = .after_structure_end;
- writer.indent_level -= 1;
+ _ = writer.element_names.pop();
}
+test elementEndEmpty {
+ var raw = std.ArrayList(u8).init(std.testing.allocator);
+ defer raw.deinit();
+ const out = streamingOutput(raw.writer());
+ var writer = out.writer(std.testing.allocator, .{ .indent = " " });
+ defer writer.deinit();
+
+ try writer.elementStart("root");
+ try writer.elementStart("element");
+ try writer.elementEndEmpty();
+ try writer.elementEnd();
+
+ try expectEqualStrings(
+ \\
+ \\
+ \\
+ , raw.items);
+}
+
+/// Adds an attribute to the current element start.
+/// Asserts that the writer is in an element start.
+///
+/// If the writer is namespace-aware, namespace declarations are recognized and
+/// registered for future use by "Ns"-suffixed functions.
pub fn attribute(writer: *Writer, name: []const u8, value: []const u8) anyerror!void {
assert(writer.state == .element_start);
- try writer.raw(" ");
- try writer.raw(name);
- try writer.raw("=\"");
+ if (writer.options.namespace_aware) prefixed: {
+ if (std.mem.eql(u8, name, "xmlns")) {
+ const new_ns = try writer.addString(value);
+ const ns_prefixes = &writer.ns_prefixes.items[writer.ns_prefixes.items.len - 1];
+ try ns_prefixes.put(writer.gpa, .empty, new_ns);
+ }
+ const colon_pos = std.mem.indexOfScalar(u8, name, ':') orelse break :prefixed;
+ const prefix = name[0..colon_pos];
+ const local = name[colon_pos + 1 ..];
+ if (std.mem.eql(u8, prefix, "xmlns")) {
+ const new_prefix = try writer.addString(local);
+ const new_ns = try writer.addString(value);
+ const ns_prefixes = &writer.ns_prefixes.items[writer.ns_prefixes.items.len - 1];
+ try ns_prefixes.put(writer.gpa, new_prefix, new_ns);
+ }
+ try writer.attributeInternal(prefix, local, value);
+ return;
+ }
+ try writer.attributeInternal("", name, value);
+}
+
+test attribute {
+ var raw = std.ArrayList(u8).init(std.testing.allocator);
+ defer raw.deinit();
+ const out = streamingOutput(raw.writer());
+ var writer = out.writer(std.testing.allocator, .{ .indent = " " });
+ defer writer.deinit();
+
+ try writer.elementStart("root");
+ try writer.attribute("key", "value");
+ try writer.attribute("xmlns", "http://example.com");
+ try writer.attribute("xmlns:a", "http://example.com/a");
+ try writer.elementStartNs("http://example.com", "element");
+ try writer.elementEndEmpty();
+ try writer.elementStartNs("http://example.com/a", "element");
+ try writer.elementEndEmpty();
+ try writer.elementEnd();
+
+ try expectEqualStrings(
+ \\
+ \\
+ \\
+ \\
+ , raw.items);
+}
+
+/// Adds a namespaced attribute to the current element start.
+/// Asserts that the writer is namespace-aware and in an element start.
+///
+/// Currently, this function also asserts that `ns` is not empty, although that
+/// may be supported in the future.
+///
+/// If `ns` is already bound to a prefix (via an attribute or `bindNs`), that
+/// prefix will be used. Otherwise, a generated namespace prefix counting
+/// upwards from `ns0` will be declared and used.
+///
+/// If the writer is namespace-aware, namespace declarations are recognized and
+/// registered for future use by "Ns"-suffixed functions.
+pub fn attributeNs(writer: *Writer, ns: []const u8, local: []const u8, value: []const u8) anyerror!void {
+ assert(writer.options.namespace_aware);
+ // TODO: XML 1.0 does not allow undeclaring namespace prefixes, so ensuring
+ // the empty namespace is actually used here is potentially quite tricky.
+ // For now, it is not allowed.
+ assert(ns.len > 0);
+ if (std.mem.eql(u8, ns, ns_xmlns)) {
+ const new_prefix = try writer.addString(local);
+ const new_ns = try writer.addString(value);
+ const ns_prefixes = &writer.ns_prefixes.items[writer.ns_prefixes.items.len - 1];
+ try ns_prefixes.put(writer.gpa, new_prefix, new_ns);
+ }
+ const prefix = writer.getNsPrefix(ns) orelse prefix: {
+ const str = try writer.generateNsPrefix();
+ try writer.bindNsImmediate(str, ns);
+ break :prefix writer.string(str);
+ };
+ try writer.attributeInternal(prefix, local, value);
+}
+
+test attributeNs {
+ var raw = std.ArrayList(u8).init(std.testing.allocator);
+ defer raw.deinit();
+ const out = streamingOutput(raw.writer());
+ var writer = out.writer(std.testing.allocator, .{ .indent = " " });
+ defer writer.deinit();
+
+ try writer.elementStart("root");
+ try writer.attributeNs("http://example.com", "key", "value");
+ try writer.attributeNs("http://www.w3.org/2000/xmlns/", "a", "http://example.com/a");
+ try writer.elementStartNs("http://example.com", "element");
+ try writer.elementEndEmpty();
+ try writer.elementStartNs("http://example.com/a", "element");
+ try writer.elementEndEmpty();
+ try writer.elementEnd();
+
+ try expectEqualStrings(
+ \\
+ \\
+ \\
+ \\
+ , raw.items);
+}
+
+fn attributeInternal(writer: *Writer, prefix: []const u8, name: []const u8, value: []const u8) !void {
+ assert(writer.state == .element_start);
+ try writer.write(" ");
+ if (prefix.len > 0) {
+ try writer.write(prefix);
+ try writer.write(":");
+ }
+ try writer.write(name);
+ try writer.write("=\"");
try writer.attributeText(value);
- try writer.raw("\"");
+ try writer.write("\"");
}
fn attributeText(writer: *Writer, s: []const u8) anyerror!void {
var pos: usize = 0;
while (std.mem.indexOfAnyPos(u8, s, pos, "\r\n\t&<\"")) |esc_pos| {
- try writer.raw(s[pos..esc_pos]);
- try writer.raw(switch (s[esc_pos]) {
+ try writer.write(s[pos..esc_pos]);
+ try writer.write(switch (s[esc_pos]) {
'\r' => "
",
'\n' => "
",
'\t' => " ",
@@ -140,38 +511,64 @@ fn attributeText(writer: *Writer, s: []const u8) anyerror!void {
});
pos = esc_pos + 1;
}
- try writer.raw(s[pos..]);
+ try writer.write(s[pos..]);
}
+/// Writes a PI (processing instruction).
pub fn pi(writer: *Writer, target: []const u8, data: []const u8) anyerror!void {
switch (writer.state) {
.start, .after_bom, .after_xml_declaration, .text, .end => {},
.element_start => {
- try writer.raw(">");
+ try writer.write(">");
try writer.newLineAndIndent();
},
.after_structure_end => {
try writer.newLineAndIndent();
},
}
- try writer.raw("");
- try writer.raw(target);
- try writer.raw(" ");
- try writer.raw(data);
- try writer.raw("?>");
+ try writer.write("");
+ try writer.write(target);
+ if (data.len > 0) {
+ try writer.write(" ");
+ try writer.write(data);
+ }
+ try writer.write("?>");
writer.state = .after_structure_end;
}
+test pi {
+ var raw = std.ArrayList(u8).init(std.testing.allocator);
+ defer raw.deinit();
+ const out = streamingOutput(raw.writer());
+ var writer = out.writer(std.testing.allocator, .{ .indent = " " });
+ defer writer.deinit();
+
+ try writer.pi("some-pi", "some pi data");
+ try writer.elementStart("root");
+ try writer.pi("handle-me", "");
+ try writer.elementEnd();
+
+ try expectEqualStrings(
+ \\
+ \\
+ \\
+ \\
+ , raw.items);
+}
+
+/// Writes a text node, escaping the text where necessary to preserve its value
+/// in the resulting XML.
+/// Asserts that the writer is in an element.
pub fn text(writer: *Writer, s: []const u8) anyerror!void {
switch (writer.state) {
.after_structure_end, .text => {},
- .element_start => try writer.raw(">"),
+ .element_start => try writer.write(">"),
.start, .after_bom, .after_xml_declaration, .end => unreachable,
}
var pos: usize = 0;
while (std.mem.indexOfAnyPos(u8, s, pos, "\r&<")) |esc_pos| {
- try writer.raw(s[pos..esc_pos]);
- try writer.raw(switch (s[esc_pos]) {
+ try writer.write(s[pos..esc_pos]);
+ try writer.write(switch (s[esc_pos]) {
'\r' => "
",
'&' => "&",
'<' => "<",
@@ -179,17 +576,39 @@ pub fn text(writer: *Writer, s: []const u8) anyerror!void {
});
pos = esc_pos + 1;
}
- try writer.raw(s[pos..]);
+ try writer.write(s[pos..]);
writer.state = .text;
}
-// insert some existing XML document without escaping anything
+test text {
+ var raw = std.ArrayList(u8).init(std.testing.allocator);
+ defer raw.deinit();
+ const out = streamingOutput(raw.writer());
+ var writer = out.writer(std.testing.allocator, .{ .indent = " " });
+ defer writer.deinit();
+
+ try writer.elementStart("root");
+ try writer.text("Sample XML: \r\n&\r\n");
+ try writer.elementEnd();
+
+ try expectEqualStrings(
+ \\Sample XML: <root>
+ \\&
+ \\</root>
+ , raw.items);
+}
+
+/// Writes an XML fragment without escaping anything.
+///
+/// For correctness, the XML fragment must not contain any unclosed structures.
+/// For example, the fragment `` is illegal, as the element `foo` remains
+/// unclosed after embedding. Similarly, ` {},
- .element_start => try writer.raw(">"),
+ .element_start => try writer.write(">"),
}
- try writer.raw(s);
+ try writer.write(s);
writer.state = switch (writer.state) {
.start, .after_bom, .after_xml_declaration => .after_xml_declaration,
.element_start, .after_structure_end, .text => .text,
@@ -197,60 +616,192 @@ pub fn embed(writer: *Writer, s: []const u8) anyerror!void {
};
}
-fn newLineAndIndent(writer: *Writer) anyerror!void {
- if (writer.options.indent.len == 0) return;
+test embed {
+ var raw = std.ArrayList(u8).init(std.testing.allocator);
+ defer raw.deinit();
+ const out = streamingOutput(raw.writer());
+ var writer = out.writer(std.testing.allocator, .{ .indent = " " });
+ defer writer.deinit();
- try writer.raw("\n");
- var n: usize = 0;
- while (n < writer.indent_level) : (n += 1) {
- try writer.raw(writer.options.indent);
- }
+ try writer.xmlDeclaration("UTF-8", null);
+ try writer.elementStart("foo");
+ try writer.embed("Baz!");
+ try writer.elementEnd();
+
+ try expectEqualStrings(
+ \\
+ \\Baz!
+ , raw.items);
}
-fn raw(writer: *Writer, s: []const u8) anyerror!void {
- try writer.sink.write(s);
+/// Binds a namespace URI to a prefix.
+///
+/// If the writer is currently inside an element start, the namespace is
+/// declared immediately. Otherwise, it will be declared on the next element
+/// started.
+pub fn bindNs(writer: *Writer, prefix: []const u8, ns: []const u8) anyerror!void {
+ try writer.bindNsInternal(try writer.addString(prefix), ns);
}
-test {
- _ = T;
+test bindNs {
+ var raw = std.ArrayList(u8).init(std.testing.allocator);
+ defer raw.deinit();
+ const out = streamingOutput(raw.writer());
+ var writer = out.writer(std.testing.allocator, .{ .indent = " " });
+ defer writer.deinit();
+
+ // Namespaces may be bound before the element they apply to, allowing a
+ // prefix to be bound for a namespaced element.
+ try writer.bindNs("ex", "http://example.com");
+ try writer.elementStartNs("http://example.com", "root");
+ try writer.attributeNs("http://example.com", "a", "value");
+ try writer.elementStartNs("http://example.com", "element");
+ try writer.bindNs("ex2", "http://example.com/ns2");
+ try writer.attributeNs("http://example.com/ns2", "a", "value");
+ // It doesn't matter if a namespace prefix is ever used: it will be
+ // declared regardless.
+ try writer.bindNs("ex3", "http://example.com/ns3");
+ try writer.elementEndEmpty();
+ try writer.elementEnd();
+
+ try expectEqualStrings(
+ \\
+ \\
+ \\
+ , raw.items);
}
-const T = struct {
- const Testbed = struct {
- buf: std.ArrayList(u8),
- fn init(a: std.mem.Allocator) Testbed {
- return .{
- .buf = std.ArrayList(u8).init(a),
- };
- }
- fn writer(self: *Testbed, indent: []const u8) Writer {
- return Writer.init(.{
- .context = self,
- .writeFn = write,
- }, .{ .indent = indent });
- }
- fn write(context: *const anyopaque, data: []const u8) anyerror!void {
- // TODO not sure why context is const.
- var self: *Testbed = @constCast(@alignCast(@ptrCast(context)));
- try self.buf.appendSlice(data);
+
+fn bindNsInternal(writer: *Writer, prefix_str: StringIndex, ns: []const u8) !void {
+ if (writer.state == .element_start) {
+ try writer.bindNsImmediate(prefix_str, ns);
+ } else {
+ const ns_str = try writer.addString(ns);
+ try writer.pending_ns.put(writer.gpa, prefix_str, ns_str);
+ }
+}
+
+fn bindNsImmediate(writer: *Writer, prefix_str: StringIndex, ns: []const u8) !void {
+ const ns_str = try writer.addString(ns);
+ try writer.attributeInternal("xmlns", writer.string(prefix_str), ns);
+ const ns_prefixes = &writer.ns_prefixes.items[writer.ns_prefixes.items.len - 1];
+ try ns_prefixes.put(writer.gpa, prefix_str, ns_str);
+}
+
+fn getNsPrefix(writer: *Writer, ns: []const u8) ?[]const u8 {
+ if (predefined_namespace_prefixes.get(ns)) |prefix| return prefix;
+
+ // Potential optimization opportunity: store a mapping of namespace URIs
+ // to prefixes and update it when an element closes or a new prefix is
+ // bound.
+
+ var pending_ns = writer.pending_ns.iterator();
+ while (pending_ns.next()) |pending| {
+ if (std.mem.eql(u8, ns, writer.string(pending.value_ptr.*))) {
+ return writer.string(pending.key_ptr.*);
}
- fn output(self: *Testbed) []const u8 {
- return self.buf.items;
+ }
+
+ var i: usize = writer.ns_prefixes.items.len;
+ while (i > 0) {
+ i -= 1;
+ var ns_prefixes = writer.ns_prefixes.items[i].iterator();
+ while (ns_prefixes.next()) |ns_prefix| {
+ if (std.mem.eql(u8, ns, writer.string(ns_prefix.value_ptr.*))) {
+ return writer.string(ns_prefix.key_ptr.*);
+ }
}
- fn deinit(self: *Testbed) void {
- self.buf.deinit();
+ }
+ return null;
+}
+
+fn generateNsPrefix(writer: *Writer) !StringIndex {
+ gen_prefix: while (true) {
+ const max_len = std.fmt.comptimePrint("ns{}", .{std.math.maxInt(@TypeOf(writer.gen_ns_prefix_counter))}).len;
+ var buf: [max_len]u8 = undefined;
+ const prefix = std.fmt.bufPrint(&buf, "ns{}", .{writer.gen_ns_prefix_counter}) catch unreachable;
+ writer.gen_ns_prefix_counter += 1;
+ for (writer.ns_prefixes.items) |ns_prefixes| {
+ for (ns_prefixes.keys()) |existing_prefix| {
+ if (std.mem.eql(u8, prefix, writer.string(existing_prefix))) {
+ continue :gen_prefix;
+ }
+ }
}
- };
- test "embed" {
- var tb = Testbed.init(std.testing.allocator);
- defer tb.deinit();
- var wtr = tb.writer(" ");
- try wtr.xmlDeclaration("UTF-8", null);
- try wtr.elementStart("foo");
- try wtr.embed("Baz!");
- try wtr.elementEnd("foo");
- try std.testing.expectEqualStrings(
- \\
- \\Baz!
- , tb.output());
+ return try writer.addString(prefix);
+ }
+}
+
+fn newLineAndIndent(writer: *Writer) anyerror!void {
+ if (writer.options.indent.len == 0) return;
+
+ try writer.write("\n");
+ for (0..writer.element_names.items.len) |_| {
+ try writer.write(writer.options.indent);
+ }
+}
+
+fn write(writer: *Writer, s: []const u8) anyerror!void {
+ try writer.sink.write(s);
+}
+
+const StringIndex = enum(usize) { empty = 0, _ };
+
+const StringIndexAdapter = struct {
+ strings: []const u8,
+
+ pub fn hash(ctx: @This(), key: []const u8) u32 {
+ _ = ctx;
+ return @truncate(std.hash.Wyhash.hash(0, key));
+ }
+
+ pub fn eql(ctx: @This(), a: []const u8, b: StringIndex, b_index: usize) bool {
+ _ = b_index;
+ const b_val = std.mem.sliceTo(ctx.strings[@intFromEnum(b)..], 0);
+ return std.mem.eql(u8, a, b_val);
}
};
+
+fn addString(writer: *Writer, s: []const u8) !StringIndex {
+ try writer.strings.ensureUnusedCapacity(writer.gpa, 1 + s.len);
+ writer.strings.appendAssumeCapacity(0);
+ const start = writer.strings.items.len;
+ writer.strings.appendSliceAssumeCapacity(s);
+ return @enumFromInt(start);
+}
+
+fn addPrefixedString(writer: *Writer, prefix: []const u8, s: []const u8) !StringIndex {
+ if (prefix.len == 0) return writer.addString(s);
+ try writer.strings.ensureUnusedCapacity(writer.gpa, 1 + prefix.len + ":".len + s.len);
+ writer.strings.appendAssumeCapacity(0);
+ const start = writer.strings.items.len;
+ writer.strings.appendSliceAssumeCapacity(prefix);
+ writer.strings.appendAssumeCapacity(':');
+ writer.strings.appendSliceAssumeCapacity(s);
+ return @enumFromInt(start);
+}
+
+fn string(writer: *const Writer, index: StringIndex) []const u8 {
+ return std.mem.sliceTo(writer.strings.items[@intFromEnum(index)..], 0);
+}
+
+test "namespace prefix strings resize bug" {
+ // Reported here: https://github.com/ianprime0509/zig-xml/pull/41#issuecomment-2449960818
+ var raw = std.ArrayList(u8).init(std.testing.allocator);
+ defer raw.deinit();
+ const out = streamingOutput(raw.writer());
+ var writer = out.writer(std.testing.allocator, .{ .indent = " " });
+ defer writer.deinit();
+
+ try writer.bindNs("d", "foospace");
+ try writer.elementStartNs("foospace", "root");
+ try writer.elementStartNs("foospace", "child");
+ try writer.text("Hello, Bug");
+ try writer.elementEnd();
+ try writer.elementEnd();
+
+ try expectEqualStrings(
+ \\
+ \\ Hello, Bug
+ \\
+ , raw.items);
+}
diff --git a/src/xml.zig b/src/xml.zig
index ceaaae9..5029e44 100644
--- a/src/xml.zig
+++ b/src/xml.zig
@@ -76,6 +76,10 @@ pub const predefined_namespace_uris = std.StaticStringMap([]const u8).initCompti
.{ "xml", ns_xml },
.{ "xmlns", ns_xmlns },
});
+pub const predefined_namespace_prefixes = std.StaticStringMap([]const u8).initComptime(.{
+ .{ ns_xml, "xml" },
+ .{ ns_xmlns, "xmlns" },
+});
pub const Reader = @import("Reader.zig");
@@ -434,43 +438,73 @@ pub fn GenericWriter(comptime SinkError: type) type {
return struct {
writer: Writer,
- pub const WriteError = Writer.WriteError || SinkError;
+ /// See `Writer.deinit`.
+ pub inline fn deinit(writer: *@This()) void {
+ writer.writer.deinit();
+ }
+
+ // TODO: not all the write functions actually need to allocate
+ pub const WriteError = Writer.WriteError || SinkError || Allocator.Error;
+ /// See `Writer.bom`.
pub inline fn bom(writer: *@This()) WriteError!void {
return @errorCast(writer.writer.bom());
}
+ /// See `Writer.xmlDeclaration`.
pub inline fn xmlDeclaration(writer: *@This(), encoding: ?[]const u8, standalone: ?bool) WriteError!void {
return @errorCast(writer.writer.xmlDeclaration(encoding, standalone));
}
+ /// See `Writer.elementStart`.
pub inline fn elementStart(writer: *@This(), name: []const u8) WriteError!void {
return @errorCast(writer.writer.elementStart(name));
}
- pub inline fn elementEnd(writer: *@This(), name: []const u8) WriteError!void {
- return @errorCast(writer.writer.elementEnd(name));
+ /// See `Writer.elementStartNs`.
+ pub inline fn elementStartNs(writer: *@This(), ns: []const u8, local: []const u8) WriteError!void {
+ return @errorCast(writer.writer.elementStartNs(ns, local));
}
+ /// See `Writer.elementEnd`.
+ pub inline fn elementEnd(writer: *@This()) WriteError!void {
+ return @errorCast(writer.writer.elementEnd());
+ }
+
+ /// See `Writer.elementEndEmpty`.
pub inline fn elementEndEmpty(writer: *@This()) WriteError!void {
return @errorCast(writer.writer.elementEndEmpty());
}
+ /// See `Writer.attribute`.
pub inline fn attribute(writer: *@This(), name: []const u8, value: []const u8) WriteError!void {
return @errorCast(writer.writer.attribute(name, value));
}
+ /// See `Writer.attributeNs`.
+ pub inline fn attributeNs(writer: *@This(), ns: []const u8, local: []const u8, value: []const u8) WriteError!void {
+ return @errorCast(writer.writer.attributeNs(ns, local, value));
+ }
+
+ /// See `Writer.pi`.
pub inline fn pi(writer: *@This(), target: []const u8, data: []const u8) WriteError!void {
return @errorCast(writer.writer.pi(target, data));
}
+ /// See `Writer.text`.
pub inline fn text(writer: *@This(), s: []const u8) WriteError!void {
return @errorCast(writer.writer.text(s));
}
+ /// See `Writer.embed`.
pub inline fn embed(writer: *@This(), s: []const u8) WriteError!void {
return @errorCast(writer.writer.embed(s));
}
+
+ /// See `Writer.bindNs`.
+ pub inline fn bindNs(writer: *@This(), prefix: []const u8, ns: []const u8) WriteError!void {
+ return @errorCast(writer.writer.bindNs(prefix, ns));
+ }
};
}
@@ -480,8 +514,8 @@ pub fn StreamingOutput(comptime WriterType: type) type {
pub const Error = WriterType.Error;
- pub fn writer(out: *const @This(), options: Writer.Options) GenericWriter(Error) {
- return .{ .writer = Writer.init(out.sink(), options) };
+ pub fn writer(out: *const @This(), gpa: Allocator, options: Writer.Options) GenericWriter(Error) {
+ return .{ .writer = Writer.init(gpa, out.sink(), options) };
}
pub fn sink(out: *const @This()) Writer.Sink {
@@ -505,6 +539,28 @@ pub fn streamingOutput(writer: anytype) StreamingOutput(@TypeOf(writer)) {
return .{ .stream = writer };
}
+test streamingOutput {
+ var raw = std.ArrayList(u8).init(std.testing.allocator);
+ defer raw.deinit();
+ const out = streamingOutput(raw.writer());
+ var writer = out.writer(std.testing.allocator, .{ .indent = " " });
+ defer writer.deinit();
+
+ try writer.xmlDeclaration("UTF-8", null);
+ try writer.elementStart("test");
+ try writer.elementStart("inner");
+ try writer.text("Hello, world!");
+ try writer.elementEnd();
+ try writer.elementEnd();
+
+ try expectEqualStrings(
+ \\
+ \\
+ \\ Hello, world!
+ \\
+ , raw.items);
+}
+
test {
_ = Location;
_ = QName;
diff --git a/xmlconf/src/xmlconf.zig b/xmlconf/src/xmlconf.zig
index c9a4faa..b21d410 100644
--- a/xmlconf/src/xmlconf.zig
+++ b/xmlconf/src/xmlconf.zig
@@ -250,8 +250,9 @@ fn runTestParseable(
var canonical_buf = std.ArrayList(u8).init(gpa);
defer canonical_buf.deinit();
- var canonical_output = xml.streamingOutput(canonical_buf.writer());
- var canonical = canonical_output.writer(.{});
+ const canonical_output = xml.streamingOutput(canonical_buf.writer());
+ var canonical = canonical_output.writer(gpa, .{});
+ defer canonical.deinit();
while (true) {
const node = reader.read() catch |err| switch (err) {
@@ -286,7 +287,7 @@ fn runTestParseable(
}
},
.element_end => {
- try canonical.elementEnd(reader.elementName());
+ try canonical.elementEnd();
},
.pi => {
try canonical.pi(reader.piTarget(), try reader.piData());