diff --git a/build.zig b/build.zig index 55fdf6b..74e5707 100644 --- a/build.zig +++ b/build.zig @@ -35,8 +35,9 @@ pub fn build(b: *Build) void { docs_step.dependOn(&xml_docs_copy.step); const install_examples_step = b.step("install-examples", "Build and install the example programs"); + const example_reader_exe = b.addExecutable(.{ - .name = "example-reader", + .name = "reader", .root_source_file = b.path("examples/reader.zig"), .target = target, .optimize = optimize, @@ -44,4 +45,14 @@ pub fn build(b: *Build) void { example_reader_exe.root_module.addImport("xml", xml); const example_reader_install = b.addInstallArtifact(example_reader_exe, .{}); install_examples_step.dependOn(&example_reader_install.step); + + const example_canonicalize_exe = b.addExecutable(.{ + .name = "canonicalize", + .root_source_file = b.path("examples/canonicalize.zig"), + .target = target, + .optimize = optimize, + }); + example_canonicalize_exe.root_module.addImport("xml", xml); + const example_canonicalize_install = b.addInstallArtifact(example_canonicalize_exe, .{}); + install_examples_step.dependOn(&example_canonicalize_install.step); } diff --git a/examples/canonicalize.zig b/examples/canonicalize.zig new file mode 100644 index 0000000..149ba7d --- /dev/null +++ b/examples/canonicalize.zig @@ -0,0 +1,91 @@ +const std = @import("std"); +const log = std.log; +const xml = @import("xml"); + +pub fn main() !void { + var gpa_state: std.heap.GeneralPurposeAllocator(.{}) = .{}; + defer _ = gpa_state.deinit(); + const gpa = gpa_state.allocator(); + + var args_iter = try std.process.argsWithAllocator(gpa); + defer args_iter.deinit(); + _ = args_iter.next(); + var pretty = false; + var input: ?[]u8 = null; + defer if (input) |f| gpa.free(f); + while (args_iter.next()) |arg| { + if (std.mem.eql(u8, arg, "-p") or std.mem.eql(u8, arg, "--pretty")) { + pretty = true; + } else { + if (input != null) return error.InvalidArguments; // usage: canonicalize [-p|--pretty] file + input = try gpa.dupe(u8, arg); + } + } + + var input_file = try std.fs.cwd().openFile(input orelse return error.InvalidArguments, .{}); + defer input_file.close(); + var doc = xml.streamingDocument(gpa, input_file.reader()); + defer doc.deinit(); + var reader = doc.reader(gpa, .{}); + defer reader.deinit(); + + var stdout_buf = std.io.bufferedWriter(std.io.getStdOut().writer()); + const stdout_output = xml.streamingOutput(stdout_buf.writer()); + var writer = stdout_output.writer(gpa, .{ + .indent = if (pretty) " " else "", + }); + defer writer.deinit(); + + while (true) { + const node = reader.read() catch |err| switch (err) { + error.MalformedXml => { + const loc = reader.errorLocation(); + log.err("{}:{}: {}", .{ loc.line, loc.column, reader.errorCode() }); + return error.MalformedXml; + }, + else => |other| return other, + }; + switch (node) { + .eof => break, + .xml_declaration, .comment => {}, // ignored in canonical form + .element_start => { + try writer.elementStart(reader.elementName()); + + const sorted_attrs = try gpa.alloc(usize, reader.attributeCount()); + defer gpa.free(sorted_attrs); + for (0..reader.attributeCount()) |i| sorted_attrs[i] = i; + std.sort.pdq(usize, sorted_attrs, reader, struct { + fn lessThan(r: @TypeOf(reader), lhs: usize, rhs: usize) bool { + return std.mem.lessThan(u8, r.attributeName(lhs), r.attributeName(rhs)); + } + }.lessThan); + for (sorted_attrs) |i| { + try writer.attribute(reader.attributeName(i), try reader.attributeValue(i)); + } + }, + .element_end => { + try writer.elementEnd(); + }, + .pi => { + try writer.pi(reader.piTarget(), try reader.piData()); + }, + .text => { + try writer.text(try reader.text()); + }, + .cdata => { + try writer.text(try reader.cdata()); + }, + .character_reference => { + var buf: [4]u8 = undefined; + const len = std.unicode.utf8Encode(reader.characterReferenceChar(), &buf) catch unreachable; + try writer.text(buf[0..len]); + }, + .entity_reference => { + const value = xml.predefined_entities.get(reader.entityReferenceName()) orelse unreachable; + try writer.text(value); + }, + } + } + + try stdout_buf.flush(); +} diff --git a/examples/reader.zig b/examples/reader.zig index 40779f0..33610c5 100644 --- a/examples/reader.zig +++ b/examples/reader.zig @@ -9,7 +9,7 @@ pub fn main() !void { const args = try std.process.argsAlloc(gpa); defer std.process.argsFree(gpa, args); if (args.len != 2) { - return error.InvalidArguments; // usage: example-reader file + return error.InvalidArguments; // usage: reader file } var input_file = try std.fs.cwd().openFile(args[1], .{}); diff --git a/fuzz/src/fuzz.zig b/fuzz/src/fuzz.zig index 86f553b..b2aa5e0 100644 --- a/fuzz/src/fuzz.zig +++ b/fuzz/src/fuzz.zig @@ -16,11 +16,53 @@ fn fuzz(gpa: Allocator, input: []const u8) !void { var doc = xml.StaticDocument.init(input); var reader = doc.reader(gpa, .{}); defer reader.deinit(); + + var out_bytes = std.ArrayList(u8).init(gpa); + defer out_bytes.deinit(); + const output = xml.streamingOutput(out_bytes.writer()); + var writer = output.writer(gpa, .{}); + defer writer.deinit(); + while (true) { const node = reader.read() catch |err| switch (err) { error.MalformedXml => break, error.OutOfMemory => return error.OutOfMemory, }; - if (node == .eof) break; + switch (node) { + .eof => break, + .xml_declaration => { + try writer.xmlDeclaration(reader.xmlDeclarationEncoding(), reader.xmlDeclarationStandalone()); + }, + .comment => { + // TODO: not implemented yet + }, + .element_start => { + try writer.elementStart(reader.elementName()); + for (0..reader.attributeCount()) |i| { + try writer.attribute(reader.attributeName(i), try reader.attributeValue(i)); + } + }, + .element_end => { + try writer.elementEnd(); + }, + .pi => { + try writer.pi(reader.piTarget(), try reader.piData()); + }, + .text => { + try writer.text(try reader.text()); + }, + .cdata => { + try writer.text(try reader.cdata()); + }, + .character_reference => { + var buf: [4]u8 = undefined; + const len = std.unicode.utf8Encode(reader.characterReferenceChar(), &buf) catch unreachable; + try writer.text(buf[0..len]); + }, + .entity_reference => { + const value = xml.predefined_entities.get(reader.entityReferenceName()) orelse unreachable; + try writer.text(value); + }, + } } } diff --git a/src/Reader.zig b/src/Reader.zig index fea9106..52ec967 100644 --- a/src/Reader.zig +++ b/src/Reader.zig @@ -2236,6 +2236,6 @@ fn addString(reader: *Reader, s: []const u8) !StringIndex { return @enumFromInt(start); } -fn string(reader: Reader, index: StringIndex) []const u8 { +fn string(reader: *const Reader, index: StringIndex) []const u8 { return std.mem.sliceTo(reader.strings.items[@intFromEnum(index)..], 0); } diff --git a/src/Writer.zig b/src/Writer.zig index 0e3a529..43dc5d8 100644 --- a/src/Writer.zig +++ b/src/Writer.zig @@ -1,17 +1,61 @@ const std = @import("std"); +const Allocator = std.mem.Allocator; const assert = std.debug.assert; +const expectEqual = std.testing.expectEqual; +const expectEqualStrings = std.testing.expectEqualStrings; + +const ns_xmlns = @import("xml.zig").ns_xmlns; +const predefined_namespace_prefixes = @import("xml.zig").predefined_namespace_prefixes; +const streamingOutput = @import("xml.zig").streamingOutput; options: Options, state: State, -indent_level: u32, +/// String data for the current element nesting context. +/// Each element start node appends the name of the element to this buffer, and +/// the element name is followed by any namespace prefixes and URIs declared on +/// the element so they can be referenced by `ns_prefixes`. +strings: std.ArrayListUnmanaged(u8), +/// The start indexes of the element names in `strings`. +element_names: std.ArrayListUnmanaged(StringIndex), +/// The namespace prefixes declared by the current nesting context of elements. +ns_prefixes: std.ArrayListUnmanaged(std.AutoArrayHashMapUnmanaged(StringIndex, StringIndex)), +/// Pending namespace prefixes to be declared on the next element start. +pending_ns: std.AutoArrayHashMapUnmanaged(StringIndex, StringIndex), +/// A counter for the next generated `ns123` namespace prefix to be used. +gen_ns_prefix_counter: u32, sink: Sink, +gpa: Allocator, + const Writer = @This(); pub const Options = struct { + /// A string to be used as indentation for the output. + /// An empty value indicates no attempt should be made to pretty-print the + /// output. + /// + /// Using any value aside from an empty string may technically change the + /// content of the output according to the spec, because leading and + /// trailing whitespace within element content is always significant. + /// For example, the following XML samples are _not_ strictly equivalent: + /// + /// ```xml + /// + /// ``` + /// + /// and + /// + /// ```xml + /// + /// + /// + /// ``` indent: []const u8 = "", + /// Whether the writer should be aware of XML namespaces. The `Ns`-suffixed + /// functions of `Writer` may only be used when this is enabled. + namespace_aware: bool = true, }; pub const Sink = struct { @@ -33,50 +77,184 @@ const State = enum { end, }; -pub fn init(sink: Sink, options: Options) Writer { +pub fn init(gpa: Allocator, sink: Sink, options: Options) Writer { return .{ .options = options, .state = .start, - .indent_level = 0, + .strings = .{}, + .element_names = .{}, + .ns_prefixes = .{}, + .pending_ns = .{}, + .gen_ns_prefix_counter = 0, .sink = sink, + + .gpa = gpa, }; } +pub fn deinit(writer: *Writer) void { + writer.strings.deinit(writer.gpa); + writer.element_names.deinit(writer.gpa); + for (writer.ns_prefixes.items) |*map| map.deinit(writer.gpa); + writer.ns_prefixes.deinit(writer.gpa); + writer.pending_ns.deinit(writer.gpa); + writer.* = undefined; +} + pub const WriteError = error{}; +/// Writes the BOM (byte-order mark). +/// Asserts that the writer is at the beginning of the document. pub fn bom(writer: *Writer) anyerror!void { assert(writer.state == .start); - try writer.raw("\u{FEFF}"); + try writer.write("\u{FEFF}"); writer.state = .after_bom; } +test bom { + var raw = std.ArrayList(u8).init(std.testing.allocator); + defer raw.deinit(); + const out = streamingOutput(raw.writer()); + var writer = out.writer(std.testing.allocator, .{ .indent = " " }); + defer writer.deinit(); + + try writer.bom(); + try writer.elementStart("root"); + try writer.elementEndEmpty(); + + try expectEqualStrings("\u{FEFF}", raw.items); +} + +/// Writes the XML declaration. +/// Asserts that the writer is at the beginning of the document or just after the BOM (if any). pub fn xmlDeclaration(writer: *Writer, encoding: ?[]const u8, standalone: ?bool) anyerror!void { assert(writer.state == .start or writer.state == .after_bom); - try writer.raw(""); + try writer.write("?>"); if (writer.options.indent.len > 0) try writer.newLineAndIndent(); writer.state = .after_xml_declaration; } +test xmlDeclaration { + var raw = std.ArrayList(u8).init(std.testing.allocator); + defer raw.deinit(); + const out = streamingOutput(raw.writer()); + var writer = out.writer(std.testing.allocator, .{ .indent = " " }); + defer writer.deinit(); + + try writer.xmlDeclaration("UTF-8", true); + try writer.elementStart("root"); + try writer.elementEndEmpty(); + + try expectEqualStrings( + \\ + \\ + , raw.items); +} + +/// Starts an element. +/// Asserts that the writer is not after the end of the root element. pub fn elementStart(writer: *Writer, name: []const u8) anyerror!void { + if (writer.options.namespace_aware) prefixed: { + const colon_pos = std.mem.indexOfScalar(u8, name, ':') orelse break :prefixed; + const prefix = name[0..colon_pos]; + const local = name[colon_pos + 1 ..]; + try writer.elementStartInternal(prefix, local); + return; + } + try writer.elementStartInternal("", name); +} + +test elementStart { + var raw = std.ArrayList(u8).init(std.testing.allocator); + defer raw.deinit(); + const out = streamingOutput(raw.writer()); + var writer = out.writer(std.testing.allocator, .{ .indent = " " }); + defer writer.deinit(); + + try writer.elementStart("root"); + try writer.elementStart("element"); + try writer.elementEnd(); + try writer.elementEnd(); + + try expectEqualStrings( + \\ + \\ + \\ + \\ + , raw.items); +} + +/// Starts a namespaced element. +/// Asserts that the writer is namespace-aware and not after the end of the +/// root element. +/// +/// Currently, this function also asserts that `ns` is not empty, although that +/// may be supported in the future. +/// +/// If `ns` is already bound to a prefix (via an attribute or `bindNs`), that +/// prefix will be used. Otherwise, a generated namespace prefix counting +/// upwards from `ns0` will be declared and used. +pub fn elementStartNs(writer: *Writer, ns: []const u8, local: []const u8) anyerror!void { + assert(writer.options.namespace_aware); + // TODO: XML 1.0 does not allow undeclaring namespace prefixes, so ensuring + // the empty namespace is actually used here is potentially quite tricky. + // For now, it is not allowed. + assert(ns.len > 0); + const prefix = writer.getNsPrefix(ns) orelse prefix: { + const str = try writer.generateNsPrefix(); + // If we are already inside an element start, we don't want to + // immediately bind our new prefix in that scope. Rather, we + // want to wait to bind it on the newly started element. + try writer.pending_ns.put(writer.gpa, str, try writer.addString(ns)); + break :prefix writer.string(str); + }; + try writer.elementStartInternal(prefix, local); +} + +test elementStartNs { + var raw = std.ArrayList(u8).init(std.testing.allocator); + defer raw.deinit(); + const out = streamingOutput(raw.writer()); + var writer = out.writer(std.testing.allocator, .{ .indent = " " }); + defer writer.deinit(); + + try writer.elementStartNs("http://example.com/foo", "root"); + try writer.elementStartNs("http://example.com/bar", "element"); + try writer.elementStartNs("http://example.com/foo", "element"); + try writer.elementEnd(); + try writer.elementEnd(); + try writer.elementEnd(); + + try expectEqualStrings( + \\ + \\ + \\ + \\ + \\ + \\ + , raw.items); +} + +fn elementStartInternal(writer: *Writer, prefix: []const u8, local: []const u8) !void { switch (writer.state) { .start, .after_bom, .after_xml_declaration, .text => {}, .element_start => { - try writer.raw(">"); + try writer.write(">"); try writer.newLineAndIndent(); }, .after_structure_end => { @@ -84,18 +262,51 @@ pub fn elementStart(writer: *Writer, name: []const u8) anyerror!void { }, .end => unreachable, } - try writer.raw("<"); - try writer.raw(name); + + try writer.write("<"); + if (prefix.len > 0) { + try writer.write(prefix); + try writer.write(":"); + } + try writer.write(local); + + // TODO: this is what I would _like_ to do, but prefix may point into + // strings, which can be invalidated while resizing it... + // const element_name = try writer.addPrefixedString(prefix, local); + // This temporary allocation is reliable, but ugly. At least local won't + // point into strings, so we can avoid the allocation if there's no prefix. + const element_name = if (prefix.len > 0) name: { + const tmp = try std.fmt.allocPrint(writer.gpa, "{s}:{s}", .{ prefix, local }); + defer writer.gpa.free(tmp); + break :name try writer.addString(tmp); + } else try writer.addString(local); + try writer.element_names.append(writer.gpa, element_name); writer.state = .element_start; - writer.indent_level += 1; + + if (writer.options.namespace_aware) { + var ns_prefixes: std.AutoArrayHashMapUnmanaged(StringIndex, StringIndex) = .{}; + try ns_prefixes.ensureUnusedCapacity(writer.gpa, writer.pending_ns.count()); + var pending_ns_iter = writer.pending_ns.iterator(); + while (pending_ns_iter.next()) |pending_ns| { + try writer.attributeInternal("xmlns", writer.string(pending_ns.key_ptr.*), writer.string(pending_ns.value_ptr.*)); + // The pending_ns strings point into the string memory of the + // enclosing element, so they are guaranteed to remain valid for + // the lifetime of the current element. + try ns_prefixes.put(writer.gpa, pending_ns.key_ptr.*, pending_ns.value_ptr.*); + } + try writer.ns_prefixes.append(writer.gpa, ns_prefixes); + writer.pending_ns.clearRetainingCapacity(); + } } -pub fn elementEnd(writer: *Writer, name: []const u8) anyerror!void { - writer.indent_level -= 1; +/// Ends the currently open element. +/// Asserts that the writer is inside an element. +pub fn elementEnd(writer: *Writer) anyerror!void { + const name = writer.element_names.pop(); switch (writer.state) { .text => {}, .element_start => { - try writer.raw(">"); + try writer.write(">"); try writer.newLineAndIndent(); }, .after_structure_end => { @@ -103,33 +314,193 @@ pub fn elementEnd(writer: *Writer, name: []const u8) anyerror!void { }, .start, .after_bom, .after_xml_declaration, .end => unreachable, } - try writer.raw(""); - writer.state = if (writer.indent_level > 0) .after_structure_end else .end; + try writer.write(""); + writer.state = if (writer.element_names.items.len > 0) .after_structure_end else .end; + writer.strings.shrinkRetainingCapacity(@intFromEnum(name)); + if (writer.options.namespace_aware) { + var ns_prefixes = writer.ns_prefixes.pop(); + ns_prefixes.deinit(writer.gpa); + writer.pending_ns.clearRetainingCapacity(); + } +} + +test elementEnd { + var raw = std.ArrayList(u8).init(std.testing.allocator); + defer raw.deinit(); + const out = streamingOutput(raw.writer()); + var writer = out.writer(std.testing.allocator, .{ .indent = " " }); + defer writer.deinit(); + + try writer.elementStart("root"); + try writer.elementStart("element"); + try writer.elementEnd(); + try writer.elementEnd(); + + try expectEqualStrings( + \\ + \\ + \\ + \\ + , raw.items); } +/// Ends the currently open element as an empty element (``). +/// Asserts that the writer is in an element start. pub fn elementEndEmpty(writer: *Writer) anyerror!void { assert(writer.state == .element_start); - try writer.raw("/>"); + try writer.write("/>"); writer.state = .after_structure_end; - writer.indent_level -= 1; + _ = writer.element_names.pop(); } +test elementEndEmpty { + var raw = std.ArrayList(u8).init(std.testing.allocator); + defer raw.deinit(); + const out = streamingOutput(raw.writer()); + var writer = out.writer(std.testing.allocator, .{ .indent = " " }); + defer writer.deinit(); + + try writer.elementStart("root"); + try writer.elementStart("element"); + try writer.elementEndEmpty(); + try writer.elementEnd(); + + try expectEqualStrings( + \\ + \\ + \\ + , raw.items); +} + +/// Adds an attribute to the current element start. +/// Asserts that the writer is in an element start. +/// +/// If the writer is namespace-aware, namespace declarations are recognized and +/// registered for future use by "Ns"-suffixed functions. pub fn attribute(writer: *Writer, name: []const u8, value: []const u8) anyerror!void { assert(writer.state == .element_start); - try writer.raw(" "); - try writer.raw(name); - try writer.raw("=\""); + if (writer.options.namespace_aware) prefixed: { + if (std.mem.eql(u8, name, "xmlns")) { + const new_ns = try writer.addString(value); + const ns_prefixes = &writer.ns_prefixes.items[writer.ns_prefixes.items.len - 1]; + try ns_prefixes.put(writer.gpa, .empty, new_ns); + } + const colon_pos = std.mem.indexOfScalar(u8, name, ':') orelse break :prefixed; + const prefix = name[0..colon_pos]; + const local = name[colon_pos + 1 ..]; + if (std.mem.eql(u8, prefix, "xmlns")) { + const new_prefix = try writer.addString(local); + const new_ns = try writer.addString(value); + const ns_prefixes = &writer.ns_prefixes.items[writer.ns_prefixes.items.len - 1]; + try ns_prefixes.put(writer.gpa, new_prefix, new_ns); + } + try writer.attributeInternal(prefix, local, value); + return; + } + try writer.attributeInternal("", name, value); +} + +test attribute { + var raw = std.ArrayList(u8).init(std.testing.allocator); + defer raw.deinit(); + const out = streamingOutput(raw.writer()); + var writer = out.writer(std.testing.allocator, .{ .indent = " " }); + defer writer.deinit(); + + try writer.elementStart("root"); + try writer.attribute("key", "value"); + try writer.attribute("xmlns", "http://example.com"); + try writer.attribute("xmlns:a", "http://example.com/a"); + try writer.elementStartNs("http://example.com", "element"); + try writer.elementEndEmpty(); + try writer.elementStartNs("http://example.com/a", "element"); + try writer.elementEndEmpty(); + try writer.elementEnd(); + + try expectEqualStrings( + \\ + \\ + \\ + \\ + , raw.items); +} + +/// Adds a namespaced attribute to the current element start. +/// Asserts that the writer is namespace-aware and in an element start. +/// +/// Currently, this function also asserts that `ns` is not empty, although that +/// may be supported in the future. +/// +/// If `ns` is already bound to a prefix (via an attribute or `bindNs`), that +/// prefix will be used. Otherwise, a generated namespace prefix counting +/// upwards from `ns0` will be declared and used. +/// +/// If the writer is namespace-aware, namespace declarations are recognized and +/// registered for future use by "Ns"-suffixed functions. +pub fn attributeNs(writer: *Writer, ns: []const u8, local: []const u8, value: []const u8) anyerror!void { + assert(writer.options.namespace_aware); + // TODO: XML 1.0 does not allow undeclaring namespace prefixes, so ensuring + // the empty namespace is actually used here is potentially quite tricky. + // For now, it is not allowed. + assert(ns.len > 0); + if (std.mem.eql(u8, ns, ns_xmlns)) { + const new_prefix = try writer.addString(local); + const new_ns = try writer.addString(value); + const ns_prefixes = &writer.ns_prefixes.items[writer.ns_prefixes.items.len - 1]; + try ns_prefixes.put(writer.gpa, new_prefix, new_ns); + } + const prefix = writer.getNsPrefix(ns) orelse prefix: { + const str = try writer.generateNsPrefix(); + try writer.bindNsImmediate(str, ns); + break :prefix writer.string(str); + }; + try writer.attributeInternal(prefix, local, value); +} + +test attributeNs { + var raw = std.ArrayList(u8).init(std.testing.allocator); + defer raw.deinit(); + const out = streamingOutput(raw.writer()); + var writer = out.writer(std.testing.allocator, .{ .indent = " " }); + defer writer.deinit(); + + try writer.elementStart("root"); + try writer.attributeNs("http://example.com", "key", "value"); + try writer.attributeNs("http://www.w3.org/2000/xmlns/", "a", "http://example.com/a"); + try writer.elementStartNs("http://example.com", "element"); + try writer.elementEndEmpty(); + try writer.elementStartNs("http://example.com/a", "element"); + try writer.elementEndEmpty(); + try writer.elementEnd(); + + try expectEqualStrings( + \\ + \\ + \\ + \\ + , raw.items); +} + +fn attributeInternal(writer: *Writer, prefix: []const u8, name: []const u8, value: []const u8) !void { + assert(writer.state == .element_start); + try writer.write(" "); + if (prefix.len > 0) { + try writer.write(prefix); + try writer.write(":"); + } + try writer.write(name); + try writer.write("=\""); try writer.attributeText(value); - try writer.raw("\""); + try writer.write("\""); } fn attributeText(writer: *Writer, s: []const u8) anyerror!void { var pos: usize = 0; while (std.mem.indexOfAnyPos(u8, s, pos, "\r\n\t&<\"")) |esc_pos| { - try writer.raw(s[pos..esc_pos]); - try writer.raw(switch (s[esc_pos]) { + try writer.write(s[pos..esc_pos]); + try writer.write(switch (s[esc_pos]) { '\r' => " ", '\n' => " ", '\t' => " ", @@ -140,38 +511,64 @@ fn attributeText(writer: *Writer, s: []const u8) anyerror!void { }); pos = esc_pos + 1; } - try writer.raw(s[pos..]); + try writer.write(s[pos..]); } +/// Writes a PI (processing instruction). pub fn pi(writer: *Writer, target: []const u8, data: []const u8) anyerror!void { switch (writer.state) { .start, .after_bom, .after_xml_declaration, .text, .end => {}, .element_start => { - try writer.raw(">"); + try writer.write(">"); try writer.newLineAndIndent(); }, .after_structure_end => { try writer.newLineAndIndent(); }, } - try writer.raw(""); + try writer.write(" 0) { + try writer.write(" "); + try writer.write(data); + } + try writer.write("?>"); writer.state = .after_structure_end; } +test pi { + var raw = std.ArrayList(u8).init(std.testing.allocator); + defer raw.deinit(); + const out = streamingOutput(raw.writer()); + var writer = out.writer(std.testing.allocator, .{ .indent = " " }); + defer writer.deinit(); + + try writer.pi("some-pi", "some pi data"); + try writer.elementStart("root"); + try writer.pi("handle-me", ""); + try writer.elementEnd(); + + try expectEqualStrings( + \\ + \\ + \\ + \\ + , raw.items); +} + +/// Writes a text node, escaping the text where necessary to preserve its value +/// in the resulting XML. +/// Asserts that the writer is in an element. pub fn text(writer: *Writer, s: []const u8) anyerror!void { switch (writer.state) { .after_structure_end, .text => {}, - .element_start => try writer.raw(">"), + .element_start => try writer.write(">"), .start, .after_bom, .after_xml_declaration, .end => unreachable, } var pos: usize = 0; while (std.mem.indexOfAnyPos(u8, s, pos, "\r&<")) |esc_pos| { - try writer.raw(s[pos..esc_pos]); - try writer.raw(switch (s[esc_pos]) { + try writer.write(s[pos..esc_pos]); + try writer.write(switch (s[esc_pos]) { '\r' => " ", '&' => "&", '<' => "<", @@ -179,17 +576,39 @@ pub fn text(writer: *Writer, s: []const u8) anyerror!void { }); pos = esc_pos + 1; } - try writer.raw(s[pos..]); + try writer.write(s[pos..]); writer.state = .text; } -// insert some existing XML document without escaping anything +test text { + var raw = std.ArrayList(u8).init(std.testing.allocator); + defer raw.deinit(); + const out = streamingOutput(raw.writer()); + var writer = out.writer(std.testing.allocator, .{ .indent = " " }); + defer writer.deinit(); + + try writer.elementStart("root"); + try writer.text("Sample XML: \r\n&\r\n"); + try writer.elementEnd(); + + try expectEqualStrings( + \\Sample XML: <root> + \\&amp; + \\</root> + , raw.items); +} + +/// Writes an XML fragment without escaping anything. +/// +/// For correctness, the XML fragment must not contain any unclosed structures. +/// For example, the fragment `` is illegal, as the element `foo` remains +/// unclosed after embedding. Similarly, ` {}, - .element_start => try writer.raw(">"), + .element_start => try writer.write(">"), } - try writer.raw(s); + try writer.write(s); writer.state = switch (writer.state) { .start, .after_bom, .after_xml_declaration => .after_xml_declaration, .element_start, .after_structure_end, .text => .text, @@ -197,60 +616,192 @@ pub fn embed(writer: *Writer, s: []const u8) anyerror!void { }; } -fn newLineAndIndent(writer: *Writer) anyerror!void { - if (writer.options.indent.len == 0) return; +test embed { + var raw = std.ArrayList(u8).init(std.testing.allocator); + defer raw.deinit(); + const out = streamingOutput(raw.writer()); + var writer = out.writer(std.testing.allocator, .{ .indent = " " }); + defer writer.deinit(); - try writer.raw("\n"); - var n: usize = 0; - while (n < writer.indent_level) : (n += 1) { - try writer.raw(writer.options.indent); - } + try writer.xmlDeclaration("UTF-8", null); + try writer.elementStart("foo"); + try writer.embed("Baz!"); + try writer.elementEnd(); + + try expectEqualStrings( + \\ + \\Baz! + , raw.items); } -fn raw(writer: *Writer, s: []const u8) anyerror!void { - try writer.sink.write(s); +/// Binds a namespace URI to a prefix. +/// +/// If the writer is currently inside an element start, the namespace is +/// declared immediately. Otherwise, it will be declared on the next element +/// started. +pub fn bindNs(writer: *Writer, prefix: []const u8, ns: []const u8) anyerror!void { + try writer.bindNsInternal(try writer.addString(prefix), ns); } -test { - _ = T; +test bindNs { + var raw = std.ArrayList(u8).init(std.testing.allocator); + defer raw.deinit(); + const out = streamingOutput(raw.writer()); + var writer = out.writer(std.testing.allocator, .{ .indent = " " }); + defer writer.deinit(); + + // Namespaces may be bound before the element they apply to, allowing a + // prefix to be bound for a namespaced element. + try writer.bindNs("ex", "http://example.com"); + try writer.elementStartNs("http://example.com", "root"); + try writer.attributeNs("http://example.com", "a", "value"); + try writer.elementStartNs("http://example.com", "element"); + try writer.bindNs("ex2", "http://example.com/ns2"); + try writer.attributeNs("http://example.com/ns2", "a", "value"); + // It doesn't matter if a namespace prefix is ever used: it will be + // declared regardless. + try writer.bindNs("ex3", "http://example.com/ns3"); + try writer.elementEndEmpty(); + try writer.elementEnd(); + + try expectEqualStrings( + \\ + \\ + \\ + , raw.items); } -const T = struct { - const Testbed = struct { - buf: std.ArrayList(u8), - fn init(a: std.mem.Allocator) Testbed { - return .{ - .buf = std.ArrayList(u8).init(a), - }; - } - fn writer(self: *Testbed, indent: []const u8) Writer { - return Writer.init(.{ - .context = self, - .writeFn = write, - }, .{ .indent = indent }); - } - fn write(context: *const anyopaque, data: []const u8) anyerror!void { - // TODO not sure why context is const. - var self: *Testbed = @constCast(@alignCast(@ptrCast(context))); - try self.buf.appendSlice(data); + +fn bindNsInternal(writer: *Writer, prefix_str: StringIndex, ns: []const u8) !void { + if (writer.state == .element_start) { + try writer.bindNsImmediate(prefix_str, ns); + } else { + const ns_str = try writer.addString(ns); + try writer.pending_ns.put(writer.gpa, prefix_str, ns_str); + } +} + +fn bindNsImmediate(writer: *Writer, prefix_str: StringIndex, ns: []const u8) !void { + const ns_str = try writer.addString(ns); + try writer.attributeInternal("xmlns", writer.string(prefix_str), ns); + const ns_prefixes = &writer.ns_prefixes.items[writer.ns_prefixes.items.len - 1]; + try ns_prefixes.put(writer.gpa, prefix_str, ns_str); +} + +fn getNsPrefix(writer: *Writer, ns: []const u8) ?[]const u8 { + if (predefined_namespace_prefixes.get(ns)) |prefix| return prefix; + + // Potential optimization opportunity: store a mapping of namespace URIs + // to prefixes and update it when an element closes or a new prefix is + // bound. + + var pending_ns = writer.pending_ns.iterator(); + while (pending_ns.next()) |pending| { + if (std.mem.eql(u8, ns, writer.string(pending.value_ptr.*))) { + return writer.string(pending.key_ptr.*); } - fn output(self: *Testbed) []const u8 { - return self.buf.items; + } + + var i: usize = writer.ns_prefixes.items.len; + while (i > 0) { + i -= 1; + var ns_prefixes = writer.ns_prefixes.items[i].iterator(); + while (ns_prefixes.next()) |ns_prefix| { + if (std.mem.eql(u8, ns, writer.string(ns_prefix.value_ptr.*))) { + return writer.string(ns_prefix.key_ptr.*); + } } - fn deinit(self: *Testbed) void { - self.buf.deinit(); + } + return null; +} + +fn generateNsPrefix(writer: *Writer) !StringIndex { + gen_prefix: while (true) { + const max_len = std.fmt.comptimePrint("ns{}", .{std.math.maxInt(@TypeOf(writer.gen_ns_prefix_counter))}).len; + var buf: [max_len]u8 = undefined; + const prefix = std.fmt.bufPrint(&buf, "ns{}", .{writer.gen_ns_prefix_counter}) catch unreachable; + writer.gen_ns_prefix_counter += 1; + for (writer.ns_prefixes.items) |ns_prefixes| { + for (ns_prefixes.keys()) |existing_prefix| { + if (std.mem.eql(u8, prefix, writer.string(existing_prefix))) { + continue :gen_prefix; + } + } } - }; - test "embed" { - var tb = Testbed.init(std.testing.allocator); - defer tb.deinit(); - var wtr = tb.writer(" "); - try wtr.xmlDeclaration("UTF-8", null); - try wtr.elementStart("foo"); - try wtr.embed("Baz!"); - try wtr.elementEnd("foo"); - try std.testing.expectEqualStrings( - \\ - \\Baz! - , tb.output()); + return try writer.addString(prefix); + } +} + +fn newLineAndIndent(writer: *Writer) anyerror!void { + if (writer.options.indent.len == 0) return; + + try writer.write("\n"); + for (0..writer.element_names.items.len) |_| { + try writer.write(writer.options.indent); + } +} + +fn write(writer: *Writer, s: []const u8) anyerror!void { + try writer.sink.write(s); +} + +const StringIndex = enum(usize) { empty = 0, _ }; + +const StringIndexAdapter = struct { + strings: []const u8, + + pub fn hash(ctx: @This(), key: []const u8) u32 { + _ = ctx; + return @truncate(std.hash.Wyhash.hash(0, key)); + } + + pub fn eql(ctx: @This(), a: []const u8, b: StringIndex, b_index: usize) bool { + _ = b_index; + const b_val = std.mem.sliceTo(ctx.strings[@intFromEnum(b)..], 0); + return std.mem.eql(u8, a, b_val); } }; + +fn addString(writer: *Writer, s: []const u8) !StringIndex { + try writer.strings.ensureUnusedCapacity(writer.gpa, 1 + s.len); + writer.strings.appendAssumeCapacity(0); + const start = writer.strings.items.len; + writer.strings.appendSliceAssumeCapacity(s); + return @enumFromInt(start); +} + +fn addPrefixedString(writer: *Writer, prefix: []const u8, s: []const u8) !StringIndex { + if (prefix.len == 0) return writer.addString(s); + try writer.strings.ensureUnusedCapacity(writer.gpa, 1 + prefix.len + ":".len + s.len); + writer.strings.appendAssumeCapacity(0); + const start = writer.strings.items.len; + writer.strings.appendSliceAssumeCapacity(prefix); + writer.strings.appendAssumeCapacity(':'); + writer.strings.appendSliceAssumeCapacity(s); + return @enumFromInt(start); +} + +fn string(writer: *const Writer, index: StringIndex) []const u8 { + return std.mem.sliceTo(writer.strings.items[@intFromEnum(index)..], 0); +} + +test "namespace prefix strings resize bug" { + // Reported here: https://github.com/ianprime0509/zig-xml/pull/41#issuecomment-2449960818 + var raw = std.ArrayList(u8).init(std.testing.allocator); + defer raw.deinit(); + const out = streamingOutput(raw.writer()); + var writer = out.writer(std.testing.allocator, .{ .indent = " " }); + defer writer.deinit(); + + try writer.bindNs("d", "foospace"); + try writer.elementStartNs("foospace", "root"); + try writer.elementStartNs("foospace", "child"); + try writer.text("Hello, Bug"); + try writer.elementEnd(); + try writer.elementEnd(); + + try expectEqualStrings( + \\ + \\ Hello, Bug + \\ + , raw.items); +} diff --git a/src/xml.zig b/src/xml.zig index ceaaae9..5029e44 100644 --- a/src/xml.zig +++ b/src/xml.zig @@ -76,6 +76,10 @@ pub const predefined_namespace_uris = std.StaticStringMap([]const u8).initCompti .{ "xml", ns_xml }, .{ "xmlns", ns_xmlns }, }); +pub const predefined_namespace_prefixes = std.StaticStringMap([]const u8).initComptime(.{ + .{ ns_xml, "xml" }, + .{ ns_xmlns, "xmlns" }, +}); pub const Reader = @import("Reader.zig"); @@ -434,43 +438,73 @@ pub fn GenericWriter(comptime SinkError: type) type { return struct { writer: Writer, - pub const WriteError = Writer.WriteError || SinkError; + /// See `Writer.deinit`. + pub inline fn deinit(writer: *@This()) void { + writer.writer.deinit(); + } + + // TODO: not all the write functions actually need to allocate + pub const WriteError = Writer.WriteError || SinkError || Allocator.Error; + /// See `Writer.bom`. pub inline fn bom(writer: *@This()) WriteError!void { return @errorCast(writer.writer.bom()); } + /// See `Writer.xmlDeclaration`. pub inline fn xmlDeclaration(writer: *@This(), encoding: ?[]const u8, standalone: ?bool) WriteError!void { return @errorCast(writer.writer.xmlDeclaration(encoding, standalone)); } + /// See `Writer.elementStart`. pub inline fn elementStart(writer: *@This(), name: []const u8) WriteError!void { return @errorCast(writer.writer.elementStart(name)); } - pub inline fn elementEnd(writer: *@This(), name: []const u8) WriteError!void { - return @errorCast(writer.writer.elementEnd(name)); + /// See `Writer.elementStartNs`. + pub inline fn elementStartNs(writer: *@This(), ns: []const u8, local: []const u8) WriteError!void { + return @errorCast(writer.writer.elementStartNs(ns, local)); } + /// See `Writer.elementEnd`. + pub inline fn elementEnd(writer: *@This()) WriteError!void { + return @errorCast(writer.writer.elementEnd()); + } + + /// See `Writer.elementEndEmpty`. pub inline fn elementEndEmpty(writer: *@This()) WriteError!void { return @errorCast(writer.writer.elementEndEmpty()); } + /// See `Writer.attribute`. pub inline fn attribute(writer: *@This(), name: []const u8, value: []const u8) WriteError!void { return @errorCast(writer.writer.attribute(name, value)); } + /// See `Writer.attributeNs`. + pub inline fn attributeNs(writer: *@This(), ns: []const u8, local: []const u8, value: []const u8) WriteError!void { + return @errorCast(writer.writer.attributeNs(ns, local, value)); + } + + /// See `Writer.pi`. pub inline fn pi(writer: *@This(), target: []const u8, data: []const u8) WriteError!void { return @errorCast(writer.writer.pi(target, data)); } + /// See `Writer.text`. pub inline fn text(writer: *@This(), s: []const u8) WriteError!void { return @errorCast(writer.writer.text(s)); } + /// See `Writer.embed`. pub inline fn embed(writer: *@This(), s: []const u8) WriteError!void { return @errorCast(writer.writer.embed(s)); } + + /// See `Writer.bindNs`. + pub inline fn bindNs(writer: *@This(), prefix: []const u8, ns: []const u8) WriteError!void { + return @errorCast(writer.writer.bindNs(prefix, ns)); + } }; } @@ -480,8 +514,8 @@ pub fn StreamingOutput(comptime WriterType: type) type { pub const Error = WriterType.Error; - pub fn writer(out: *const @This(), options: Writer.Options) GenericWriter(Error) { - return .{ .writer = Writer.init(out.sink(), options) }; + pub fn writer(out: *const @This(), gpa: Allocator, options: Writer.Options) GenericWriter(Error) { + return .{ .writer = Writer.init(gpa, out.sink(), options) }; } pub fn sink(out: *const @This()) Writer.Sink { @@ -505,6 +539,28 @@ pub fn streamingOutput(writer: anytype) StreamingOutput(@TypeOf(writer)) { return .{ .stream = writer }; } +test streamingOutput { + var raw = std.ArrayList(u8).init(std.testing.allocator); + defer raw.deinit(); + const out = streamingOutput(raw.writer()); + var writer = out.writer(std.testing.allocator, .{ .indent = " " }); + defer writer.deinit(); + + try writer.xmlDeclaration("UTF-8", null); + try writer.elementStart("test"); + try writer.elementStart("inner"); + try writer.text("Hello, world!"); + try writer.elementEnd(); + try writer.elementEnd(); + + try expectEqualStrings( + \\ + \\ + \\ Hello, world! + \\ + , raw.items); +} + test { _ = Location; _ = QName; diff --git a/xmlconf/src/xmlconf.zig b/xmlconf/src/xmlconf.zig index c9a4faa..b21d410 100644 --- a/xmlconf/src/xmlconf.zig +++ b/xmlconf/src/xmlconf.zig @@ -250,8 +250,9 @@ fn runTestParseable( var canonical_buf = std.ArrayList(u8).init(gpa); defer canonical_buf.deinit(); - var canonical_output = xml.streamingOutput(canonical_buf.writer()); - var canonical = canonical_output.writer(.{}); + const canonical_output = xml.streamingOutput(canonical_buf.writer()); + var canonical = canonical_output.writer(gpa, .{}); + defer canonical.deinit(); while (true) { const node = reader.read() catch |err| switch (err) { @@ -286,7 +287,7 @@ fn runTestParseable( } }, .element_end => { - try canonical.elementEnd(reader.elementName()); + try canonical.elementEnd(); }, .pi => { try canonical.pi(reader.piTarget(), try reader.piData());