diff --git a/bench/src/scanner.zig b/bench/src/scanner.zig index 4820412..933f168 100644 --- a/bench/src/scanner.zig +++ b/bench/src/scanner.zig @@ -1,13 +1,16 @@ +const std = @import("std"); const xml = @import("xml"); pub const main = @import("common.zig").main; pub fn runBench(data: []const u8) !void { var scanner = xml.Scanner{}; + var data_stream = std.io.fixedBufferStream(data); var decoder = xml.encoding.Utf8Decoder{}; - for (data) |b| { - if (try decoder.next(b)) |c| { - _ = try scanner.next(c, 1); - } + var buf: [4]u8 = undefined; + while (true) { + const c = try decoder.readCodepoint(data_stream.reader(), &buf); + if (!c.present) break; + _ = try scanner.next(c.codepoint, c.byte_length); } } diff --git a/examples/scan.zig b/examples/scan.zig index 98e90a8..faccaa0 100644 --- a/examples/scan.zig +++ b/examples/scan.zig @@ -27,19 +27,11 @@ pub fn main() !void { var line: usize = 1; var column: usize = 1; - read: while (true) { - var codepoint_bytes: usize = 0; - const c = while (true) { - const b = input_reader.readByte() catch |e| switch (e) { - error.EndOfStream => break :read, - else => |other| return other, - }; - codepoint_bytes += 1; - if (try decoder.next(b)) |codepoint| { - break codepoint; - } - }; - const token = scanner.next(c, codepoint_bytes) catch |e| { + while (true) { + var buf: [4]u8 = undefined; + const c = try decoder.readCodepoint(input_reader, &buf); + if (!c.present) break; + const token = scanner.next(c.codepoint, c.byte_length) catch |e| { try stdout_buffered_writer.flush(); try stderr.print("error: {} ({}:{}): {}\n", .{ scanner.pos, line, column, e }); return; @@ -47,7 +39,7 @@ pub fn main() !void { if (token != .ok) { try stdout.print("{} ({}:{}): {}\n", .{ scanner.pos, line, column, token }); } - if (c == '\n') { + if (c.codepoint == '\n') { line += 1; column = 1; } else { diff --git a/src/encoding.zig b/src/encoding.zig index d08153c..936230f 100644 --- a/src/encoding.zig +++ b/src/encoding.zig @@ -6,18 +6,16 @@ //! //! - `const max_encoded_codepoint_len` - the maximum number of bytes a //! single Unicode codepoint may occupy in encoded form. -//! - `fn next(self: *Decoder, b: u8) Error!?u21` - accepts a single byte of -//! input, returning an error if the byte is invalid in the current state of -//! the decoder, a valid Unicode codepoint, or `null` if the byte is valid -//! but there is not yet a full codepoint to return. +//! - `fn readCodepoint(self: *Decoder, reader: anytype, buf: []u8) (Error || @TypeOf(reader).Error))!ReadResult` - +//! reads a single codepoint from a `std.io.GenericReader` and writes its UTF-8 +//! encoding to `buf`. Should return `error.UnexpectedEndOfInput` if a full +//! codepoint cannot be read, `error.Overflow` if the UTF-8-encoded form cannot +//! be written to `buf`; other decoder-specific errors can also be used. //! - `fn adaptTo(self: *Decoder, encoding: []const u8) error{InvalidEncoding}!void` - //! accepts a UTF-8-encoded encoding name and returns an error if the desired //! encoding cannot be handled by the decoder. This is intended to support //! `Decoder` implementations which adapt to the encoding declared by an XML //! document. -//! - `fn isUtf8Compatible(self: Decoder) bool` - returns whether this decoder -//! decodes a subset of UTF-8. It is always safe to return false if this is -//! not known. const std = @import("std"); const ascii = std.ascii; @@ -27,6 +25,26 @@ const Allocator = std.mem.Allocator; const ArrayListUnmanaged = std.ArrayListUnmanaged; const BoundedArray = std.BoundedArray; +/// The result of reading a single codepoint successfully. +pub const ReadResult = packed struct(u32) { + /// The codepoint read. + codepoint: u21, + /// The length of the codepoint encoded in UTF-8. + byte_length: u10, + /// If https://github.com/ziglang/zig/issues/104 is implemented, a much + /// better API would be to make `ReadResult` a `packed struct(u31)` instead + /// and use `?ReadResult` elsewhere. But, for now, this indicates whether + /// `codepoint` and `byte_length` are present, so that the whole thing fits + /// in a `u32` rather than unnecessarily taking up 8 bytes. + present: bool = true, + + pub const none: ReadResult = .{ + .codepoint = 0, + .byte_length = 0, + .present = false, + }; +}; + /// A decoder which handles UTF-8 or UTF-16, using a BOM to detect UTF-16 /// endianness. /// @@ -35,123 +53,134 @@ const BoundedArray = std.BoundedArray; pub const DefaultDecoder = struct { state: union(enum) { start, - utf16_be_bom, - utf16_le_bom, utf8: Utf8Decoder, - utf16_le: Utf16Decoder(.little), - utf16_be: Utf16Decoder(.big), + utf16_le: Utf16Decoder(.Little), + utf16_be: Utf16Decoder(.Big), } = .start, - pub const Error = error{ InvalidUtf8, InvalidUtf16 }; + pub const Error = Utf8Decoder.Error || Utf16Decoder(.Little).Error || Utf16Decoder(.Big).Error; pub const max_encoded_codepoint_len = 4; + const bom = 0xFEFF; + const bom_byte_length = unicode.utf8CodepointSequenceLength(bom) catch unreachable; - pub fn next(self: *DefaultDecoder, b: u8) Error!?u21 { + pub fn readCodepoint(self: *DefaultDecoder, reader: anytype, buf: []u8) (Error || @TypeOf(reader).Error)!ReadResult { switch (self.state) { - .start => if (b == 0xFE) { - self.state = .utf16_be_bom; - return null; - } else if (b == 0xFF) { - self.state = .utf16_le_bom; - return null; - } else { - self.state = .{ .utf8 = .{} }; - return try self.state.utf8.next(b); - }, - .utf16_be_bom => if (b == 0xFF) { + .start => {}, + inline else => |*inner| return inner.readCodepoint(reader, buf), + } + // If attempting to match the UTF-16 BOM fails for whatever reason, we + // will assume we are reading UTF-8. + self.state = .{ .utf8 = .{} }; + const b = reader.readByte() catch |e| switch (e) { + error.EndOfStream => return error.UnexpectedEndOfInput, + else => |other| return other, + }; + switch (b) { + 0xFE => { + const b2 = reader.readByte() catch |e| switch (e) { + error.EndOfStream => return error.InvalidUtf8, + else => |other| return other, + }; + if (b2 != 0xFF) return error.InvalidUtf8; self.state = .{ .utf16_be = .{} }; - return 0xFEFF; - } else { - self.state = .{ .utf8 = .{} }; - return error.InvalidUtf8; + if (bom_byte_length > buf.len) return error.Overflow; + _ = unicode.utf8Encode(bom, buf) catch unreachable; + return .{ .codepoint = bom, .byte_length = bom_byte_length }; }, - .utf16_le_bom => if (b == 0xFE) { + 0xFF => { + const b2 = reader.readByte() catch |e| switch (e) { + error.EndOfStream => return error.InvalidUtf8, + else => |other| return other, + }; + if (b2 != 0xFE) return error.InvalidUtf8; self.state = .{ .utf16_le = .{} }; - return 0xFEFF; - } else { - self.state = .{ .utf8 = .{} }; - return error.InvalidUtf8; + if (bom_byte_length > buf.len) return error.Overflow; + _ = unicode.utf8Encode(bom, buf) catch unreachable; + return .{ .codepoint = bom, .byte_length = bom_byte_length }; + }, + else => { + // The rest of this branch is copied from Utf8Decoder + const byte_length = unicode.utf8ByteSequenceLength(b) catch return error.InvalidUtf8; + if (byte_length > buf.len) return error.Overflow; + buf[0] = b; + if (byte_length == 1) return .{ .codepoint = b, .byte_length = 1 }; + reader.readNoEof(buf[1..byte_length]) catch |e| switch (e) { + error.EndOfStream => return error.UnexpectedEndOfInput, + else => |other| return other, + }; + const codepoint = switch (byte_length) { + 2 => unicode.utf8Decode2(buf[0..2]), + 3 => unicode.utf8Decode3(buf[0..3]), + 4 => unicode.utf8Decode4(buf[0..4]), + else => unreachable, + } catch return error.InvalidUtf8; + return .{ .codepoint = codepoint, .byte_length = byte_length }; }, - inline else => |*decoder| return try decoder.next(b), } } pub fn adaptTo(self: *DefaultDecoder, encoding: []const u8) error{InvalidEncoding}!void { switch (self.state) { - .start, .utf16_be_bom, .utf16_le_bom => {}, + .start => {}, inline else => |*decoder| try decoder.adaptTo(encoding), } } - - pub inline fn isUtf8Compatible(self: DefaultDecoder) bool { - return self.state == .utf8; - } }; test DefaultDecoder { // UTF-8 no BOM { - var decoder = DefaultDecoder{}; - try testing.expectEqual(@as(?u21, 'H'), try decoder.next('H')); - try testing.expectEqual(@as(?u21, null), try decoder.next(0xC3)); - try testing.expectEqual(@as(?u21, 'ü'), try decoder.next(0xBC)); - try testing.expectEqual(@as(?u21, null), try decoder.next(0xE6)); - try testing.expectEqual(@as(?u21, null), try decoder.next(0x97)); - try testing.expectEqual(@as(?u21, '日'), try decoder.next(0xA5)); - try testing.expectEqual(@as(?u21, null), try decoder.next(0xF0)); - try testing.expectEqual(@as(?u21, null), try decoder.next(0x9F)); - try testing.expectEqual(@as(?u21, null), try decoder.next(0x98)); - try testing.expectEqual(@as(?u21, '😀'), try decoder.next(0x80)); + const input = "Hü日😀"; + var decoder = try testDecode(DefaultDecoder, input, &.{ + 'H', + 'ü', + '日', + '😀', + }); try decoder.adaptTo("utf-8"); try decoder.adaptTo("UTF-8"); } // UTF-8 BOM { - var decoder = DefaultDecoder{}; - try testing.expectEqual(@as(?u21, null), try decoder.next(0xEF)); - try testing.expectEqual(@as(?u21, null), try decoder.next(0xBB)); - try testing.expectEqual(@as(?u21, 0xFEFF), try decoder.next(0xBF)); - try testing.expectEqual(@as(?u21, 'H'), try decoder.next('H')); - try testing.expectEqual(@as(?u21, null), try decoder.next(0xC3)); - try testing.expectEqual(@as(?u21, 'ü'), try decoder.next(0xBC)); - try testing.expectEqual(@as(?u21, null), try decoder.next(0xE6)); - try testing.expectEqual(@as(?u21, null), try decoder.next(0x97)); - try testing.expectEqual(@as(?u21, '日'), try decoder.next(0xA5)); - try testing.expectEqual(@as(?u21, null), try decoder.next(0xF0)); - try testing.expectEqual(@as(?u21, null), try decoder.next(0x9F)); - try testing.expectEqual(@as(?u21, null), try decoder.next(0x98)); - try testing.expectEqual(@as(?u21, '😀'), try decoder.next(0x80)); + const input = "\u{FEFF}Hü日😀"; + var decoder = try testDecode(DefaultDecoder, input, &.{ + 0xFEFF, + 'H', + 'ü', + '日', + '😀', + }); try decoder.adaptTo("utf-8"); try decoder.adaptTo("UTF-8"); } // Invalid UTF-8 BOM { - var decoder = DefaultDecoder{}; - try testing.expectEqual(@as(?u21, null), try decoder.next(0xEF)); - try testing.expectEqual(@as(?u21, null), try decoder.next(0x00)); - try testing.expectError(error.InvalidUtf8, decoder.next(0x00)); - try testing.expectEqual(@as(?u21, 'H'), try decoder.next('H')); + const input = "\xEF\x00\x00H"; + var decoder = try testDecode(DefaultDecoder, input, &.{ + error.InvalidUtf8, + 'H', + }); try decoder.adaptTo("utf-8"); try decoder.adaptTo("UTF-8"); } // UTF-16BE BOM { - var decoder = DefaultDecoder{}; - try testing.expectEqual(@as(?u21, null), try decoder.next(0xFE)); - try testing.expectEqual(@as(?u21, 0xFEFF), try decoder.next(0xFF)); - try testing.expectEqual(@as(?u21, null), try decoder.next(0x00)); - try testing.expectEqual(@as(?u21, 'H'), try decoder.next('H')); - try testing.expectEqual(@as(?u21, null), try decoder.next(0x00)); - try testing.expectEqual(@as(?u21, 'ü'), try decoder.next(0xFC)); - try testing.expectEqual(@as(?u21, null), try decoder.next(0x65)); - try testing.expectEqual(@as(?u21, '日'), try decoder.next(0xE5)); - try testing.expectEqual(@as(?u21, null), try decoder.next(0xD8)); - try testing.expectEqual(@as(?u21, null), try decoder.next(0x3D)); - try testing.expectEqual(@as(?u21, null), try decoder.next(0xDE)); - try testing.expectEqual(@as(?u21, '😀'), try decoder.next(0x00)); + const input = "\xFE\xFF" ++ // U+FEFF + "\x00H" ++ + "\x00\xFC" ++ // ü + "\x65\xE5" ++ // 日 + "\xD8\x3D\xDE\x00"; // 😀 + var decoder = try testDecode(DefaultDecoder, input, &.{ + 0xFEFF, + 'H', + 'ü', + '日', + '😀', + }); try decoder.adaptTo("utf-16"); try decoder.adaptTo("UTF-16"); try decoder.adaptTo("utf-16be"); @@ -160,29 +189,29 @@ test DefaultDecoder { // Invalid UTF-16BE BOM { - var decoder = DefaultDecoder{}; - try testing.expectEqual(@as(?u21, null), try decoder.next(0xFE)); - try testing.expectError(error.InvalidUtf8, decoder.next(0x00)); - try testing.expectEqual(@as(?u21, 'H'), try decoder.next('H')); + const input = "\xFE\x00H"; + var decoder = try testDecode(DefaultDecoder, input, &.{ + error.InvalidUtf8, + 'H', + }); try decoder.adaptTo("utf-8"); try decoder.adaptTo("UTF-8"); } // UTF-16LE BOM { - var decoder = DefaultDecoder{}; - try testing.expectEqual(@as(?u21, null), try decoder.next(0xFF)); - try testing.expectEqual(@as(?u21, 0xFEFF), try decoder.next(0xFE)); - try testing.expectEqual(@as(?u21, null), try decoder.next('H')); - try testing.expectEqual(@as(?u21, 'H'), try decoder.next(0x00)); - try testing.expectEqual(@as(?u21, null), try decoder.next(0xFC)); - try testing.expectEqual(@as(?u21, 'ü'), try decoder.next(0x00)); - try testing.expectEqual(@as(?u21, null), try decoder.next(0xE5)); - try testing.expectEqual(@as(?u21, '日'), try decoder.next(0x65)); - try testing.expectEqual(@as(?u21, null), try decoder.next(0x3D)); - try testing.expectEqual(@as(?u21, null), try decoder.next(0xD8)); - try testing.expectEqual(@as(?u21, null), try decoder.next(0x00)); - try testing.expectEqual(@as(?u21, '😀'), try decoder.next(0xDE)); + const input = "\xFF\xFE" ++ // U+FEFF + "H\x00" ++ + "\xFC\x00" ++ // ü + "\xE5\x65" ++ // 日 + "\x3D\xD8\x00\xDE"; // 😀 + var decoder = try testDecode(DefaultDecoder, input, &.{ + 0xFEFF, + 'H', + 'ü', + '日', + '😀', + }); try decoder.adaptTo("utf-16"); try decoder.adaptTo("UTF-16"); try decoder.adaptTo("utf-16le"); @@ -191,10 +220,11 @@ test DefaultDecoder { // Invalid UTF-16LE BOM { - var decoder = DefaultDecoder{}; - try testing.expectEqual(@as(?u21, null), try decoder.next(0xFF)); - try testing.expectError(error.InvalidUtf8, decoder.next(0xFF)); - try testing.expectEqual(@as(?u21, 'H'), try decoder.next('H')); + const input = "\xFF\xFFH"; + var decoder = try testDecode(DefaultDecoder, input, &.{ + error.InvalidUtf8, + 'H', + }); try decoder.adaptTo("utf-8"); try decoder.adaptTo("UTF-8"); } @@ -202,33 +232,30 @@ test DefaultDecoder { /// A decoder which handles only UTF-8. pub const Utf8Decoder = struct { - buffer: BoundedArray(u8, 4) = .{}, - expecting: u3 = 0, - - pub const Error = error{InvalidUtf8}; - pub const max_encoded_codepoint_len = 4; - pub fn next(self: *Utf8Decoder, b: u8) Error!?u21 { - if (self.expecting == 0) { - const len = unicode.utf8ByteSequenceLength(b) catch return error.InvalidUtf8; - if (len == 1) { - return b; - } - self.expecting = len; - self.buffer.appendAssumeCapacity(b); - return null; - } else { - self.buffer.appendAssumeCapacity(b); - if (self.buffer.len == self.expecting) { - const codepoint_or_error = unicode.utf8Decode(self.buffer.slice()); - self.expecting = 0; - self.buffer.len = 0; - return codepoint_or_error catch error.InvalidUtf8; - } else { - return null; - } - } + pub const Error = error{ InvalidUtf8, Overflow, UnexpectedEndOfInput }; + + pub fn readCodepoint(_: *Utf8Decoder, reader: anytype, buf: []u8) (Error || @TypeOf(reader).Error)!ReadResult { + const b = reader.readByte() catch |e| switch (e) { + error.EndOfStream => return ReadResult.none, + else => |other| return other, + }; + const byte_length = unicode.utf8ByteSequenceLength(b) catch return error.InvalidUtf8; + if (byte_length > buf.len) return error.Overflow; + buf[0] = b; + if (byte_length == 1) return .{ .codepoint = b, .byte_length = 1 }; + reader.readNoEof(buf[1..byte_length]) catch |e| switch (e) { + error.EndOfStream => return error.UnexpectedEndOfInput, + else => |other| return other, + }; + const codepoint = switch (byte_length) { + 2 => unicode.utf8Decode2(buf[0..2]), + 3 => unicode.utf8Decode3(buf[0..3]), + 4 => unicode.utf8Decode4(buf[0..4]), + else => unreachable, + } catch return error.InvalidUtf8; + return .{ .codepoint = codepoint, .byte_length = byte_length }; } pub fn adaptTo(_: *Utf8Decoder, encoding: []const u8) error{InvalidEncoding}!void { @@ -236,10 +263,6 @@ pub const Utf8Decoder = struct { return error.InvalidEncoding; } } - - pub inline fn isUtf8Compatible(_: Utf8Decoder) bool { - return true; - } }; test Utf8Decoder { @@ -258,7 +281,7 @@ test Utf8Decoder { "\xF7\xBF\xBF\xBF" ++ // Surrogate halves "\xED\xA0\x80\xED\xBF\xBF"; - const expected: []const (error{InvalidUtf8}!u21) = &.{ + _ = try testDecode(Utf8Decoder, input, &.{ '\x00', '\x01', ' ', @@ -309,94 +332,53 @@ test Utf8Decoder { error.InvalidUtf8, // attempted U+1FFFFF error.InvalidUtf8, // U+D800 error.InvalidUtf8, // U+DFFF - }; - - var decoded = ArrayListUnmanaged(error{InvalidUtf8}!u21){}; - defer decoded.deinit(testing.allocator); - var decoder = Utf8Decoder{}; - for (input) |b| { - if (decoder.next(b)) |maybe_c| { - if (maybe_c) |c| { - try decoded.append(testing.allocator, c); - } - } else |err| { - try decoded.append(testing.allocator, err); - } - } - - try testing.expectEqualDeep(expected, decoded.items); + }); } -pub const Utf16Endianness = enum { - big, - little, -}; - /// A decoder which handles only UTF-16 of a given endianness. -pub fn Utf16Decoder(comptime endianness: Utf16Endianness) type { +pub fn Utf16Decoder(comptime endian: std.builtin.Endian) type { return struct { - buffer: BoundedArray(u8, 2) = .{}, - high_unit: ?u10 = null, - const Self = @This(); - pub const Error = error{InvalidUtf16}; + pub const Error = error{ InvalidUtf16, Overflow, UnexpectedEndOfInput }; pub const max_encoded_codepoint_len = 4; - pub fn next(self: *Self, b: u8) Error!?u21 { - self.buffer.appendAssumeCapacity(b); - if (self.buffer.len == 1) { - return null; - } - const u = self.takeCodeUnit(); - if (self.high_unit) |high_unit| { - self.high_unit = null; - if (!isLowSurrogate(u)) { - return error.InvalidUtf16; - } - return 0x10000 + ((@as(u21, high_unit) << 10) | surrogateValue(u)); - } else if (isHighSurrogate(u)) { - self.high_unit = surrogateValue(u); - return null; - } else if (isLowSurrogate(u)) { - return error.InvalidUtf16; - } else { - return u; + pub fn readCodepoint(_: *Self, reader: anytype, buf: []u8) (Error || @TypeOf(reader).Error)!ReadResult { + var u_buf: [2]u8 = undefined; + const u_len = try reader.readAll(&u_buf); + switch (u_len) { + 0 => return ReadResult.none, + 1 => return error.UnexpectedEndOfInput, + else => {}, } - } - - inline fn takeCodeUnit(self: *Self) u16 { - const b1 = self.buffer.buffer[0]; - const b2 = self.buffer.buffer[1]; - self.buffer.len = 0; - return if (endianness == .big) (@as(u16, b1) << 8) + b2 else (@as(u16, b2) << 8) + b1; - } - - inline fn isHighSurrogate(u: u16) bool { - return u & ~@as(u16, 0x3FF) == 0xD800; - } - - inline fn isLowSurrogate(u: u16) bool { - return u & ~@as(u16, 0x3FF) == 0xDC00; - } - - inline fn surrogateValue(u: u16) u10 { - return @intCast(u & 0x3FF); + const u = std.mem.readInt(u16, &u_buf, endian); + const code_unit_length = unicode.utf16CodeUnitSequenceLength(u) catch return error.InvalidUtf16; + const codepoint = switch (code_unit_length) { + 1 => u, + 2 => codepoint: { + const low = reader.readInt(u16, endian) catch |e| switch (e) { + error.EndOfStream => return error.UnexpectedEndOfInput, + else => |other| return other, + }; + break :codepoint unicode.utf16DecodeSurrogatePair(&.{ u, low }) catch return error.InvalidUtf16; + }, + else => unreachable, + }; + const byte_length = unicode.utf8CodepointSequenceLength(codepoint) catch unreachable; + if (byte_length > buf.len) return error.Overflow; + _ = unicode.utf8Encode(codepoint, buf) catch unreachable; + return .{ .codepoint = codepoint, .byte_length = byte_length }; } pub fn adaptTo(_: *Self, encoding: []const u8) error{InvalidEncoding}!void { if (!(ascii.eqlIgnoreCase(encoding, "utf-16") or - (endianness == .big and ascii.eqlIgnoreCase(encoding, "utf-16be")) or - (endianness == .little and ascii.eqlIgnoreCase(encoding, "utf-16le")))) + (endian == .Big and ascii.eqlIgnoreCase(encoding, "utf-16be")) or + (endian == .Little and ascii.eqlIgnoreCase(encoding, "utf-16le")))) { return error.InvalidEncoding; } } - - pub inline fn isUtf8Compatible(_: Self) bool { - return false; - } }; } @@ -412,7 +394,7 @@ test Utf16Decoder { "\x00\xD8\x00\x00" ++ // unpaired high surrogate followed by U+0000 "\xFF\xDF" // unpaired low surrogate ; - const expected: []const (error{InvalidUtf16}!u21) = &.{ + _ = try testDecode(Utf16Decoder(.Little), input, &.{ '\x00', 'A', 'b', @@ -421,22 +403,7 @@ test Utf16Decoder { '😳', error.InvalidUtf16, error.InvalidUtf16, - }; - - var decoded = ArrayListUnmanaged(error{InvalidUtf16}!u21){}; - defer decoded.deinit(testing.allocator); - var decoder = Utf16Decoder(.little){}; - for (input) |b| { - if (decoder.next(b)) |maybe_c| { - if (maybe_c) |c| { - try decoded.append(testing.allocator, c); - } - } else |err| { - try decoded.append(testing.allocator, err); - } - } - - try testing.expectEqualDeep(expected, decoded.items); + }); } // big-endian @@ -450,7 +417,7 @@ test Utf16Decoder { "\xD8\x00\x00\x00" ++ // unpaired high surrogate followed by U+0000 "\xDF\xFF" // unpaired low surrogate ; - const expected: []const (error{InvalidUtf16}!u21) = &.{ + _ = try testDecode(Utf16Decoder(.Big), input, &.{ '\x00', 'A', 'b', @@ -459,21 +426,26 @@ test Utf16Decoder { '😳', error.InvalidUtf16, error.InvalidUtf16, - }; + }); + } +} - var decoded = ArrayListUnmanaged(error{InvalidUtf16}!u21){}; - defer decoded.deinit(testing.allocator); - var decoder = Utf16Decoder(.big){}; - for (input) |b| { - if (decoder.next(b)) |maybe_c| { - if (maybe_c) |c| { - try decoded.append(testing.allocator, c); - } - } else |err| { - try decoded.append(testing.allocator, err); - } +fn testDecode(comptime Decoder: type, input: []const u8, expected: []const (Decoder.Error!u21)) !Decoder { + var decoder: Decoder = .{}; + var decoded = ArrayListUnmanaged(Decoder.Error!u21){}; + defer decoded.deinit(testing.allocator); + var input_stream = std.io.fixedBufferStream(input); + var buf: [4]u8 = undefined; + while (true) { + if (decoder.readCodepoint(input_stream.reader(), &buf)) |c| { + if (!c.present) break; + try decoded.append(testing.allocator, c.codepoint); + } else |err| { + try decoded.append(testing.allocator, err); } - - try testing.expectEqualDeep(expected, decoded.items); } + + try testing.expectEqualDeep(expected, decoded.items); + + return decoder; } diff --git a/src/token_reader.zig b/src/token_reader.zig index 91e4bb3..d3601c6 100644 --- a/src/token_reader.zig +++ b/src/token_reader.zig @@ -224,9 +224,6 @@ pub fn TokenReader( /// /// This is relevant for line break normalization. after_cr: if (options.enable_normalization) bool else void = if (options.enable_normalization) false, - /// The length of the raw codepoint data currently stored in `buffer` - /// starting at `scanner.pos`. - cp_len: usize = 0, const Self = @This(); @@ -281,21 +278,12 @@ pub fn TokenReader( } } - const c = (try self.nextCodepoint()) orelse { + const c = try self.nextCodepoint(); + if (!c.present) { try self.scanner.endInput(); return null; - }; - if (!self.decoder.isUtf8Compatible()) { - // If the decoder is not compatible with UTF-8, we have to - // reencode the codepoint we just read into UTF-8, since - // `buffer` must always be valid UTF-8. - self.cp_len = unicode.utf8CodepointSequenceLength(c) catch unreachable; - if (self.scanner.pos + self.cp_len >= self.buffer.len) { - return error.Overflow; - } - _ = unicode.utf8Encode(c, self.buffer[self.scanner.pos .. self.scanner.pos + self.cp_len]) catch unreachable; } - const token = try self.scanner.next(c, self.cp_len); + const token = try self.scanner.next(c.codepoint, c.byte_length); if (token != .ok) { return try self.bufToken(token); } @@ -304,49 +292,36 @@ pub fn TokenReader( const nextCodepoint = if (options.enable_normalization) nextCodepointNormalized else nextCodepointRaw; - fn nextCodepointNormalized(self: *Self) !?u21 { - var b = (try self.nextCodepointRaw()) orelse return null; + fn nextCodepointNormalized(self: *Self) !encoding.ReadResult { + var c = try self.nextCodepointRaw(); + if (!c.present) return c; if (self.after_cr) { self.after_cr = false; - if (b == '\n') { + if (c.codepoint == '\n') { // \n after \r is ignored because \r was already processed // as a line ending - b = (try self.nextCodepointRaw()) orelse return null; + c = try self.nextCodepointRaw(); + if (!c.present) return c; } } - if (b == '\r') { + if (c.codepoint == '\r') { self.after_cr = true; - b = '\n'; + c.codepoint = '\n'; self.buffer[self.scanner.pos] = '\n'; } - if (self.scanner.state == .attribute_content and (b == '\t' or b == '\r' or b == '\n')) { - b = ' '; + if (self.scanner.state == .attribute_content and + (c.codepoint == '\t' or c.codepoint == '\r' or c.codepoint == '\n')) + { + c.codepoint = ' '; self.buffer[self.scanner.pos] = ' '; } - return b; + return c; } - fn nextCodepointRaw(self: *Self) !?u21 { - self.cp_len = 0; - var b = self.reader.readByte() catch |e| switch (e) { - error.EndOfStream => return null, - else => |other| return other, - }; - while (true) { - if (self.scanner.pos + self.cp_len == self.buffer.len) { - return error.Overflow; - } - self.buffer[self.scanner.pos + self.cp_len] = b; - self.cp_len += 1; - if (try self.decoder.next(b)) |c| { - self.location.advance(c); - return c; - } - b = self.reader.readByte() catch |e| switch (e) { - error.EndOfStream => return error.UnexpectedEndOfInput, - else => |other| return other, - }; - } + fn nextCodepointRaw(self: *Self) !encoding.ReadResult { + const c = try self.decoder.readCodepoint(self.reader, self.buffer[self.scanner.pos..]); + if (c.present) self.location.advance(c.codepoint); + return c; } fn bufToken(self: *Self, token: Scanner.Token) !Token {