std.zig.tokenizer: simplify line-based tokens

ianprime0509 · richerfu · commit 516d0fcb6115 · 2024-10-28T09:49:10.000+08:00
Closes ziglang#21358 Closes ziglang#21360 This commit modifies the `multiline_string_literal_line`, `doc_comment`, and `container_doc_comment` tokens to no longer include the line ending as part of the token. This makes it easier to handle line endings (which may be LF, CRLF, or in edge cases possibly nonexistent) consistently. In the two issues linked above, Autodoc was already assuming this for doc comments, and yielding incorrect results when handling files with CRLF line endings (both in Markdown parsing and source rendering). Applying the same simplification for multiline string literals also brings `zig fmt` into conformance with ziglang/zig-spec#38 regarding formatting of multiline strings with CRLF line endings: the spec says that `zig fmt` should remove the CR from such line endings, but this was not previously the case.
diff --git a/lib/std/zig/AstGen.zig b/lib/std/zig/AstGen.zig
@@ -11721,16 +11721,14 @@ fn strLitNodeAsString(astgen: *AstGen, node: Ast.Node.Index) !IndexSlice {
     var tok_i = start;
     {
         const slice = tree.tokenSlice(tok_i);
-        const carriage_return_ending: usize = if (slice[slice.len - 2] == '\r') 2 else 1;
-        const line_bytes = slice[2 .. slice.len - carriage_return_ending];
+        const line_bytes = slice[2..];
         try string_bytes.appendSlice(gpa, line_bytes);
         tok_i += 1;
     }
     // Following lines: each line prepends a newline.
     while (tok_i <= end) : (tok_i += 1) {
         const slice = tree.tokenSlice(tok_i);
-        const carriage_return_ending: usize = if (slice[slice.len - 2] == '\r') 2 else 1;
-        const line_bytes = slice[2 .. slice.len - carriage_return_ending];
+        const line_bytes = slice[2..];
         try string_bytes.ensureUnusedCapacity(gpa, line_bytes.len + 1);
         string_bytes.appendAssumeCapacity('\n');
         string_bytes.appendSliceAssumeCapacity(line_bytes);
diff --git a/lib/std/zig/parser_test.zig b/lib/std/zig/parser_test.zig
@@ -3087,6 +3087,22 @@ test "zig fmt: multiline string" {
     );
 }
 
+test "zig fmt: multiline string with CRLF line endings" {
+    try testTransform("" ++
+        "const s =\r\n" ++
+        "    \\\\one\r\n" ++
+        "    \\\\two)\r\n" ++
+        "    \\\\three\r\n" ++
+        ";\r\n",
+        \\const s =
+        \\    \\one
+        \\    \\two)
+        \\    \\three
+        \\;
+        \\
+    );
+}
+
 test "zig fmt: values" {
     try testCanonical(
         \\test "values" {
@@ -4404,6 +4420,28 @@ test "zig fmt: invalid doc comments on comptime and test blocks" {
     });
 }
 
+test "zig fmt: comments with CRLF line endings" {
+    try testTransform("" ++
+        "//! Top-level doc comment\r\n" ++
+        "//! Continuing to another line\r\n" ++
+        "\r\n" ++
+        "/// Regular doc comment\r\n" ++
+        "const S = struct {\r\n" ++
+        "    // Regular comment\r\n" ++
+        "    // More content\r\n" ++
+        "};\r\n",
+        \\//! Top-level doc comment
+        \\//! Continuing to another line
+        \\
+        \\/// Regular doc comment
+        \\const S = struct {
+        \\    // Regular comment
+        \\    // More content
+        \\};
+        \\
+    );
+}
+
 test "zig fmt: else comptime expr" {
     try testCanonical(
         \\comptime {
diff --git a/lib/std/zig/render.zig b/lib/std/zig/render.zig
@@ -3170,9 +3170,6 @@ fn discardAllParams(r: *Render, fn_proto_node: Ast.Node.Index) Error!void {
 fn tokenSliceForRender(tree: Ast, token_index: Ast.TokenIndex) []const u8 {
     var ret = tree.tokenSlice(token_index);
     switch (tree.tokens.items(.tag)[token_index]) {
-        .multiline_string_literal_line => {
-            if (ret[ret.len - 1] == '\n') ret.len -= 1;
-        },
         .container_doc_comment, .doc_comment => {
             ret = mem.trimRight(u8, ret, &std.ascii.whitespace);
         },
diff --git a/lib/std/zig/tokenizer.zig b/lib/std/zig/tokenizer.zig
@@ -847,12 +847,10 @@ pub const Tokenizer = struct {
                         break;
                     },
                     '\n' => {
-                        self.index += 1;
                         break;
                     },
                     '\r' => {
                         if (self.buffer[self.index + 1] == '\n') {
-                            self.index += 2;
                             break;
                         } else {
                             state = .invalid;
@@ -1117,7 +1115,6 @@ pub const Tokenizer = struct {
                     },
                     '\r' => {
                         if (self.buffer[self.index + 1] == '\n') {
-                            self.index += 1;
                             result.tag = .doc_comment;
                             break;
                         } else {
@@ -1167,7 +1164,6 @@ pub const Tokenizer = struct {
                     },
                     '\r' => {
                         if (self.buffer[self.index + 1] == '\n') {
-                            self.index += 1;
                             break;
                         } else {
                             state = .invalid;