From 9764e026ac0c6c1f1d33e93685ad30e4ef33a931 Mon Sep 17 00:00:00 2001
From: Owen Wu <1449069+yubing744@users.noreply.github.com>
Date: Sat, 7 May 2022 00:25:35 +0800
Subject: [PATCH] Support CRLF as newline for Move files (#90)

* [move-compiler] Support CRLF as newline for Move files

Fix issue: https://github.com/starcoinorg/starcoin/issues/3130

* feat: validate crlf in source

* [move-compiler] Support CRLF as newline for Move files

Fix issue: https://github.com/starcoinorg/starcoin/issues/3130

* feat: fix test-ci fail

Signed-off-by: sahithiacn <sahithi.kalakonda@accenture.com>
Closes: #328
---
 .../src/character_sets.rs                     | 62 ++++++++++++++++---
 language/move-compiler/src/parser/comments.rs |  8 +--
 language/move-compiler/src/parser/lexer.rs    | 12 ++--
 .../parser/invalid_character_comment.exp      |  2 +-
 .../parser/invalid_character_non_ascii.exp    |  2 +-
 .../tests/move_check/parser/newline_crlf.move | 21 +++++++
 .../move-ir-to-bytecode/src/parser.rs         | 16 ++---
 .../move-ir-to-bytecode/syntax/src/lexer.rs   |  1 +
 .../tests/parsing/crlf.exp                    | 59 ++++++++++++++++++
 .../tests/parsing/crlf.mvir                   | 46 ++++++++++++++
 10 files changed, 202 insertions(+), 27 deletions(-)
 create mode 100644 language/move-compiler/tests/move_check/parser/newline_crlf.move
 create mode 100644 language/move-ir-compiler/transactional-tests/tests/parsing/crlf.exp
 create mode 100644 language/move-ir-compiler/transactional-tests/tests/parsing/crlf.mvir
diff --git a/language/move-command-line-common/src/character_sets.rs b/language/move-command-line-common/src/character_sets.rs
index abe863f6da..702d0f42f6 100644
--- a/language/move-command-line-common/src/character_sets.rs
+++ b/language/move-command-line-common/src/character_sets.rs
@@ -13,12 +13,23 @@ pub fn is_permitted_printable_char(c: char) -> bool {
     (is_above_space && is_below_tilde) || is_tab
 }
 
-/// Determine if a character is a permitted newline character.
+/// Determine if a character is a permitted newline lf character.
 ///
-/// The only permitted newline character is \n. All others are invalid.
-pub fn is_permitted_newline_char(c: char) -> bool {
+/// The only permitted newline lf character is \n All others are invalid.
+pub fn is_permitted_newline_lf_char(c: char) -> bool {
     let x = c as u32;
-    x == 0x0A
+    x == 0x0A // \n
+}
+
+/// Determine if a character is a permitted newline crlf character.
+///
+/// The only permitted newline character is \r\n. All others are invalid.
+pub fn is_permitted_newline_crlf_chars(c1: char, c2: char) -> bool {
+    let x1 = c1 as u32;
+    let x2 = c2 as u32;
+    let is_cr = x1 == 0x0D; // \r
+    let is_lf = x2 == 0x0A; // \n
+    is_cr && is_lf
 }
 
 /// Determine if a character is permitted character.
@@ -26,7 +37,26 @@ pub fn is_permitted_newline_char(c: char) -> bool {
 /// A permitted character is either a permitted printable character, or a permitted
 /// newline. Any other characters are disallowed from appearing in the file.
 pub fn is_permitted_char(c: char) -> bool {
-    is_permitted_printable_char(c) || is_permitted_newline_char(c)
+    is_permitted_printable_char(c) || is_permitted_newline_lf_char(c)
+}
+
+/// Determine if the characters is permitted characters.
+///
+/// A permitted characters is either a permitted printable character, or a permitted
+/// newlines. Any other characters are disallowed from appearing in the file.
+pub fn is_permitted_chars(chars: &[u8], idx: usize) -> bool {
+    let c1 = chars[idx] as char;
+
+    if is_permitted_char(c1) {
+        return true;
+    }
+
+    if idx + 1 >= chars.len() {
+        return false;
+    }
+
+    let c2 = chars[idx + 1] as char;
+    is_permitted_newline_crlf_chars(c1, c2)
 }
 
 #[cfg(test)]
@@ -34,20 +64,34 @@ mod tests {
     #[test]
     fn test_permitted_characters() {
         let mut good_chars = (0x20..=0x7E).collect::<Vec<u8>>();
+        good_chars.push(0x0D); // \r
         good_chars.push(0x0A); // \n
         good_chars.push(0x09); // \t
-        for c in good_chars {
-            assert!(super::is_permitted_char(c as char));
+
+        for idx in 0..good_chars.len() {
+            assert!(super::is_permitted_chars(&good_chars, idx));
         }
     }
 
+    #[test]
+    fn test_forbidden_last_lf_characters() {
+        let mut good_chars = (0x20..=0x7E).collect::<Vec<u8>>();
+        good_chars.push(0x0D); // \r
+
+        assert!(!super::is_permitted_chars(
+            &good_chars,
+            good_chars.len() - 1
+        ));
+    }
+
     #[test]
     fn test_forbidden_characters() {
         let mut bad_chars = (0x0..0x09).collect::<Vec<u8>>();
         bad_chars.append(&mut (0x0B..=0x1F).collect::<Vec<u8>>());
         bad_chars.push(0x7F);
-        for c in bad_chars {
-            assert!(!super::is_permitted_char(c as char));
+
+        for idx in 0..bad_chars.len() {
+            assert!(!super::is_permitted_chars(&bad_chars, idx));
         }
     }
 }
diff --git a/language/move-compiler/src/parser/comments.rs b/language/move-compiler/src/parser/comments.rs
index ad0298f4e8..c00f324a29 100644
--- a/language/move-compiler/src/parser/comments.rs
+++ b/language/move-compiler/src/parser/comments.rs
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 use crate::{diag, diagnostics::Diagnostics};
-use move_command_line_common::{character_sets::is_permitted_char, files::FileHash};
+use move_command_line_common::{character_sets::is_permitted_chars, files::FileHash};
 use move_ir_types::location::*;
 use std::collections::BTreeMap;
 
@@ -12,19 +12,19 @@ pub type MatchedFileCommentMap = BTreeMap<u32, String>;
 pub type FileCommentMap = BTreeMap<(u32, u32), String>;
 
 // We restrict strings to only ascii visual characters (0x20 <= c <= 0x7E) or a permitted newline
-// character--\n--or a tab--\t.
+// character--\r--,--\n--or a tab--\t.
 pub fn verify_string(file_hash: FileHash, string: &str) -> Result<(), Diagnostics> {
     match string
         .chars()
         .enumerate()
-        .find(|(_, c)| !is_permitted_char(*c))
+        .find(|(idx, _)| !is_permitted_chars(string.as_bytes(), *idx))
     {
         None => Ok(()),
         Some((idx, chr)) => {
             let loc = Loc::new(file_hash, idx as u32, idx as u32);
             let msg = format!(
                 "Invalid character '{}' found when reading file. Only ASCII printable characters, \
-                 tabs (\\t), and line endings (\\n) are permitted.",
+                 tabs (\\t), lf (\\n) and crlf (\\r\\n) are permitted.",
                 chr
             );
             Err(Diagnostics::from(vec![diag!(
diff --git a/language/move-compiler/src/parser/lexer.rs b/language/move-compiler/src/parser/lexer.rs
index cf1014b76e..5c0e3bf70b 100644
--- a/language/move-compiler/src/parser/lexer.rs
+++ b/language/move-compiler/src/parser/lexer.rs
@@ -223,7 +223,8 @@ impl<'input> Lexer<'input> {
         // Loop until we find text that isn't whitespace, and that isn't part of
         // a multi-line or single-line comment.
         loop {
-            // Trim only the whitespace characters we recognize: newline, tab, and space.
+            // Trim only the whitespace characters we recognize: newline(\n|\r\n), tab, and space.
+            text = text.trim_start_matches("\r\n");
             text = text.trim_start_matches(|c: char| matches!(c, '\n' | '\t' | ' '));
 
             if text.starts_with("/*") {
@@ -294,10 +295,11 @@ impl<'input> Lexer<'input> {
                 // If this was a documentation comment, record it in our map.
                 if is_doc {
                     let end = get_offset(text);
-                    self.doc_comments.insert(
-                        (start as u32, end as u32),
-                        self.text[(start + 3)..end].to_string(),
-                    );
+                    let mut comment = &self.text[(start + 3)..end];
+                    comment = comment.trim_end_matches(|c: char| c == '\r');
+
+                    self.doc_comments
+                        .insert((start as u32, end as u32), comment.to_string());
                 }
 
                 // Continue the loop on the following line, which may contain leading
diff --git a/language/move-compiler/tests/move_check/parser/invalid_character_comment.exp b/language/move-compiler/tests/move_check/parser/invalid_character_comment.exp
index e8b85beb46..aa375653ed 100644
--- a/language/move-compiler/tests/move_check/parser/invalid_character_comment.exp
+++ b/language/move-compiler/tests/move_check/parser/invalid_character_comment.exp
@@ -2,5 +2,5 @@ error[E01001]: invalid character
   ┌─ tests/move_check/parser/invalid_character_comment.move:3:44
   │
 3 │ // Non-ASCII characters in comments (e.g., ф) are also not allowed.
-  │                                            ^ Invalid character 'ф' found when reading file. Only ASCII printable characters, tabs (\t), and line endings (\n) are permitted.
+  │                                            ^ Invalid character 'ф' found when reading file. Only ASCII printable characters, tabs (\t), lf (\n) and crlf (\r\n) are permitted.
 
diff --git a/language/move-compiler/tests/move_check/parser/invalid_character_non_ascii.exp b/language/move-compiler/tests/move_check/parser/invalid_character_non_ascii.exp
index 17f7153b2c..3ffa9e0d47 100644
--- a/language/move-compiler/tests/move_check/parser/invalid_character_non_ascii.exp
+++ b/language/move-compiler/tests/move_check/parser/invalid_character_non_ascii.exp
@@ -2,5 +2,5 @@ error[E01001]: invalid character
   ┌─ tests/move_check/parser/invalid_character_non_ascii.move:5:4
   │
 5 │    ф
-  │    ^ Invalid character 'ф' found when reading file. Only ASCII printable characters, tabs (\t), and line endings (\n) are permitted.
+  │    ^ Invalid character 'ф' found when reading file. Only ASCII printable characters, tabs (\t), lf (\n) and crlf (\r\n) are permitted.
 
diff --git a/language/move-compiler/tests/move_check/parser/newline_crlf.move b/language/move-compiler/tests/move_check/parser/newline_crlf.move
new file mode 100644
index 0000000000..314fbd306b
--- /dev/null
+++ b/language/move-compiler/tests/move_check/parser/newline_crlf.move
@@ -0,0 +1,21 @@
+/// This is a test.
+module 0x8675309::M {
+    /**
+     * One can have /* nested */
+     * // block comments
+     */
+    fun f() { }
+
+    /* This is a nested /* regular comment // */ */
+    fun g() {}
+
+    // This is a line comment which contains unbalanced /* delimiter.
+    fun h() {}
+
+    // Comments in strings are not comments at all.
+    fun str(): vector<u8> {
+        b"http://diem.com"
+    }
+
+    // This is a regular comment which appears where a doc comment would not be allowed.
+}
diff --git a/language/move-ir-compiler/move-ir-to-bytecode/src/parser.rs b/language/move-ir-compiler/move-ir-to-bytecode/src/parser.rs
index f03075c5bc..0a9db21f1a 100644
--- a/language/move-ir-compiler/move-ir-to-bytecode/src/parser.rs
+++ b/language/move-ir-compiler/move-ir-to-bytecode/src/parser.rs
@@ -11,24 +11,25 @@ use codespan_reporting::{
         Config,
     },
 };
-use move_command_line_common::character_sets::is_permitted_char;
+use move_command_line_common::character_sets::is_permitted_chars;
 use move_ir_to_bytecode_syntax::syntax::{self, ParseError};
 use move_ir_types::{ast, location::*};
 
 // We restrict strings to only ascii visual characters (0x20 <= c <= 0x7E) or a permitted newline
-// character--\n--or a tab--\t. Checking each character in the input string is more fool-proof
+// character--\r--,--\n--or a tab--\t. Checking each character in the input string is more fool-proof
 // than checking each character later during lexing & tokenization, since that would require special
 // handling of characters inside of comments (usually not included as tokens) and within byte
 // array literals.
 fn verify_string(string: &str) -> Result<()> {
     string
         .chars()
-        .find(|c| !is_permitted_char(*c))
-        .map_or(Ok(()), |chr| {
+        .enumerate()
+        .find(|(idx, _)| !is_permitted_chars(string.as_bytes(), *idx))
+        .map_or(Ok(()), |(_, c)| {
             bail!(
                 "Parser Error: invalid character {} found when reading file.\
-                 Only ascii printable, tabs (\\t), and \\n line ending characters are permitted.",
-                chr
+                 Only ascii printable, tabs (\\t), lf (\\n) and crlf (\\r\\n) characters are permitted.",
+                c
             )
         })
 }
@@ -86,7 +87,8 @@ mod tests {
         good_chars.push(0x09);
 
         let mut bad_chars = (0x0..0x09).collect::<Vec<_>>();
-        bad_chars.append(&mut (0x0B..=0x1F).collect::<Vec<_>>());
+        bad_chars.append(&mut vec![0x0B, 0x0C]);
+        bad_chars.append(&mut (0x0E..=0x1F).collect::<Vec<_>>());
         bad_chars.push(0x7F);
 
         // Test to make sure that all the characters that are in the allowlist pass.
diff --git a/language/move-ir-compiler/move-ir-to-bytecode/syntax/src/lexer.rs b/language/move-ir-compiler/move-ir-to-bytecode/syntax/src/lexer.rs
index e36ffbb635..9629bc4567 100644
--- a/language/move-ir-compiler/move-ir-to-bytecode/syntax/src/lexer.rs
+++ b/language/move-ir-compiler/move-ir-to-bytecode/syntax/src/lexer.rs
@@ -174,6 +174,7 @@ impl<'input> Lexer<'input> {
         let mut text = &self.text[self.cur_end..];
         loop {
             // Trim the only whitespace characters we recognize: newline, tab, and space.
+            text = text.trim_start_matches("\r\n");
             text = text.trim_start_matches(|c: char| matches!(c, '\n' | '\t' | ' '));
             // Trim the only comments we recognize: '// ... \n'.
             if text.starts_with("//") {
diff --git a/language/move-ir-compiler/transactional-tests/tests/parsing/crlf.exp b/language/move-ir-compiler/transactional-tests/tests/parsing/crlf.exp
new file mode 100644
index 0000000000..9adc3af979
--- /dev/null
+++ b/language/move-ir-compiler/transactional-tests/tests/parsing/crlf.exp
@@ -0,0 +1,59 @@
+processed 6 tasks
+
+task 0 'print-bytecode'. lines 1-6:
+// Move bytecode v5
+script {
+
+
+main() {
+B0:
+	0: Ret
+}
+}
+
+task 1 'print-bytecode'. lines 8-14:
+// Move bytecode v5
+script {
+
+
+main() {
+B0:
+	0: Ret
+}
+}
+
+task 2 'print-bytecode'. lines 16-20:
+// Move bytecode v5
+script {
+
+
+main() {
+B0:
+	0: Ret
+}
+}
+
+task 3 'print-bytecode'. lines 22-27:
+// Move bytecode v5
+script {
+
+
+main() {
+B0:
+	0: Ret
+}
+}
+
+task 4 'print-bytecode'. lines 29-36:
+Error: ParserError: Invalid Token: invalid token kind for statement Slash
+
+task 5 'print-bytecode'. lines 38-46:
+// Move bytecode v5
+script {
+
+
+main() {
+B0:
+	0: Ret
+}
+}
diff --git a/language/move-ir-compiler/transactional-tests/tests/parsing/crlf.mvir b/language/move-ir-compiler/transactional-tests/tests/parsing/crlf.mvir
new file mode 100644
index 0000000000..9e2cd428fe
--- /dev/null
+++ b/language/move-ir-compiler/transactional-tests/tests/parsing/crlf.mvir
@@ -0,0 +1,46 @@
+//# print-bytecode
+main() {
+label b0:
+    // return;
+    return;
+}
+
+//# print-bytecode
+main() {
+label b0:
+    // return;
+    // return;
+    return;
+}
+
+//# print-bytecode
+main() {
+label b0:
+    return; // return;
+}
+
+//# print-bytecode
+main() {
+label b0:
+    // return;
+    return; // return;
+}
+
+//# print-bytecode
+// In Move, /* */ are block comment delimiters. Not so in Move IR, so the `/*` below
+// cannot be parsed.
+main() {
+label b0:
+    return;
+    /* return; */
+}
+
+//# print-bytecode
+// Since /* */ are not block comment delimiters, they do not behave in any unique way when
+// they appear within comments.
+main() {
+label b0:
+    // /*
+    return;
+    // */
+}