From 9764e026ac0c6c1f1d33e93685ad30e4ef33a931 Mon Sep 17 00:00:00 2001 From: Owen Wu <1449069+yubing744@users.noreply.github.com> Date: Sat, 7 May 2022 00:25:35 +0800 Subject: [PATCH] Support CRLF as newline for Move files (#90) * [move-compiler] Support CRLF as newline for Move files Fix issue: https://github.com/starcoinorg/starcoin/issues/3130 * feat: validate crlf in source * [move-compiler] Support CRLF as newline for Move files Fix issue: https://github.com/starcoinorg/starcoin/issues/3130 * feat: fix test-ci fail Signed-off-by: sahithiacn Closes: #328 --- .../src/character_sets.rs | 62 ++++++++++++++++--- language/move-compiler/src/parser/comments.rs | 8 +-- language/move-compiler/src/parser/lexer.rs | 12 ++-- .../parser/invalid_character_comment.exp | 2 +- .../parser/invalid_character_non_ascii.exp | 2 +- .../tests/move_check/parser/newline_crlf.move | 21 +++++++ .../move-ir-to-bytecode/src/parser.rs | 16 ++--- .../move-ir-to-bytecode/syntax/src/lexer.rs | 1 + .../tests/parsing/crlf.exp | 59 ++++++++++++++++++ .../tests/parsing/crlf.mvir | 46 ++++++++++++++ 10 files changed, 202 insertions(+), 27 deletions(-) create mode 100644 language/move-compiler/tests/move_check/parser/newline_crlf.move create mode 100644 language/move-ir-compiler/transactional-tests/tests/parsing/crlf.exp create mode 100644 language/move-ir-compiler/transactional-tests/tests/parsing/crlf.mvir diff --git a/language/move-command-line-common/src/character_sets.rs b/language/move-command-line-common/src/character_sets.rs index abe863f6da..702d0f42f6 100644 --- a/language/move-command-line-common/src/character_sets.rs +++ b/language/move-command-line-common/src/character_sets.rs @@ -13,12 +13,23 @@ pub fn is_permitted_printable_char(c: char) -> bool { (is_above_space && is_below_tilde) || is_tab } -/// Determine if a character is a permitted newline character. +/// Determine if a character is a permitted newline lf character. /// -/// The only permitted newline character is \n. All others are invalid. -pub fn is_permitted_newline_char(c: char) -> bool { +/// The only permitted newline lf character is \n All others are invalid. +pub fn is_permitted_newline_lf_char(c: char) -> bool { let x = c as u32; - x == 0x0A + x == 0x0A // \n +} + +/// Determine if a character is a permitted newline crlf character. +/// +/// The only permitted newline character is \r\n. All others are invalid. +pub fn is_permitted_newline_crlf_chars(c1: char, c2: char) -> bool { + let x1 = c1 as u32; + let x2 = c2 as u32; + let is_cr = x1 == 0x0D; // \r + let is_lf = x2 == 0x0A; // \n + is_cr && is_lf } /// Determine if a character is permitted character. @@ -26,7 +37,26 @@ pub fn is_permitted_newline_char(c: char) -> bool { /// A permitted character is either a permitted printable character, or a permitted /// newline. Any other characters are disallowed from appearing in the file. pub fn is_permitted_char(c: char) -> bool { - is_permitted_printable_char(c) || is_permitted_newline_char(c) + is_permitted_printable_char(c) || is_permitted_newline_lf_char(c) +} + +/// Determine if the characters is permitted characters. +/// +/// A permitted characters is either a permitted printable character, or a permitted +/// newlines. Any other characters are disallowed from appearing in the file. +pub fn is_permitted_chars(chars: &[u8], idx: usize) -> bool { + let c1 = chars[idx] as char; + + if is_permitted_char(c1) { + return true; + } + + if idx + 1 >= chars.len() { + return false; + } + + let c2 = chars[idx + 1] as char; + is_permitted_newline_crlf_chars(c1, c2) } #[cfg(test)] @@ -34,20 +64,34 @@ mod tests { #[test] fn test_permitted_characters() { let mut good_chars = (0x20..=0x7E).collect::>(); + good_chars.push(0x0D); // \r good_chars.push(0x0A); // \n good_chars.push(0x09); // \t - for c in good_chars { - assert!(super::is_permitted_char(c as char)); + + for idx in 0..good_chars.len() { + assert!(super::is_permitted_chars(&good_chars, idx)); } } + #[test] + fn test_forbidden_last_lf_characters() { + let mut good_chars = (0x20..=0x7E).collect::>(); + good_chars.push(0x0D); // \r + + assert!(!super::is_permitted_chars( + &good_chars, + good_chars.len() - 1 + )); + } + #[test] fn test_forbidden_characters() { let mut bad_chars = (0x0..0x09).collect::>(); bad_chars.append(&mut (0x0B..=0x1F).collect::>()); bad_chars.push(0x7F); - for c in bad_chars { - assert!(!super::is_permitted_char(c as char)); + + for idx in 0..bad_chars.len() { + assert!(!super::is_permitted_chars(&bad_chars, idx)); } } } diff --git a/language/move-compiler/src/parser/comments.rs b/language/move-compiler/src/parser/comments.rs index ad0298f4e8..c00f324a29 100644 --- a/language/move-compiler/src/parser/comments.rs +++ b/language/move-compiler/src/parser/comments.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use crate::{diag, diagnostics::Diagnostics}; -use move_command_line_common::{character_sets::is_permitted_char, files::FileHash}; +use move_command_line_common::{character_sets::is_permitted_chars, files::FileHash}; use move_ir_types::location::*; use std::collections::BTreeMap; @@ -12,19 +12,19 @@ pub type MatchedFileCommentMap = BTreeMap; pub type FileCommentMap = BTreeMap<(u32, u32), String>; // We restrict strings to only ascii visual characters (0x20 <= c <= 0x7E) or a permitted newline -// character--\n--or a tab--\t. +// character--\r--,--\n--or a tab--\t. pub fn verify_string(file_hash: FileHash, string: &str) -> Result<(), Diagnostics> { match string .chars() .enumerate() - .find(|(_, c)| !is_permitted_char(*c)) + .find(|(idx, _)| !is_permitted_chars(string.as_bytes(), *idx)) { None => Ok(()), Some((idx, chr)) => { let loc = Loc::new(file_hash, idx as u32, idx as u32); let msg = format!( "Invalid character '{}' found when reading file. Only ASCII printable characters, \ - tabs (\\t), and line endings (\\n) are permitted.", + tabs (\\t), lf (\\n) and crlf (\\r\\n) are permitted.", chr ); Err(Diagnostics::from(vec![diag!( diff --git a/language/move-compiler/src/parser/lexer.rs b/language/move-compiler/src/parser/lexer.rs index cf1014b76e..5c0e3bf70b 100644 --- a/language/move-compiler/src/parser/lexer.rs +++ b/language/move-compiler/src/parser/lexer.rs @@ -223,7 +223,8 @@ impl<'input> Lexer<'input> { // Loop until we find text that isn't whitespace, and that isn't part of // a multi-line or single-line comment. loop { - // Trim only the whitespace characters we recognize: newline, tab, and space. + // Trim only the whitespace characters we recognize: newline(\n|\r\n), tab, and space. + text = text.trim_start_matches("\r\n"); text = text.trim_start_matches(|c: char| matches!(c, '\n' | '\t' | ' ')); if text.starts_with("/*") { @@ -294,10 +295,11 @@ impl<'input> Lexer<'input> { // If this was a documentation comment, record it in our map. if is_doc { let end = get_offset(text); - self.doc_comments.insert( - (start as u32, end as u32), - self.text[(start + 3)..end].to_string(), - ); + let mut comment = &self.text[(start + 3)..end]; + comment = comment.trim_end_matches(|c: char| c == '\r'); + + self.doc_comments + .insert((start as u32, end as u32), comment.to_string()); } // Continue the loop on the following line, which may contain leading diff --git a/language/move-compiler/tests/move_check/parser/invalid_character_comment.exp b/language/move-compiler/tests/move_check/parser/invalid_character_comment.exp index e8b85beb46..aa375653ed 100644 --- a/language/move-compiler/tests/move_check/parser/invalid_character_comment.exp +++ b/language/move-compiler/tests/move_check/parser/invalid_character_comment.exp @@ -2,5 +2,5 @@ error[E01001]: invalid character ┌─ tests/move_check/parser/invalid_character_comment.move:3:44 │ 3 │ // Non-ASCII characters in comments (e.g., ф) are also not allowed. - │ ^ Invalid character 'ф' found when reading file. Only ASCII printable characters, tabs (\t), and line endings (\n) are permitted. + │ ^ Invalid character 'ф' found when reading file. Only ASCII printable characters, tabs (\t), lf (\n) and crlf (\r\n) are permitted. diff --git a/language/move-compiler/tests/move_check/parser/invalid_character_non_ascii.exp b/language/move-compiler/tests/move_check/parser/invalid_character_non_ascii.exp index 17f7153b2c..3ffa9e0d47 100644 --- a/language/move-compiler/tests/move_check/parser/invalid_character_non_ascii.exp +++ b/language/move-compiler/tests/move_check/parser/invalid_character_non_ascii.exp @@ -2,5 +2,5 @@ error[E01001]: invalid character ┌─ tests/move_check/parser/invalid_character_non_ascii.move:5:4 │ 5 │ ф - │ ^ Invalid character 'ф' found when reading file. Only ASCII printable characters, tabs (\t), and line endings (\n) are permitted. + │ ^ Invalid character 'ф' found when reading file. Only ASCII printable characters, tabs (\t), lf (\n) and crlf (\r\n) are permitted. diff --git a/language/move-compiler/tests/move_check/parser/newline_crlf.move b/language/move-compiler/tests/move_check/parser/newline_crlf.move new file mode 100644 index 0000000000..314fbd306b --- /dev/null +++ b/language/move-compiler/tests/move_check/parser/newline_crlf.move @@ -0,0 +1,21 @@ +/// This is a test. +module 0x8675309::M { + /** + * One can have /* nested */ + * // block comments + */ + fun f() { } + + /* This is a nested /* regular comment // */ */ + fun g() {} + + // This is a line comment which contains unbalanced /* delimiter. + fun h() {} + + // Comments in strings are not comments at all. + fun str(): vector { + b"http://diem.com" + } + + // This is a regular comment which appears where a doc comment would not be allowed. +} diff --git a/language/move-ir-compiler/move-ir-to-bytecode/src/parser.rs b/language/move-ir-compiler/move-ir-to-bytecode/src/parser.rs index f03075c5bc..0a9db21f1a 100644 --- a/language/move-ir-compiler/move-ir-to-bytecode/src/parser.rs +++ b/language/move-ir-compiler/move-ir-to-bytecode/src/parser.rs @@ -11,24 +11,25 @@ use codespan_reporting::{ Config, }, }; -use move_command_line_common::character_sets::is_permitted_char; +use move_command_line_common::character_sets::is_permitted_chars; use move_ir_to_bytecode_syntax::syntax::{self, ParseError}; use move_ir_types::{ast, location::*}; // We restrict strings to only ascii visual characters (0x20 <= c <= 0x7E) or a permitted newline -// character--\n--or a tab--\t. Checking each character in the input string is more fool-proof +// character--\r--,--\n--or a tab--\t. Checking each character in the input string is more fool-proof // than checking each character later during lexing & tokenization, since that would require special // handling of characters inside of comments (usually not included as tokens) and within byte // array literals. fn verify_string(string: &str) -> Result<()> { string .chars() - .find(|c| !is_permitted_char(*c)) - .map_or(Ok(()), |chr| { + .enumerate() + .find(|(idx, _)| !is_permitted_chars(string.as_bytes(), *idx)) + .map_or(Ok(()), |(_, c)| { bail!( "Parser Error: invalid character {} found when reading file.\ - Only ascii printable, tabs (\\t), and \\n line ending characters are permitted.", - chr + Only ascii printable, tabs (\\t), lf (\\n) and crlf (\\r\\n) characters are permitted.", + c ) }) } @@ -86,7 +87,8 @@ mod tests { good_chars.push(0x09); let mut bad_chars = (0x0..0x09).collect::>(); - bad_chars.append(&mut (0x0B..=0x1F).collect::>()); + bad_chars.append(&mut vec![0x0B, 0x0C]); + bad_chars.append(&mut (0x0E..=0x1F).collect::>()); bad_chars.push(0x7F); // Test to make sure that all the characters that are in the allowlist pass. diff --git a/language/move-ir-compiler/move-ir-to-bytecode/syntax/src/lexer.rs b/language/move-ir-compiler/move-ir-to-bytecode/syntax/src/lexer.rs index e36ffbb635..9629bc4567 100644 --- a/language/move-ir-compiler/move-ir-to-bytecode/syntax/src/lexer.rs +++ b/language/move-ir-compiler/move-ir-to-bytecode/syntax/src/lexer.rs @@ -174,6 +174,7 @@ impl<'input> Lexer<'input> { let mut text = &self.text[self.cur_end..]; loop { // Trim the only whitespace characters we recognize: newline, tab, and space. + text = text.trim_start_matches("\r\n"); text = text.trim_start_matches(|c: char| matches!(c, '\n' | '\t' | ' ')); // Trim the only comments we recognize: '// ... \n'. if text.starts_with("//") { diff --git a/language/move-ir-compiler/transactional-tests/tests/parsing/crlf.exp b/language/move-ir-compiler/transactional-tests/tests/parsing/crlf.exp new file mode 100644 index 0000000000..9adc3af979 --- /dev/null +++ b/language/move-ir-compiler/transactional-tests/tests/parsing/crlf.exp @@ -0,0 +1,59 @@ +processed 6 tasks + +task 0 'print-bytecode'. lines 1-6: +// Move bytecode v5 +script { + + +main() { +B0: + 0: Ret +} +} + +task 1 'print-bytecode'. lines 8-14: +// Move bytecode v5 +script { + + +main() { +B0: + 0: Ret +} +} + +task 2 'print-bytecode'. lines 16-20: +// Move bytecode v5 +script { + + +main() { +B0: + 0: Ret +} +} + +task 3 'print-bytecode'. lines 22-27: +// Move bytecode v5 +script { + + +main() { +B0: + 0: Ret +} +} + +task 4 'print-bytecode'. lines 29-36: +Error: ParserError: Invalid Token: invalid token kind for statement Slash + +task 5 'print-bytecode'. lines 38-46: +// Move bytecode v5 +script { + + +main() { +B0: + 0: Ret +} +} diff --git a/language/move-ir-compiler/transactional-tests/tests/parsing/crlf.mvir b/language/move-ir-compiler/transactional-tests/tests/parsing/crlf.mvir new file mode 100644 index 0000000000..9e2cd428fe --- /dev/null +++ b/language/move-ir-compiler/transactional-tests/tests/parsing/crlf.mvir @@ -0,0 +1,46 @@ +//# print-bytecode +main() { +label b0: + // return; + return; +} + +//# print-bytecode +main() { +label b0: + // return; + // return; + return; +} + +//# print-bytecode +main() { +label b0: + return; // return; +} + +//# print-bytecode +main() { +label b0: + // return; + return; // return; +} + +//# print-bytecode +// In Move, /* */ are block comment delimiters. Not so in Move IR, so the `/*` below +// cannot be parsed. +main() { +label b0: + return; + /* return; */ +} + +//# print-bytecode +// Since /* */ are not block comment delimiters, they do not behave in any unique way when +// they appear within comments. +main() { +label b0: + // /* + return; + // */ +}