Skip to content

Commit

Permalink
Support CRLF as newline for Move files (#90)
Browse files Browse the repository at this point in the history
* [move-compiler] Support CRLF as newline for Move files

Fix issue: starcoinorg/starcoin#3130

* feat: validate crlf in source

* [move-compiler] Support CRLF as newline for Move files

Fix issue: starcoinorg/starcoin#3130

* feat: fix test-ci fail

Signed-off-by: sahithiacn <[email protected]>
Closes: #328
  • Loading branch information
yubing744 authored and bors-diem committed Jan 11, 2023
1 parent b2760c7 commit 9764e02
Show file tree
Hide file tree
Showing 10 changed files with 202 additions and 27 deletions.
62 changes: 53 additions & 9 deletions language/move-command-line-common/src/character_sets.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,41 +13,85 @@ pub fn is_permitted_printable_char(c: char) -> bool {
(is_above_space && is_below_tilde) || is_tab
}

/// Determine if a character is a permitted newline character.
/// Determine if a character is a permitted newline lf character.
///
/// The only permitted newline character is \n. All others are invalid.
pub fn is_permitted_newline_char(c: char) -> bool {
/// The only permitted newline lf character is \n All others are invalid.
pub fn is_permitted_newline_lf_char(c: char) -> bool {
let x = c as u32;
x == 0x0A
x == 0x0A // \n
}

/// Determine if a character is a permitted newline crlf character.
///
/// The only permitted newline character is \r\n. All others are invalid.
pub fn is_permitted_newline_crlf_chars(c1: char, c2: char) -> bool {
let x1 = c1 as u32;
let x2 = c2 as u32;
let is_cr = x1 == 0x0D; // \r
let is_lf = x2 == 0x0A; // \n
is_cr && is_lf
}

/// Determine if a character is permitted character.
///
/// A permitted character is either a permitted printable character, or a permitted
/// newline. Any other characters are disallowed from appearing in the file.
pub fn is_permitted_char(c: char) -> bool {
is_permitted_printable_char(c) || is_permitted_newline_char(c)
is_permitted_printable_char(c) || is_permitted_newline_lf_char(c)
}

/// Determine if the characters is permitted characters.
///
/// A permitted characters is either a permitted printable character, or a permitted
/// newlines. Any other characters are disallowed from appearing in the file.
pub fn is_permitted_chars(chars: &[u8], idx: usize) -> bool {
let c1 = chars[idx] as char;

if is_permitted_char(c1) {
return true;
}

if idx + 1 >= chars.len() {
return false;
}

let c2 = chars[idx + 1] as char;
is_permitted_newline_crlf_chars(c1, c2)
}

#[cfg(test)]
mod tests {
#[test]
fn test_permitted_characters() {
let mut good_chars = (0x20..=0x7E).collect::<Vec<u8>>();
good_chars.push(0x0D); // \r
good_chars.push(0x0A); // \n
good_chars.push(0x09); // \t
for c in good_chars {
assert!(super::is_permitted_char(c as char));

for idx in 0..good_chars.len() {
assert!(super::is_permitted_chars(&good_chars, idx));
}
}

#[test]
fn test_forbidden_last_lf_characters() {
let mut good_chars = (0x20..=0x7E).collect::<Vec<u8>>();
good_chars.push(0x0D); // \r

assert!(!super::is_permitted_chars(
&good_chars,
good_chars.len() - 1
));
}

#[test]
fn test_forbidden_characters() {
let mut bad_chars = (0x0..0x09).collect::<Vec<u8>>();
bad_chars.append(&mut (0x0B..=0x1F).collect::<Vec<u8>>());
bad_chars.push(0x7F);
for c in bad_chars {
assert!(!super::is_permitted_char(c as char));

for idx in 0..bad_chars.len() {
assert!(!super::is_permitted_chars(&bad_chars, idx));
}
}
}
8 changes: 4 additions & 4 deletions language/move-compiler/src/parser/comments.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// SPDX-License-Identifier: Apache-2.0

use crate::{diag, diagnostics::Diagnostics};
use move_command_line_common::{character_sets::is_permitted_char, files::FileHash};
use move_command_line_common::{character_sets::is_permitted_chars, files::FileHash};
use move_ir_types::location::*;
use std::collections::BTreeMap;

Expand All @@ -12,19 +12,19 @@ pub type MatchedFileCommentMap = BTreeMap<u32, String>;
pub type FileCommentMap = BTreeMap<(u32, u32), String>;

// We restrict strings to only ascii visual characters (0x20 <= c <= 0x7E) or a permitted newline
// character--\n--or a tab--\t.
// character--\r--,--\n--or a tab--\t.
pub fn verify_string(file_hash: FileHash, string: &str) -> Result<(), Diagnostics> {
match string
.chars()
.enumerate()
.find(|(_, c)| !is_permitted_char(*c))
.find(|(idx, _)| !is_permitted_chars(string.as_bytes(), *idx))
{
None => Ok(()),
Some((idx, chr)) => {
let loc = Loc::new(file_hash, idx as u32, idx as u32);
let msg = format!(
"Invalid character '{}' found when reading file. Only ASCII printable characters, \
tabs (\\t), and line endings (\\n) are permitted.",
tabs (\\t), lf (\\n) and crlf (\\r\\n) are permitted.",
chr
);
Err(Diagnostics::from(vec![diag!(
Expand Down
12 changes: 7 additions & 5 deletions language/move-compiler/src/parser/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,8 @@ impl<'input> Lexer<'input> {
// Loop until we find text that isn't whitespace, and that isn't part of
// a multi-line or single-line comment.
loop {
// Trim only the whitespace characters we recognize: newline, tab, and space.
// Trim only the whitespace characters we recognize: newline(\n|\r\n), tab, and space.
text = text.trim_start_matches("\r\n");
text = text.trim_start_matches(|c: char| matches!(c, '\n' | '\t' | ' '));

if text.starts_with("/*") {
Expand Down Expand Up @@ -294,10 +295,11 @@ impl<'input> Lexer<'input> {
// If this was a documentation comment, record it in our map.
if is_doc {
let end = get_offset(text);
self.doc_comments.insert(
(start as u32, end as u32),
self.text[(start + 3)..end].to_string(),
);
let mut comment = &self.text[(start + 3)..end];
comment = comment.trim_end_matches(|c: char| c == '\r');

self.doc_comments
.insert((start as u32, end as u32), comment.to_string());
}

// Continue the loop on the following line, which may contain leading
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ error[E01001]: invalid character
┌─ tests/move_check/parser/invalid_character_comment.move:3:44
3 │ // Non-ASCII characters in comments (e.g., ф) are also not allowed.
│ ^ Invalid character 'ф' found when reading file. Only ASCII printable characters, tabs (\t), and line endings (\n) are permitted.
│ ^ Invalid character 'ф' found when reading file. Only ASCII printable characters, tabs (\t), lf (\n) and crlf (\r\n) are permitted.

Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ error[E01001]: invalid character
┌─ tests/move_check/parser/invalid_character_non_ascii.move:5:4
5 │ ф
│ ^ Invalid character 'ф' found when reading file. Only ASCII printable characters, tabs (\t), and line endings (\n) are permitted.
│ ^ Invalid character 'ф' found when reading file. Only ASCII printable characters, tabs (\t), lf (\n) and crlf (\r\n) are permitted.

21 changes: 21 additions & 0 deletions language/move-compiler/tests/move_check/parser/newline_crlf.move
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
/// This is a test.
module 0x8675309::M {
/**
* One can have /* nested */
* // block comments
*/
fun f() { }

/* This is a nested /* regular comment // */ */
fun g() {}

// This is a line comment which contains unbalanced /* delimiter.
fun h() {}

// Comments in strings are not comments at all.
fun str(): vector<u8> {
b"http://diem.com"
}

// This is a regular comment which appears where a doc comment would not be allowed.
}
16 changes: 9 additions & 7 deletions language/move-ir-compiler/move-ir-to-bytecode/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,24 +11,25 @@ use codespan_reporting::{
Config,
},
};
use move_command_line_common::character_sets::is_permitted_char;
use move_command_line_common::character_sets::is_permitted_chars;
use move_ir_to_bytecode_syntax::syntax::{self, ParseError};
use move_ir_types::{ast, location::*};

// We restrict strings to only ascii visual characters (0x20 <= c <= 0x7E) or a permitted newline
// character--\n--or a tab--\t. Checking each character in the input string is more fool-proof
// character--\r--,--\n--or a tab--\t. Checking each character in the input string is more fool-proof
// than checking each character later during lexing & tokenization, since that would require special
// handling of characters inside of comments (usually not included as tokens) and within byte
// array literals.
fn verify_string(string: &str) -> Result<()> {
string
.chars()
.find(|c| !is_permitted_char(*c))
.map_or(Ok(()), |chr| {
.enumerate()
.find(|(idx, _)| !is_permitted_chars(string.as_bytes(), *idx))
.map_or(Ok(()), |(_, c)| {
bail!(
"Parser Error: invalid character {} found when reading file.\
Only ascii printable, tabs (\\t), and \\n line ending characters are permitted.",
chr
Only ascii printable, tabs (\\t), lf (\\n) and crlf (\\r\\n) characters are permitted.",
c
)
})
}
Expand Down Expand Up @@ -86,7 +87,8 @@ mod tests {
good_chars.push(0x09);

let mut bad_chars = (0x0..0x09).collect::<Vec<_>>();
bad_chars.append(&mut (0x0B..=0x1F).collect::<Vec<_>>());
bad_chars.append(&mut vec![0x0B, 0x0C]);
bad_chars.append(&mut (0x0E..=0x1F).collect::<Vec<_>>());
bad_chars.push(0x7F);

// Test to make sure that all the characters that are in the allowlist pass.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ impl<'input> Lexer<'input> {
let mut text = &self.text[self.cur_end..];
loop {
// Trim the only whitespace characters we recognize: newline, tab, and space.
text = text.trim_start_matches("\r\n");
text = text.trim_start_matches(|c: char| matches!(c, '\n' | '\t' | ' '));
// Trim the only comments we recognize: '// ... \n'.
if text.starts_with("//") {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
processed 6 tasks

task 0 'print-bytecode'. lines 1-6:
// Move bytecode v5
script {


main() {
B0:
0: Ret
}
}

task 1 'print-bytecode'. lines 8-14:
// Move bytecode v5
script {


main() {
B0:
0: Ret
}
}

task 2 'print-bytecode'. lines 16-20:
// Move bytecode v5
script {


main() {
B0:
0: Ret
}
}

task 3 'print-bytecode'. lines 22-27:
// Move bytecode v5
script {


main() {
B0:
0: Ret
}
}

task 4 'print-bytecode'. lines 29-36:
Error: ParserError: Invalid Token: invalid token kind for statement Slash

task 5 'print-bytecode'. lines 38-46:
// Move bytecode v5
script {


main() {
B0:
0: Ret
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
//# print-bytecode
main() {
label b0:
// return;
return;
}

//# print-bytecode
main() {
label b0:
// return;
// return;
return;
}

//# print-bytecode
main() {
label b0:
return; // return;
}

//# print-bytecode
main() {
label b0:
// return;
return; // return;
}

//# print-bytecode
// In Move, /* */ are block comment delimiters. Not so in Move IR, so the `/*` below
// cannot be parsed.
main() {
label b0:
return;
/* return; */
}

//# print-bytecode
// Since /* */ are not block comment delimiters, they do not behave in any unique way when
// they appear within comments.
main() {
label b0:
// /*
return;
// */
}

0 comments on commit 9764e02

Please sign in to comment.