From 879d212e537096ed40c9da8c6b5afbf05079cfd6 Mon Sep 17 00:00:00 2001 From: Daniel Parks Date: Sat, 27 May 2023 22:48:26 -0700 Subject: [PATCH 1/2] Add support for operating on byte strings This adds a `bytes` submodule for operating on byte strings that might contain invalid UTF-8. Where possible I have switched the functions that operate on `str` to use the `bytes` functions internally to avoid duplicating code and eliminate the potential for differing behavior between the two functions. It includes trivial tests that confirm that the `bytes` version of functions actually work on invalid UTF-8. Fixes #12 --- CHANGELOG.md | 4 + README.md | 5 +- src/bytes.rs | 267 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 29 ++---- 4 files changed, 284 insertions(+), 21 deletions(-) create mode 100644 src/bytes.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 50d2e6e..2a926db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +# Next release + +* Adds `bytes` module to support operating directly on byte strings. + # 1.1.0 * Adds the `std` feature (enabled by default) diff --git a/README.md b/README.md index 7bd0c44..6400a6f 100644 --- a/README.md +++ b/README.md @@ -16,8 +16,9 @@ You only get the default settings of shlex.split, which mimic the POSIX shell: This implementation also deviates from the Python version in not treating \r specially, which I believe is more compliant. -The algorithms in this crate are oblivious to UTF-8 high bytes, so they iterate -over the bytes directly as a micro-optimization. +This crate can be used on either normal Rust strings, or on byte strings with +the `bytes` module. The algorithms used are oblivious to UTF-8 high bytes, so +internally they all work on bytes directly as a micro-optimization. Disabling the `std` feature (which is enabled by default) will allow the crate to work in `no_std` environments, where the `alloc` crate, and a global diff --git a/src/bytes.rs b/src/bytes.rs new file mode 100644 index 0000000..5c03d81 --- /dev/null +++ b/src/bytes.rs @@ -0,0 +1,267 @@ +// Copyright 2015 Nicholas Allegra (comex). +// Licensed under the Apache License, Version 2.0 or +// the MIT license , at your option. This file may not be +// copied, modified, or distributed except according to those terms. + +//! [`Shlex`] and friends for byte strings. +//! +//! This may be more convenient if you are working with byte slices (`[u8]`) +//! or types that are wrappers around bytes, such as [`OsStr`](std::ffi::OsStr): +//! +//! ```rust +//! #[cfg(unix)] { +//! use shlex::bytes::quote; +//! use std::ffi::OsStr; +//! use std::os::unix::ffi::OsStrExt; +//! +//! // `\x80` is invalid in UTF-8. +//! let os_str = OsStr::from_bytes(b"a\x80b c"); +//! assert_eq!(quote(os_str.as_bytes()), &b"\"a\x80b c\""[..]); +//! } +//! ``` +//! +//! (On Windows, `OsStr` uses 16 bit wide characters so this will not work.) + +extern crate alloc; +use alloc::vec::Vec; +use alloc::borrow::Cow; +#[cfg(test)] +use alloc::vec; +#[cfg(test)] +use alloc::borrow::ToOwned; + +/// An iterator that takes an input byte string and splits it into the words using the same syntax as +/// the POSIX shell. +pub struct Shlex<'a> { + in_iter: core::slice::Iter<'a, u8>, + /// The number of newlines read so far, plus one. + pub line_no: usize, + /// An input string is erroneous if it ends while inside a quotation or right after an + /// unescaped backslash. Since Iterator does not have a mechanism to return an error, if that + /// happens, Shlex just throws out the last token, ends the iteration, and sets 'had_error' to + /// true; best to check it after you're done iterating. + pub had_error: bool, +} + +impl<'a> Shlex<'a> { + pub fn new(in_bytes: &'a [u8]) -> Self { + Shlex { + in_iter: in_bytes.iter(), + line_no: 1, + had_error: false, + } + } + + fn parse_word(&mut self, mut ch: u8) -> Option> { + let mut result: Vec = Vec::new(); + loop { + match ch as char { + '"' => if let Err(()) = self.parse_double(&mut result) { + self.had_error = true; + return None; + }, + '\'' => if let Err(()) = self.parse_single(&mut result) { + self.had_error = true; + return None; + }, + '\\' => if let Some(ch2) = self.next_char() { + if ch2 != '\n' as u8 { result.push(ch2); } + } else { + self.had_error = true; + return None; + }, + ' ' | '\t' | '\n' => { break; }, + _ => { result.push(ch as u8); }, + } + if let Some(ch2) = self.next_char() { ch = ch2; } else { break; } + } + Some(result) + } + + fn parse_double(&mut self, result: &mut Vec) -> Result<(), ()> { + loop { + if let Some(ch2) = self.next_char() { + match ch2 as char { + '\\' => { + if let Some(ch3) = self.next_char() { + match ch3 as char { + // \$ => $ + '$' | '`' | '"' | '\\' => { result.push(ch3); }, + // \ => nothing + '\n' => {}, + // \x => =x + _ => { result.push('\\' as u8); result.push(ch3); } + } + } else { + return Err(()); + } + }, + '"' => { return Ok(()); }, + _ => { result.push(ch2); }, + } + } else { + return Err(()); + } + } + } + + fn parse_single(&mut self, result: &mut Vec) -> Result<(), ()> { + loop { + if let Some(ch2) = self.next_char() { + match ch2 as char { + '\'' => { return Ok(()); }, + _ => { result.push(ch2); }, + } + } else { + return Err(()); + } + } + } + + fn next_char(&mut self) -> Option { + let res = self.in_iter.next().copied(); + if res == Some(b'\n') { self.line_no += 1; } + res + } +} + +impl<'a> Iterator for Shlex<'a> { + type Item = Vec; + fn next(&mut self) -> Option { + if let Some(mut ch) = self.next_char() { + // skip initial whitespace + loop { + match ch as char { + ' ' | '\t' | '\n' => {}, + '#' => { + while let Some(ch2) = self.next_char() { + if ch2 as char == '\n' { break; } + } + }, + _ => { break; } + } + if let Some(ch2) = self.next_char() { ch = ch2; } else { return None; } + } + self.parse_word(ch) + } else { // no initial character + None + } + } + +} + +/// Convenience function that consumes the whole byte string at once. Returns None if the input was +/// erroneous. +pub fn split(in_bytes: &[u8]) -> Option>> { + let mut shl = Shlex::new(in_bytes); + let res = shl.by_ref().collect(); + if shl.had_error { None } else { Some(res) } +} + +/// Given a single word, return a byte string suitable to encode it as a shell argument. +/// +/// If given valid UTF-8, this will never produce invalid UTF-8. This is because it only +/// ever inserts valid ASCII characters before or after existing ASCII characters (or +/// returns two double quotes if the input was an empty string). It will never modify a +/// multibyte UTF-8 character. +pub fn quote(in_bytes: &[u8]) -> Cow<[u8]> { + if in_bytes.len() == 0 { + b"\"\""[..].into() + } else if in_bytes.iter().any(|c| match *c as char { + '|' | '&' | ';' | '<' | '>' | '(' | ')' | '$' | '`' | '\\' | '"' | '\'' | ' ' | '\t' | + '\r' | '\n' | '*' | '?' | '[' | '#' | '~' | '=' | '%' => true, + _ => false + }) { + let mut out: Vec = Vec::new(); + out.push(b'"'); + for &c in in_bytes { + match c { + b'$' | b'`' | b'"' | b'\\' => out.push(b'\\'), + _ => () + } + out.push(c); + } + out.push(b'"'); + out.into() + } else { + in_bytes.into() + } +} + +/// Convenience function that consumes an iterable of words and turns it into a single byte string, +/// quoting words when necessary. Consecutive words will be separated by a single space. +pub fn join<'a, I: core::iter::IntoIterator>(words: I) -> Vec { + words.into_iter() + .map(quote) + .collect::>() + .join(&b' ') +} + +#[cfg(test)] +const INVALID_UTF8: &[u8] = b"\xa1"; + +#[test] +fn test_invalid_utf8() { + // Check that our test string is actually invalid UTF-8. + assert!(core::str::from_utf8(INVALID_UTF8).is_err()); +} + +#[cfg(test)] +static SPLIT_TEST_ITEMS: &'static [(&'static [u8], Option<&'static [&'static [u8]]>)] = &[ + (b"foo$baz", Some(&[b"foo$baz"])), + (b"foo baz", Some(&[b"foo", b"baz"])), + (b"foo\"bar\"baz", Some(&[b"foobarbaz"])), + (b"foo \"bar\"baz", Some(&[b"foo", b"barbaz"])), + (b" foo \nbar", Some(&[b"foo", b"bar"])), + (b"foo\\\nbar", Some(&[b"foobar"])), + (b"\"foo\\\nbar\"", Some(&[b"foobar"])), + (b"'baz\\$b'", Some(&[b"baz\\$b"])), + (b"'baz\\\''", None), + (b"\\", None), + (b"\"\\", None), + (b"'\\", None), + (b"\"", None), + (b"'", None), + (b"foo #bar\nbaz", Some(&[b"foo", b"baz"])), + (b"foo #bar", Some(&[b"foo"])), + (b"foo#bar", Some(&[b"foo#bar"])), + (b"foo\"#bar", None), + (b"'\\n'", Some(&[b"\\n"])), + (b"'\\\\n'", Some(&[b"\\\\n"])), + (INVALID_UTF8, Some(&[INVALID_UTF8])), +]; + +#[test] +fn test_split() { + for &(input, output) in SPLIT_TEST_ITEMS { + assert_eq!(split(input), output.map(|o| o.iter().map(|&x| x.to_owned()).collect())); + } +} + +#[test] +fn test_lineno() { + let mut sh = Shlex::new(b"\nfoo\nbar"); + while let Some(word) = sh.next() { + if word == b"bar" { + assert_eq!(sh.line_no, 3); + } + } +} + +#[test] +fn test_quote() { + assert_eq!(quote(b"foobar"), &b"foobar"[..]); + assert_eq!(quote(b"foo bar"), &b"\"foo bar\""[..]); + assert_eq!(quote(b"\""), &b"\"\\\"\""[..]); + assert_eq!(quote(b""), &b"\"\""[..]); + assert_eq!(quote(INVALID_UTF8), INVALID_UTF8); +} + +#[test] +fn test_join() { + assert_eq!(join(vec![]), &b""[..]); + assert_eq!(join(vec![&b""[..]]), &b"\"\""[..]); + assert_eq!(join(vec![&b"a"[..], &b"b"[..]]), &b"a b"[..]); + assert_eq!(join(vec![&b"foo bar"[..], &b"baz"[..]]), &b"\"foo bar\" baz"[..]); + assert_eq!(join(vec![INVALID_UTF8]), INVALID_UTF8); +} diff --git a/src/lib.rs b/src/lib.rs index 31b54bd..e6bf432 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -29,6 +29,8 @@ use alloc::vec; #[cfg(test)] use alloc::borrow::ToOwned; +pub mod bytes; + /// An iterator that takes an input string and splits it into the words using the same syntax as /// the POSIX shell. pub struct Shlex<'a> { @@ -159,26 +161,15 @@ pub fn split(in_str: &str) -> Option> { /// Given a single word, return a string suitable to encode it as a shell argument. pub fn quote(in_str: &str) -> Cow { - if in_str.len() == 0 { - "\"\"".into() - } else if in_str.bytes().any(|c| match c as char { - '|' | '&' | ';' | '<' | '>' | '(' | ')' | '$' | '`' | '\\' | '"' | '\'' | ' ' | '\t' | - '\r' | '\n' | '*' | '?' | '[' | '#' | '~' | '=' | '%' => true, - _ => false - }) { - let mut out: Vec = Vec::new(); - out.push('"' as u8); - for c in in_str.bytes() { - match c as char { - '$' | '`' | '"' | '\\' => out.push('\\' as u8), - _ => () - } - out.push(c); + match bytes::quote(in_str.as_bytes()) { + Cow::Borrowed(out) => { + // Safety: given valid UTF-8, bytes::quote() will always return valid UTF-8. + unsafe { core::str::from_utf8_unchecked(out) }.into() + } + Cow::Owned(out) => { + // Safety: given valid UTF-8, bytes::quote() will always return valid UTF-8. + unsafe { String::from_utf8_unchecked(out) }.into() } - out.push('"' as u8); - unsafe { String::from_utf8_unchecked(out) }.into() - } else { - in_str.into() } } From 0c786d42a288298228ff448e81991bd1d7289798 Mon Sep 17 00:00:00 2001 From: Daniel Parks Date: Mon, 29 May 2023 09:26:18 -0700 Subject: [PATCH 2/2] Implement Shlex with bytes::Shlex. --- src/bytes.rs | 5 +- src/lib.rs | 129 +++++++++------------------------------------------ 2 files changed, 26 insertions(+), 108 deletions(-) diff --git a/src/bytes.rs b/src/bytes.rs index 5c03d81..e3306f5 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -5,8 +5,9 @@ //! [`Shlex`] and friends for byte strings. //! -//! This may be more convenient if you are working with byte slices (`[u8]`) -//! or types that are wrappers around bytes, such as [`OsStr`](std::ffi::OsStr): +//! This is used internally by the [outer module](crate), and may be more +//! convenient if you are working with byte slices (`[u8]`) or types that are +//! wrappers around bytes, such as [`OsStr`](std::ffi::OsStr): //! //! ```rust //! #[cfg(unix)] { diff --git a/src/lib.rs b/src/lib.rs index e6bf432..444c1fa 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,8 +12,9 @@ //! This implementation also deviates from the Python version in not treating `\r` specially, which //! I believe is more compliant. //! -//! The algorithms in this crate are oblivious to UTF-8 high bytes, so they iterate over the bytes -//! directly as a micro-optimization. +//! This is a string-friendly wrapper around the [bytes] module that works on the underlying byte +//! slices. The algorithms in this crate are oblivious to UTF-8 high bytes, so working directly +//! with bytes is a safe micro-optimization. //! //! Disabling the `std` feature (which is enabled by default) will allow the crate to work in //! `no_std` environments, where the `alloc` crate, and a global allocator, are available. @@ -33,122 +34,38 @@ pub mod bytes; /// An iterator that takes an input string and splits it into the words using the same syntax as /// the POSIX shell. -pub struct Shlex<'a> { - in_iter: core::str::Bytes<'a>, - /// The number of newlines read so far, plus one. - pub line_no: usize, - /// An input string is erroneous if it ends while inside a quotation or right after an - /// unescaped backslash. Since Iterator does not have a mechanism to return an error, if that - /// happens, Shlex just throws out the last token, ends the iteration, and sets 'had_error' to - /// true; best to check it after you're done iterating. - pub had_error: bool, -} +/// +/// See [`bytes::Shlex`]. +pub struct Shlex<'a>(bytes::Shlex<'a>); impl<'a> Shlex<'a> { pub fn new(in_str: &'a str) -> Self { - Shlex { - in_iter: in_str.bytes(), - line_no: 1, - had_error: false, - } - } - - fn parse_word(&mut self, mut ch: u8) -> Option { - let mut result: Vec = Vec::new(); - loop { - match ch as char { - '"' => if let Err(()) = self.parse_double(&mut result) { - self.had_error = true; - return None; - }, - '\'' => if let Err(()) = self.parse_single(&mut result) { - self.had_error = true; - return None; - }, - '\\' => if let Some(ch2) = self.next_char() { - if ch2 != '\n' as u8 { result.push(ch2); } - } else { - self.had_error = true; - return None; - }, - ' ' | '\t' | '\n' => { break; }, - _ => { result.push(ch as u8); }, - } - if let Some(ch2) = self.next_char() { ch = ch2; } else { break; } - } - unsafe { Some(String::from_utf8_unchecked(result)) } + Self(bytes::Shlex::new(in_str.as_bytes())) } +} - fn parse_double(&mut self, result: &mut Vec) -> Result<(), ()> { - loop { - if let Some(ch2) = self.next_char() { - match ch2 as char { - '\\' => { - if let Some(ch3) = self.next_char() { - match ch3 as char { - // \$ => $ - '$' | '`' | '"' | '\\' => { result.push(ch3); }, - // \ => nothing - '\n' => {}, - // \x => =x - _ => { result.push('\\' as u8); result.push(ch3); } - } - } else { - return Err(()); - } - }, - '"' => { return Ok(()); }, - _ => { result.push(ch2); }, - } - } else { - return Err(()); - } - } +impl<'a> Iterator for Shlex<'a> { + type Item = String; + fn next(&mut self) -> Option { + self.0.next().map(|byte_word| { + // Safety: given valid UTF-8, bytes::Shlex will always return valid UTF-8. + unsafe { String::from_utf8_unchecked(byte_word) } + }) } +} - fn parse_single(&mut self, result: &mut Vec) -> Result<(), ()> { - loop { - if let Some(ch2) = self.next_char() { - match ch2 as char { - '\'' => { return Ok(()); }, - _ => { result.push(ch2); }, - } - } else { - return Err(()); - } - } - } +impl<'a> core::ops::Deref for Shlex<'a> { + type Target = bytes::Shlex<'a>; - fn next_char(&mut self) -> Option { - let res = self.in_iter.next(); - if res == Some('\n' as u8) { self.line_no += 1; } - res + fn deref(&self) -> &Self::Target { + &self.0 } } -impl<'a> Iterator for Shlex<'a> { - type Item = String; - fn next(&mut self) -> Option { - if let Some(mut ch) = self.next_char() { - // skip initial whitespace - loop { - match ch as char { - ' ' | '\t' | '\n' => {}, - '#' => { - while let Some(ch2) = self.next_char() { - if ch2 as char == '\n' { break; } - } - }, - _ => { break; } - } - if let Some(ch2) = self.next_char() { ch = ch2; } else { return None; } - } - self.parse_word(ch) - } else { // no initial character - None - } +impl<'a> core::ops::DerefMut for Shlex<'a> { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 } - } /// Convenience function that consumes the whole string at once. Returns None if the input was