diff options
Diffstat (limited to 'library/std/src')
| -rw-r--r-- | library/std/src/lib.rs | 2 | ||||
| -rw-r--r-- | library/std/src/os/unix/fs.rs | 3 | ||||
| -rw-r--r-- | library/std/src/primitive_docs.rs | 18 | ||||
| -rw-r--r-- | library/std/src/sys/unix/fs.rs | 5 | ||||
| -rw-r--r-- | library/std/src/sys/windows/args.rs | 257 | ||||
| -rw-r--r-- | library/std/src/sys/windows/args/tests.rs | 64 | ||||
| -rw-r--r-- | library/std/src/sys/windows/c.rs | 2 | ||||
| -rw-r--r-- | library/std/src/sys/windows/stdio.rs | 104 |
8 files changed, 306 insertions, 149 deletions
diff --git a/library/std/src/lib.rs b/library/std/src/lib.rs index 028a066b5a1..3a1eb625b57 100644 --- a/library/std/src/lib.rs +++ b/library/std/src/lib.rs @@ -253,6 +253,7 @@ #![feature(const_ip)] #![feature(const_ipv4)] #![feature(const_ipv6)] +#![feature(const_option)] #![feature(const_raw_ptr_deref)] #![feature(const_socketaddr)] #![feature(const_trait_impl)] @@ -331,7 +332,6 @@ #![feature(try_reserve)] #![feature(try_reserve_kind)] #![feature(unboxed_closures)] -#![feature(unsafe_cell_raw_get)] #![feature(unwrap_infallible)] #![feature(vec_into_raw_parts)] #![feature(vec_spare_capacity)] diff --git a/library/std/src/os/unix/fs.rs b/library/std/src/os/unix/fs.rs index e4ce788f741..6cf37f23c57 100644 --- a/library/std/src/os/unix/fs.rs +++ b/library/std/src/os/unix/fs.rs @@ -934,7 +934,6 @@ impl DirBuilderExt for fs::DirBuilder { /// # Examples /// /// ```no_run -/// #![feature(unix_chroot)] /// use std::os::unix::fs; /// /// fn main() -> std::io::Result<()> { @@ -944,7 +943,7 @@ impl DirBuilderExt for fs::DirBuilder { /// Ok(()) /// } /// ``` -#[unstable(feature = "unix_chroot", issue = "84715")] +#[stable(feature = "unix_chroot", since = "1.56.0")] #[cfg(not(any(target_os = "fuchsia", target_os = "vxworks")))] pub fn chroot<P: AsRef<Path>>(dir: P) -> io::Result<()> { sys::fs::chroot(dir.as_ref()) diff --git a/library/std/src/primitive_docs.rs b/library/std/src/primitive_docs.rs index dc4572cd936..261d0e648e2 100644 --- a/library/std/src/primitive_docs.rs +++ b/library/std/src/primitive_docs.rs @@ -581,6 +581,8 @@ mod prim_pointer {} /// might be made consistent to the behavior of later editions. /// /// ```rust,edition2018 +/// // Rust 2015 and 2018: +/// /// # #![allow(array_into_iter)] // override our `deny(warnings)` /// let array: [i32; 3] = [0; 3]; /// @@ -604,11 +606,13 @@ mod prim_pointer {} /// } /// ``` /// -/// Starting in the 2021 edition, `array.into_iter()` will use `IntoIterator` normally to iterate +/// Starting in the 2021 edition, `array.into_iter()` uses `IntoIterator` normally to iterate /// by value, and `iter()` should be used to iterate by reference like previous editions. /// -/// ```rust,edition2021,ignore -/// # // FIXME: ignored because 2021 testing is still unstable +#[cfg_attr(bootstrap, doc = "```rust,edition2021,ignore")] +#[cfg_attr(not(bootstrap), doc = "```rust,edition2021")] +/// // Rust 2021: +/// /// let array: [i32; 3] = [0; 3]; /// /// // This iterates by reference: @@ -631,12 +635,12 @@ mod prim_pointer {} /// avoid the `into_iter` syntax on those editions. If an edition update is not /// viable/desired, there are multiple alternatives: /// * use `iter`, equivalent to the old behavior, creating references -/// * use [`array::IntoIter`], equivalent to the post-2021 behavior (Rust 1.51+) +/// * use [`IntoIterator::into_iter`], equivalent to the post-2021 behavior (Rust 1.53+) /// * replace `for ... in array.into_iter() {` with `for ... in array {`, /// equivalent to the post-2021 behavior (Rust 1.53+) /// /// ```rust,edition2018 -/// use std::array::IntoIter; +/// // Rust 2015 and 2018: /// /// let array: [i32; 3] = [0; 3]; /// @@ -647,7 +651,7 @@ mod prim_pointer {} /// } /// /// // This iterates by value: -/// for item in IntoIter::new(array) { +/// for item in IntoIterator::into_iter(array) { /// let x: i32 = item; /// println!("{}", x); /// } @@ -660,7 +664,7 @@ mod prim_pointer {} /// /// // IntoIter can also start a chain. /// // This iterates by value: -/// for item in IntoIter::new(array).enumerate() { +/// for item in IntoIterator::into_iter(array).enumerate() { /// let (i, x): (usize, i32) = item; /// println!("array[{}] = {}", i, x); /// } diff --git a/library/std/src/sys/unix/fs.rs b/library/std/src/sys/unix/fs.rs index 6075eb5c7c5..6d7524a733a 100644 --- a/library/std/src/sys/unix/fs.rs +++ b/library/std/src/sys/unix/fs.rs @@ -506,7 +506,8 @@ impl Iterator for ReadDir { let mut ret = DirEntry { entry: mem::zeroed(), dir: Arc::clone(&self.inner) }; let mut entry_ptr = ptr::null_mut(); loop { - if readdir64_r(self.inner.dirp.0, &mut ret.entry, &mut entry_ptr) != 0 { + let err = readdir64_r(self.inner.dirp.0, &mut ret.entry, &mut entry_ptr); + if err != 0 { if entry_ptr.is_null() { // We encountered an error (which will be returned in this iteration), but // we also reached the end of the directory stream. The `end_of_stream` @@ -514,7 +515,7 @@ impl Iterator for ReadDir { // (instead of looping forever) self.end_of_stream = true; } - return Some(Err(Error::last_os_error())); + return Some(Err(Error::from_raw_os_error(err))); } if entry_ptr.is_null() { return None; diff --git a/library/std/src/sys/windows/args.rs b/library/std/src/sys/windows/args.rs index f1264130faf..3919025b080 100644 --- a/library/std/src/sys/windows/args.rs +++ b/library/std/src/sys/windows/args.rs @@ -1,13 +1,18 @@ -#![allow(dead_code)] // runtime init functions not used during testing +//! The Windows command line is just a string +//! <https://docs.microsoft.com/en-us/archive/blogs/larryosterman/the-windows-command-line-is-just-a-string> +//! +//! This module implements the parsing necessary to turn that string into a list of arguments. #[cfg(test)] mod tests; use crate::ffi::OsString; use crate::fmt; +use crate::marker::PhantomData; +use crate::num::NonZeroU16; use crate::os::windows::prelude::*; use crate::path::PathBuf; -use crate::slice; +use crate::ptr::NonNull; use crate::sys::c; use crate::sys::windows::os::current_exe; use crate::vec; @@ -15,9 +20,11 @@ use crate::vec; use core::iter; pub fn args() -> Args { + // SAFETY: `GetCommandLineW` returns a pointer to a null terminated UTF-16 + // string so it's safe for `WStrUnits` to use. unsafe { let lp_cmd_line = c::GetCommandLineW(); - let parsed_args_list = parse_lp_cmd_line(lp_cmd_line as *const u16, || { + let parsed_args_list = parse_lp_cmd_line(WStrUnits::new(lp_cmd_line), || { current_exe().map(PathBuf::into_os_string).unwrap_or_else(|_| OsString::new()) }); @@ -28,129 +35,120 @@ pub fn args() -> Args { /// Implements the Windows command-line argument parsing algorithm. /// /// Microsoft's documentation for the Windows CLI argument format can be found at -/// <https://docs.microsoft.com/en-us/previous-versions//17w5ykft(v=vs.85)>. +/// <https://docs.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-160#parsing-c-command-line-arguments> /// -/// Windows includes a function to do this in shell32.dll, -/// but linking with that DLL causes the process to be registered as a GUI application. +/// A more in-depth explanation is here: +/// <https://daviddeley.com/autohotkey/parameters/parameters.htm#WIN> +/// +/// Windows includes a function to do command line parsing in shell32.dll. +/// However, this is not used for two reasons: +/// +/// 1. Linking with that DLL causes the process to be registered as a GUI application. /// GUI applications add a bunch of overhead, even if no windows are drawn. See /// <https://randomascii.wordpress.com/2018/12/03/a-not-called-function-can-cause-a-5x-slowdown/>. /// -/// This function was tested for equivalence to the shell32.dll implementation in -/// Windows 10 Pro v1803, using an exhaustive test suite available at -/// <https://gist.github.com/notriddle/dde431930c392e428055b2dc22e638f5> or -/// <https://paste.gg/p/anonymous/47d6ed5f5bd549168b1c69c799825223>. -unsafe fn parse_lp_cmd_line<F: Fn() -> OsString>( - lp_cmd_line: *const u16, +/// 2. It does not follow the modern C/C++ argv rules outlined in the first two links above. +/// +/// This function was tested for equivalence to the C/C++ parsing rules using an +/// extensive test suite available at +/// <https://github.com/ChrisDenton/winarg/tree/std>. +fn parse_lp_cmd_line<'a, F: Fn() -> OsString>( + lp_cmd_line: Option<WStrUnits<'a>>, exe_name: F, ) -> Vec<OsString> { - const BACKSLASH: u16 = '\\' as u16; - const QUOTE: u16 = '"' as u16; - const TAB: u16 = '\t' as u16; - const SPACE: u16 = ' ' as u16; + const BACKSLASH: NonZeroU16 = NonZeroU16::new(b'\\' as u16).unwrap(); + const QUOTE: NonZeroU16 = NonZeroU16::new(b'"' as u16).unwrap(); + const TAB: NonZeroU16 = NonZeroU16::new(b'\t' as u16).unwrap(); + const SPACE: NonZeroU16 = NonZeroU16::new(b' ' as u16).unwrap(); + let mut ret_val = Vec::new(); - if lp_cmd_line.is_null() || *lp_cmd_line == 0 { + // If the cmd line pointer is null or it points to an empty string then + // return the name of the executable as argv[0]. + if lp_cmd_line.as_ref().and_then(|cmd| cmd.peek()).is_none() { ret_val.push(exe_name()); return ret_val; } - let mut cmd_line = { - let mut end = 0; - while *lp_cmd_line.offset(end) != 0 { - end += 1; - } - slice::from_raw_parts(lp_cmd_line, end as usize) - }; + let mut code_units = lp_cmd_line.unwrap(); + // The executable name at the beginning is special. - cmd_line = match cmd_line[0] { - // The executable name ends at the next quote mark, - // no matter what. - QUOTE => { - let args = { - let mut cut = cmd_line[1..].splitn(2, |&c| c == QUOTE); - if let Some(exe) = cut.next() { - ret_val.push(OsString::from_wide(exe)); - } - cut.next() - }; - if let Some(args) = args { - args - } else { - return ret_val; - } - } - // Implement quirk: when they say whitespace here, - // they include the entire ASCII control plane: - // "However, if lpCmdLine starts with any amount of whitespace, CommandLineToArgvW - // will consider the first argument to be an empty string. Excess whitespace at the - // end of lpCmdLine is ignored." - 0..=SPACE => { - ret_val.push(OsString::new()); - &cmd_line[1..] - } - // The executable name ends at the next whitespace, - // no matter what. - _ => { - let args = { - let mut cut = cmd_line.splitn(2, |&c| c > 0 && c <= SPACE); - if let Some(exe) = cut.next() { - ret_val.push(OsString::from_wide(exe)); - } - cut.next() - }; - if let Some(args) = args { - args - } else { - return ret_val; - } + let mut in_quotes = false; + let mut cur = Vec::new(); + for w in &mut code_units { + match w { + // A quote mark always toggles `in_quotes` no matter what because + // there are no escape characters when parsing the executable name. + QUOTE => in_quotes = !in_quotes, + // If not `in_quotes` then whitespace ends argv[0]. + SPACE | TAB if !in_quotes => break, + // In all other cases the code unit is taken literally. + _ => cur.push(w.get()), } - }; + } + // Skip whitespace. + code_units.advance_while(|w| w == SPACE || w == TAB); + ret_val.push(OsString::from_wide(&cur)); + + // Parse the arguments according to these rules: + // * All code units are taken literally except space, tab, quote and backslash. + // * When not `in_quotes`, space and tab separate arguments. Consecutive spaces and tabs are + // treated as a single separator. + // * A space or tab `in_quotes` is taken literally. + // * A quote toggles `in_quotes` mode unless it's escaped. An escaped quote is taken literally. + // * A quote can be escaped if preceded by an odd number of backslashes. + // * If any number of backslashes is immediately followed by a quote then the number of + // backslashes is halved (rounding down). + // * Backslashes not followed by a quote are all taken literally. + // * If `in_quotes` then a quote can also be escaped using another quote + // (i.e. two consecutive quotes become one literal quote). let mut cur = Vec::new(); let mut in_quotes = false; - let mut was_in_quotes = false; - let mut backslash_count: usize = 0; - for &c in cmd_line { - match c { - // backslash - BACKSLASH => { - backslash_count += 1; - was_in_quotes = false; + while let Some(w) = code_units.next() { + match w { + // If not `in_quotes`, a space or tab ends the argument. + SPACE | TAB if !in_quotes => { + ret_val.push(OsString::from_wide(&cur[..])); + cur.truncate(0); + + // Skip whitespace. + code_units.advance_while(|w| w == SPACE || w == TAB); } - QUOTE if backslash_count % 2 == 0 => { - cur.extend(iter::repeat(b'\\' as u16).take(backslash_count / 2)); - backslash_count = 0; - if was_in_quotes { - cur.push('"' as u16); - was_in_quotes = false; + // Backslashes can escape quotes or backslashes but only if consecutive backslashes are followed by a quote. + BACKSLASH => { + let backslash_count = code_units.advance_while(|w| w == BACKSLASH) + 1; + if code_units.peek() == Some(QUOTE) { + cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count / 2)); + // The quote is escaped if there are an odd number of backslashes. + if backslash_count % 2 == 1 { + code_units.next(); + cur.push(QUOTE.get()); + } } else { - was_in_quotes = in_quotes; - in_quotes = !in_quotes; + // If there is no quote on the end then there is no escaping. + cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count)); } } - QUOTE if backslash_count % 2 != 0 => { - cur.extend(iter::repeat(b'\\' as u16).take(backslash_count / 2)); - backslash_count = 0; - was_in_quotes = false; - cur.push(b'"' as u16); - } - SPACE | TAB if !in_quotes => { - cur.extend(iter::repeat(b'\\' as u16).take(backslash_count)); - if !cur.is_empty() || was_in_quotes { - ret_val.push(OsString::from_wide(&cur[..])); - cur.truncate(0); + // If `in_quotes` and not backslash escaped (see above) then a quote either + // unsets `in_quote` or is escaped by another quote. + QUOTE if in_quotes => match code_units.peek() { + // Two consecutive quotes when `in_quotes` produces one literal quote. + Some(QUOTE) => { + cur.push(QUOTE.get()); + code_units.next(); } - backslash_count = 0; - was_in_quotes = false; - } - _ => { - cur.extend(iter::repeat(b'\\' as u16).take(backslash_count)); - backslash_count = 0; - was_in_quotes = false; - cur.push(c); - } + // Otherwise set `in_quotes`. + Some(_) => in_quotes = false, + // The end of the command line. + // Push `cur` even if empty, which we do by breaking while `in_quotes` is still set. + None => break, + }, + // If not `in_quotes` and not BACKSLASH escaped (see above) then a quote sets `in_quote`. + QUOTE => in_quotes = true, + // Everything else is always taken literally. + _ => cur.push(w.get()), } } - cur.extend(iter::repeat(b'\\' as u16).take(backslash_count)); - // include empty quoted strings at the end of the arguments list - if !cur.is_empty() || was_in_quotes || in_quotes { + // Push the final argument, if any. + if !cur.is_empty() || in_quotes { ret_val.push(OsString::from_wide(&cur[..])); } ret_val @@ -187,3 +185,52 @@ impl ExactSizeIterator for Args { self.parsed_args_list.len() } } + +/// A safe iterator over a LPWSTR +/// (aka a pointer to a series of UTF-16 code units terminated by a NULL). +struct WStrUnits<'a> { + // The pointer must never be null... + lpwstr: NonNull<u16>, + // ...and the memory it points to must be valid for this lifetime. + lifetime: PhantomData<&'a [u16]>, +} +impl WStrUnits<'_> { + /// Create the iterator. Returns `None` if `lpwstr` is null. + /// + /// SAFETY: `lpwstr` must point to a null-terminated wide string that lives + /// at least as long as the lifetime of this struct. + unsafe fn new(lpwstr: *const u16) -> Option<Self> { + Some(Self { lpwstr: NonNull::new(lpwstr as _)?, lifetime: PhantomData }) + } + fn peek(&self) -> Option<NonZeroU16> { + // SAFETY: It's always safe to read the current item because we don't + // ever move out of the array's bounds. + unsafe { NonZeroU16::new(*self.lpwstr.as_ptr()) } + } + /// Advance the iterator while `predicate` returns true. + /// Returns the number of items it advanced by. + fn advance_while<P: FnMut(NonZeroU16) -> bool>(&mut self, mut predicate: P) -> usize { + let mut counter = 0; + while let Some(w) = self.peek() { + if !predicate(w) { + break; + } + counter += 1; + self.next(); + } + counter + } +} +impl Iterator for WStrUnits<'_> { + // This can never return zero as that marks the end of the string. + type Item = NonZeroU16; + fn next(&mut self) -> Option<NonZeroU16> { + // SAFETY: If NULL is reached we immediately return. + // Therefore it's safe to advance the pointer after that. + unsafe { + let next = self.peek()?; + self.lpwstr = NonNull::new_unchecked(self.lpwstr.as_ptr().add(1)); + Some(next) + } + } +} diff --git a/library/std/src/sys/windows/args/tests.rs b/library/std/src/sys/windows/args/tests.rs index 756a4361ea3..82c32d08c5e 100644 --- a/library/std/src/sys/windows/args/tests.rs +++ b/library/std/src/sys/windows/args/tests.rs @@ -5,9 +5,9 @@ fn chk(string: &str, parts: &[&str]) { let mut wide: Vec<u16> = OsString::from(string).encode_wide().collect(); wide.push(0); let parsed = - unsafe { parse_lp_cmd_line(wide.as_ptr() as *const u16, || OsString::from("TEST.EXE")) }; + unsafe { parse_lp_cmd_line(WStrUnits::new(wide.as_ptr()), || OsString::from("TEST.EXE")) }; let expected: Vec<OsString> = parts.iter().map(|k| OsString::from(k)).collect(); - assert_eq!(parsed.as_slice(), expected.as_slice()); + assert_eq!(parsed.as_slice(), expected.as_slice(), "{:?}", string); } #[test] @@ -27,35 +27,65 @@ fn single_words() { #[test] fn official_examples() { chk(r#"EXE "abc" d e"#, &["EXE", "abc", "d", "e"]); - chk(r#"EXE a\\\b d"e f"g h"#, &["EXE", r#"a\\\b"#, "de fg", "h"]); + chk(r#"EXE a\\\b d"e f"g h"#, &["EXE", r"a\\\b", "de fg", "h"]); chk(r#"EXE a\\\"b c d"#, &["EXE", r#"a\"b"#, "c", "d"]); - chk(r#"EXE a\\\\"b c" d e"#, &["EXE", r#"a\\b c"#, "d", "e"]); + chk(r#"EXE a\\\\"b c" d e"#, &["EXE", r"a\\b c", "d", "e"]); } #[test] fn whitespace_behavior() { - chk(r#" test"#, &["", "test"]); - chk(r#" test"#, &["", "test"]); - chk(r#" test test2"#, &["", "test", "test2"]); - chk(r#" test test2"#, &["", "test", "test2"]); - chk(r#"test test2 "#, &["test", "test2"]); - chk(r#"test test2 "#, &["test", "test2"]); - chk(r#"test "#, &["test"]); + chk(" test", &["", "test"]); + chk(" test", &["", "test"]); + chk(" test test2", &["", "test", "test2"]); + chk(" test test2", &["", "test", "test2"]); + chk("test test2 ", &["test", "test2"]); + chk("test test2 ", &["test", "test2"]); + chk("test ", &["test"]); } #[test] fn genius_quotes() { chk(r#"EXE "" """#, &["EXE", "", ""]); - chk(r#"EXE "" """"#, &["EXE", "", "\""]); + chk(r#"EXE "" """"#, &["EXE", "", r#"""#]); chk( r#"EXE "this is """all""" in the same argument""#, - &["EXE", "this is \"all\" in the same argument"], + &["EXE", r#"this is "all" in the same argument"#], ); - chk(r#"EXE "a"""#, &["EXE", "a\""]); - chk(r#"EXE "a"" a"#, &["EXE", "a\"", "a"]); + chk(r#"EXE "a"""#, &["EXE", r#"a""#]); + chk(r#"EXE "a"" a"#, &["EXE", r#"a" a"#]); // quotes cannot be escaped in command names chk(r#""EXE" check"#, &["EXE", "check"]); chk(r#""EXE check""#, &["EXE check"]); - chk(r#""EXE """for""" check"#, &["EXE ", r#"for""#, "check"]); - chk(r#""EXE \"for\" check"#, &[r#"EXE \"#, r#"for""#, "check"]); + chk(r#""EXE """for""" check"#, &["EXE for check"]); + chk(r#""EXE \"for\" check"#, &[r"EXE \for\ check"]); + chk(r#""EXE \" for \" check"#, &[r"EXE \", "for", r#"""#, "check"]); + chk(r#"E"X"E test"#, &["EXE", "test"]); + chk(r#"EX""E test"#, &["EXE", "test"]); +} + +// from https://daviddeley.com/autohotkey/parameters/parameters.htm#WINCRULESEX +#[test] +fn post_2008() { + chk("EXE CallMeIshmael", &["EXE", "CallMeIshmael"]); + chk(r#"EXE "Call Me Ishmael""#, &["EXE", "Call Me Ishmael"]); + chk(r#"EXE Cal"l Me I"shmael"#, &["EXE", "Call Me Ishmael"]); + chk(r#"EXE CallMe\"Ishmael"#, &["EXE", r#"CallMe"Ishmael"#]); + chk(r#"EXE "CallMe\"Ishmael""#, &["EXE", r#"CallMe"Ishmael"#]); + chk(r#"EXE "Call Me Ishmael\\""#, &["EXE", r"Call Me Ishmael\"]); + chk(r#"EXE "CallMe\\\"Ishmael""#, &["EXE", r#"CallMe\"Ishmael"#]); + chk(r#"EXE a\\\b"#, &["EXE", r"a\\\b"]); + chk(r#"EXE "a\\\b""#, &["EXE", r"a\\\b"]); + chk(r#"EXE "\"Call Me Ishmael\"""#, &["EXE", r#""Call Me Ishmael""#]); + chk(r#"EXE "C:\TEST A\\""#, &["EXE", r"C:\TEST A\"]); + chk(r#"EXE "\"C:\TEST A\\\"""#, &["EXE", r#""C:\TEST A\""#]); + chk(r#"EXE "a b c" d e"#, &["EXE", "a b c", "d", "e"]); + chk(r#"EXE "ab\"c" "\\" d"#, &["EXE", r#"ab"c"#, r"\", "d"]); + chk(r#"EXE a\\\b d"e f"g h"#, &["EXE", r"a\\\b", "de fg", "h"]); + chk(r#"EXE a\\\"b c d"#, &["EXE", r#"a\"b"#, "c", "d"]); + chk(r#"EXE a\\\\"b c" d e"#, &["EXE", r"a\\b c", "d", "e"]); + // Double Double Quotes + chk(r#"EXE "a b c"""#, &["EXE", r#"a b c""#]); + chk(r#"EXE """CallMeIshmael""" b c"#, &["EXE", r#""CallMeIshmael""#, "b", "c"]); + chk(r#"EXE """Call Me Ishmael""""#, &["EXE", r#""Call Me Ishmael""#]); + chk(r#"EXE """"Call Me Ishmael"" b c"#, &["EXE", r#""Call"#, "Me", "Ishmael", "b", "c"]); } diff --git a/library/std/src/sys/windows/c.rs b/library/std/src/sys/windows/c.rs index cedf389fbf5..6fb850d1828 100644 --- a/library/std/src/sys/windows/c.rs +++ b/library/std/src/sys/windows/c.rs @@ -789,7 +789,7 @@ extern "system" { pub fn RemoveDirectoryW(lpPathName: LPCWSTR) -> BOOL; pub fn SetFileAttributesW(lpFileName: LPCWSTR, dwFileAttributes: DWORD) -> BOOL; pub fn SetLastError(dwErrCode: DWORD); - pub fn GetCommandLineW() -> *mut LPCWSTR; + pub fn GetCommandLineW() -> LPWSTR; pub fn GetTempPathW(nBufferLength: DWORD, lpBuffer: LPCWSTR) -> DWORD; pub fn GetCurrentProcess() -> HANDLE; pub fn GetCurrentThread() -> HANDLE; diff --git a/library/std/src/sys/windows/stdio.rs b/library/std/src/sys/windows/stdio.rs index 6f2618c63b5..2719a530dfd 100644 --- a/library/std/src/sys/windows/stdio.rs +++ b/library/std/src/sys/windows/stdio.rs @@ -9,14 +9,25 @@ use crate::str; use crate::sys::c; use crate::sys::cvt; use crate::sys::handle::Handle; +use core::str::utf8_char_width; // Don't cache handles but get them fresh for every read/write. This allows us to track changes to // the value over time (such as if a process calls `SetStdHandle` while it's running). See #40490. pub struct Stdin { surrogate: u16, } -pub struct Stdout; -pub struct Stderr; +pub struct Stdout { + incomplete_utf8: IncompleteUtf8, +} + +pub struct Stderr { + incomplete_utf8: IncompleteUtf8, +} + +struct IncompleteUtf8 { + bytes: [u8; 4], + len: u8, +} // Apparently Windows doesn't handle large reads on stdin or writes to stdout/stderr well (see // #13304 for details). @@ -51,7 +62,15 @@ fn is_console(handle: c::HANDLE) -> bool { unsafe { c::GetConsoleMode(handle, &mut mode) != 0 } } -fn write(handle_id: c::DWORD, data: &[u8]) -> io::Result<usize> { +fn write( + handle_id: c::DWORD, + data: &[u8], + incomplete_utf8: &mut IncompleteUtf8, +) -> io::Result<usize> { + if data.is_empty() { + return Ok(0); + } + let handle = get_handle(handle_id)?; if !is_console(handle) { unsafe { @@ -62,22 +81,73 @@ fn write(handle_id: c::DWORD, data: &[u8]) -> io::Result<usize> { } } - // As the console is meant for presenting text, we assume bytes of `data` come from a string - // and are encoded as UTF-8, which needs to be encoded as UTF-16. + if incomplete_utf8.len > 0 { + assert!( + incomplete_utf8.len < 4, + "Unexpected number of bytes for incomplete UTF-8 codepoint." + ); + if data[0] >> 6 != 0b10 { + // not a continuation byte - reject + incomplete_utf8.len = 0; + return Err(io::Error::new_const( + io::ErrorKind::InvalidData, + &"Windows stdio in console mode does not support writing non-UTF-8 byte sequences", + )); + } + incomplete_utf8.bytes[incomplete_utf8.len as usize] = data[0]; + incomplete_utf8.len += 1; + let char_width = utf8_char_width(incomplete_utf8.bytes[0]); + if (incomplete_utf8.len as usize) < char_width { + // more bytes needed + return Ok(1); + } + let s = str::from_utf8(&incomplete_utf8.bytes[0..incomplete_utf8.len as usize]); + incomplete_utf8.len = 0; + match s { + Ok(s) => { + assert_eq!(char_width, s.len()); + let written = write_valid_utf8_to_console(handle, s)?; + assert_eq!(written, s.len()); // guaranteed by write_valid_utf8_to_console() for single codepoint writes + return Ok(1); + } + Err(_) => { + return Err(io::Error::new_const( + io::ErrorKind::InvalidData, + &"Windows stdio in console mode does not support writing non-UTF-8 byte sequences", + )); + } + } + } + + // As the console is meant for presenting text, we assume bytes of `data` are encoded as UTF-8, + // which needs to be encoded as UTF-16. // // If the data is not valid UTF-8 we write out as many bytes as are valid. - // Only when there are no valid bytes (which will happen on the next call), return an error. + // If the first byte is invalid it is either first byte of a multi-byte sequence but the + // provided byte slice is too short or it is the first byte of an invalide multi-byte sequence. let len = cmp::min(data.len(), MAX_BUFFER_SIZE / 2); let utf8 = match str::from_utf8(&data[..len]) { Ok(s) => s, Err(ref e) if e.valid_up_to() == 0 => { - return Err(io::Error::new_const( - io::ErrorKind::InvalidData, - &"Windows stdio in console mode does not support writing non-UTF-8 byte sequences", - )); + let first_byte_char_width = utf8_char_width(data[0]); + if first_byte_char_width > 1 && data.len() < first_byte_char_width { + incomplete_utf8.bytes[0] = data[0]; + incomplete_utf8.len = 1; + return Ok(1); + } else { + return Err(io::Error::new_const( + io::ErrorKind::InvalidData, + &"Windows stdio in console mode does not support writing non-UTF-8 byte sequences", + )); + } } Err(e) => str::from_utf8(&data[..e.valid_up_to()]).unwrap(), }; + + write_valid_utf8_to_console(handle, utf8) +} + +fn write_valid_utf8_to_console(handle: c::HANDLE, utf8: &str) -> io::Result<usize> { let mut utf16 = [0u16; MAX_BUFFER_SIZE / 2]; let mut len_utf16 = 0; for (chr, dest) in utf8.encode_utf16().zip(utf16.iter_mut()) { @@ -259,15 +329,21 @@ fn utf16_to_utf8(utf16: &[u16], utf8: &mut [u8]) -> io::Result<usize> { Ok(written) } +impl IncompleteUtf8 { + pub const fn new() -> IncompleteUtf8 { + IncompleteUtf8 { bytes: [0; 4], len: 0 } + } +} + impl Stdout { pub const fn new() -> Stdout { - Stdout + Stdout { incomplete_utf8: IncompleteUtf8::new() } } } impl io::Write for Stdout { fn write(&mut self, buf: &[u8]) -> io::Result<usize> { - write(c::STD_OUTPUT_HANDLE, buf) + write(c::STD_OUTPUT_HANDLE, buf, &mut self.incomplete_utf8) } fn flush(&mut self) -> io::Result<()> { @@ -277,13 +353,13 @@ impl io::Write for Stdout { impl Stderr { pub const fn new() -> Stderr { - Stderr + Stderr { incomplete_utf8: IncompleteUtf8::new() } } } impl io::Write for Stderr { fn write(&mut self, buf: &[u8]) -> io::Result<usize> { - write(c::STD_ERROR_HANDLE, buf) + write(c::STD_ERROR_HANDLE, buf, &mut self.incomplete_utf8) } fn flush(&mut self) -> io::Result<()> { |
