diff options
| author | Chris Denton <christophersdenton@gmail.com> | 2021-08-06 00:15:22 +0100 |
|---|---|---|
| committer | Chris Denton <christophersdenton@gmail.com> | 2021-08-08 22:11:30 +0100 |
| commit | e26dda564219341e25589ff745b16258ad424b78 (patch) | |
| tree | 076e9b4fc6f7e80f5b0f72c46921c06f502cf350 | |
| parent | 565a51973a4f64f522e66c8af87bad339c966f31 (diff) | |
| download | rust-e26dda564219341e25589ff745b16258ad424b78.tar.gz rust-e26dda564219341e25589ff745b16258ad424b78.zip | |
Implement modern Windows arg parsing
As derived from extensive testing of `argv` in a C/C++ application. Co-Authored-By: Jane Lusby <jlusby42@gmail.com>
| -rw-r--r-- | library/std/src/lib.rs | 1 | ||||
| -rw-r--r-- | library/std/src/sys/windows/args.rs | 257 | ||||
| -rw-r--r-- | library/std/src/sys/windows/c.rs | 2 |
3 files changed, 154 insertions, 106 deletions
diff --git a/library/std/src/lib.rs b/library/std/src/lib.rs index 1af6157ca68..4a9405905dc 100644 --- a/library/std/src/lib.rs +++ b/library/std/src/lib.rs @@ -253,6 +253,7 @@ #![feature(const_ip)] #![feature(const_ipv4)] #![feature(const_ipv6)] +#![feature(const_option)] #![feature(const_raw_ptr_deref)] #![feature(const_socketaddr)] #![feature(container_error_extra)] diff --git a/library/std/src/sys/windows/args.rs b/library/std/src/sys/windows/args.rs index f1264130faf..3919025b080 100644 --- a/library/std/src/sys/windows/args.rs +++ b/library/std/src/sys/windows/args.rs @@ -1,13 +1,18 @@ -#![allow(dead_code)] // runtime init functions not used during testing +//! The Windows command line is just a string +//! <https://docs.microsoft.com/en-us/archive/blogs/larryosterman/the-windows-command-line-is-just-a-string> +//! +//! This module implements the parsing necessary to turn that string into a list of arguments. #[cfg(test)] mod tests; use crate::ffi::OsString; use crate::fmt; +use crate::marker::PhantomData; +use crate::num::NonZeroU16; use crate::os::windows::prelude::*; use crate::path::PathBuf; -use crate::slice; +use crate::ptr::NonNull; use crate::sys::c; use crate::sys::windows::os::current_exe; use crate::vec; @@ -15,9 +20,11 @@ use crate::vec; use core::iter; pub fn args() -> Args { + // SAFETY: `GetCommandLineW` returns a pointer to a null terminated UTF-16 + // string so it's safe for `WStrUnits` to use. unsafe { let lp_cmd_line = c::GetCommandLineW(); - let parsed_args_list = parse_lp_cmd_line(lp_cmd_line as *const u16, || { + let parsed_args_list = parse_lp_cmd_line(WStrUnits::new(lp_cmd_line), || { current_exe().map(PathBuf::into_os_string).unwrap_or_else(|_| OsString::new()) }); @@ -28,129 +35,120 @@ pub fn args() -> Args { /// Implements the Windows command-line argument parsing algorithm. /// /// Microsoft's documentation for the Windows CLI argument format can be found at -/// <https://docs.microsoft.com/en-us/previous-versions//17w5ykft(v=vs.85)>. +/// <https://docs.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-160#parsing-c-command-line-arguments> /// -/// Windows includes a function to do this in shell32.dll, -/// but linking with that DLL causes the process to be registered as a GUI application. +/// A more in-depth explanation is here: +/// <https://daviddeley.com/autohotkey/parameters/parameters.htm#WIN> +/// +/// Windows includes a function to do command line parsing in shell32.dll. +/// However, this is not used for two reasons: +/// +/// 1. Linking with that DLL causes the process to be registered as a GUI application. /// GUI applications add a bunch of overhead, even if no windows are drawn. See /// <https://randomascii.wordpress.com/2018/12/03/a-not-called-function-can-cause-a-5x-slowdown/>. /// -/// This function was tested for equivalence to the shell32.dll implementation in -/// Windows 10 Pro v1803, using an exhaustive test suite available at -/// <https://gist.github.com/notriddle/dde431930c392e428055b2dc22e638f5> or -/// <https://paste.gg/p/anonymous/47d6ed5f5bd549168b1c69c799825223>. -unsafe fn parse_lp_cmd_line<F: Fn() -> OsString>( - lp_cmd_line: *const u16, +/// 2. It does not follow the modern C/C++ argv rules outlined in the first two links above. +/// +/// This function was tested for equivalence to the C/C++ parsing rules using an +/// extensive test suite available at +/// <https://github.com/ChrisDenton/winarg/tree/std>. +fn parse_lp_cmd_line<'a, F: Fn() -> OsString>( + lp_cmd_line: Option<WStrUnits<'a>>, exe_name: F, ) -> Vec<OsString> { - const BACKSLASH: u16 = '\\' as u16; - const QUOTE: u16 = '"' as u16; - const TAB: u16 = '\t' as u16; - const SPACE: u16 = ' ' as u16; + const BACKSLASH: NonZeroU16 = NonZeroU16::new(b'\\' as u16).unwrap(); + const QUOTE: NonZeroU16 = NonZeroU16::new(b'"' as u16).unwrap(); + const TAB: NonZeroU16 = NonZeroU16::new(b'\t' as u16).unwrap(); + const SPACE: NonZeroU16 = NonZeroU16::new(b' ' as u16).unwrap(); + let mut ret_val = Vec::new(); - if lp_cmd_line.is_null() || *lp_cmd_line == 0 { + // If the cmd line pointer is null or it points to an empty string then + // return the name of the executable as argv[0]. + if lp_cmd_line.as_ref().and_then(|cmd| cmd.peek()).is_none() { ret_val.push(exe_name()); return ret_val; } - let mut cmd_line = { - let mut end = 0; - while *lp_cmd_line.offset(end) != 0 { - end += 1; - } - slice::from_raw_parts(lp_cmd_line, end as usize) - }; + let mut code_units = lp_cmd_line.unwrap(); + // The executable name at the beginning is special. - cmd_line = match cmd_line[0] { - // The executable name ends at the next quote mark, - // no matter what. - QUOTE => { - let args = { - let mut cut = cmd_line[1..].splitn(2, |&c| c == QUOTE); - if let Some(exe) = cut.next() { - ret_val.push(OsString::from_wide(exe)); - } - cut.next() - }; - if let Some(args) = args { - args - } else { - return ret_val; - } - } - // Implement quirk: when they say whitespace here, - // they include the entire ASCII control plane: - // "However, if lpCmdLine starts with any amount of whitespace, CommandLineToArgvW - // will consider the first argument to be an empty string. Excess whitespace at the - // end of lpCmdLine is ignored." - 0..=SPACE => { - ret_val.push(OsString::new()); - &cmd_line[1..] - } - // The executable name ends at the next whitespace, - // no matter what. - _ => { - let args = { - let mut cut = cmd_line.splitn(2, |&c| c > 0 && c <= SPACE); - if let Some(exe) = cut.next() { - ret_val.push(OsString::from_wide(exe)); - } - cut.next() - }; - if let Some(args) = args { - args - } else { - return ret_val; - } + let mut in_quotes = false; + let mut cur = Vec::new(); + for w in &mut code_units { + match w { + // A quote mark always toggles `in_quotes` no matter what because + // there are no escape characters when parsing the executable name. + QUOTE => in_quotes = !in_quotes, + // If not `in_quotes` then whitespace ends argv[0]. + SPACE | TAB if !in_quotes => break, + // In all other cases the code unit is taken literally. + _ => cur.push(w.get()), } - }; + } + // Skip whitespace. + code_units.advance_while(|w| w == SPACE || w == TAB); + ret_val.push(OsString::from_wide(&cur)); + + // Parse the arguments according to these rules: + // * All code units are taken literally except space, tab, quote and backslash. + // * When not `in_quotes`, space and tab separate arguments. Consecutive spaces and tabs are + // treated as a single separator. + // * A space or tab `in_quotes` is taken literally. + // * A quote toggles `in_quotes` mode unless it's escaped. An escaped quote is taken literally. + // * A quote can be escaped if preceded by an odd number of backslashes. + // * If any number of backslashes is immediately followed by a quote then the number of + // backslashes is halved (rounding down). + // * Backslashes not followed by a quote are all taken literally. + // * If `in_quotes` then a quote can also be escaped using another quote + // (i.e. two consecutive quotes become one literal quote). let mut cur = Vec::new(); let mut in_quotes = false; - let mut was_in_quotes = false; - let mut backslash_count: usize = 0; - for &c in cmd_line { - match c { - // backslash - BACKSLASH => { - backslash_count += 1; - was_in_quotes = false; + while let Some(w) = code_units.next() { + match w { + // If not `in_quotes`, a space or tab ends the argument. + SPACE | TAB if !in_quotes => { + ret_val.push(OsString::from_wide(&cur[..])); + cur.truncate(0); + + // Skip whitespace. + code_units.advance_while(|w| w == SPACE || w == TAB); } - QUOTE if backslash_count % 2 == 0 => { - cur.extend(iter::repeat(b'\\' as u16).take(backslash_count / 2)); - backslash_count = 0; - if was_in_quotes { - cur.push('"' as u16); - was_in_quotes = false; + // Backslashes can escape quotes or backslashes but only if consecutive backslashes are followed by a quote. + BACKSLASH => { + let backslash_count = code_units.advance_while(|w| w == BACKSLASH) + 1; + if code_units.peek() == Some(QUOTE) { + cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count / 2)); + // The quote is escaped if there are an odd number of backslashes. + if backslash_count % 2 == 1 { + code_units.next(); + cur.push(QUOTE.get()); + } } else { - was_in_quotes = in_quotes; - in_quotes = !in_quotes; + // If there is no quote on the end then there is no escaping. + cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count)); } } - QUOTE if backslash_count % 2 != 0 => { - cur.extend(iter::repeat(b'\\' as u16).take(backslash_count / 2)); - backslash_count = 0; - was_in_quotes = false; - cur.push(b'"' as u16); - } - SPACE | TAB if !in_quotes => { - cur.extend(iter::repeat(b'\\' as u16).take(backslash_count)); - if !cur.is_empty() || was_in_quotes { - ret_val.push(OsString::from_wide(&cur[..])); - cur.truncate(0); + // If `in_quotes` and not backslash escaped (see above) then a quote either + // unsets `in_quote` or is escaped by another quote. + QUOTE if in_quotes => match code_units.peek() { + // Two consecutive quotes when `in_quotes` produces one literal quote. + Some(QUOTE) => { + cur.push(QUOTE.get()); + code_units.next(); } - backslash_count = 0; - was_in_quotes = false; - } - _ => { - cur.extend(iter::repeat(b'\\' as u16).take(backslash_count)); - backslash_count = 0; - was_in_quotes = false; - cur.push(c); - } + // Otherwise set `in_quotes`. + Some(_) => in_quotes = false, + // The end of the command line. + // Push `cur` even if empty, which we do by breaking while `in_quotes` is still set. + None => break, + }, + // If not `in_quotes` and not BACKSLASH escaped (see above) then a quote sets `in_quote`. + QUOTE => in_quotes = true, + // Everything else is always taken literally. + _ => cur.push(w.get()), } } - cur.extend(iter::repeat(b'\\' as u16).take(backslash_count)); - // include empty quoted strings at the end of the arguments list - if !cur.is_empty() || was_in_quotes || in_quotes { + // Push the final argument, if any. + if !cur.is_empty() || in_quotes { ret_val.push(OsString::from_wide(&cur[..])); } ret_val @@ -187,3 +185,52 @@ impl ExactSizeIterator for Args { self.parsed_args_list.len() } } + +/// A safe iterator over a LPWSTR +/// (aka a pointer to a series of UTF-16 code units terminated by a NULL). +struct WStrUnits<'a> { + // The pointer must never be null... + lpwstr: NonNull<u16>, + // ...and the memory it points to must be valid for this lifetime. + lifetime: PhantomData<&'a [u16]>, +} +impl WStrUnits<'_> { + /// Create the iterator. Returns `None` if `lpwstr` is null. + /// + /// SAFETY: `lpwstr` must point to a null-terminated wide string that lives + /// at least as long as the lifetime of this struct. + unsafe fn new(lpwstr: *const u16) -> Option<Self> { + Some(Self { lpwstr: NonNull::new(lpwstr as _)?, lifetime: PhantomData }) + } + fn peek(&self) -> Option<NonZeroU16> { + // SAFETY: It's always safe to read the current item because we don't + // ever move out of the array's bounds. + unsafe { NonZeroU16::new(*self.lpwstr.as_ptr()) } + } + /// Advance the iterator while `predicate` returns true. + /// Returns the number of items it advanced by. + fn advance_while<P: FnMut(NonZeroU16) -> bool>(&mut self, mut predicate: P) -> usize { + let mut counter = 0; + while let Some(w) = self.peek() { + if !predicate(w) { + break; + } + counter += 1; + self.next(); + } + counter + } +} +impl Iterator for WStrUnits<'_> { + // This can never return zero as that marks the end of the string. + type Item = NonZeroU16; + fn next(&mut self) -> Option<NonZeroU16> { + // SAFETY: If NULL is reached we immediately return. + // Therefore it's safe to advance the pointer after that. + unsafe { + let next = self.peek()?; + self.lpwstr = NonNull::new_unchecked(self.lpwstr.as_ptr().add(1)); + Some(next) + } + } +} diff --git a/library/std/src/sys/windows/c.rs b/library/std/src/sys/windows/c.rs index 63f9be7b7e3..023186c7a90 100644 --- a/library/std/src/sys/windows/c.rs +++ b/library/std/src/sys/windows/c.rs @@ -781,7 +781,7 @@ extern "system" { pub fn RemoveDirectoryW(lpPathName: LPCWSTR) -> BOOL; pub fn SetFileAttributesW(lpFileName: LPCWSTR, dwFileAttributes: DWORD) -> BOOL; pub fn SetLastError(dwErrCode: DWORD); - pub fn GetCommandLineW() -> *mut LPCWSTR; + pub fn GetCommandLineW() -> LPWSTR; pub fn GetTempPathW(nBufferLength: DWORD, lpBuffer: LPCWSTR) -> DWORD; pub fn GetCurrentProcess() -> HANDLE; pub fn GetCurrentThread() -> HANDLE; |
