about summary refs log tree commit diff
diff options
context:
space:
mode:
authorChris Denton <christophersdenton@gmail.com>2021-08-06 00:15:22 +0100
committerChris Denton <christophersdenton@gmail.com>2021-08-08 22:11:30 +0100
commite26dda564219341e25589ff745b16258ad424b78 (patch)
tree076e9b4fc6f7e80f5b0f72c46921c06f502cf350
parent565a51973a4f64f522e66c8af87bad339c966f31 (diff)
downloadrust-e26dda564219341e25589ff745b16258ad424b78.tar.gz
rust-e26dda564219341e25589ff745b16258ad424b78.zip
Implement modern Windows arg parsing
As derived from extensive testing of `argv` in a C/C++ application.

Co-Authored-By: Jane Lusby <jlusby42@gmail.com>
-rw-r--r--library/std/src/lib.rs1
-rw-r--r--library/std/src/sys/windows/args.rs257
-rw-r--r--library/std/src/sys/windows/c.rs2
3 files changed, 154 insertions, 106 deletions
diff --git a/library/std/src/lib.rs b/library/std/src/lib.rs
index 1af6157ca68..4a9405905dc 100644
--- a/library/std/src/lib.rs
+++ b/library/std/src/lib.rs
@@ -253,6 +253,7 @@
 #![feature(const_ip)]
 #![feature(const_ipv4)]
 #![feature(const_ipv6)]
+#![feature(const_option)]
 #![feature(const_raw_ptr_deref)]
 #![feature(const_socketaddr)]
 #![feature(container_error_extra)]
diff --git a/library/std/src/sys/windows/args.rs b/library/std/src/sys/windows/args.rs
index f1264130faf..3919025b080 100644
--- a/library/std/src/sys/windows/args.rs
+++ b/library/std/src/sys/windows/args.rs
@@ -1,13 +1,18 @@
-#![allow(dead_code)] // runtime init functions not used during testing
+//! The Windows command line is just a string
+//! <https://docs.microsoft.com/en-us/archive/blogs/larryosterman/the-windows-command-line-is-just-a-string>
+//!
+//! This module implements the parsing necessary to turn that string into a list of arguments.
 
 #[cfg(test)]
 mod tests;
 
 use crate::ffi::OsString;
 use crate::fmt;
+use crate::marker::PhantomData;
+use crate::num::NonZeroU16;
 use crate::os::windows::prelude::*;
 use crate::path::PathBuf;
-use crate::slice;
+use crate::ptr::NonNull;
 use crate::sys::c;
 use crate::sys::windows::os::current_exe;
 use crate::vec;
@@ -15,9 +20,11 @@ use crate::vec;
 use core::iter;
 
 pub fn args() -> Args {
+    // SAFETY: `GetCommandLineW` returns a pointer to a null terminated UTF-16
+    // string so it's safe for `WStrUnits` to use.
     unsafe {
         let lp_cmd_line = c::GetCommandLineW();
-        let parsed_args_list = parse_lp_cmd_line(lp_cmd_line as *const u16, || {
+        let parsed_args_list = parse_lp_cmd_line(WStrUnits::new(lp_cmd_line), || {
             current_exe().map(PathBuf::into_os_string).unwrap_or_else(|_| OsString::new())
         });
 
@@ -28,129 +35,120 @@ pub fn args() -> Args {
 /// Implements the Windows command-line argument parsing algorithm.
 ///
 /// Microsoft's documentation for the Windows CLI argument format can be found at
-/// <https://docs.microsoft.com/en-us/previous-versions//17w5ykft(v=vs.85)>.
+/// <https://docs.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-160#parsing-c-command-line-arguments>
 ///
-/// Windows includes a function to do this in shell32.dll,
-/// but linking with that DLL causes the process to be registered as a GUI application.
+/// A more in-depth explanation is here:
+/// <https://daviddeley.com/autohotkey/parameters/parameters.htm#WIN>
+///
+/// Windows includes a function to do command line parsing in shell32.dll.
+/// However, this is not used for two reasons:
+///
+/// 1. Linking with that DLL causes the process to be registered as a GUI application.
 /// GUI applications add a bunch of overhead, even if no windows are drawn. See
 /// <https://randomascii.wordpress.com/2018/12/03/a-not-called-function-can-cause-a-5x-slowdown/>.
 ///
-/// This function was tested for equivalence to the shell32.dll implementation in
-/// Windows 10 Pro v1803, using an exhaustive test suite available at
-/// <https://gist.github.com/notriddle/dde431930c392e428055b2dc22e638f5> or
-/// <https://paste.gg/p/anonymous/47d6ed5f5bd549168b1c69c799825223>.
-unsafe fn parse_lp_cmd_line<F: Fn() -> OsString>(
-    lp_cmd_line: *const u16,
+/// 2. It does not follow the modern C/C++ argv rules outlined in the first two links above.
+///
+/// This function was tested for equivalence to the C/C++ parsing rules using an
+/// extensive test suite available at
+/// <https://github.com/ChrisDenton/winarg/tree/std>.
+fn parse_lp_cmd_line<'a, F: Fn() -> OsString>(
+    lp_cmd_line: Option<WStrUnits<'a>>,
     exe_name: F,
 ) -> Vec<OsString> {
-    const BACKSLASH: u16 = '\\' as u16;
-    const QUOTE: u16 = '"' as u16;
-    const TAB: u16 = '\t' as u16;
-    const SPACE: u16 = ' ' as u16;
+    const BACKSLASH: NonZeroU16 = NonZeroU16::new(b'\\' as u16).unwrap();
+    const QUOTE: NonZeroU16 = NonZeroU16::new(b'"' as u16).unwrap();
+    const TAB: NonZeroU16 = NonZeroU16::new(b'\t' as u16).unwrap();
+    const SPACE: NonZeroU16 = NonZeroU16::new(b' ' as u16).unwrap();
+
     let mut ret_val = Vec::new();
-    if lp_cmd_line.is_null() || *lp_cmd_line == 0 {
+    // If the cmd line pointer is null or it points to an empty string then
+    // return the name of the executable as argv[0].
+    if lp_cmd_line.as_ref().and_then(|cmd| cmd.peek()).is_none() {
         ret_val.push(exe_name());
         return ret_val;
     }
-    let mut cmd_line = {
-        let mut end = 0;
-        while *lp_cmd_line.offset(end) != 0 {
-            end += 1;
-        }
-        slice::from_raw_parts(lp_cmd_line, end as usize)
-    };
+    let mut code_units = lp_cmd_line.unwrap();
+
     // The executable name at the beginning is special.
-    cmd_line = match cmd_line[0] {
-        // The executable name ends at the next quote mark,
-        // no matter what.
-        QUOTE => {
-            let args = {
-                let mut cut = cmd_line[1..].splitn(2, |&c| c == QUOTE);
-                if let Some(exe) = cut.next() {
-                    ret_val.push(OsString::from_wide(exe));
-                }
-                cut.next()
-            };
-            if let Some(args) = args {
-                args
-            } else {
-                return ret_val;
-            }
-        }
-        // Implement quirk: when they say whitespace here,
-        // they include the entire ASCII control plane:
-        // "However, if lpCmdLine starts with any amount of whitespace, CommandLineToArgvW
-        // will consider the first argument to be an empty string. Excess whitespace at the
-        // end of lpCmdLine is ignored."
-        0..=SPACE => {
-            ret_val.push(OsString::new());
-            &cmd_line[1..]
-        }
-        // The executable name ends at the next whitespace,
-        // no matter what.
-        _ => {
-            let args = {
-                let mut cut = cmd_line.splitn(2, |&c| c > 0 && c <= SPACE);
-                if let Some(exe) = cut.next() {
-                    ret_val.push(OsString::from_wide(exe));
-                }
-                cut.next()
-            };
-            if let Some(args) = args {
-                args
-            } else {
-                return ret_val;
-            }
+    let mut in_quotes = false;
+    let mut cur = Vec::new();
+    for w in &mut code_units {
+        match w {
+            // A quote mark always toggles `in_quotes` no matter what because
+            // there are no escape characters when parsing the executable name.
+            QUOTE => in_quotes = !in_quotes,
+            // If not `in_quotes` then whitespace ends argv[0].
+            SPACE | TAB if !in_quotes => break,
+            // In all other cases the code unit is taken literally.
+            _ => cur.push(w.get()),
         }
-    };
+    }
+    // Skip whitespace.
+    code_units.advance_while(|w| w == SPACE || w == TAB);
+    ret_val.push(OsString::from_wide(&cur));
+
+    // Parse the arguments according to these rules:
+    // * All code units are taken literally except space, tab, quote and backslash.
+    // * When not `in_quotes`, space and tab separate arguments. Consecutive spaces and tabs are
+    // treated as a single separator.
+    // * A space or tab `in_quotes` is taken literally.
+    // * A quote toggles `in_quotes` mode unless it's escaped. An escaped quote is taken literally.
+    // * A quote can be escaped if preceded by an odd number of backslashes.
+    // * If any number of backslashes is immediately followed by a quote then the number of
+    // backslashes is halved (rounding down).
+    // * Backslashes not followed by a quote are all taken literally.
+    // * If `in_quotes` then a quote can also be escaped using another quote
+    // (i.e. two consecutive quotes become one literal quote).
     let mut cur = Vec::new();
     let mut in_quotes = false;
-    let mut was_in_quotes = false;
-    let mut backslash_count: usize = 0;
-    for &c in cmd_line {
-        match c {
-            // backslash
-            BACKSLASH => {
-                backslash_count += 1;
-                was_in_quotes = false;
+    while let Some(w) = code_units.next() {
+        match w {
+            // If not `in_quotes`, a space or tab ends the argument.
+            SPACE | TAB if !in_quotes => {
+                ret_val.push(OsString::from_wide(&cur[..]));
+                cur.truncate(0);
+
+                // Skip whitespace.
+                code_units.advance_while(|w| w == SPACE || w == TAB);
             }
-            QUOTE if backslash_count % 2 == 0 => {
-                cur.extend(iter::repeat(b'\\' as u16).take(backslash_count / 2));
-                backslash_count = 0;
-                if was_in_quotes {
-                    cur.push('"' as u16);
-                    was_in_quotes = false;
+            // Backslashes can escape quotes or backslashes but only if consecutive backslashes are followed by a quote.
+            BACKSLASH => {
+                let backslash_count = code_units.advance_while(|w| w == BACKSLASH) + 1;
+                if code_units.peek() == Some(QUOTE) {
+                    cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count / 2));
+                    // The quote is escaped if there are an odd number of backslashes.
+                    if backslash_count % 2 == 1 {
+                        code_units.next();
+                        cur.push(QUOTE.get());
+                    }
                 } else {
-                    was_in_quotes = in_quotes;
-                    in_quotes = !in_quotes;
+                    // If there is no quote on the end then there is no escaping.
+                    cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count));
                 }
             }
-            QUOTE if backslash_count % 2 != 0 => {
-                cur.extend(iter::repeat(b'\\' as u16).take(backslash_count / 2));
-                backslash_count = 0;
-                was_in_quotes = false;
-                cur.push(b'"' as u16);
-            }
-            SPACE | TAB if !in_quotes => {
-                cur.extend(iter::repeat(b'\\' as u16).take(backslash_count));
-                if !cur.is_empty() || was_in_quotes {
-                    ret_val.push(OsString::from_wide(&cur[..]));
-                    cur.truncate(0);
+            // If `in_quotes` and not backslash escaped (see above) then a quote either
+            // unsets `in_quote` or is escaped by another quote.
+            QUOTE if in_quotes => match code_units.peek() {
+                // Two consecutive quotes when `in_quotes` produces one literal quote.
+                Some(QUOTE) => {
+                    cur.push(QUOTE.get());
+                    code_units.next();
                 }
-                backslash_count = 0;
-                was_in_quotes = false;
-            }
-            _ => {
-                cur.extend(iter::repeat(b'\\' as u16).take(backslash_count));
-                backslash_count = 0;
-                was_in_quotes = false;
-                cur.push(c);
-            }
+                // Otherwise set `in_quotes`.
+                Some(_) => in_quotes = false,
+                // The end of the command line.
+                // Push `cur` even if empty, which we do by breaking while `in_quotes` is still set.
+                None => break,
+            },
+            // If not `in_quotes` and not BACKSLASH escaped (see above) then a quote sets `in_quote`.
+            QUOTE => in_quotes = true,
+            // Everything else is always taken literally.
+            _ => cur.push(w.get()),
         }
     }
-    cur.extend(iter::repeat(b'\\' as u16).take(backslash_count));
-    // include empty quoted strings at the end of the arguments list
-    if !cur.is_empty() || was_in_quotes || in_quotes {
+    // Push the final argument, if any.
+    if !cur.is_empty() || in_quotes {
         ret_val.push(OsString::from_wide(&cur[..]));
     }
     ret_val
@@ -187,3 +185,52 @@ impl ExactSizeIterator for Args {
         self.parsed_args_list.len()
     }
 }
+
+/// A safe iterator over a LPWSTR
+/// (aka a pointer to a series of UTF-16 code units terminated by a NULL).
+struct WStrUnits<'a> {
+    // The pointer must never be null...
+    lpwstr: NonNull<u16>,
+    // ...and the memory it points to must be valid for this lifetime.
+    lifetime: PhantomData<&'a [u16]>,
+}
+impl WStrUnits<'_> {
+    /// Create the iterator. Returns `None` if `lpwstr` is null.
+    ///
+    /// SAFETY: `lpwstr` must point to a null-terminated wide string that lives
+    /// at least as long as the lifetime of this struct.
+    unsafe fn new(lpwstr: *const u16) -> Option<Self> {
+        Some(Self { lpwstr: NonNull::new(lpwstr as _)?, lifetime: PhantomData })
+    }
+    fn peek(&self) -> Option<NonZeroU16> {
+        // SAFETY: It's always safe to read the current item because we don't
+        // ever move out of the array's bounds.
+        unsafe { NonZeroU16::new(*self.lpwstr.as_ptr()) }
+    }
+    /// Advance the iterator while `predicate` returns true.
+    /// Returns the number of items it advanced by.
+    fn advance_while<P: FnMut(NonZeroU16) -> bool>(&mut self, mut predicate: P) -> usize {
+        let mut counter = 0;
+        while let Some(w) = self.peek() {
+            if !predicate(w) {
+                break;
+            }
+            counter += 1;
+            self.next();
+        }
+        counter
+    }
+}
+impl Iterator for WStrUnits<'_> {
+    // This can never return zero as that marks the end of the string.
+    type Item = NonZeroU16;
+    fn next(&mut self) -> Option<NonZeroU16> {
+        // SAFETY: If NULL is reached we immediately return.
+        // Therefore it's safe to advance the pointer after that.
+        unsafe {
+            let next = self.peek()?;
+            self.lpwstr = NonNull::new_unchecked(self.lpwstr.as_ptr().add(1));
+            Some(next)
+        }
+    }
+}
diff --git a/library/std/src/sys/windows/c.rs b/library/std/src/sys/windows/c.rs
index 63f9be7b7e3..023186c7a90 100644
--- a/library/std/src/sys/windows/c.rs
+++ b/library/std/src/sys/windows/c.rs
@@ -781,7 +781,7 @@ extern "system" {
     pub fn RemoveDirectoryW(lpPathName: LPCWSTR) -> BOOL;
     pub fn SetFileAttributesW(lpFileName: LPCWSTR, dwFileAttributes: DWORD) -> BOOL;
     pub fn SetLastError(dwErrCode: DWORD);
-    pub fn GetCommandLineW() -> *mut LPCWSTR;
+    pub fn GetCommandLineW() -> LPWSTR;
     pub fn GetTempPathW(nBufferLength: DWORD, lpBuffer: LPCWSTR) -> DWORD;
     pub fn GetCurrentProcess() -> HANDLE;
     pub fn GetCurrentThread() -> HANDLE;