about summary refs log tree commit diff
path: root/library/std/src/sys/args/windows.rs
diff options
context:
space:
mode:
Diffstat (limited to 'library/std/src/sys/args/windows.rs')
-rw-r--r--library/std/src/sys/args/windows.rs442
1 files changed, 442 insertions, 0 deletions
diff --git a/library/std/src/sys/args/windows.rs b/library/std/src/sys/args/windows.rs
new file mode 100644
index 00000000000..55bd6864cae
--- /dev/null
+++ b/library/std/src/sys/args/windows.rs
@@ -0,0 +1,442 @@
+//! The Windows command line is just a string
+//! <https://docs.microsoft.com/en-us/archive/blogs/larryosterman/the-windows-command-line-is-just-a-string>
+//!
+//! This module implements the parsing necessary to turn that string into a list of arguments.
+
+#[cfg(test)]
+mod tests;
+
+use crate::ffi::{OsStr, OsString};
+use crate::num::NonZero;
+use crate::os::windows::prelude::*;
+use crate::path::{Path, PathBuf};
+use crate::sys::pal::os::current_exe;
+use crate::sys::pal::{ensure_no_nuls, fill_utf16_buf};
+use crate::sys::path::get_long_path;
+use crate::sys::{c, to_u16s};
+use crate::sys_common::AsInner;
+use crate::sys_common::wstr::WStrUnits;
+use crate::{fmt, io, iter, ptr, vec};
+
+pub fn args() -> Args {
+    // SAFETY: `GetCommandLineW` returns a pointer to a null terminated UTF-16
+    // string so it's safe for `WStrUnits` to use.
+    unsafe {
+        let lp_cmd_line = c::GetCommandLineW();
+        let parsed_args_list = parse_lp_cmd_line(WStrUnits::new(lp_cmd_line), || {
+            current_exe().map(PathBuf::into_os_string).unwrap_or_else(|_| OsString::new())
+        });
+
+        Args { parsed_args_list: parsed_args_list.into_iter() }
+    }
+}
+
+/// Implements the Windows command-line argument parsing algorithm.
+///
+/// Microsoft's documentation for the Windows CLI argument format can be found at
+/// <https://docs.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-160#parsing-c-command-line-arguments>
+///
+/// A more in-depth explanation is here:
+/// <https://daviddeley.com/autohotkey/parameters/parameters.htm#WIN>
+///
+/// Windows includes a function to do command line parsing in shell32.dll.
+/// However, this is not used for two reasons:
+///
+/// 1. Linking with that DLL causes the process to be registered as a GUI application.
+/// GUI applications add a bunch of overhead, even if no windows are drawn. See
+/// <https://randomascii.wordpress.com/2018/12/03/a-not-called-function-can-cause-a-5x-slowdown/>.
+///
+/// 2. It does not follow the modern C/C++ argv rules outlined in the first two links above.
+///
+/// This function was tested for equivalence to the C/C++ parsing rules using an
+/// extensive test suite available at
+/// <https://github.com/ChrisDenton/winarg/tree/std>.
+fn parse_lp_cmd_line<'a, F: Fn() -> OsString>(
+    lp_cmd_line: Option<WStrUnits<'a>>,
+    exe_name: F,
+) -> Vec<OsString> {
+    const BACKSLASH: NonZero<u16> = NonZero::new(b'\\' as u16).unwrap();
+    const QUOTE: NonZero<u16> = NonZero::new(b'"' as u16).unwrap();
+    const TAB: NonZero<u16> = NonZero::new(b'\t' as u16).unwrap();
+    const SPACE: NonZero<u16> = NonZero::new(b' ' as u16).unwrap();
+
+    let mut ret_val = Vec::new();
+    // If the cmd line pointer is null or it points to an empty string then
+    // return the name of the executable as argv[0].
+    if lp_cmd_line.as_ref().and_then(|cmd| cmd.peek()).is_none() {
+        ret_val.push(exe_name());
+        return ret_val;
+    }
+    let mut code_units = lp_cmd_line.unwrap();
+
+    // The executable name at the beginning is special.
+    let mut in_quotes = false;
+    let mut cur = Vec::new();
+    for w in &mut code_units {
+        match w {
+            // A quote mark always toggles `in_quotes` no matter what because
+            // there are no escape characters when parsing the executable name.
+            QUOTE => in_quotes = !in_quotes,
+            // If not `in_quotes` then whitespace ends argv[0].
+            SPACE | TAB if !in_quotes => break,
+            // In all other cases the code unit is taken literally.
+            _ => cur.push(w.get()),
+        }
+    }
+    // Skip whitespace.
+    code_units.advance_while(|w| w == SPACE || w == TAB);
+    ret_val.push(OsString::from_wide(&cur));
+
+    // Parse the arguments according to these rules:
+    // * All code units are taken literally except space, tab, quote and backslash.
+    // * When not `in_quotes`, space and tab separate arguments. Consecutive spaces and tabs are
+    // treated as a single separator.
+    // * A space or tab `in_quotes` is taken literally.
+    // * A quote toggles `in_quotes` mode unless it's escaped. An escaped quote is taken literally.
+    // * A quote can be escaped if preceded by an odd number of backslashes.
+    // * If any number of backslashes is immediately followed by a quote then the number of
+    // backslashes is halved (rounding down).
+    // * Backslashes not followed by a quote are all taken literally.
+    // * If `in_quotes` then a quote can also be escaped using another quote
+    // (i.e. two consecutive quotes become one literal quote).
+    let mut cur = Vec::new();
+    let mut in_quotes = false;
+    while let Some(w) = code_units.next() {
+        match w {
+            // If not `in_quotes`, a space or tab ends the argument.
+            SPACE | TAB if !in_quotes => {
+                ret_val.push(OsString::from_wide(&cur[..]));
+                cur.truncate(0);
+
+                // Skip whitespace.
+                code_units.advance_while(|w| w == SPACE || w == TAB);
+            }
+            // Backslashes can escape quotes or backslashes but only if consecutive backslashes are followed by a quote.
+            BACKSLASH => {
+                let backslash_count = code_units.advance_while(|w| w == BACKSLASH) + 1;
+                if code_units.peek() == Some(QUOTE) {
+                    cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count / 2));
+                    // The quote is escaped if there are an odd number of backslashes.
+                    if backslash_count % 2 == 1 {
+                        code_units.next();
+                        cur.push(QUOTE.get());
+                    }
+                } else {
+                    // If there is no quote on the end then there is no escaping.
+                    cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count));
+                }
+            }
+            // If `in_quotes` and not backslash escaped (see above) then a quote either
+            // unsets `in_quote` or is escaped by another quote.
+            QUOTE if in_quotes => match code_units.peek() {
+                // Two consecutive quotes when `in_quotes` produces one literal quote.
+                Some(QUOTE) => {
+                    cur.push(QUOTE.get());
+                    code_units.next();
+                }
+                // Otherwise set `in_quotes`.
+                Some(_) => in_quotes = false,
+                // The end of the command line.
+                // Push `cur` even if empty, which we do by breaking while `in_quotes` is still set.
+                None => break,
+            },
+            // If not `in_quotes` and not BACKSLASH escaped (see above) then a quote sets `in_quote`.
+            QUOTE => in_quotes = true,
+            // Everything else is always taken literally.
+            _ => cur.push(w.get()),
+        }
+    }
+    // Push the final argument, if any.
+    if !cur.is_empty() || in_quotes {
+        ret_val.push(OsString::from_wide(&cur[..]));
+    }
+    ret_val
+}
+
+pub struct Args {
+    parsed_args_list: vec::IntoIter<OsString>,
+}
+
+impl fmt::Debug for Args {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        self.parsed_args_list.as_slice().fmt(f)
+    }
+}
+
+impl Iterator for Args {
+    type Item = OsString;
+    fn next(&mut self) -> Option<OsString> {
+        self.parsed_args_list.next()
+    }
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.parsed_args_list.size_hint()
+    }
+}
+
+impl DoubleEndedIterator for Args {
+    fn next_back(&mut self) -> Option<OsString> {
+        self.parsed_args_list.next_back()
+    }
+}
+
+impl ExactSizeIterator for Args {
+    fn len(&self) -> usize {
+        self.parsed_args_list.len()
+    }
+}
+
+#[derive(Debug)]
+pub(crate) enum Arg {
+    /// Add quotes (if needed)
+    Regular(OsString),
+    /// Append raw string without quoting
+    Raw(OsString),
+}
+
+enum Quote {
+    // Every arg is quoted
+    Always,
+    // Whitespace and empty args are quoted
+    Auto,
+    // Arg appended without any changes (#29494)
+    Never,
+}
+
+pub(crate) fn append_arg(cmd: &mut Vec<u16>, arg: &Arg, force_quotes: bool) -> io::Result<()> {
+    let (arg, quote) = match arg {
+        Arg::Regular(arg) => (arg, if force_quotes { Quote::Always } else { Quote::Auto }),
+        Arg::Raw(arg) => (arg, Quote::Never),
+    };
+
+    // If an argument has 0 characters then we need to quote it to ensure
+    // that it actually gets passed through on the command line or otherwise
+    // it will be dropped entirely when parsed on the other end.
+    ensure_no_nuls(arg)?;
+    let arg_bytes = arg.as_encoded_bytes();
+    let (quote, escape) = match quote {
+        Quote::Always => (true, true),
+        Quote::Auto => {
+            (arg_bytes.iter().any(|c| *c == b' ' || *c == b'\t') || arg_bytes.is_empty(), true)
+        }
+        Quote::Never => (false, false),
+    };
+    if quote {
+        cmd.push('"' as u16);
+    }
+
+    let mut backslashes: usize = 0;
+    for x in arg.encode_wide() {
+        if escape {
+            if x == '\\' as u16 {
+                backslashes += 1;
+            } else {
+                if x == '"' as u16 {
+                    // Add n+1 backslashes to total 2n+1 before internal '"'.
+                    cmd.extend((0..=backslashes).map(|_| '\\' as u16));
+                }
+                backslashes = 0;
+            }
+        }
+        cmd.push(x);
+    }
+
+    if quote {
+        // Add n backslashes to total 2n before ending '"'.
+        cmd.extend((0..backslashes).map(|_| '\\' as u16));
+        cmd.push('"' as u16);
+    }
+    Ok(())
+}
+
+fn append_bat_arg(cmd: &mut Vec<u16>, arg: &OsStr, mut quote: bool) -> io::Result<()> {
+    ensure_no_nuls(arg)?;
+    // If an argument has 0 characters then we need to quote it to ensure
+    // that it actually gets passed through on the command line or otherwise
+    // it will be dropped entirely when parsed on the other end.
+    //
+    // We also need to quote the argument if it ends with `\` to guard against
+    // bat usage such as `"%~2"` (i.e. force quote arguments) otherwise a
+    // trailing slash will escape the closing quote.
+    if arg.is_empty() || arg.as_encoded_bytes().last() == Some(&b'\\') {
+        quote = true;
+    }
+    for cp in arg.as_inner().inner.code_points() {
+        if let Some(cp) = cp.to_char() {
+            // Rather than trying to find every ascii symbol that must be quoted,
+            // we assume that all ascii symbols must be quoted unless they're known to be good.
+            // We also quote Unicode control blocks for good measure.
+            // Note an unquoted `\` is fine so long as the argument isn't otherwise quoted.
+            static UNQUOTED: &str = r"#$*+-./:?@\_";
+            let ascii_needs_quotes =
+                cp.is_ascii() && !(cp.is_ascii_alphanumeric() || UNQUOTED.contains(cp));
+            if ascii_needs_quotes || cp.is_control() {
+                quote = true;
+            }
+        }
+    }
+
+    if quote {
+        cmd.push('"' as u16);
+    }
+    // Loop through the string, escaping `\` only if followed by `"`.
+    // And escaping `"` by doubling them.
+    let mut backslashes: usize = 0;
+    for x in arg.encode_wide() {
+        if x == '\\' as u16 {
+            backslashes += 1;
+        } else {
+            if x == '"' as u16 {
+                // Add n backslashes to total 2n before internal `"`.
+                cmd.extend((0..backslashes).map(|_| '\\' as u16));
+                // Appending an additional double-quote acts as an escape.
+                cmd.push(b'"' as u16)
+            } else if x == '%' as u16 || x == '\r' as u16 {
+                // yt-dlp hack: replaces `%` with `%%cd:~,%` to stop %VAR% being expanded as an environment variable.
+                //
+                // # Explanation
+                //
+                // cmd supports extracting a substring from a variable using the following syntax:
+                //     %variable:~start_index,end_index%
+                //
+                // In the above command `cd` is used as the variable and the start_index and end_index are left blank.
+                // `cd` is a built-in variable that dynamically expands to the current directory so it's always available.
+                // Explicitly omitting both the start and end index creates a zero-length substring.
+                //
+                // Therefore it all resolves to nothing. However, by doing this no-op we distract cmd.exe
+                // from potentially expanding %variables% in the argument.
+                cmd.extend_from_slice(&[
+                    '%' as u16, '%' as u16, 'c' as u16, 'd' as u16, ':' as u16, '~' as u16,
+                    ',' as u16,
+                ]);
+            }
+            backslashes = 0;
+        }
+        cmd.push(x);
+    }
+    if quote {
+        // Add n backslashes to total 2n before ending `"`.
+        cmd.extend((0..backslashes).map(|_| '\\' as u16));
+        cmd.push('"' as u16);
+    }
+    Ok(())
+}
+
+pub(crate) fn make_bat_command_line(
+    script: &[u16],
+    args: &[Arg],
+    force_quotes: bool,
+) -> io::Result<Vec<u16>> {
+    const INVALID_ARGUMENT_ERROR: io::Error =
+        io::const_error!(io::ErrorKind::InvalidInput, r#"batch file arguments are invalid"#);
+    // Set the start of the command line to `cmd.exe /c "`
+    // It is necessary to surround the command in an extra pair of quotes,
+    // hence the trailing quote here. It will be closed after all arguments
+    // have been added.
+    // Using /e:ON enables "command extensions" which is essential for the `%` hack to work.
+    let mut cmd: Vec<u16> = "cmd.exe /e:ON /v:OFF /d /c \"".encode_utf16().collect();
+
+    // Push the script name surrounded by its quote pair.
+    cmd.push(b'"' as u16);
+    // Windows file names cannot contain a `"` character or end with `\\`.
+    // If the script name does then return an error.
+    if script.contains(&(b'"' as u16)) || script.last() == Some(&(b'\\' as u16)) {
+        return Err(io::const_error!(
+            io::ErrorKind::InvalidInput,
+            "Windows file names may not contain `\"` or end with `\\`"
+        ));
+    }
+    cmd.extend_from_slice(script.strip_suffix(&[0]).unwrap_or(script));
+    cmd.push(b'"' as u16);
+
+    // Append the arguments.
+    // FIXME: This needs tests to ensure that the arguments are properly
+    // reconstructed by the batch script by default.
+    for arg in args {
+        cmd.push(' ' as u16);
+        match arg {
+            Arg::Regular(arg_os) => {
+                let arg_bytes = arg_os.as_encoded_bytes();
+                // Disallow \r and \n as they may truncate the arguments.
+                const DISALLOWED: &[u8] = b"\r\n";
+                if arg_bytes.iter().any(|c| DISALLOWED.contains(c)) {
+                    return Err(INVALID_ARGUMENT_ERROR);
+                }
+                append_bat_arg(&mut cmd, arg_os, force_quotes)?;
+            }
+            _ => {
+                // Raw arguments are passed on as-is.
+                // It's the user's responsibility to properly handle arguments in this case.
+                append_arg(&mut cmd, arg, force_quotes)?;
+            }
+        };
+    }
+
+    // Close the quote we left opened earlier.
+    cmd.push(b'"' as u16);
+
+    Ok(cmd)
+}
+
+/// Takes a path and tries to return a non-verbatim path.
+///
+/// This is necessary because cmd.exe does not support verbatim paths.
+pub(crate) fn to_user_path(path: &Path) -> io::Result<Vec<u16>> {
+    from_wide_to_user_path(to_u16s(path)?)
+}
+pub(crate) fn from_wide_to_user_path(mut path: Vec<u16>) -> io::Result<Vec<u16>> {
+    // UTF-16 encoded code points, used in parsing and building UTF-16 paths.
+    // All of these are in the ASCII range so they can be cast directly to `u16`.
+    const SEP: u16 = b'\\' as _;
+    const QUERY: u16 = b'?' as _;
+    const COLON: u16 = b':' as _;
+    const U: u16 = b'U' as _;
+    const N: u16 = b'N' as _;
+    const C: u16 = b'C' as _;
+
+    // Early return if the path is too long to remove the verbatim prefix.
+    const LEGACY_MAX_PATH: usize = 260;
+    if path.len() > LEGACY_MAX_PATH {
+        return Ok(path);
+    }
+
+    match &path[..] {
+        // `\\?\C:\...` => `C:\...`
+        [SEP, SEP, QUERY, SEP, _, COLON, SEP, ..] => unsafe {
+            let lpfilename = path[4..].as_ptr();
+            fill_utf16_buf(
+                |buffer, size| c::GetFullPathNameW(lpfilename, size, buffer, ptr::null_mut()),
+                |full_path: &[u16]| {
+                    if full_path == &path[4..path.len() - 1] {
+                        let mut path: Vec<u16> = full_path.into();
+                        path.push(0);
+                        path
+                    } else {
+                        path
+                    }
+                },
+            )
+        },
+        // `\\?\UNC\...` => `\\...`
+        [SEP, SEP, QUERY, SEP, U, N, C, SEP, ..] => unsafe {
+            // Change the `C` in `UNC\` to `\` so we can get a slice that starts with `\\`.
+            path[6] = b'\\' as u16;
+            let lpfilename = path[6..].as_ptr();
+            fill_utf16_buf(
+                |buffer, size| c::GetFullPathNameW(lpfilename, size, buffer, ptr::null_mut()),
+                |full_path: &[u16]| {
+                    if full_path == &path[6..path.len() - 1] {
+                        let mut path: Vec<u16> = full_path.into();
+                        path.push(0);
+                        path
+                    } else {
+                        // Restore the 'C' in "UNC".
+                        path[6] = b'C' as u16;
+                        path
+                    }
+                },
+            )
+        },
+        // For everything else, leave the path unchanged.
+        _ => get_long_path(path, false),
+    }
+}