Auto merge of #94887 - dylni:move-normpath-crate-impl-to-libstd, r=ChrisDenton

Improve Windows path prefix parsing This PR fixes improves parsing of Windows path prefixes. `parse_prefix` now supports both types of separators on Windows (`/` and `\`).
author: bors <bors@rust-lang.org> 2022-04-23 00:58:22 +0000
committer: bors <bors@rust-lang.org> 2022-04-23 00:58:22 +0000
commit: 8834629b861cd182be6b914d4e6bc5958160debc (patch)
tree: 32c7778c18bb94338e6d3f243cc7a91d91727092 /library/std/src
parent: c2b4c2dffaab572cc060e96e52ded4b108614b82 (diff)
parent: fb9731ea13a2526583cedcfa674be92d2a120970 (diff)
download: rust-8834629b861cd182be6b914d4e6bc5958160debc.tar.gz
rust-8834629b861cd182be6b914d4e6bc5958160debc.zip
6 files changed, 142 insertions, 57 deletions
diff --git a/library/std/src/lib.rs b/library/std/src/lib.rs
index da7753216d0..f60b604f1b1 100644
--- a/library/std/src/lib.rs
+++ b/library/std/src/lib.rs
@@ -240,6 +240,7 @@
 #![feature(exhaustive_patterns)]
 #![feature(intra_doc_pointers)]
 #![feature(lang_items)]
+#![feature(let_chains)]
 #![feature(linkage)]
 #![feature(min_specialization)]
 #![feature(must_not_suspend)]
diff --git a/library/std/src/path.rs b/library/std/src/path.rs
index 8ecea8ce07f..c03d197e019 100644
--- a/library/std/src/path.rs
+++ b/library/std/src/path.rs
@@ -168,8 +168,8 @@ pub enum Prefix<'a> {
 
     /// Device namespace prefix, e.g., `\\.\COM42`.
     ///
-    /// Device namespace prefixes consist of `\\.\` immediately followed by the
-    /// device name.
+    /// Device namespace prefixes consist of `\\.\` (possibly using `/`
+    /// instead of `\`), immediately followed by the device name.
     #[stable(feature = "rust1", since = "1.0.0")]
     DeviceNS(#[stable(feature = "rust1", since = "1.0.0")] &'a OsStr),
 
@@ -193,7 +193,7 @@ impl<'a> Prefix<'a> {
     fn len(&self) -> usize {
         use self::Prefix::*;
         fn os_str_len(s: &OsStr) -> usize {
-            os_str_as_u8_slice(s).len()
+            s.bytes().len()
         }
         match *self {
             Verbatim(x) => 4 + os_str_len(x),
@@ -299,19 +299,17 @@ where
     }
 }
 
-// See note at the top of this module to understand why these are used:
-//
-// These casts are safe as OsStr is internally a wrapper around [u8] on all
-// platforms.
-//
-// Note that currently this relies on the special knowledge that libstd has;
-// these types are single-element structs but are not marked repr(transparent)
-// or repr(C) which would make these casts allowable outside std.
-fn os_str_as_u8_slice(s: &OsStr) -> &[u8] {
-    unsafe { &*(s as *const OsStr as *const [u8]) }
-}
 unsafe fn u8_slice_as_os_str(s: &[u8]) -> &OsStr {
-    // SAFETY: see the comment of `os_str_as_u8_slice`
+    // SAFETY: See note at the top of this module to understand why this and
+    // `OsStr::bytes` are used:
+    //
+    // This casts are safe as OsStr is internally a wrapper around [u8] on all
+    // platforms.
+    //
+    // Note that currently this relies on the special knowledge that libstd has;
+    // these types are single-element structs but are not marked
+    // repr(transparent) or repr(C) which would make these casts not allowable
+    // outside std.
     unsafe { &*(s as *const [u8] as *const OsStr) }
 }
 
@@ -332,7 +330,7 @@ fn has_physical_root(s: &[u8], prefix: Option<Prefix<'_>>) -> bool {
 
 // basic workhorse for splitting stem and extension
 fn rsplit_file_at_dot(file: &OsStr) -> (Option<&OsStr>, Option<&OsStr>) {
-    if os_str_as_u8_slice(file) == b".." {
+    if file.bytes() == b".." {
         return (Some(file), None);
     }
 
@@ -340,7 +338,7 @@ fn rsplit_file_at_dot(file: &OsStr) -> (Option<&OsStr>, Option<&OsStr>) {
     // and back. This is safe to do because (1) we only look at ASCII
     // contents of the encoding and (2) new &OsStr values are produced
     // only from ASCII-bounded slices of existing &OsStr values.
-    let mut iter = os_str_as_u8_slice(file).rsplitn(2, |b| *b == b'.');
+    let mut iter = file.bytes().rsplitn(2, |b| *b == b'.');
     let after = iter.next();
     let before = iter.next();
     if before == Some(b"") {
@@ -351,7 +349,7 @@ fn rsplit_file_at_dot(file: &OsStr) -> (Option<&OsStr>, Option<&OsStr>) {
 }
 
 fn split_file_at_dot(file: &OsStr) -> (&OsStr, Option<&OsStr>) {
-    let slice = os_str_as_u8_slice(file);
+    let slice = file.bytes();
     if slice == b".." {
         return (file, None);
     }
@@ -1445,17 +1443,17 @@ impl PathBuf {
     fn _set_extension(&mut self, extension: &OsStr) -> bool {
         let file_stem = match self.file_stem() {
             None => return false,
-            Some(f) => os_str_as_u8_slice(f),
+            Some(f) => f.bytes(),
         };
 
         // truncate until right after the file stem
         let end_file_stem = file_stem[file_stem.len()..].as_ptr().addr();
-        let start = os_str_as_u8_slice(&self.inner).as_ptr().addr();
+        let start = self.inner.bytes().as_ptr().addr();
         let v = self.as_mut_vec();
         v.truncate(end_file_stem.wrapping_sub(start));
 
         // add the new extension, if any
-        let new = os_str_as_u8_slice(extension);
+        let new = extension.bytes();
         if !new.is_empty() {
             v.reserve_exact(new.len() + 1);
             v.push(b'.');
@@ -1948,7 +1946,7 @@ impl Path {
     }
     // The following (private!) function reveals the byte encoding used for OsStr.
     fn as_u8_slice(&self) -> &[u8] {
-        os_str_as_u8_slice(&self.inner)
+        self.inner.bytes()
     }
 
     /// Directly wraps a string slice as a `Path` slice.
diff --git a/library/std/src/path/tests.rs b/library/std/src/path/tests.rs
index d1f59d2786e..0d8ea29c2be 100644
--- a/library/std/src/path/tests.rs
+++ b/library/std/src/path/tests.rs
@@ -971,15 +971,15 @@ pub fn test_decompositions_windows() {
     file_prefix: None
     );
 
-    t!("\\\\?\\C:/foo",
-    iter: ["\\\\?\\C:/foo"],
+    t!("\\\\?\\C:/foo/bar",
+    iter: ["\\\\?\\C:", "\\", "foo/bar"],
     has_root: true,
     is_absolute: true,
-    parent: None,
-    file_name: None,
-    file_stem: None,
+    parent: Some("\\\\?\\C:/"),
+    file_name: Some("foo/bar"),
+    file_stem: Some("foo/bar"),
     extension: None,
-    file_prefix: None
+    file_prefix: Some("foo/bar")
     );
 
     t!("\\\\.\\foo\\bar",
diff --git a/library/std/src/sys/windows/mod.rs b/library/std/src/sys/windows/mod.rs
index 31c7208bbf1..2c832aa75fd 100644
--- a/library/std/src/sys/windows/mod.rs
+++ b/library/std/src/sys/windows/mod.rs
@@ -190,6 +190,10 @@ where
 {
     // Start off with a stack buf but then spill over to the heap if we end up
     // needing more space.
+    //
+    // This initial size also works around `GetFullPathNameW` returning
+    // incorrect size hints for some short paths:
+    // https://github.com/dylni/normpath/issues/5
     let mut stack_buf = [0u16; 512];
     let mut heap_buf = Vec::new();
     unsafe {
diff --git a/library/std/src/sys/windows/path.rs b/library/std/src/sys/windows/path.rs
index e54fcaed495..a0f82207099 100644
--- a/library/std/src/sys/windows/path.rs
+++ b/library/std/src/sys/windows/path.rs
@@ -50,37 +50,101 @@ pub(crate) fn append_suffix(path: PathBuf, suffix: &OsStr) -> PathBuf {
     path.into()
 }
 
+struct PrefixParser<'a, const LEN: usize> {
+    path: &'a OsStr,
+    prefix: [u8; LEN],
+}
+
+impl<'a, const LEN: usize> PrefixParser<'a, LEN> {
+    #[inline]
+    fn get_prefix(path: &OsStr) -> [u8; LEN] {
+        let mut prefix = [0; LEN];
+        // SAFETY: Only ASCII characters are modified.
+        for (i, &ch) in path.bytes().iter().take(LEN).enumerate() {
+            prefix[i] = if ch == b'/' { b'\\' } else { ch };
+        }
+        prefix
+    }
+
+    fn new(path: &'a OsStr) -> Self {
+        Self { path, prefix: Self::get_prefix(path) }
+    }
+
+    fn as_slice(&self) -> PrefixParserSlice<'a, '_> {
+        PrefixParserSlice {
+            path: self.path,
+            prefix: &self.prefix[..LEN.min(self.path.len())],
+            index: 0,
+        }
+    }
+}
+
+struct PrefixParserSlice<'a, 'b> {
+    path: &'a OsStr,
+    prefix: &'b [u8],
+    index: usize,
+}
+
+impl<'a> PrefixParserSlice<'a, '_> {
+    fn strip_prefix(&self, prefix: &str) -> Option<Self> {
+        self.prefix[self.index..]
+            .starts_with(prefix.as_bytes())
+            .then(|| Self { index: self.index + prefix.len(), ..*self })
+    }
+
+    fn prefix_bytes(&self) -> &'a [u8] {
+        &self.path.bytes()[..self.index]
+    }
+
+    fn finish(self) -> &'a OsStr {
+        // SAFETY: The unsafety here stems from converting between &OsStr and
+        // &[u8] and back. This is safe to do because (1) we only look at ASCII
+        // contents of the encoding and (2) new &OsStr values are produced only
+        // from ASCII-bounded slices of existing &OsStr values.
+        unsafe { bytes_as_os_str(&self.path.bytes()[self.index..]) }
+    }
+}
+
 pub fn parse_prefix(path: &OsStr) -> Option<Prefix<'_>> {
     use Prefix::{DeviceNS, Disk, Verbatim, VerbatimDisk, VerbatimUNC, UNC};
 
-    if let Some(path) = strip_prefix(path, r"\\") {
+    let parser = PrefixParser::<8>::new(path);
+    let parser = parser.as_slice();
+    if let Some(parser) = parser.strip_prefix(r"\\") {
         // \\
-        if let Some(path) = strip_prefix(path, r"?\") {
+
+        // The meaning of verbatim paths can change when they use a different
+        // separator.
+        if let Some(parser) = parser.strip_prefix(r"?\") && !parser.prefix_bytes().iter().any(|&x| x == b'/') {
             // \\?\
-            if let Some(path) = strip_prefix(path, r"UNC\") {
+            if let Some(parser) = parser.strip_prefix(r"UNC\") {
                 // \\?\UNC\server\share
 
+                let path = parser.finish();
                 let (server, path) = parse_next_component(path, true);
                 let (share, _) = parse_next_component(path, true);
 
                 Some(VerbatimUNC(server, share))
             } else {
-                let (prefix, _) = parse_next_component(path, true);
+                let path = parser.finish();
 
                 // in verbatim paths only recognize an exact drive prefix
-                if let Some(drive) = parse_drive_exact(prefix) {
+                if let Some(drive) = parse_drive_exact(path) {
                     // \\?\C:
                     Some(VerbatimDisk(drive))
                 } else {
                     // \\?\prefix
+                    let (prefix, _) = parse_next_component(path, true);
                     Some(Verbatim(prefix))
                 }
             }
-        } else if let Some(path) = strip_prefix(path, r".\") {
+        } else if let Some(parser) = parser.strip_prefix(r".\") {
             // \\.\COM42
+            let path = parser.finish();
             let (prefix, _) = parse_next_component(path, false);
             Some(DeviceNS(prefix))
         } else {
+            let path = parser.finish();
             let (server, path) = parse_next_component(path, false);
             let (share, _) = parse_next_component(path, false);
 
@@ -102,31 +166,26 @@ pub fn parse_prefix(path: &OsStr) -> Option<Prefix<'_>> {
 }
 
 // Parses a drive prefix, e.g. "C:" and "C:\whatever"
-fn parse_drive(prefix: &OsStr) -> Option<u8> {
+fn parse_drive(path: &OsStr) -> Option<u8> {
     // In most DOS systems, it is not possible to have more than 26 drive letters.
     // See <https://en.wikipedia.org/wiki/Drive_letter_assignment#Common_assignments>.
     fn is_valid_drive_letter(drive: &u8) -> bool {
         drive.is_ascii_alphabetic()
     }
 
-    match prefix.bytes() {
+    match path.bytes() {
         [drive, b':', ..] if is_valid_drive_letter(drive) => Some(drive.to_ascii_uppercase()),
         _ => None,
     }
 }
 
 // Parses a drive prefix exactly, e.g. "C:"
-fn parse_drive_exact(prefix: &OsStr) -> Option<u8> {
+fn parse_drive_exact(path: &OsStr) -> Option<u8> {
     // only parse two bytes: the drive letter and the drive separator
-    if prefix.len() == 2 { parse_drive(prefix) } else { None }
-}
-
-fn strip_prefix<'a>(path: &'a OsStr, prefix: &str) -> Option<&'a OsStr> {
-    // `path` and `prefix` are valid wtf8 and utf8 encoded slices respectively, `path[prefix.len()]`
-    // is thus a code point boundary and `path[prefix.len()..]` is a valid wtf8 encoded slice.
-    match path.bytes().strip_prefix(prefix.as_bytes()) {
-        Some(path) => unsafe { Some(bytes_as_os_str(path)) },
-        None => None,
+    if path.bytes().get(2).map(|&x| is_sep_byte(x)).unwrap_or(true) {
+        parse_drive(path)
+    } else {
+        None
     }
 }
 
@@ -219,15 +278,7 @@ pub(crate) fn maybe_verbatim(path: &Path) -> io::Result<Vec<u16>> {
         // SAFETY: `fill_utf16_buf` ensures the `buffer` and `size` are valid.
         // `lpfilename` is a pointer to a null terminated string that is not
         // invalidated until after `GetFullPathNameW` returns successfully.
-        |buffer, size| unsafe {
-            // While the docs for `GetFullPathNameW` have the standard note
-            // about needing a `\\?\` path for a long lpfilename, this does not
-            // appear to be true in practice.
-            // See:
-            // https://stackoverflow.com/questions/38036943/getfullpathnamew-and-long-windows-file-paths
-            // https://googleprojectzero.blogspot.com/2016/02/the-definitive-guide-on-win32-to-nt.html
-            c::GetFullPathNameW(lpfilename, size, buffer, ptr::null_mut())
-        },
+        |buffer, size| unsafe { c::GetFullPathNameW(lpfilename, size, buffer, ptr::null_mut()) },
         |mut absolute| {
             path.clear();
 
@@ -263,9 +314,20 @@ pub(crate) fn maybe_verbatim(path: &Path) -> io::Result<Vec<u16>> {
 
 /// Make a Windows path absolute.
 pub(crate) fn absolute(path: &Path) -> io::Result<PathBuf> {
-    if path.as_os_str().bytes().starts_with(br"\\?\") {
-        return Ok(path.into());
+    let path = path.as_os_str();
+    let prefix = parse_prefix(path);
+    // Verbatim paths should not be modified.
+    if prefix.map(|x| x.is_verbatim()).unwrap_or(false) {
+        // NULs in verbatim paths are rejected for consistency.
+        if path.bytes().contains(&0) {
+            return Err(io::const_io_error!(
+                io::ErrorKind::InvalidInput,
+                "strings passed to WinAPI cannot contain NULs",
+            ));
+        }
+        return Ok(path.to_owned().into());
     }
+
     let path = to_u16s(path)?;
     let lpfilename = path.as_ptr();
     fill_utf16_buf(
diff --git a/library/std/src/sys/windows/path/tests.rs b/library/std/src/sys/windows/path/tests.rs
index 425c2011b32..8656b04e4f4 100644
--- a/library/std/src/sys/windows/path/tests.rs
+++ b/library/std/src/sys/windows/path/tests.rs
@@ -94,3 +94,23 @@ fn verbatim() {
     // A path that contains null is not a valid path.
     assert!(maybe_verbatim(Path::new("\0")).is_err());
 }
+
+fn parse_prefix(path: &str) -> Option<Prefix<'_>> {
+    super::parse_prefix(OsStr::new(path))
+}
+
+#[test]
+fn test_parse_prefix_verbatim() {
+    let prefix = Some(Prefix::VerbatimDisk(b'C'));
+    assert_eq!(prefix, parse_prefix(r"\\?\C:/windows/system32/notepad.exe"));
+    assert_eq!(prefix, parse_prefix(r"\\?\C:\windows\system32\notepad.exe"));
+}
+
+#[test]
+fn test_parse_prefix_verbatim_device() {
+    let prefix = Some(Prefix::UNC(OsStr::new("?"), OsStr::new("C:")));
+    assert_eq!(prefix, parse_prefix(r"//?/C:/windows/system32/notepad.exe"));
+    assert_eq!(prefix, parse_prefix(r"//?/C:\windows\system32\notepad.exe"));
+    assert_eq!(prefix, parse_prefix(r"/\?\C:\windows\system32\notepad.exe"));
+    assert_eq!(prefix, parse_prefix(r"\\?/C:\windows\system32\notepad.exe"));
+}
author	bors <bors@rust-lang.org>	2022-04-23 00:58:22 +0000
committer	bors <bors@rust-lang.org>	2022-04-23 00:58:22 +0000
commit	8834629b861cd182be6b914d4e6bc5958160debc (patch)
tree	32c7778c18bb94338e6d3f243cc7a91d91727092 /library/std/src
parent	c2b4c2dffaab572cc060e96e52ded4b108614b82 (diff)
parent	fb9731ea13a2526583cedcfa674be92d2a120970 (diff)
download	rust-8834629b861cd182be6b914d4e6bc5958160debc.tar.gz rust-8834629b861cd182be6b914d4e6bc5958160debc.zip