Handle win32 separator & prefixes for cygwin paths

author: 王宇逸 <Strawberry_Str@hotmail.com> 2025-06-01 23:12:35 +0800
committer: 王宇逸 <Strawberry_Str@hotmail.com> 2025-06-16 09:24:07 +0800
commit: 3cb0cba054d9d1871f3a10345d5c30cfc7ac214c (patch)
tree: 8e8c225bf85c5c9afc3afe80cdb7b5ebf49140ba /library/std/src/sys
parent: 015c7770ec0ffdba9ff03f1861144a827497f8ca (diff)
download: rust-3cb0cba054d9d1871f3a10345d5c30cfc7ac214c.tar.gz
rust-3cb0cba054d9d1871f3a10345d5c30cfc7ac214c.zip
5 files changed, 284 insertions, 172 deletions
diff --git a/library/std/src/sys/path/cygwin.rs b/library/std/src/sys/path/cygwin.rs
new file mode 100644
index 00000000000..e90372805bb
--- /dev/null
+++ b/library/std/src/sys/path/cygwin.rs
@@ -0,0 +1,92 @@
+use crate::ffi::OsString;
+use crate::os::unix::ffi::OsStringExt;
+use crate::path::{Path, PathBuf};
+use crate::sys::common::small_c_string::run_path_with_cstr;
+use crate::sys::cvt;
+use crate::{io, ptr};
+
+#[inline]
+pub fn is_sep_byte(b: u8) -> bool {
+    b == b'/' || b == b'\\'
+}
+
+/// Cygwin allways prefers `/` over `\`, and it always converts all `/` to `\`
+/// internally when calling Win32 APIs. Therefore, the server component of path
+/// `\\?\UNC\localhost/share` is `localhost/share` on Win32, but `localhost`
+/// on Cygwin.
+#[inline]
+pub fn is_verbatim_sep(b: u8) -> bool {
+    b == b'/' || b == b'\\'
+}
+
+pub use super::windows_prefix::parse_prefix;
+
+pub const MAIN_SEP_STR: &str = "/";
+pub const MAIN_SEP: char = '/';
+
+unsafe extern "C" {
+    // Doc: https://cygwin.com/cygwin-api/func-cygwin-conv-path.html
+    // Src: https://github.com/cygwin/cygwin/blob/718a15ba50e0d01c79800bd658c2477f9a603540/winsup/cygwin/path.cc#L3902
+    // Safety:
+    // * `what` should be `CCP_WIN_A_TO_POSIX` here
+    // * `from` is null-terminated UTF-8 path
+    // * `to` is buffer, the buffer size is `size`.
+    //
+    // Converts a path to an absolute POSIX path, no matter the input is Win32 path or POSIX path.
+    fn cygwin_conv_path(
+        what: libc::c_uint,
+        from: *const libc::c_char,
+        to: *mut u8,
+        size: libc::size_t,
+    ) -> libc::ssize_t;
+}
+
+const CCP_WIN_A_TO_POSIX: libc::c_uint = 2;
+
+/// Make a POSIX path absolute.
+pub(crate) fn absolute(path: &Path) -> io::Result<PathBuf> {
+    run_path_with_cstr(path, &|path| {
+        let conv = CCP_WIN_A_TO_POSIX;
+        let size = cvt(unsafe { cygwin_conv_path(conv, path.as_ptr(), ptr::null_mut(), 0) })?;
+        // If success, size should not be 0.
+        debug_assert!(size >= 1);
+        let size = size as usize;
+        let mut buffer = Vec::with_capacity(size);
+        cvt(unsafe { cygwin_conv_path(conv, path.as_ptr(), buffer.as_mut_ptr(), size) })?;
+        unsafe {
+            buffer.set_len(size - 1);
+        }
+        Ok(PathBuf::from(OsString::from_vec(buffer)))
+    })
+    .map(|path| {
+        if path.prefix().is_some() {
+            return path;
+        }
+
+        // From unix.rs
+        let mut components = path.components();
+        let path_os = path.as_os_str().as_encoded_bytes();
+
+        let mut normalized = if path_os.starts_with(b"//") && !path_os.starts_with(b"///") {
+            components.next();
+            PathBuf::from("//")
+        } else {
+            PathBuf::new()
+        };
+        normalized.extend(components);
+
+        if path_os.ends_with(b"/") {
+            normalized.push("");
+        }
+
+        normalized
+    })
+}
+
+pub(crate) fn is_absolute(path: &Path) -> bool {
+    if path.as_os_str().as_encoded_bytes().starts_with(b"\\") {
+        path.has_root() && path.prefix().is_some()
+    } else {
+        path.has_root()
+    }
+}
diff --git a/library/std/src/sys/path/mod.rs b/library/std/src/sys/path/mod.rs
index 1fa4e80d678..a4ff4338cf5 100644
--- a/library/std/src/sys/path/mod.rs
+++ b/library/std/src/sys/path/mod.rs
@@ -1,6 +1,7 @@
 cfg_if::cfg_if! {
     if #[cfg(target_os = "windows")] {
         mod windows;
+        mod windows_prefix;
         pub use windows::*;
     } else if #[cfg(all(target_vendor = "fortanix", target_env = "sgx"))] {
         mod sgx;
@@ -11,6 +12,10 @@ cfg_if::cfg_if! {
     } else if #[cfg(target_os = "uefi")] {
         mod uefi;
         pub use uefi::*;
+    } else if #[cfg(target_os = "cygwin")] {
+        mod cygwin;
+        mod windows_prefix;
+        pub use cygwin::*;
     } else {
         mod unix;
         pub use unix::*;
diff --git a/library/std/src/sys/path/windows.rs b/library/std/src/sys/path/windows.rs
index e0e003f6a81..f124e1e5a71 100644
--- a/library/std/src/sys/path/windows.rs
+++ b/library/std/src/sys/path/windows.rs
@@ -1,5 +1,5 @@
 use crate::ffi::{OsStr, OsString};
-use crate::path::{Path, PathBuf, Prefix};
+use crate::path::{Path, PathBuf};
 use crate::sys::api::utf16;
 use crate::sys::pal::{c, fill_utf16_buf, os2path, to_u16s};
 use crate::{io, ptr};
@@ -7,6 +7,8 @@ use crate::{io, ptr};
 #[cfg(test)]
 mod tests;
 
+pub use super::windows_prefix::parse_prefix;
+
 pub const MAIN_SEP_STR: &str = "\\";
 pub const MAIN_SEP: char = '\\';
 
@@ -77,177 +79,6 @@ pub(crate) fn append_suffix(path: PathBuf, suffix: &OsStr) -> PathBuf {
     path.into()
 }
 
-struct PrefixParser<'a, const LEN: usize> {
-    path: &'a OsStr,
-    prefix: [u8; LEN],
-}
-
-impl<'a, const LEN: usize> PrefixParser<'a, LEN> {
-    #[inline]
-    fn get_prefix(path: &OsStr) -> [u8; LEN] {
-        let mut prefix = [0; LEN];
-        // SAFETY: Only ASCII characters are modified.
-        for (i, &ch) in path.as_encoded_bytes().iter().take(LEN).enumerate() {
-            prefix[i] = if ch == b'/' { b'\\' } else { ch };
-        }
-        prefix
-    }
-
-    fn new(path: &'a OsStr) -> Self {
-        Self { path, prefix: Self::get_prefix(path) }
-    }
-
-    fn as_slice(&self) -> PrefixParserSlice<'a, '_> {
-        PrefixParserSlice {
-            path: self.path,
-            prefix: &self.prefix[..LEN.min(self.path.len())],
-            index: 0,
-        }
-    }
-}
-
-struct PrefixParserSlice<'a, 'b> {
-    path: &'a OsStr,
-    prefix: &'b [u8],
-    index: usize,
-}
-
-impl<'a> PrefixParserSlice<'a, '_> {
-    fn strip_prefix(&self, prefix: &str) -> Option<Self> {
-        self.prefix[self.index..]
-            .starts_with(prefix.as_bytes())
-            .then_some(Self { index: self.index + prefix.len(), ..*self })
-    }
-
-    fn prefix_bytes(&self) -> &'a [u8] {
-        &self.path.as_encoded_bytes()[..self.index]
-    }
-
-    fn finish(self) -> &'a OsStr {
-        // SAFETY: The unsafety here stems from converting between &OsStr and
-        // &[u8] and back. This is safe to do because (1) we only look at ASCII
-        // contents of the encoding and (2) new &OsStr values are produced only
-        // from ASCII-bounded slices of existing &OsStr values.
-        unsafe { OsStr::from_encoded_bytes_unchecked(&self.path.as_encoded_bytes()[self.index..]) }
-    }
-}
-
-pub fn parse_prefix(path: &OsStr) -> Option<Prefix<'_>> {
-    use Prefix::{DeviceNS, Disk, UNC, Verbatim, VerbatimDisk, VerbatimUNC};
-
-    let parser = PrefixParser::<8>::new(path);
-    let parser = parser.as_slice();
-    if let Some(parser) = parser.strip_prefix(r"\\") {
-        // \\
-
-        // The meaning of verbatim paths can change when they use a different
-        // separator.
-        if let Some(parser) = parser.strip_prefix(r"?\")
-            && !parser.prefix_bytes().iter().any(|&x| x == b'/')
-        {
-            // \\?\
-            if let Some(parser) = parser.strip_prefix(r"UNC\") {
-                // \\?\UNC\server\share
-
-                let path = parser.finish();
-                let (server, path) = parse_next_component(path, true);
-                let (share, _) = parse_next_component(path, true);
-
-                Some(VerbatimUNC(server, share))
-            } else {
-                let path = parser.finish();
-
-                // in verbatim paths only recognize an exact drive prefix
-                if let Some(drive) = parse_drive_exact(path) {
-                    // \\?\C:
-                    Some(VerbatimDisk(drive))
-                } else {
-                    // \\?\prefix
-                    let (prefix, _) = parse_next_component(path, true);
-                    Some(Verbatim(prefix))
-                }
-            }
-        } else if let Some(parser) = parser.strip_prefix(r".\") {
-            // \\.\COM42
-            let path = parser.finish();
-            let (prefix, _) = parse_next_component(path, false);
-            Some(DeviceNS(prefix))
-        } else {
-            let path = parser.finish();
-            let (server, path) = parse_next_component(path, false);
-            let (share, _) = parse_next_component(path, false);
-
-            if !server.is_empty() && !share.is_empty() {
-                // \\server\share
-                Some(UNC(server, share))
-            } else {
-                // no valid prefix beginning with "\\" recognized
-                None
-            }
-        }
-    } else {
-        // If it has a drive like `C:` then it's a disk.
-        // Otherwise there is no prefix.
-        parse_drive(path).map(Disk)
-    }
-}
-
-// Parses a drive prefix, e.g. "C:" and "C:\whatever"
-fn parse_drive(path: &OsStr) -> Option<u8> {
-    // In most DOS systems, it is not possible to have more than 26 drive letters.
-    // See <https://en.wikipedia.org/wiki/Drive_letter_assignment#Common_assignments>.
-    fn is_valid_drive_letter(drive: &u8) -> bool {
-        drive.is_ascii_alphabetic()
-    }
-
-    match path.as_encoded_bytes() {
-        [drive, b':', ..] if is_valid_drive_letter(drive) => Some(drive.to_ascii_uppercase()),
-        _ => None,
-    }
-}
-
-// Parses a drive prefix exactly, e.g. "C:"
-fn parse_drive_exact(path: &OsStr) -> Option<u8> {
-    // only parse two bytes: the drive letter and the drive separator
-    if path.as_encoded_bytes().get(2).map(|&x| is_sep_byte(x)).unwrap_or(true) {
-        parse_drive(path)
-    } else {
-        None
-    }
-}
-
-// Parse the next path component.
-//
-// Returns the next component and the rest of the path excluding the component and separator.
-// Does not recognize `/` as a separator character if `verbatim` is true.
-fn parse_next_component(path: &OsStr, verbatim: bool) -> (&OsStr, &OsStr) {
-    let separator = if verbatim { is_verbatim_sep } else { is_sep_byte };
-
-    match path.as_encoded_bytes().iter().position(|&x| separator(x)) {
-        Some(separator_start) => {
-            let separator_end = separator_start + 1;
-
-            let component = &path.as_encoded_bytes()[..separator_start];
-
-            // Panic safe
-            // The max `separator_end` is `bytes.len()` and `bytes[bytes.len()..]` is a valid index.
-            let path = &path.as_encoded_bytes()[separator_end..];
-
-            // SAFETY: `path` is a valid wtf8 encoded slice and each of the separators ('/', '\')
-            // is encoded in a single byte, therefore `bytes[separator_start]` and
-            // `bytes[separator_end]` must be code point boundaries and thus
-            // `bytes[..separator_start]` and `bytes[separator_end..]` are valid wtf8 slices.
-            unsafe {
-                (
-                    OsStr::from_encoded_bytes_unchecked(component),
-                    OsStr::from_encoded_bytes_unchecked(path),
-                )
-            }
-        }
-        None => (path, OsStr::new("")),
-    }
-}
-
 /// Returns a UTF-16 encoded path capable of bypassing the legacy `MAX_PATH` limits.
 ///
 /// This path may or may not have a verbatim prefix.
diff --git a/library/std/src/sys/path/windows/tests.rs b/library/std/src/sys/path/windows/tests.rs
index 9eb79203dca..830f48d7bfc 100644
--- a/library/std/src/sys/path/windows/tests.rs
+++ b/library/std/src/sys/path/windows/tests.rs
@@ -1,4 +1,6 @@
+use super::super::windows_prefix::*;
 use super::*;
+use crate::path::Prefix;
 
 #[test]
 fn test_parse_next_component() {
diff --git a/library/std/src/sys/path/windows_prefix.rs b/library/std/src/sys/path/windows_prefix.rs
new file mode 100644
index 00000000000..b9dfe754485
--- /dev/null
+++ b/library/std/src/sys/path/windows_prefix.rs
@@ -0,0 +1,182 @@
+//! Parse Windows prefixes, for both Windows and Cygwin.
+
+use super::{is_sep_byte, is_verbatim_sep};
+use crate::ffi::OsStr;
+use crate::path::Prefix;
+
+struct PrefixParser<'a, const LEN: usize> {
+    path: &'a OsStr,
+    prefix: [u8; LEN],
+}
+
+impl<'a, const LEN: usize> PrefixParser<'a, LEN> {
+    #[inline]
+    fn get_prefix(path: &OsStr) -> [u8; LEN] {
+        let mut prefix = [0; LEN];
+        // SAFETY: Only ASCII characters are modified.
+        for (i, &ch) in path.as_encoded_bytes().iter().take(LEN).enumerate() {
+            prefix[i] = if ch == b'/' { b'\\' } else { ch };
+        }
+        prefix
+    }
+
+    fn new(path: &'a OsStr) -> Self {
+        Self { path, prefix: Self::get_prefix(path) }
+    }
+
+    fn as_slice(&self) -> PrefixParserSlice<'a, '_> {
+        PrefixParserSlice {
+            path: self.path,
+            prefix: &self.prefix[..LEN.min(self.path.len())],
+            index: 0,
+        }
+    }
+}
+
+struct PrefixParserSlice<'a, 'b> {
+    path: &'a OsStr,
+    prefix: &'b [u8],
+    index: usize,
+}
+
+impl<'a> PrefixParserSlice<'a, '_> {
+    fn strip_prefix(&self, prefix: &str) -> Option<Self> {
+        self.prefix[self.index..]
+            .starts_with(prefix.as_bytes())
+            .then_some(Self { index: self.index + prefix.len(), ..*self })
+    }
+
+    fn prefix_bytes(&self) -> &'a [u8] {
+        &self.path.as_encoded_bytes()[..self.index]
+    }
+
+    fn finish(self) -> &'a OsStr {
+        // SAFETY: The unsafety here stems from converting between &OsStr and
+        // &[u8] and back. This is safe to do because (1) we only look at ASCII
+        // contents of the encoding and (2) new &OsStr values are produced only
+        // from ASCII-bounded slices of existing &OsStr values.
+        unsafe { OsStr::from_encoded_bytes_unchecked(&self.path.as_encoded_bytes()[self.index..]) }
+    }
+}
+
+pub fn parse_prefix(path: &OsStr) -> Option<Prefix<'_>> {
+    use Prefix::{DeviceNS, Disk, UNC, Verbatim, VerbatimDisk, VerbatimUNC};
+
+    let parser = PrefixParser::<8>::new(path);
+    let parser = parser.as_slice();
+    if let Some(parser) = parser.strip_prefix(r"\\") {
+        // \\
+
+        // It's a POSIX path.
+        if cfg!(target_os = "cygwin") && !path.as_encoded_bytes().iter().any(|&x| x == b'\\') {
+            return None;
+        }
+
+        // The meaning of verbatim paths can change when they use a different
+        // separator.
+        if let Some(parser) = parser.strip_prefix(r"?\")
+            // Cygwin allows `/` in verbatim paths.
+            && (cfg!(target_os = "cygwin") || !parser.prefix_bytes().iter().any(|&x| x == b'/'))
+        {
+            // \\?\
+            if let Some(parser) = parser.strip_prefix(r"UNC\") {
+                // \\?\UNC\server\share
+
+                let path = parser.finish();
+                let (server, path) = parse_next_component(path, true);
+                let (share, _) = parse_next_component(path, true);
+
+                Some(VerbatimUNC(server, share))
+            } else {
+                let path = parser.finish();
+
+                // in verbatim paths only recognize an exact drive prefix
+                if let Some(drive) = parse_drive_exact(path) {
+                    // \\?\C:
+                    Some(VerbatimDisk(drive))
+                } else {
+                    // \\?\prefix
+                    let (prefix, _) = parse_next_component(path, true);
+                    Some(Verbatim(prefix))
+                }
+            }
+        } else if let Some(parser) = parser.strip_prefix(r".\") {
+            // \\.\COM42
+            let path = parser.finish();
+            let (prefix, _) = parse_next_component(path, false);
+            Some(DeviceNS(prefix))
+        } else {
+            let path = parser.finish();
+            let (server, path) = parse_next_component(path, false);
+            let (share, _) = parse_next_component(path, false);
+
+            if !server.is_empty() && !share.is_empty() {
+                // \\server\share
+                Some(UNC(server, share))
+            } else {
+                // no valid prefix beginning with "\\" recognized
+                None
+            }
+        }
+    } else {
+        // If it has a drive like `C:` then it's a disk.
+        // Otherwise there is no prefix.
+        Some(Disk(parse_drive(path)?))
+    }
+}
+
+// Parses a drive prefix, e.g. "C:" and "C:\whatever"
+fn parse_drive(path: &OsStr) -> Option<u8> {
+    // In most DOS systems, it is not possible to have more than 26 drive letters.
+    // See <https://en.wikipedia.org/wiki/Drive_letter_assignment#Common_assignments>.
+    fn is_valid_drive_letter(drive: &u8) -> bool {
+        drive.is_ascii_alphabetic()
+    }
+
+    match path.as_encoded_bytes() {
+        [drive, b':', ..] if is_valid_drive_letter(drive) => Some(drive.to_ascii_uppercase()),
+        _ => None,
+    }
+}
+
+// Parses a drive prefix exactly, e.g. "C:"
+fn parse_drive_exact(path: &OsStr) -> Option<u8> {
+    // only parse two bytes: the drive letter and the drive separator
+    if path.as_encoded_bytes().get(2).map(|&x| is_sep_byte(x)).unwrap_or(true) {
+        parse_drive(path)
+    } else {
+        None
+    }
+}
+
+// Parse the next path component.
+//
+// Returns the next component and the rest of the path excluding the component and separator.
+// Does not recognize `/` as a separator character on Windows if `verbatim` is true.
+pub(crate) fn parse_next_component(path: &OsStr, verbatim: bool) -> (&OsStr, &OsStr) {
+    let separator = if verbatim { is_verbatim_sep } else { is_sep_byte };
+
+    match path.as_encoded_bytes().iter().position(|&x| separator(x)) {
+        Some(separator_start) => {
+            let separator_end = separator_start + 1;
+
+            let component = &path.as_encoded_bytes()[..separator_start];
+
+            // Panic safe
+            // The max `separator_end` is `bytes.len()` and `bytes[bytes.len()..]` is a valid index.
+            let path = &path.as_encoded_bytes()[separator_end..];
+
+            // SAFETY: `path` is a valid wtf8 encoded slice and each of the separators ('/', '\')
+            // is encoded in a single byte, therefore `bytes[separator_start]` and
+            // `bytes[separator_end]` must be code point boundaries and thus
+            // `bytes[..separator_start]` and `bytes[separator_end..]` are valid wtf8 slices.
+            unsafe {
+                (
+                    OsStr::from_encoded_bytes_unchecked(component),
+                    OsStr::from_encoded_bytes_unchecked(path),
+                )
+            }
+        }
+        None => (path, OsStr::new("")),
+    }
+}
author	王宇逸 <Strawberry_Str@hotmail.com>	2025-06-01 23:12:35 +0800
committer	王宇逸 <Strawberry_Str@hotmail.com>	2025-06-16 09:24:07 +0800
commit	3cb0cba054d9d1871f3a10345d5c30cfc7ac214c (patch)
tree	8e8c225bf85c5c9afc3afe80cdb7b5ebf49140ba /library/std/src/sys
parent	015c7770ec0ffdba9ff03f1861144a827497f8ca (diff)
download	rust-3cb0cba054d9d1871f3a10345d5c30cfc7ac214c.tar.gz rust-3cb0cba054d9d1871f3a10345d5c30cfc7ac214c.zip