about summary refs log tree commit diff
path: root/library/std/src/sys
diff options
context:
space:
mode:
authorbors <bors@rust-lang.org>2023-05-30 21:26:22 +0000
committerbors <bors@rust-lang.org>2023-05-30 21:26:22 +0000
commit9610dfe5a9a731ced1ea4923ecbd0c57fe367898 (patch)
tree9d34e6dd7c85cc272964c3b8232e401cea3bbfea /library/std/src/sys
parentf0411ffcebcd7f75ac02ed45feb53ffd07b75398 (diff)
parente6a35c49533dd734f178b04677404df0da518737 (diff)
downloadrust-9610dfe5a9a731ced1ea4923ecbd0c57fe367898.tar.gz
rust-9610dfe5a9a731ced1ea4923ecbd0c57fe367898.zip
Auto merge of #109698 - epage:wtf, r=Amanieu
Allow limited access to `OsStr` bytes

`OsStr` has historically kept its implementation details private out of
concern for locking us into a specific encoding on Windows.

This is an alternative to rust-lang#95290 which proposed specifying the encoding on Windows.  Instead, this
only specifies that for cross-platform code, `OsStr`'s encoding is a superset of UTF-8 and defines
rules for safely interacting with it

At minimum, this can greatly simplify the `os_str_bytes` crate and every
arg parser that interacts with `OsStr` directly (which is most of those
that support invalid UTF-8).

Tracking issue: #111544
Diffstat (limited to 'library/std/src/sys')
-rw-r--r--library/std/src/sys/common/small_c_string.rs2
-rw-r--r--library/std/src/sys/common/tests.rs4
-rw-r--r--library/std/src/sys/unix/os_str.rs9
-rw-r--r--library/std/src/sys/unix/os_str/tests.rs9
-rw-r--r--library/std/src/sys/unix/path.rs2
-rw-r--r--library/std/src/sys/unix/process/process_common.rs4
-rw-r--r--library/std/src/sys/windows/args.rs6
-rw-r--r--library/std/src/sys/windows/os_str.rs10
-rw-r--r--library/std/src/sys/windows/path.rs42
-rw-r--r--library/std/src/sys/windows/process.rs4
10 files changed, 51 insertions, 41 deletions
diff --git a/library/std/src/sys/common/small_c_string.rs b/library/std/src/sys/common/small_c_string.rs
index 01acd519135..963d17a47e4 100644
--- a/library/std/src/sys/common/small_c_string.rs
+++ b/library/std/src/sys/common/small_c_string.rs
@@ -19,7 +19,7 @@ pub fn run_path_with_cstr<T, F>(path: &Path, f: F) -> io::Result<T>
 where
     F: FnOnce(&CStr) -> io::Result<T>,
 {
-    run_with_cstr(path.as_os_str().bytes(), f)
+    run_with_cstr(path.as_os_str().as_os_str_bytes(), f)
 }
 
 #[inline]
diff --git a/library/std/src/sys/common/tests.rs b/library/std/src/sys/common/tests.rs
index fb6f5d6af83..0a1cbcbe8ef 100644
--- a/library/std/src/sys/common/tests.rs
+++ b/library/std/src/sys/common/tests.rs
@@ -8,7 +8,7 @@ use core::iter::repeat;
 fn stack_allocation_works() {
     let path = Path::new("abc");
     let result = run_path_with_cstr(path, |p| {
-        assert_eq!(p, &*CString::new(path.as_os_str().bytes()).unwrap());
+        assert_eq!(p, &*CString::new(path.as_os_str().as_os_str_bytes()).unwrap());
         Ok(42)
     });
     assert_eq!(result.unwrap(), 42);
@@ -25,7 +25,7 @@ fn heap_allocation_works() {
     let path = repeat("a").take(384).collect::<String>();
     let path = Path::new(&path);
     let result = run_path_with_cstr(path, |p| {
-        assert_eq!(p, &*CString::new(path.as_os_str().bytes()).unwrap());
+        assert_eq!(p, &*CString::new(path.as_os_str().as_os_str_bytes()).unwrap());
         Ok(42)
     });
     assert_eq!(result.unwrap(), 42);
diff --git a/library/std/src/sys/unix/os_str.rs b/library/std/src/sys/unix/os_str.rs
index 488217f3941..142fcb9ed0b 100644
--- a/library/std/src/sys/unix/os_str.rs
+++ b/library/std/src/sys/unix/os_str.rs
@@ -193,13 +193,18 @@ impl Buf {
 
 impl Slice {
     #[inline]
-    fn from_u8_slice(s: &[u8]) -> &Slice {
+    pub fn as_os_str_bytes(&self) -> &[u8] {
+        &self.inner
+    }
+
+    #[inline]
+    pub unsafe fn from_os_str_bytes_unchecked(s: &[u8]) -> &Slice {
         unsafe { mem::transmute(s) }
     }
 
     #[inline]
     pub fn from_str(s: &str) -> &Slice {
-        Slice::from_u8_slice(s.as_bytes())
+        unsafe { Slice::from_os_str_bytes_unchecked(s.as_bytes()) }
     }
 
     pub fn to_str(&self) -> Option<&str> {
diff --git a/library/std/src/sys/unix/os_str/tests.rs b/library/std/src/sys/unix/os_str/tests.rs
index 22ba0c92350..91bc0e61a4a 100644
--- a/library/std/src/sys/unix/os_str/tests.rs
+++ b/library/std/src/sys/unix/os_str/tests.rs
@@ -2,7 +2,7 @@ use super::*;
 
 #[test]
 fn slice_debug_output() {
-    let input = Slice::from_u8_slice(b"\xF0hello,\tworld");
+    let input = unsafe { Slice::from_os_str_bytes_unchecked(b"\xF0hello,\tworld") };
     let expected = r#""\xF0hello,\tworld""#;
     let output = format!("{input:?}");
 
@@ -11,8 +11,7 @@ fn slice_debug_output() {
 
 #[test]
 fn display() {
-    assert_eq!(
-        "Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye",
-        Slice::from_u8_slice(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string(),
-    );
+    assert_eq!("Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye", unsafe {
+        Slice::from_os_str_bytes_unchecked(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string()
+    },);
 }
diff --git a/library/std/src/sys/unix/path.rs b/library/std/src/sys/unix/path.rs
index a98a69e2db8..935245f637b 100644
--- a/library/std/src/sys/unix/path.rs
+++ b/library/std/src/sys/unix/path.rs
@@ -30,7 +30,7 @@ pub(crate) fn absolute(path: &Path) -> io::Result<PathBuf> {
 
     // Get the components, skipping the redundant leading "." component if it exists.
     let mut components = path.strip_prefix(".").unwrap_or(path).components();
-    let path_os = path.as_os_str().bytes();
+    let path_os = path.as_os_str().as_os_str_bytes();
 
     let mut normalized = if path.is_absolute() {
         // "If a pathname begins with two successive <slash> characters, the
diff --git a/library/std/src/sys/unix/process/process_common.rs b/library/std/src/sys/unix/process/process_common.rs
index afd03d79c0b..640648e8707 100644
--- a/library/std/src/sys/unix/process/process_common.rs
+++ b/library/std/src/sys/unix/process/process_common.rs
@@ -164,9 +164,9 @@ pub enum ProgramKind {
 
 impl ProgramKind {
     fn new(program: &OsStr) -> Self {
-        if program.bytes().starts_with(b"/") {
+        if program.as_os_str_bytes().starts_with(b"/") {
             Self::Absolute
-        } else if program.bytes().contains(&b'/') {
+        } else if program.as_os_str_bytes().contains(&b'/') {
             // If the program has more than one component in it, it is a relative path.
             Self::Relative
         } else {
diff --git a/library/std/src/sys/windows/args.rs b/library/std/src/sys/windows/args.rs
index 5bfd8b52ed0..6b597f499bc 100644
--- a/library/std/src/sys/windows/args.rs
+++ b/library/std/src/sys/windows/args.rs
@@ -226,7 +226,7 @@ pub(crate) fn append_arg(cmd: &mut Vec<u16>, arg: &Arg, force_quotes: bool) -> i
     // that it actually gets passed through on the command line or otherwise
     // it will be dropped entirely when parsed on the other end.
     ensure_no_nuls(arg)?;
-    let arg_bytes = arg.bytes();
+    let arg_bytes = arg.as_os_str_bytes();
     let (quote, escape) = match quote {
         Quote::Always => (true, true),
         Quote::Auto => {
@@ -297,7 +297,9 @@ pub(crate) fn make_bat_command_line(
         // * `|<>` pipe/redirect characters.
         const SPECIAL: &[u8] = b"\t &()[]{}^=;!'+,`~%|<>";
         let force_quotes = match arg {
-            Arg::Regular(arg) if !force_quotes => arg.bytes().iter().any(|c| SPECIAL.contains(c)),
+            Arg::Regular(arg) if !force_quotes => {
+                arg.as_os_str_bytes().iter().any(|c| SPECIAL.contains(c))
+            }
             _ => force_quotes,
         };
         append_arg(&mut cmd, arg, force_quotes)?;
diff --git a/library/std/src/sys/windows/os_str.rs b/library/std/src/sys/windows/os_str.rs
index 2f2b0e56e08..611f0d040f0 100644
--- a/library/std/src/sys/windows/os_str.rs
+++ b/library/std/src/sys/windows/os_str.rs
@@ -152,6 +152,16 @@ impl Buf {
 
 impl Slice {
     #[inline]
+    pub fn as_os_str_bytes(&self) -> &[u8] {
+        self.inner.as_bytes()
+    }
+
+    #[inline]
+    pub unsafe fn from_os_str_bytes_unchecked(s: &[u8]) -> &Slice {
+        mem::transmute(Wtf8::from_bytes_unchecked(s))
+    }
+
+    #[inline]
     pub fn from_str(s: &str) -> &Slice {
         unsafe { mem::transmute(Wtf8::from_str(s)) }
     }
diff --git a/library/std/src/sys/windows/path.rs b/library/std/src/sys/windows/path.rs
index c3573d14c7f..c9c2d10e6c4 100644
--- a/library/std/src/sys/windows/path.rs
+++ b/library/std/src/sys/windows/path.rs
@@ -1,7 +1,6 @@
 use super::{c, fill_utf16_buf, to_u16s};
 use crate::ffi::{OsStr, OsString};
 use crate::io;
-use crate::mem;
 use crate::path::{Path, PathBuf, Prefix};
 use crate::ptr;
 
@@ -11,16 +10,6 @@ mod tests;
 pub const MAIN_SEP_STR: &str = "\\";
 pub const MAIN_SEP: char = '\\';
 
-/// # Safety
-///
-/// `bytes` must be a valid wtf8 encoded slice
-#[inline]
-unsafe fn bytes_as_os_str(bytes: &[u8]) -> &OsStr {
-    // &OsStr is layout compatible with &Slice, which is compatible with &Wtf8,
-    // which is compatible with &[u8].
-    mem::transmute(bytes)
-}
-
 #[inline]
 pub fn is_sep_byte(b: u8) -> bool {
     b == b'/' || b == b'\\'
@@ -33,12 +22,12 @@ pub fn is_verbatim_sep(b: u8) -> bool {
 
 /// Returns true if `path` looks like a lone filename.
 pub(crate) fn is_file_name(path: &OsStr) -> bool {
-    !path.bytes().iter().copied().any(is_sep_byte)
+    !path.as_os_str_bytes().iter().copied().any(is_sep_byte)
 }
 pub(crate) fn has_trailing_slash(path: &OsStr) -> bool {
-    let is_verbatim = path.bytes().starts_with(br"\\?\");
+    let is_verbatim = path.as_os_str_bytes().starts_with(br"\\?\");
     let is_separator = if is_verbatim { is_verbatim_sep } else { is_sep_byte };
-    if let Some(&c) = path.bytes().last() { is_separator(c) } else { false }
+    if let Some(&c) = path.as_os_str_bytes().last() { is_separator(c) } else { false }
 }
 
 /// Appends a suffix to a path.
@@ -60,7 +49,7 @@ impl<'a, const LEN: usize> PrefixParser<'a, LEN> {
     fn get_prefix(path: &OsStr) -> [u8; LEN] {
         let mut prefix = [0; LEN];
         // SAFETY: Only ASCII characters are modified.
-        for (i, &ch) in path.bytes().iter().take(LEN).enumerate() {
+        for (i, &ch) in path.as_os_str_bytes().iter().take(LEN).enumerate() {
             prefix[i] = if ch == b'/' { b'\\' } else { ch };
         }
         prefix
@@ -93,7 +82,7 @@ impl<'a> PrefixParserSlice<'a, '_> {
     }
 
     fn prefix_bytes(&self) -> &'a [u8] {
-        &self.path.bytes()[..self.index]
+        &self.path.as_os_str_bytes()[..self.index]
     }
 
     fn finish(self) -> &'a OsStr {
@@ -101,7 +90,7 @@ impl<'a> PrefixParserSlice<'a, '_> {
         // &[u8] and back. This is safe to do because (1) we only look at ASCII
         // contents of the encoding and (2) new &OsStr values are produced only
         // from ASCII-bounded slices of existing &OsStr values.
-        unsafe { bytes_as_os_str(&self.path.bytes()[self.index..]) }
+        unsafe { OsStr::from_os_str_bytes_unchecked(&self.path.as_os_str_bytes()[self.index..]) }
     }
 }
 
@@ -173,7 +162,7 @@ fn parse_drive(path: &OsStr) -> Option<u8> {
         drive.is_ascii_alphabetic()
     }
 
-    match path.bytes() {
+    match path.as_os_str_bytes() {
         [drive, b':', ..] if is_valid_drive_letter(drive) => Some(drive.to_ascii_uppercase()),
         _ => None,
     }
@@ -182,7 +171,7 @@ fn parse_drive(path: &OsStr) -> Option<u8> {
 // Parses a drive prefix exactly, e.g. "C:"
 fn parse_drive_exact(path: &OsStr) -> Option<u8> {
     // only parse two bytes: the drive letter and the drive separator
-    if path.bytes().get(2).map(|&x| is_sep_byte(x)).unwrap_or(true) {
+    if path.as_os_str_bytes().get(2).map(|&x| is_sep_byte(x)).unwrap_or(true) {
         parse_drive(path)
     } else {
         None
@@ -196,21 +185,26 @@ fn parse_drive_exact(path: &OsStr) -> Option<u8> {
 fn parse_next_component(path: &OsStr, verbatim: bool) -> (&OsStr, &OsStr) {
     let separator = if verbatim { is_verbatim_sep } else { is_sep_byte };
 
-    match path.bytes().iter().position(|&x| separator(x)) {
+    match path.as_os_str_bytes().iter().position(|&x| separator(x)) {
         Some(separator_start) => {
             let separator_end = separator_start + 1;
 
-            let component = &path.bytes()[..separator_start];
+            let component = &path.as_os_str_bytes()[..separator_start];
 
             // Panic safe
             // The max `separator_end` is `bytes.len()` and `bytes[bytes.len()..]` is a valid index.
-            let path = &path.bytes()[separator_end..];
+            let path = &path.as_os_str_bytes()[separator_end..];
 
             // SAFETY: `path` is a valid wtf8 encoded slice and each of the separators ('/', '\')
             // is encoded in a single byte, therefore `bytes[separator_start]` and
             // `bytes[separator_end]` must be code point boundaries and thus
             // `bytes[..separator_start]` and `bytes[separator_end..]` are valid wtf8 slices.
-            unsafe { (bytes_as_os_str(component), bytes_as_os_str(path)) }
+            unsafe {
+                (
+                    OsStr::from_os_str_bytes_unchecked(component),
+                    OsStr::from_os_str_bytes_unchecked(path),
+                )
+            }
         }
         None => (path, OsStr::new("")),
     }
@@ -329,7 +323,7 @@ pub(crate) fn absolute(path: &Path) -> io::Result<PathBuf> {
     // Verbatim paths should not be modified.
     if prefix.map(|x| x.is_verbatim()).unwrap_or(false) {
         // NULs in verbatim paths are rejected for consistency.
-        if path.bytes().contains(&0) {
+        if path.as_os_str_bytes().contains(&0) {
             return Err(io::const_io_error!(
                 io::ErrorKind::InvalidInput,
                 "strings passed to WinAPI cannot contain NULs",
diff --git a/library/std/src/sys/windows/process.rs b/library/std/src/sys/windows/process.rs
index df3667c0fd7..a573a05c39c 100644
--- a/library/std/src/sys/windows/process.rs
+++ b/library/std/src/sys/windows/process.rs
@@ -395,7 +395,7 @@ fn resolve_exe<'a>(
     // Test if the file name has the `exe` extension.
     // This does a case-insensitive `ends_with`.
     let has_exe_suffix = if exe_path.len() >= EXE_SUFFIX.len() {
-        exe_path.bytes()[exe_path.len() - EXE_SUFFIX.len()..]
+        exe_path.as_os_str_bytes()[exe_path.len() - EXE_SUFFIX.len()..]
             .eq_ignore_ascii_case(EXE_SUFFIX.as_bytes())
     } else {
         false
@@ -425,7 +425,7 @@ fn resolve_exe<'a>(
         // From the `CreateProcessW` docs:
         // > If the file name does not contain an extension, .exe is appended.
         // Note that this rule only applies when searching paths.
-        let has_extension = exe_path.bytes().contains(&b'.');
+        let has_extension = exe_path.as_os_str_bytes().contains(&b'.');
 
         // Search the directories given by `search_paths`.
         let result = search_paths(parent_paths, child_paths, |mut path| {