diff options
| author | bors <bors@rust-lang.org> | 2023-05-30 21:26:22 +0000 |
|---|---|---|
| committer | bors <bors@rust-lang.org> | 2023-05-30 21:26:22 +0000 |
| commit | 9610dfe5a9a731ced1ea4923ecbd0c57fe367898 (patch) | |
| tree | 9d34e6dd7c85cc272964c3b8232e401cea3bbfea /library/std/src/sys | |
| parent | f0411ffcebcd7f75ac02ed45feb53ffd07b75398 (diff) | |
| parent | e6a35c49533dd734f178b04677404df0da518737 (diff) | |
| download | rust-9610dfe5a9a731ced1ea4923ecbd0c57fe367898.tar.gz rust-9610dfe5a9a731ced1ea4923ecbd0c57fe367898.zip | |
Auto merge of #109698 - epage:wtf, r=Amanieu
Allow limited access to `OsStr` bytes `OsStr` has historically kept its implementation details private out of concern for locking us into a specific encoding on Windows. This is an alternative to rust-lang#95290 which proposed specifying the encoding on Windows. Instead, this only specifies that for cross-platform code, `OsStr`'s encoding is a superset of UTF-8 and defines rules for safely interacting with it At minimum, this can greatly simplify the `os_str_bytes` crate and every arg parser that interacts with `OsStr` directly (which is most of those that support invalid UTF-8). Tracking issue: #111544
Diffstat (limited to 'library/std/src/sys')
| -rw-r--r-- | library/std/src/sys/common/small_c_string.rs | 2 | ||||
| -rw-r--r-- | library/std/src/sys/common/tests.rs | 4 | ||||
| -rw-r--r-- | library/std/src/sys/unix/os_str.rs | 9 | ||||
| -rw-r--r-- | library/std/src/sys/unix/os_str/tests.rs | 9 | ||||
| -rw-r--r-- | library/std/src/sys/unix/path.rs | 2 | ||||
| -rw-r--r-- | library/std/src/sys/unix/process/process_common.rs | 4 | ||||
| -rw-r--r-- | library/std/src/sys/windows/args.rs | 6 | ||||
| -rw-r--r-- | library/std/src/sys/windows/os_str.rs | 10 | ||||
| -rw-r--r-- | library/std/src/sys/windows/path.rs | 42 | ||||
| -rw-r--r-- | library/std/src/sys/windows/process.rs | 4 |
10 files changed, 51 insertions, 41 deletions
diff --git a/library/std/src/sys/common/small_c_string.rs b/library/std/src/sys/common/small_c_string.rs index 01acd519135..963d17a47e4 100644 --- a/library/std/src/sys/common/small_c_string.rs +++ b/library/std/src/sys/common/small_c_string.rs @@ -19,7 +19,7 @@ pub fn run_path_with_cstr<T, F>(path: &Path, f: F) -> io::Result<T> where F: FnOnce(&CStr) -> io::Result<T>, { - run_with_cstr(path.as_os_str().bytes(), f) + run_with_cstr(path.as_os_str().as_os_str_bytes(), f) } #[inline] diff --git a/library/std/src/sys/common/tests.rs b/library/std/src/sys/common/tests.rs index fb6f5d6af83..0a1cbcbe8ef 100644 --- a/library/std/src/sys/common/tests.rs +++ b/library/std/src/sys/common/tests.rs @@ -8,7 +8,7 @@ use core::iter::repeat; fn stack_allocation_works() { let path = Path::new("abc"); let result = run_path_with_cstr(path, |p| { - assert_eq!(p, &*CString::new(path.as_os_str().bytes()).unwrap()); + assert_eq!(p, &*CString::new(path.as_os_str().as_os_str_bytes()).unwrap()); Ok(42) }); assert_eq!(result.unwrap(), 42); @@ -25,7 +25,7 @@ fn heap_allocation_works() { let path = repeat("a").take(384).collect::<String>(); let path = Path::new(&path); let result = run_path_with_cstr(path, |p| { - assert_eq!(p, &*CString::new(path.as_os_str().bytes()).unwrap()); + assert_eq!(p, &*CString::new(path.as_os_str().as_os_str_bytes()).unwrap()); Ok(42) }); assert_eq!(result.unwrap(), 42); diff --git a/library/std/src/sys/unix/os_str.rs b/library/std/src/sys/unix/os_str.rs index 488217f3941..142fcb9ed0b 100644 --- a/library/std/src/sys/unix/os_str.rs +++ b/library/std/src/sys/unix/os_str.rs @@ -193,13 +193,18 @@ impl Buf { impl Slice { #[inline] - fn from_u8_slice(s: &[u8]) -> &Slice { + pub fn as_os_str_bytes(&self) -> &[u8] { + &self.inner + } + + #[inline] + pub unsafe fn from_os_str_bytes_unchecked(s: &[u8]) -> &Slice { unsafe { mem::transmute(s) } } #[inline] pub fn from_str(s: &str) -> &Slice { - Slice::from_u8_slice(s.as_bytes()) + unsafe { Slice::from_os_str_bytes_unchecked(s.as_bytes()) } } pub fn to_str(&self) -> Option<&str> { diff --git a/library/std/src/sys/unix/os_str/tests.rs b/library/std/src/sys/unix/os_str/tests.rs index 22ba0c92350..91bc0e61a4a 100644 --- a/library/std/src/sys/unix/os_str/tests.rs +++ b/library/std/src/sys/unix/os_str/tests.rs @@ -2,7 +2,7 @@ use super::*; #[test] fn slice_debug_output() { - let input = Slice::from_u8_slice(b"\xF0hello,\tworld"); + let input = unsafe { Slice::from_os_str_bytes_unchecked(b"\xF0hello,\tworld") }; let expected = r#""\xF0hello,\tworld""#; let output = format!("{input:?}"); @@ -11,8 +11,7 @@ fn slice_debug_output() { #[test] fn display() { - assert_eq!( - "Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye", - Slice::from_u8_slice(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string(), - ); + assert_eq!("Hello\u{FFFD}\u{FFFD} There\u{FFFD} Goodbye", unsafe { + Slice::from_os_str_bytes_unchecked(b"Hello\xC0\x80 There\xE6\x83 Goodbye").to_string() + },); } diff --git a/library/std/src/sys/unix/path.rs b/library/std/src/sys/unix/path.rs index a98a69e2db8..935245f637b 100644 --- a/library/std/src/sys/unix/path.rs +++ b/library/std/src/sys/unix/path.rs @@ -30,7 +30,7 @@ pub(crate) fn absolute(path: &Path) -> io::Result<PathBuf> { // Get the components, skipping the redundant leading "." component if it exists. let mut components = path.strip_prefix(".").unwrap_or(path).components(); - let path_os = path.as_os_str().bytes(); + let path_os = path.as_os_str().as_os_str_bytes(); let mut normalized = if path.is_absolute() { // "If a pathname begins with two successive <slash> characters, the diff --git a/library/std/src/sys/unix/process/process_common.rs b/library/std/src/sys/unix/process/process_common.rs index afd03d79c0b..640648e8707 100644 --- a/library/std/src/sys/unix/process/process_common.rs +++ b/library/std/src/sys/unix/process/process_common.rs @@ -164,9 +164,9 @@ pub enum ProgramKind { impl ProgramKind { fn new(program: &OsStr) -> Self { - if program.bytes().starts_with(b"/") { + if program.as_os_str_bytes().starts_with(b"/") { Self::Absolute - } else if program.bytes().contains(&b'/') { + } else if program.as_os_str_bytes().contains(&b'/') { // If the program has more than one component in it, it is a relative path. Self::Relative } else { diff --git a/library/std/src/sys/windows/args.rs b/library/std/src/sys/windows/args.rs index 5bfd8b52ed0..6b597f499bc 100644 --- a/library/std/src/sys/windows/args.rs +++ b/library/std/src/sys/windows/args.rs @@ -226,7 +226,7 @@ pub(crate) fn append_arg(cmd: &mut Vec<u16>, arg: &Arg, force_quotes: bool) -> i // that it actually gets passed through on the command line or otherwise // it will be dropped entirely when parsed on the other end. ensure_no_nuls(arg)?; - let arg_bytes = arg.bytes(); + let arg_bytes = arg.as_os_str_bytes(); let (quote, escape) = match quote { Quote::Always => (true, true), Quote::Auto => { @@ -297,7 +297,9 @@ pub(crate) fn make_bat_command_line( // * `|<>` pipe/redirect characters. const SPECIAL: &[u8] = b"\t &()[]{}^=;!'+,`~%|<>"; let force_quotes = match arg { - Arg::Regular(arg) if !force_quotes => arg.bytes().iter().any(|c| SPECIAL.contains(c)), + Arg::Regular(arg) if !force_quotes => { + arg.as_os_str_bytes().iter().any(|c| SPECIAL.contains(c)) + } _ => force_quotes, }; append_arg(&mut cmd, arg, force_quotes)?; diff --git a/library/std/src/sys/windows/os_str.rs b/library/std/src/sys/windows/os_str.rs index 2f2b0e56e08..611f0d040f0 100644 --- a/library/std/src/sys/windows/os_str.rs +++ b/library/std/src/sys/windows/os_str.rs @@ -152,6 +152,16 @@ impl Buf { impl Slice { #[inline] + pub fn as_os_str_bytes(&self) -> &[u8] { + self.inner.as_bytes() + } + + #[inline] + pub unsafe fn from_os_str_bytes_unchecked(s: &[u8]) -> &Slice { + mem::transmute(Wtf8::from_bytes_unchecked(s)) + } + + #[inline] pub fn from_str(s: &str) -> &Slice { unsafe { mem::transmute(Wtf8::from_str(s)) } } diff --git a/library/std/src/sys/windows/path.rs b/library/std/src/sys/windows/path.rs index c3573d14c7f..c9c2d10e6c4 100644 --- a/library/std/src/sys/windows/path.rs +++ b/library/std/src/sys/windows/path.rs @@ -1,7 +1,6 @@ use super::{c, fill_utf16_buf, to_u16s}; use crate::ffi::{OsStr, OsString}; use crate::io; -use crate::mem; use crate::path::{Path, PathBuf, Prefix}; use crate::ptr; @@ -11,16 +10,6 @@ mod tests; pub const MAIN_SEP_STR: &str = "\\"; pub const MAIN_SEP: char = '\\'; -/// # Safety -/// -/// `bytes` must be a valid wtf8 encoded slice -#[inline] -unsafe fn bytes_as_os_str(bytes: &[u8]) -> &OsStr { - // &OsStr is layout compatible with &Slice, which is compatible with &Wtf8, - // which is compatible with &[u8]. - mem::transmute(bytes) -} - #[inline] pub fn is_sep_byte(b: u8) -> bool { b == b'/' || b == b'\\' @@ -33,12 +22,12 @@ pub fn is_verbatim_sep(b: u8) -> bool { /// Returns true if `path` looks like a lone filename. pub(crate) fn is_file_name(path: &OsStr) -> bool { - !path.bytes().iter().copied().any(is_sep_byte) + !path.as_os_str_bytes().iter().copied().any(is_sep_byte) } pub(crate) fn has_trailing_slash(path: &OsStr) -> bool { - let is_verbatim = path.bytes().starts_with(br"\\?\"); + let is_verbatim = path.as_os_str_bytes().starts_with(br"\\?\"); let is_separator = if is_verbatim { is_verbatim_sep } else { is_sep_byte }; - if let Some(&c) = path.bytes().last() { is_separator(c) } else { false } + if let Some(&c) = path.as_os_str_bytes().last() { is_separator(c) } else { false } } /// Appends a suffix to a path. @@ -60,7 +49,7 @@ impl<'a, const LEN: usize> PrefixParser<'a, LEN> { fn get_prefix(path: &OsStr) -> [u8; LEN] { let mut prefix = [0; LEN]; // SAFETY: Only ASCII characters are modified. - for (i, &ch) in path.bytes().iter().take(LEN).enumerate() { + for (i, &ch) in path.as_os_str_bytes().iter().take(LEN).enumerate() { prefix[i] = if ch == b'/' { b'\\' } else { ch }; } prefix @@ -93,7 +82,7 @@ impl<'a> PrefixParserSlice<'a, '_> { } fn prefix_bytes(&self) -> &'a [u8] { - &self.path.bytes()[..self.index] + &self.path.as_os_str_bytes()[..self.index] } fn finish(self) -> &'a OsStr { @@ -101,7 +90,7 @@ impl<'a> PrefixParserSlice<'a, '_> { // &[u8] and back. This is safe to do because (1) we only look at ASCII // contents of the encoding and (2) new &OsStr values are produced only // from ASCII-bounded slices of existing &OsStr values. - unsafe { bytes_as_os_str(&self.path.bytes()[self.index..]) } + unsafe { OsStr::from_os_str_bytes_unchecked(&self.path.as_os_str_bytes()[self.index..]) } } } @@ -173,7 +162,7 @@ fn parse_drive(path: &OsStr) -> Option<u8> { drive.is_ascii_alphabetic() } - match path.bytes() { + match path.as_os_str_bytes() { [drive, b':', ..] if is_valid_drive_letter(drive) => Some(drive.to_ascii_uppercase()), _ => None, } @@ -182,7 +171,7 @@ fn parse_drive(path: &OsStr) -> Option<u8> { // Parses a drive prefix exactly, e.g. "C:" fn parse_drive_exact(path: &OsStr) -> Option<u8> { // only parse two bytes: the drive letter and the drive separator - if path.bytes().get(2).map(|&x| is_sep_byte(x)).unwrap_or(true) { + if path.as_os_str_bytes().get(2).map(|&x| is_sep_byte(x)).unwrap_or(true) { parse_drive(path) } else { None @@ -196,21 +185,26 @@ fn parse_drive_exact(path: &OsStr) -> Option<u8> { fn parse_next_component(path: &OsStr, verbatim: bool) -> (&OsStr, &OsStr) { let separator = if verbatim { is_verbatim_sep } else { is_sep_byte }; - match path.bytes().iter().position(|&x| separator(x)) { + match path.as_os_str_bytes().iter().position(|&x| separator(x)) { Some(separator_start) => { let separator_end = separator_start + 1; - let component = &path.bytes()[..separator_start]; + let component = &path.as_os_str_bytes()[..separator_start]; // Panic safe // The max `separator_end` is `bytes.len()` and `bytes[bytes.len()..]` is a valid index. - let path = &path.bytes()[separator_end..]; + let path = &path.as_os_str_bytes()[separator_end..]; // SAFETY: `path` is a valid wtf8 encoded slice and each of the separators ('/', '\') // is encoded in a single byte, therefore `bytes[separator_start]` and // `bytes[separator_end]` must be code point boundaries and thus // `bytes[..separator_start]` and `bytes[separator_end..]` are valid wtf8 slices. - unsafe { (bytes_as_os_str(component), bytes_as_os_str(path)) } + unsafe { + ( + OsStr::from_os_str_bytes_unchecked(component), + OsStr::from_os_str_bytes_unchecked(path), + ) + } } None => (path, OsStr::new("")), } @@ -329,7 +323,7 @@ pub(crate) fn absolute(path: &Path) -> io::Result<PathBuf> { // Verbatim paths should not be modified. if prefix.map(|x| x.is_verbatim()).unwrap_or(false) { // NULs in verbatim paths are rejected for consistency. - if path.bytes().contains(&0) { + if path.as_os_str_bytes().contains(&0) { return Err(io::const_io_error!( io::ErrorKind::InvalidInput, "strings passed to WinAPI cannot contain NULs", diff --git a/library/std/src/sys/windows/process.rs b/library/std/src/sys/windows/process.rs index df3667c0fd7..a573a05c39c 100644 --- a/library/std/src/sys/windows/process.rs +++ b/library/std/src/sys/windows/process.rs @@ -395,7 +395,7 @@ fn resolve_exe<'a>( // Test if the file name has the `exe` extension. // This does a case-insensitive `ends_with`. let has_exe_suffix = if exe_path.len() >= EXE_SUFFIX.len() { - exe_path.bytes()[exe_path.len() - EXE_SUFFIX.len()..] + exe_path.as_os_str_bytes()[exe_path.len() - EXE_SUFFIX.len()..] .eq_ignore_ascii_case(EXE_SUFFIX.as_bytes()) } else { false @@ -425,7 +425,7 @@ fn resolve_exe<'a>( // From the `CreateProcessW` docs: // > If the file name does not contain an extension, .exe is appended. // Note that this rule only applies when searching paths. - let has_extension = exe_path.bytes().contains(&b'.'); + let has_extension = exe_path.as_os_str_bytes().contains(&b'.'); // Search the directories given by `search_paths`. let result = search_paths(parent_paths, child_paths, |mut path| { |
