diff options
Diffstat (limited to 'library/std/src/sys/stdio/windows.rs')
| -rw-r--r-- | library/std/src/sys/stdio/windows.rs | 472 |
1 files changed, 472 insertions, 0 deletions
diff --git a/library/std/src/sys/stdio/windows.rs b/library/std/src/sys/stdio/windows.rs new file mode 100644 index 00000000000..9b27f76b9dd --- /dev/null +++ b/library/std/src/sys/stdio/windows.rs @@ -0,0 +1,472 @@ +#![unstable(issue = "none", feature = "windows_stdio")] + +use core::char::MAX_LEN_UTF8; +use core::str::utf8_char_width; + +use crate::mem::MaybeUninit; +use crate::os::windows::io::{FromRawHandle, IntoRawHandle}; +use crate::sys::handle::Handle; +use crate::sys::pal::api::{self, WinError}; +use crate::sys::{c, cvt}; +use crate::{cmp, io, ptr, str}; + +#[cfg(test)] +mod tests; + +// Don't cache handles but get them fresh for every read/write. This allows us to track changes to +// the value over time (such as if a process calls `SetStdHandle` while it's running). See #40490. +pub struct Stdin { + surrogate: u16, + incomplete_utf8: IncompleteUtf8, +} + +pub struct Stdout { + incomplete_utf8: IncompleteUtf8, +} + +pub struct Stderr { + incomplete_utf8: IncompleteUtf8, +} + +struct IncompleteUtf8 { + bytes: [u8; 4], + len: u8, +} + +impl IncompleteUtf8 { + // Implemented for use in Stdin::read. + fn read(&mut self, buf: &mut [u8]) -> usize { + // Write to buffer until the buffer is full or we run out of bytes. + let to_write = cmp::min(buf.len(), self.len as usize); + buf[..to_write].copy_from_slice(&self.bytes[..to_write]); + + // Rotate the remaining bytes if not enough remaining space in buffer. + if usize::from(self.len) > buf.len() { + self.bytes.copy_within(to_write.., 0); + self.len -= to_write as u8; + } else { + self.len = 0; + } + + to_write + } +} + +// Apparently Windows doesn't handle large reads on stdin or writes to stdout/stderr well (see +// #13304 for details). +// +// From MSDN (2011): "The storage for this buffer is allocated from a shared heap for the +// process that is 64 KB in size. The maximum size of the buffer will depend on heap usage." +// +// We choose the cap at 8 KiB because libuv does the same, and it seems to be acceptable so far. +const MAX_BUFFER_SIZE: usize = 8192; + +// The standard buffer size of BufReader for Stdin should be able to hold 3x more bytes than there +// are `u16`'s in MAX_BUFFER_SIZE. This ensures the read data can always be completely decoded from +// UTF-16 to UTF-8. +pub const STDIN_BUF_SIZE: usize = MAX_BUFFER_SIZE / 2 * 3; + +pub fn get_handle(handle_id: u32) -> io::Result<c::HANDLE> { + let handle = unsafe { c::GetStdHandle(handle_id) }; + if handle == c::INVALID_HANDLE_VALUE { + Err(io::Error::last_os_error()) + } else if handle.is_null() { + Err(io::Error::from_raw_os_error(c::ERROR_INVALID_HANDLE as i32)) + } else { + Ok(handle) + } +} + +fn is_console(handle: c::HANDLE) -> bool { + // `GetConsoleMode` will return false (0) if this is a pipe (we don't care about the reported + // mode). This will only detect Windows Console, not other terminals connected to a pipe like + // MSYS. Which is exactly what we need, as only Windows Console needs a conversion to UTF-16. + let mut mode = 0; + unsafe { c::GetConsoleMode(handle, &mut mode) != 0 } +} + +/// Returns true if the attached console's code page is currently UTF-8. +#[cfg(not(target_vendor = "win7"))] +fn is_utf8_console() -> bool { + unsafe { c::GetConsoleOutputCP() == c::CP_UTF8 } +} + +#[cfg(target_vendor = "win7")] +fn is_utf8_console() -> bool { + // Windows 7 has a fun "feature" where WriteFile on a console handle will return + // the number of UTF-16 code units written and not the number of bytes from the input string. + // So we always claim the console isn't UTF-8 to trigger the WriteConsole fallback code. + false +} + +fn write(handle_id: u32, data: &[u8], incomplete_utf8: &mut IncompleteUtf8) -> io::Result<usize> { + if data.is_empty() { + return Ok(0); + } + + let handle = get_handle(handle_id)?; + if !is_console(handle) || is_utf8_console() { + unsafe { + let handle = Handle::from_raw_handle(handle); + let ret = handle.write(data); + let _ = handle.into_raw_handle(); // Don't close the handle + return ret; + } + } else { + write_console_utf16(data, incomplete_utf8, handle) + } +} + +fn write_console_utf16( + data: &[u8], + incomplete_utf8: &mut IncompleteUtf8, + handle: c::HANDLE, +) -> io::Result<usize> { + if incomplete_utf8.len > 0 { + assert!( + incomplete_utf8.len < 4, + "Unexpected number of bytes for incomplete UTF-8 codepoint." + ); + if data[0] >> 6 != 0b10 { + // not a continuation byte - reject + incomplete_utf8.len = 0; + return Err(io::const_error!( + io::ErrorKind::InvalidData, + "Windows stdio in console mode does not support writing non-UTF-8 byte sequences", + )); + } + incomplete_utf8.bytes[incomplete_utf8.len as usize] = data[0]; + incomplete_utf8.len += 1; + let char_width = utf8_char_width(incomplete_utf8.bytes[0]); + if (incomplete_utf8.len as usize) < char_width { + // more bytes needed + return Ok(1); + } + let s = str::from_utf8(&incomplete_utf8.bytes[0..incomplete_utf8.len as usize]); + incomplete_utf8.len = 0; + match s { + Ok(s) => { + assert_eq!(char_width, s.len()); + let written = write_valid_utf8_to_console(handle, s)?; + assert_eq!(written, s.len()); // guaranteed by write_valid_utf8_to_console() for single codepoint writes + return Ok(1); + } + Err(_) => { + return Err(io::const_error!( + io::ErrorKind::InvalidData, + "Windows stdio in console mode does not support writing non-UTF-8 byte sequences", + )); + } + } + } + + // As the console is meant for presenting text, we assume bytes of `data` are encoded as UTF-8, + // which needs to be encoded as UTF-16. + // + // If the data is not valid UTF-8 we write out as many bytes as are valid. + // If the first byte is invalid it is either first byte of a multi-byte sequence but the + // provided byte slice is too short or it is the first byte of an invalid multi-byte sequence. + let len = cmp::min(data.len(), MAX_BUFFER_SIZE / 2); + let utf8 = match str::from_utf8(&data[..len]) { + Ok(s) => s, + Err(ref e) if e.valid_up_to() == 0 => { + let first_byte_char_width = utf8_char_width(data[0]); + if first_byte_char_width > 1 && data.len() < first_byte_char_width { + incomplete_utf8.bytes[0] = data[0]; + incomplete_utf8.len = 1; + return Ok(1); + } else { + return Err(io::const_error!( + io::ErrorKind::InvalidData, + "Windows stdio in console mode does not support writing non-UTF-8 byte sequences", + )); + } + } + Err(e) => str::from_utf8(&data[..e.valid_up_to()]).unwrap(), + }; + + write_valid_utf8_to_console(handle, utf8) +} + +fn write_valid_utf8_to_console(handle: c::HANDLE, utf8: &str) -> io::Result<usize> { + debug_assert!(!utf8.is_empty()); + + let mut utf16 = [MaybeUninit::<u16>::uninit(); MAX_BUFFER_SIZE / 2]; + let utf8 = &utf8[..utf8.floor_char_boundary(utf16.len())]; + + let utf16: &[u16] = unsafe { + // Note that this theoretically checks validity twice in the (most common) case + // where the underlying byte sequence is valid utf-8 (given the check in `write()`). + let result = c::MultiByteToWideChar( + c::CP_UTF8, // CodePage + c::MB_ERR_INVALID_CHARS, // dwFlags + utf8.as_ptr(), // lpMultiByteStr + utf8.len() as i32, // cbMultiByte + utf16.as_mut_ptr() as *mut c::WCHAR, // lpWideCharStr + utf16.len() as i32, // cchWideChar + ); + assert!(result != 0, "Unexpected error in MultiByteToWideChar"); + + // Safety: MultiByteToWideChar initializes `result` values. + utf16[..result as usize].assume_init_ref() + }; + + let mut written = write_u16s(handle, utf16)?; + + // Figure out how many bytes of as UTF-8 were written away as UTF-16. + if written == utf16.len() { + Ok(utf8.len()) + } else { + // Make sure we didn't end up writing only half of a surrogate pair (even though the chance + // is tiny). Because it is not possible for user code to re-slice `data` in such a way that + // a missing surrogate can be produced (and also because of the UTF-8 validation above), + // write the missing surrogate out now. + // Buffering it would mean we have to lie about the number of bytes written. + let first_code_unit_remaining = utf16[written]; + if matches!(first_code_unit_remaining, 0xDCEE..=0xDFFF) { + // low surrogate + // We just hope this works, and give up otherwise + let _ = write_u16s(handle, &utf16[written..written + 1]); + written += 1; + } + // Calculate the number of bytes of `utf8` that were actually written. + let mut count = 0; + for ch in utf16[..written].iter() { + count += match ch { + 0x0000..=0x007F => 1, + 0x0080..=0x07FF => 2, + 0xDCEE..=0xDFFF => 1, // Low surrogate. We already counted 3 bytes for the other. + _ => 3, + }; + } + debug_assert!(String::from_utf16(&utf16[..written]).unwrap() == utf8[..count]); + Ok(count) + } +} + +fn write_u16s(handle: c::HANDLE, data: &[u16]) -> io::Result<usize> { + debug_assert!(data.len() < u32::MAX as usize); + let mut written = 0; + cvt(unsafe { + c::WriteConsoleW(handle, data.as_ptr(), data.len() as u32, &mut written, ptr::null_mut()) + })?; + Ok(written as usize) +} + +impl Stdin { + pub const fn new() -> Stdin { + Stdin { surrogate: 0, incomplete_utf8: IncompleteUtf8::new() } + } +} + +impl io::Read for Stdin { + fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> { + let handle = get_handle(c::STD_INPUT_HANDLE)?; + if !is_console(handle) { + unsafe { + let handle = Handle::from_raw_handle(handle); + let ret = handle.read(buf); + let _ = handle.into_raw_handle(); // Don't close the handle + return ret; + } + } + + // If there are bytes in the incomplete utf-8, start with those. + // (No-op if there is nothing in the buffer.) + let mut bytes_copied = self.incomplete_utf8.read(buf); + + if bytes_copied == buf.len() { + Ok(bytes_copied) + } else if buf.len() - bytes_copied < 4 { + // Not enough space to get a UTF-8 byte. We will use the incomplete UTF8. + let mut utf16_buf = [MaybeUninit::new(0); 1]; + // Read one u16 character. + let read = read_u16s_fixup_surrogates(handle, &mut utf16_buf, 1, &mut self.surrogate)?; + // Read bytes, using the (now-empty) self.incomplete_utf8 as extra space. + let read_bytes = utf16_to_utf8( + unsafe { utf16_buf[..read].assume_init_ref() }, + &mut self.incomplete_utf8.bytes, + )?; + + // Read in the bytes from incomplete_utf8 until the buffer is full. + self.incomplete_utf8.len = read_bytes as u8; + // No-op if no bytes. + bytes_copied += self.incomplete_utf8.read(&mut buf[bytes_copied..]); + Ok(bytes_copied) + } else { + let mut utf16_buf = [MaybeUninit::<u16>::uninit(); MAX_BUFFER_SIZE / 2]; + + // In the worst case, a UTF-8 string can take 3 bytes for every `u16` of a UTF-16. So + // we can read at most a third of `buf.len()` chars and uphold the guarantee no data gets + // lost. + let amount = cmp::min(buf.len() / 3, utf16_buf.len()); + let read = + read_u16s_fixup_surrogates(handle, &mut utf16_buf, amount, &mut self.surrogate)?; + // Safety `read_u16s_fixup_surrogates` returns the number of items + // initialized. + let utf16s = unsafe { utf16_buf[..read].assume_init_ref() }; + match utf16_to_utf8(utf16s, buf) { + Ok(value) => return Ok(bytes_copied + value), + Err(e) => return Err(e), + } + } + } +} + +// We assume that if the last `u16` is an unpaired surrogate they got sliced apart by our +// buffer size, and keep it around for the next read hoping to put them together. +// This is a best effort, and might not work if we are not the only reader on Stdin. +fn read_u16s_fixup_surrogates( + handle: c::HANDLE, + buf: &mut [MaybeUninit<u16>], + mut amount: usize, + surrogate: &mut u16, +) -> io::Result<usize> { + // Insert possibly remaining unpaired surrogate from last read. + let mut start = 0; + if *surrogate != 0 { + buf[0] = MaybeUninit::new(*surrogate); + *surrogate = 0; + start = 1; + if amount == 1 { + // Special case: `Stdin::read` guarantees we can always read at least one new `u16` + // and combine it with an unpaired surrogate, because the UTF-8 buffer is at least + // 4 bytes. + amount = 2; + } + } + let mut amount = read_u16s(handle, &mut buf[start..amount])? + start; + + if amount > 0 { + // Safety: The returned `amount` is the number of values initialized, + // and it is not 0, so we know that `buf[amount - 1]` have been + // initialized. + let last_char = unsafe { buf[amount - 1].assume_init() }; + if matches!(last_char, 0xD800..=0xDBFF) { + // high surrogate + *surrogate = last_char; + amount -= 1; + } + } + Ok(amount) +} + +// Returns `Ok(n)` if it initialized `n` values in `buf`. +fn read_u16s(handle: c::HANDLE, buf: &mut [MaybeUninit<u16>]) -> io::Result<usize> { + // Configure the `pInputControl` parameter to not only return on `\r\n` but also Ctrl-Z, the + // traditional DOS method to indicate end of character stream / user input (SUB). + // See #38274 and https://stackoverflow.com/questions/43836040/win-api-readconsole. + const CTRL_Z: u16 = 0x1A; + const CTRL_Z_MASK: u32 = 1 << CTRL_Z; + let input_control = c::CONSOLE_READCONSOLE_CONTROL { + nLength: size_of::<c::CONSOLE_READCONSOLE_CONTROL>() as u32, + nInitialChars: 0, + dwCtrlWakeupMask: CTRL_Z_MASK, + dwControlKeyState: 0, + }; + + let mut amount = 0; + loop { + cvt(unsafe { + c::SetLastError(0); + c::ReadConsoleW( + handle, + buf.as_mut_ptr() as *mut core::ffi::c_void, + buf.len() as u32, + &mut amount, + &input_control, + ) + })?; + + // ReadConsoleW returns success with ERROR_OPERATION_ABORTED for Ctrl-C or Ctrl-Break. + // Explicitly check for that case here and try again. + if amount == 0 && api::get_last_error() == WinError::OPERATION_ABORTED { + continue; + } + break; + } + // Safety: if `amount > 0`, then that many bytes were written, so + // `buf[amount as usize - 1]` has been initialized. + if amount > 0 && unsafe { buf[amount as usize - 1].assume_init() } == CTRL_Z { + amount -= 1; + } + Ok(amount as usize) +} + +fn utf16_to_utf8(utf16: &[u16], utf8: &mut [u8]) -> io::Result<usize> { + debug_assert!(utf16.len() <= i32::MAX as usize); + debug_assert!(utf8.len() <= i32::MAX as usize); + + if utf16.is_empty() { + return Ok(0); + } + + let result = unsafe { + c::WideCharToMultiByte( + c::CP_UTF8, // CodePage + c::WC_ERR_INVALID_CHARS, // dwFlags + utf16.as_ptr(), // lpWideCharStr + utf16.len() as i32, // cchWideChar + utf8.as_mut_ptr(), // lpMultiByteStr + utf8.len() as i32, // cbMultiByte + ptr::null(), // lpDefaultChar + ptr::null_mut(), // lpUsedDefaultChar + ) + }; + if result == 0 { + // We can't really do any better than forget all data and return an error. + Err(io::const_error!( + io::ErrorKind::InvalidData, + "Windows stdin in console mode does not support non-UTF-16 input; \ + encountered unpaired surrogate", + )) + } else { + Ok(result as usize) + } +} + +impl IncompleteUtf8 { + pub const fn new() -> IncompleteUtf8 { + IncompleteUtf8 { bytes: [0; MAX_LEN_UTF8], len: 0 } + } +} + +impl Stdout { + pub const fn new() -> Stdout { + Stdout { incomplete_utf8: IncompleteUtf8::new() } + } +} + +impl io::Write for Stdout { + fn write(&mut self, buf: &[u8]) -> io::Result<usize> { + write(c::STD_OUTPUT_HANDLE, buf, &mut self.incomplete_utf8) + } + + fn flush(&mut self) -> io::Result<()> { + Ok(()) + } +} + +impl Stderr { + pub const fn new() -> Stderr { + Stderr { incomplete_utf8: IncompleteUtf8::new() } + } +} + +impl io::Write for Stderr { + fn write(&mut self, buf: &[u8]) -> io::Result<usize> { + write(c::STD_ERROR_HANDLE, buf, &mut self.incomplete_utf8) + } + + fn flush(&mut self) -> io::Result<()> { + Ok(()) + } +} + +pub fn is_ebadf(err: &io::Error) -> bool { + err.raw_os_error() == Some(c::ERROR_INVALID_HANDLE as i32) +} + +pub fn panic_output() -> Option<impl io::Write> { + Some(Stderr::new()) +} |
