about summary refs log tree commit diff
path: root/library/std/src
diff options
context:
space:
mode:
authorThe8472 <git@infinite-source.de>2020-09-10 22:12:42 +0200
committerThe8472 <git@infinite-source.de>2020-11-13 19:46:35 +0100
commit46e7fbe60b53e486ff39d29c571428c8a345e925 (patch)
tree2d9b663b4581d9875074eec29eb6ab08823e859d /library/std/src
parent0624730d9e9e2b6de974b6f4edd1ea48ab5f240c (diff)
downloadrust-46e7fbe60b53e486ff39d29c571428c8a345e925.tar.gz
rust-46e7fbe60b53e486ff39d29c571428c8a345e925.zip
reduce syscalls by inferring FD types based on source struct instead of calling stat()
also adds handling for edge-cases involving large sparse files where sendfile could fail with EOVERFLOW
Diffstat (limited to 'library/std/src')
-rw-r--r--library/std/src/io/copy.rs168
-rw-r--r--library/std/src/sys/unix/fs.rs83
2 files changed, 158 insertions, 93 deletions
diff --git a/library/std/src/io/copy.rs b/library/std/src/io/copy.rs
index e8cbe6a7e71..31bfdb63386 100644
--- a/library/std/src/io/copy.rs
+++ b/library/std/src/io/copy.rs
@@ -99,6 +99,7 @@ mod kernel_copy {
     use crate::os::unix::fs::FileTypeExt;
     use crate::os::unix::io::{AsRawFd, FromRawFd, RawFd};
     use crate::process::{ChildStderr, ChildStdin, ChildStdout};
+    use crate::sys::fs::{copy_regular_files, sendfile_splice, CopyResult, SpliceMode};
 
     pub(super) fn copy_spec<R: Read + ?Sized, W: Write + ?Sized>(
         read: &mut R,
@@ -108,20 +109,55 @@ mod kernel_copy {
         SpecCopy::copy(copier)
     }
 
+    /// This type represents either the inferred `FileType` of a `RawFd` based on the source
+    /// type from which it was extracted or the actual metadata
+    ///
+    /// The methods on this type only provide hints, due to `AsRawFd` and `FromRawFd` the inferred
+    /// type may be wrong.
     enum FdMeta {
+        /// We obtained the FD from a type that can contain any type of `FileType` and queried the metadata
+        /// because it is cheaper than probing all possible syscalls (reader side)
         Metadata(Metadata),
         Socket,
         Pipe,
-        None,
+        /// We don't have any metadata, e.g. because the original type was `File` which can represent
+        /// any `FileType` and we did not query the metadata either since it did not seem beneficial
+        /// (writer side)
+        NoneObtained,
     }
 
     impl FdMeta {
-        fn is_fifo(&self) -> bool {
+        fn maybe_fifo(&self) -> bool {
             match self {
                 FdMeta::Metadata(meta) => meta.file_type().is_fifo(),
                 FdMeta::Socket => false,
                 FdMeta::Pipe => true,
-                FdMeta::None => false,
+                FdMeta::NoneObtained => true,
+            }
+        }
+
+        fn potential_sendfile_source(&self) -> bool {
+            match self {
+                // procfs erronously shows 0 length on non-empty readable files.
+                // and if a file is truly empty then a `read` syscall will determine that and skip the write syscall
+                // thus there would be benefit from attempting sendfile
+                FdMeta::Metadata(meta)
+                    if meta.file_type().is_file() && meta.len() > 0
+                        || meta.file_type().is_block_device() =>
+                {
+                    true
+                }
+                _ => false,
+            }
+        }
+
+        fn copy_file_range_candidate(&self) -> bool {
+            match self {
+                // copy_file_range will fail on empty procfs files. `read` can determine whether EOF has been reached
+                // without extra cost and skip the write, thus there is no benefit in attempting copy_file_range
+                FdMeta::Metadata(meta) if meta.is_file() && meta.len() > 0 => true,
+                FdMeta::NoneObtained => true,
+                _ => false,
             }
         }
     }
@@ -149,66 +185,65 @@ mod kernel_copy {
             let r_cfg = reader.properties();
             let w_cfg = writer.properties();
 
-            // before direct operations  on file descriptors ensure that all source and sink buffers are emtpy
+            // before direct operations on file descriptors ensure that all source and sink buffers are emtpy
             let mut flush = || -> crate::io::Result<u64> {
                 let bytes = reader.drain_to(writer, u64::MAX)?;
+                // BufWriter buffered bytes have already been accounted for in earlier write() calls
                 writer.flush()?;
                 Ok(bytes)
             };
 
-            match (r_cfg, w_cfg) {
-                (
-                    CopyParams(FdMeta::Metadata(reader_meta), Some(readfd)),
-                    CopyParams(FdMeta::Metadata(writer_meta), Some(writefd)),
-                ) if reader_meta.is_file() && writer_meta.is_file() => {
-                    let bytes_flushed = flush()?;
-                    let max_write = reader.min_limit();
-                    let (mut reader, mut writer) =
-                        unsafe { (fd_as_file(readfd), fd_as_file(writefd)) };
-                    let len = reader_meta.len();
-                    crate::sys::fs::copy_regular_files(
-                        &mut reader,
-                        &mut writer,
-                        min(len, max_write),
-                    )
-                    .map(|bytes_copied| bytes_copied + bytes_flushed)
+            let mut written = 0u64;
+
+            if let (CopyParams(input_meta, Some(readfd)), CopyParams(output_meta, Some(writefd))) =
+                (r_cfg, w_cfg)
+            {
+                written += flush()?;
+                let max_write = reader.min_limit();
+
+                if input_meta.copy_file_range_candidate() && output_meta.copy_file_range_candidate()
+                {
+                    let result = copy_regular_files(readfd, writefd, max_write);
+
+                    match result {
+                        CopyResult::Ended(Ok(bytes_copied)) => return Ok(bytes_copied + written),
+                        CopyResult::Ended(err) => return err,
+                        CopyResult::Fallback(bytes) => written += bytes,
+                    }
                 }
-                (
-                    CopyParams(FdMeta::Metadata(reader_meta), Some(readfd)),
-                    CopyParams(_, Some(writefd)),
-                ) if reader_meta.is_file() => {
-                    // try sendfile, most modern systems it should work with any target as long as the source is a mmapable file.
-                    // in the rare cases where it's no supported the wrapper function will fall back to a normal copy loop
-                    let bytes_flushed = flush()?;
-                    let (mut reader, mut writer) =
-                        unsafe { (fd_as_file(readfd), fd_as_file(writefd)) };
-                    let len = reader_meta.len();
-                    let max_write = reader.min_limit();
-                    crate::sys::fs::sendfile_splice(
-                        crate::sys::fs::SpliceMode::Sendfile,
-                        &mut reader,
-                        &mut writer,
-                        min(len, max_write),
-                    )
-                        .map(|bytes_sent| bytes_sent + bytes_flushed)
+
+                // on modern kernels sendfile can copy from any mmapable type (some but not all regular files and block devices)
+                // to any writable file descriptor. On older kernels the writer side can only be a socket.
+                // So we just try and fallback if needed.
+                // If current file offsets + write sizes overflow it may also fail, we do not try to fix that and instead
+                // fall back to the generic copy loop.
+                if input_meta.potential_sendfile_source() {
+                    let result = sendfile_splice(SpliceMode::Sendfile, readfd, writefd, max_write);
+
+                    match result {
+                        CopyResult::Ended(Ok(bytes_copied)) => return Ok(bytes_copied + written),
+                        CopyResult::Ended(err) => return err,
+                        CopyResult::Fallback(bytes) => written += bytes,
+                    }
                 }
-                (CopyParams(reader_meta, Some(readfd)), CopyParams(writer_meta, Some(writefd)))
-                    if reader_meta.is_fifo() || writer_meta.is_fifo() =>
-                {
-                    // splice
-                    let bytes_flushed = flush()?;
-                    let max_write = reader.min_limit();
-                    let (mut reader, mut writer) =
-                        unsafe { (fd_as_file(readfd), fd_as_file(writefd)) };
-                    crate::sys::fs::sendfile_splice(
-                        crate::sys::fs::SpliceMode::Splice,
-                        &mut reader,
-                        &mut writer,
-                        max_write,
-                    )
-                    .map(|bytes_sent| bytes_sent + bytes_flushed)
+
+                if input_meta.maybe_fifo() || output_meta.maybe_fifo() {
+                    let result = sendfile_splice(SpliceMode::Splice, readfd, writefd, max_write);
+
+                    match result {
+                        CopyResult::Ended(Ok(bytes_copied)) => return Ok(bytes_copied + written),
+                        CopyResult::Ended(err) => return err,
+                        CopyResult::Fallback(0) => { /* use fallback */ }
+                        CopyResult::Fallback(_) => {
+                            unreachable!("splice should not return > 0 bytes on the fallback path")
+                        }
+                    }
                 }
-                _ => super::generic_copy(reader, writer),
+            }
+
+            match super::generic_copy(reader, writer) {
+                Ok(bytes) => Ok(bytes + written),
+                err => err,
             }
         }
     }
@@ -235,7 +270,10 @@ mod kernel_copy {
         fn properties(&self) -> CopyParams;
     }
 
-    impl<T> CopyRead for &mut T where T: CopyRead {
+    impl<T> CopyRead for &mut T
+    where
+        T: CopyRead,
+    {
         fn drain_to<W: Write>(&mut self, writer: &mut W, limit: u64) -> Result<u64> {
             (**self).drain_to(writer, limit)
         }
@@ -249,13 +287,15 @@ mod kernel_copy {
         }
     }
 
-    impl<T> CopyWrite for &mut T where T: CopyWrite {
+    impl<T> CopyWrite for &mut T
+    where
+        T: CopyWrite,
+    {
         fn properties(&self) -> CopyParams {
             (**self).properties()
         }
     }
 
-
     impl CopyRead for File {
         fn properties(&self) -> CopyParams {
             CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
@@ -270,13 +310,13 @@ mod kernel_copy {
 
     impl CopyWrite for File {
         fn properties(&self) -> CopyParams {
-            CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
+            CopyParams(FdMeta::NoneObtained, Some(self.as_raw_fd()))
         }
     }
 
     impl CopyWrite for &File {
         fn properties(&self) -> CopyParams {
-            CopyParams(fd_to_meta(*self), Some(self.as_raw_fd()))
+            CopyParams(FdMeta::NoneObtained, Some(self.as_raw_fd()))
         }
     }
 
@@ -345,13 +385,13 @@ mod kernel_copy {
 
     impl CopyWrite for StdoutLock<'_> {
         fn properties(&self) -> CopyParams {
-            CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
+            CopyParams(FdMeta::NoneObtained, Some(self.as_raw_fd()))
         }
     }
 
     impl CopyWrite for StderrLock<'_> {
         fn properties(&self) -> CopyParams {
-            CopyParams(fd_to_meta(self), Some(self.as_raw_fd()))
+            CopyParams(FdMeta::NoneObtained, Some(self.as_raw_fd()))
         }
     }
 
@@ -411,11 +451,7 @@ mod kernel_copy {
         let file: ManuallyDrop<File> = ManuallyDrop::new(unsafe { File::from_raw_fd(fd) });
         match file.metadata() {
             Ok(meta) => FdMeta::Metadata(meta),
-            Err(_) => FdMeta::None,
+            Err(_) => FdMeta::NoneObtained,
         }
     }
-
-    unsafe fn fd_as_file(fd: RawFd) -> ManuallyDrop<File> {
-        ManuallyDrop::new(File::from_raw_fd(fd))
-    }
 }
diff --git a/library/std/src/sys/unix/fs.rs b/library/std/src/sys/unix/fs.rs
index b106eb0a5b6..0bab95053a4 100644
--- a/library/std/src/sys/unix/fs.rs
+++ b/library/std/src/sys/unix/fs.rs
@@ -1195,17 +1195,26 @@ pub fn copy(from: &Path, to: &Path) -> io::Result<u64> {
     let max_len = u64::MAX;
     let (mut writer, _) = open_to_and_set_permissions(to, reader_metadata)?;
 
-    copy_regular_files(&mut reader, &mut writer, max_len)
+    return match copy_regular_files(reader.as_raw_fd(), writer.as_raw_fd(), max_len) {
+        CopyResult::Ended(result) => result,
+        CopyResult::Fallback(written) => {
+            // fallback is only > 0 on EOVERFLOW, which shouldn't happen
+            // because the copy loop starts at a file offset 0 and countns down from `len`
+            assert_eq!(0, written);
+            io::copy::generic_copy(&mut reader, &mut writer)
+        }
+    };
 }
 
 /// linux-specific implementation that will attempt to use copy_file_range for copy offloading
 /// as the name says, it only works on regular files
+///
+/// Callers must handle fallback to a generic copy loop.
+/// `Fallback` may indicate non-zero number of bytes already written
+/// if one of the files' cursor +`max_len` would exceed u64::MAX (`EOVERFLOW`).
+/// If the initial file offset was 0 then `Fallback` will only contain `0`.
 #[cfg(any(target_os = "linux", target_os = "android"))]
-pub(crate) fn copy_regular_files(
-    reader: &mut crate::fs::File,
-    writer: &mut crate::fs::File,
-    max_len: u64,
-) -> io::Result<u64> {
+pub(crate) fn copy_regular_files(reader: RawFd, writer: RawFd, max_len: u64) -> CopyResult {
     use crate::cmp;
     use crate::sync::atomic::{AtomicBool, Ordering};
 
@@ -1228,14 +1237,18 @@ pub(crate) fn copy_regular_files(
     let mut written = 0u64;
     while written < max_len {
         let copy_result = if has_copy_file_range {
-            let bytes_to_copy = cmp::min(max_len - written, usize::MAX as u64) as usize;
+            let bytes_to_copy = cmp::min(max_len - written, usize::MAX as u64);
+            // cap to 2GB chunks in case u64::MAX is passed in as file size and the file has a non-zero offset
+            // this allows us to copy large chunks without hitting the limit,
+            // unless someone sets a file offset close to u64::MAX - 2GB, in which case the fallback would kick in
+            let bytes_to_copy = cmp::min(bytes_to_copy as usize, 0x8000_0000usize);
             let copy_result = unsafe {
                 // We actually don't have to adjust the offsets,
                 // because copy_file_range adjusts the file offset automatically
                 cvt(copy_file_range(
-                    reader.as_raw_fd(),
+                    reader,
                     ptr::null_mut(),
-                    writer.as_raw_fd(),
+                    writer,
                     ptr::null_mut(),
                     bytes_to_copy,
                     0,
@@ -1260,12 +1273,14 @@ pub(crate) fn copy_regular_files(
                 // - reading virtual files from the proc filesystem which appear to have 0 size
                 //   but are not empty. noted in coreutils to affect kernels at least up to 5.6.19.
                 // - copying from an overlay filesystem in docker. reported to occur on fedora 32.
-                return io::copy(reader, writer);
+                return CopyResult::Fallback(0);
             }
-            Ok(0) => return Ok(written), // reached EOF
+            Ok(0) => return CopyResult::Ended(Ok(written)), // reached EOF
             Ok(ret) => written += ret as u64,
             Err(err) => {
                 match err.raw_os_error() {
+                    // when file offset + max_length > u64::MAX
+                    Some(libc::EOVERFLOW) => return CopyResult::Fallback(written),
                     Some(
                         libc::ENOSYS | libc::EXDEV | libc::EINVAL | libc::EPERM | libc::EOPNOTSUPP,
                     ) => {
@@ -1276,43 +1291,55 @@ pub(crate) fn copy_regular_files(
                         // - copy_file_range is disallowed, for example by seccomp (EPERM)
                         // - copy_file_range cannot be used with pipes or device nodes (EINVAL)
                         assert_eq!(written, 0);
-                        return io::copy::generic_copy(reader, writer);
+                        return CopyResult::Fallback(0);
                     }
-                    _ => return Err(err),
+                    _ => return CopyResult::Ended(Err(err)),
                 }
             }
         }
     }
-    Ok(written)
+    CopyResult::Ended(Ok(written))
 }
 
+#[derive(PartialEq)]
 pub(crate) enum SpliceMode {
     Sendfile,
     Splice,
 }
 
+pub(crate) enum CopyResult {
+    Ended(io::Result<u64>),
+    Fallback(u64),
+}
+
 /// performs splice or sendfile between file descriptors
+/// Does _not_ fall back to a generic copy loop.
 #[cfg(any(target_os = "linux", target_os = "android"))]
 pub(crate) fn sendfile_splice(
     mode: SpliceMode,
-    reader: &mut crate::fs::File,
-    writer: &mut crate::fs::File,
+    reader: RawFd,
+    writer: RawFd,
     len: u64,
-) -> io::Result<u64> {
+) -> CopyResult {
     let mut written = 0u64;
     while written < len {
         let chunk_size = crate::cmp::min(len - written, 0x7ffff000_u64) as usize;
 
         let result = match mode {
-            SpliceMode::Sendfile => cvt(unsafe {
-                libc::sendfile(writer.as_raw_fd(), reader.as_raw_fd(), ptr::null_mut(), chunk_size)
-            }),
+            SpliceMode::Sendfile => {
+                cvt(unsafe { libc::sendfile(writer, reader, ptr::null_mut(), chunk_size) })
+            }
             SpliceMode::Splice => cvt(unsafe {
                 libc::splice(
-                    reader.as_raw_fd(),
+                    reader,
                     ptr::null_mut(),
-                    writer.as_raw_fd(),
+                    writer,
                     ptr::null_mut(),
+                    // default pipe size is 64KiB. try to only fill/drain half of that capacity
+                    // so that the next loop iteration won't be put to sleep.
+                    // If reader and writer operate at the same pace they will experience fewer blocking waits.
+                    // This is only needed for splice since sendfile stays in kernel space when it has to block.
+                    //crate::cmp::min(32*1024, chunk_size),
                     chunk_size,
                     0,
                 )
@@ -1325,17 +1352,19 @@ pub(crate) fn sendfile_splice(
             Err(err) => {
                 match err.raw_os_error() {
                     Some(os_err) if os_err == libc::EINVAL => {
-                        // Try fallback io::copy if splice/sendfile do not support this particular
-                        // file descritor (EINVAL)
+                        // splice/sendfile do not support this particular file descritor (EINVAL)
                         assert_eq!(written, 0);
-                        return io::copy::generic_copy(reader, writer);
+                        return CopyResult::Fallback(0);
+                    }
+                    Some(os_err) if mode == SpliceMode::Sendfile && os_err == libc::EOVERFLOW => {
+                        return CopyResult::Fallback(written);
                     }
-                    _ => return Err(err),
+                    _ => return CopyResult::Ended(Err(err)),
                 }
             }
         }
     }
-    Ok(written)
+    CopyResult::Ended(Ok(written))
 }
 
 #[cfg(any(target_os = "macos", target_os = "ios"))]