Auto merge of #98943 - WilliamVenner:feat/bufread_skip_until, r=dtolnay

Add `BufRead::skip_until` Alternative version of `BufRead::read_until` that simply discards data, rather than copying it into a buffer. Useful for situations like skipping irrelevant data in a binary file format that is NUL-terminated. <details> <summary>Benchmark</summary> ``` running 2 tests test bench_read_until ... bench: 123 ns/iter (+/- 6) test bench_skip_until ... bench: 66 ns/iter (+/- 3) ``` ```rs #![feature(test)] extern crate test; use test::Bencher; use std::io::{ErrorKind, BufRead}; fn skip_until<R: BufRead + ?Sized>(r: &mut R, delim: u8) -> Result<usize, std::io::Error> { let mut read = 0; loop { let (done, used) = { let available = match r.fill_buf() { Ok(n) => n, Err(ref e) if e.kind() == ErrorKind::Interrupted => continue, Err(e) => return Err(e), }; match memchr::memchr(delim, available) { Some(i) => (true, i + 1), None => (false, available.len()), } }; r.consume(used); read += used; if done || used == 0 { return Ok(read); } } } const STR: &[u8] = b"Ferris\0Hello, world!\0"; #[bench] fn bench_skip_until(b: &mut Bencher) { b.iter(|| { let mut io = std::io::Cursor::new(test::black_box(STR)); skip_until(&mut io, b'\0').unwrap(); let mut hello = Vec::with_capacity(b"Hello, world!\0".len()); let num_bytes = io.read_until(b'\0', &mut hello).unwrap(); assert_eq!(num_bytes, b"Hello, world!\0".len()); assert_eq!(hello, b"Hello, world!\0"); }); } #[bench] fn bench_read_until(b: &mut Bencher) { b.iter(|| { let mut io = std::io::Cursor::new(test::black_box(STR)); io.read_until(b'\0', &mut Vec::new()).unwrap(); let mut hello = Vec::with_capacity(b"Hello, world!\0".len()); let num_bytes = io.read_until(b'\0', &mut hello).unwrap(); assert_eq!(num_bytes, b"Hello, world!\0".len()); assert_eq!(hello, b"Hello, world!\0"); }); } ``` </details>
author: bors <bors@rust-lang.org> 2023-11-23 22:28:14 +0000
committer: bors <bors@rust-lang.org> 2023-11-23 22:28:14 +0000
commit: e68f935117fef05e7663605a5a6671a6bb4ce719 (patch)
tree: 4975abe955374ee74c729da510d5c9ce50e0abcf
parent: a1a37735cbc3db359d0b24ba9085c9fcbe1bc274 (diff)
parent: 7c1ab71f71087d88aade79925ae68a447397422f (diff)
download: rust-e68f935117fef05e7663605a5a6671a6bb4ce719.tar.gz
rust-e68f935117fef05e7663605a5a6671a6bb4ce719.zip
2 files changed, 114 insertions, 0 deletions
diff --git a/library/std/src/io/mod.rs b/library/std/src/io/mod.rs
index ff5b4e76d44..260200115c8 100644
--- a/library/std/src/io/mod.rs
+++ b/library/std/src/io/mod.rs
@@ -2044,6 +2044,28 @@ fn read_until<R: BufRead + ?Sized>(r: &mut R, delim: u8, buf: &mut Vec<u8>) -> R
     }
 }
 
+fn skip_until<R: BufRead + ?Sized>(r: &mut R, delim: u8) -> Result<usize> {
+    let mut read = 0;
+    loop {
+        let (done, used) = {
+            let available = match r.fill_buf() {
+                Ok(n) => n,
+                Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
+                Err(e) => return Err(e),
+            };
+            match memchr::memchr(delim, available) {
+                Some(i) => (true, i + 1),
+                None => (false, available.len()),
+            }
+        };
+        r.consume(used);
+        read += used;
+        if done || used == 0 {
+            return Ok(read);
+        }
+    }
+}
+
 /// A `BufRead` is a type of `Read`er which has an internal buffer, allowing it
 /// to perform extra ways of reading.
 ///
@@ -2247,6 +2269,68 @@ pub trait BufRead: Read {
         read_until(self, byte, buf)
     }
 
+    /// Skip all bytes until the delimiter `byte` or EOF is reached.
+    ///
+    /// This function will read (and discard) bytes from the underlying stream until the
+    /// delimiter or EOF is found.
+    ///
+    /// If successful, this function will return the total number of bytes read,
+    /// including the delimiter byte.
+    ///
+    /// This is useful for efficiently skipping data such as NUL-terminated strings
+    /// in binary file formats without buffering.
+    ///
+    /// This function is blocking and should be used carefully: it is possible for
+    /// an attacker to continuously send bytes without ever sending the delimiter
+    /// or EOF.
+    ///
+    /// # Errors
+    ///
+    /// This function will ignore all instances of [`ErrorKind::Interrupted`] and
+    /// will otherwise return any errors returned by [`fill_buf`].
+    ///
+    /// If an I/O error is encountered then all bytes read so far will be
+    /// present in `buf` and its length will have been adjusted appropriately.
+    ///
+    /// [`fill_buf`]: BufRead::fill_buf
+    ///
+    /// # Examples
+    ///
+    /// [`std::io::Cursor`][`Cursor`] is a type that implements `BufRead`. In
+    /// this example, we use [`Cursor`] to read some NUL-terminated information
+    /// about Ferris from a binary string, skipping the fun fact:
+    ///
+    /// ```
+    /// #![feature(bufread_skip_until)]
+    ///
+    /// use std::io::{self, BufRead};
+    ///
+    /// let mut cursor = io::Cursor::new(b"Ferris\0Likes long walks on the beach\0Crustacean\0");
+    ///
+    /// // read name
+    /// let mut name = Vec::new();
+    /// let num_bytes = cursor.read_until(b'\0', &mut name)
+    ///     .expect("reading from cursor won't fail");
+    /// assert_eq!(num_bytes, 7);
+    /// assert_eq!(name, b"Ferris\0");
+    ///
+    /// // skip fun fact
+    /// let num_bytes = cursor.skip_until(b'\0')
+    ///     .expect("reading from cursor won't fail");
+    /// assert_eq!(num_bytes, 30);
+    ///
+    /// // read animal type
+    /// let mut animal = Vec::new();
+    /// let num_bytes = cursor.read_until(b'\0', &mut animal)
+    ///     .expect("reading from cursor won't fail");
+    /// assert_eq!(num_bytes, 11);
+    /// assert_eq!(animal, b"Crustacean\0");
+    /// ```
+    #[unstable(feature = "bufread_skip_until", issue = "111735")]
+    fn skip_until(&mut self, byte: u8) -> Result<usize> {
+        skip_until(self, byte)
+    }
+
     /// Read all bytes until a newline (the `0xA` byte) is reached, and append
     /// them to the provided `String` buffer.
     ///
diff --git a/library/std/src/io/tests.rs b/library/std/src/io/tests.rs
index 6d30f5e6c6c..bda5b721adc 100644
--- a/library/std/src/io/tests.rs
+++ b/library/std/src/io/tests.rs
@@ -26,6 +26,36 @@ fn read_until() {
 }
 
 #[test]
+fn skip_until() {
+    let bytes: &[u8] = b"read\0ignore\0read\0ignore\0read\0ignore\0";
+    let mut reader = BufReader::new(bytes);
+
+    // read from the bytes, alternating between
+    // consuming `read\0`s and skipping `ignore\0`s
+    loop {
+        // consume `read\0`
+        let mut out = Vec::new();
+        let read = reader.read_until(0, &mut out).unwrap();
+        if read == 0 {
+            // eof
+            break;
+        } else {
+            assert_eq!(out, b"read\0");
+            assert_eq!(read, b"read\0".len());
+        }
+
+        // skip past `ignore\0`
+        let skipped = reader.skip_until(0).unwrap();
+        assert_eq!(skipped, b"ignore\0".len());
+    }
+
+    // ensure we are at the end of the byte slice and that we can skip no further
+    // also ensure skip_until matches the behavior of read_until at EOF
+    let skipped = reader.skip_until(0).unwrap();
+    assert_eq!(skipped, 0);
+}
+
+#[test]
 fn split() {
     let buf = Cursor::new(&b"12"[..]);
     let mut s = buf.split(b'3');
author	bors <bors@rust-lang.org>	2023-11-23 22:28:14 +0000
committer	bors <bors@rust-lang.org>	2023-11-23 22:28:14 +0000
commit	e68f935117fef05e7663605a5a6671a6bb4ce719 (patch)
tree	4975abe955374ee74c729da510d5c9ce50e0abcf
parent	a1a37735cbc3db359d0b24ba9085c9fcbe1bc274 (diff)
parent	7c1ab71f71087d88aade79925ae68a447397422f (diff)
download	rust-e68f935117fef05e7663605a5a6671a6bb4ce719.tar.gz rust-e68f935117fef05e7663605a5a6671a6bb4ce719.zip