about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--src/test/bench/shootout-reverse-complement.rs347
1 files changed, 259 insertions, 88 deletions
diff --git a/src/test/bench/shootout-reverse-complement.rs b/src/test/bench/shootout-reverse-complement.rs
index 352d351973b..85529cec0e9 100644
--- a/src/test/bench/shootout-reverse-complement.rs
+++ b/src/test/bench/shootout-reverse-complement.rs
@@ -38,113 +38,284 @@
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 // OF THE POSSIBILITY OF SUCH DAMAGE.
 
-// ignore-pretty very bad with line comments
-// ignore-android doesn't terminate?
-
-#![feature(slicing_syntax)]
-
-use std::iter::range_step;
-use std::io::{stdin, stdout, File};
-
-static LINE_LEN: uint = 60;
-
-fn make_complements() -> [u8, ..256] {
-    let transforms = [
-        ('A', 'T'), ('C', 'G'), ('G', 'C'), ('T', 'A'),
-        ('U', 'A'), ('M', 'K'), ('R', 'Y'), ('W', 'W'),
-        ('S', 'S'), ('Y', 'R'), ('K', 'M'), ('V', 'B'),
-        ('H', 'D'), ('D', 'H'), ('B', 'V'), ('N', 'N'),
-        ('\n', '\n')];
-    let mut complements: [u8, ..256] = [0, ..256];
-    for (i, c) in complements.iter_mut().enumerate() {
-        *c = i as u8;
+#![feature(slicing_syntax, asm, if_let, tuple_indexing)]
+
+extern crate libc;
+
+use std::io::stdio::{stdin_raw, stdout_raw};
+use std::sync::{Future};
+use std::num::{div_rem};
+use std::ptr::{copy_memory};
+use std::io::{IoResult, EndOfFile};
+use std::slice::raw::{mut_buf_as_slice};
+
+use shared_memory::{SharedMemory};
+
+mod tables {
+    use std::sync::{Once, ONCE_INIT};
+
+    /// Lookup tables.
+    static mut CPL16: [u16, ..1 << 16] = [0, ..1 << 16];
+    static mut CPL8:  [u8,  ..1 << 8]  = [0, ..1 << 8];
+
+    /// Generates the tables.
+    pub fn get() -> Tables {
+        /// To make sure we initialize the tables only once.
+        static INIT: Once = ONCE_INIT;
+        INIT.doit(|| {
+            unsafe {
+                for i in range(0, 1 << 8) {
+                    CPL8[i] = match i as u8 {
+                        b'A' | b'a' => b'T',
+                        b'C' | b'c' => b'G',
+                        b'G' | b'g' => b'C',
+                        b'T' | b't' => b'A',
+                        b'U' | b'u' => b'A',
+                        b'M' | b'm' => b'K',
+                        b'R' | b'r' => b'Y',
+                        b'W' | b'w' => b'W',
+                        b'S' | b's' => b'S',
+                        b'Y' | b'y' => b'R',
+                        b'K' | b'k' => b'M',
+                        b'V' | b'v' => b'B',
+                        b'H' | b'h' => b'D',
+                        b'D' | b'd' => b'H',
+                        b'B' | b'b' => b'V',
+                        b'N' | b'n' => b'N',
+                        i => i,
+                    };
+                }
+
+                for (i, v) in CPL16.iter_mut().enumerate() {
+                    *v = *CPL8.unsafe_get(i & 255) as u16 << 8 |
+                         *CPL8.unsafe_get(i >> 8)  as u16;
+                }
+            }
+        });
+        Tables { _dummy: () }
+    }
+
+    /// Accessor for the static arrays.
+    ///
+    /// To make sure that the tables can't be accessed without having been initialized.
+    pub struct Tables {
+        _dummy: ()
     }
-    let lower = 'A' as u8 - 'a' as u8;
-    for &(from, to) in transforms.iter() {
-        complements[from as uint] = to as u8;
-        complements[(from as u8 - lower) as uint] = to as u8;
+
+    impl Tables {
+        /// Retreives the complement for `i`.
+        pub fn cpl8(self, i: u8) -> u8 {
+            // Not really unsafe.
+            unsafe { CPL8[i as uint] }
+        }
+
+        /// Retreives the complement for `i`.
+        pub fn cpl16(self, i: u16) -> u16 {
+            unsafe { CPL16[i as uint] }
+        }
     }
-    complements
 }
 
-fn main() {
-    let complements = make_complements();
-    let data = if std::os::getenv("RUST_BENCH").is_some() {
-        File::open(&Path::new("shootout-k-nucleotide.data")).read_to_end()
-    } else {
-        stdin().read_to_end()
-    };
-    let mut data = data.unwrap();
+mod shared_memory {
+    use std::sync::{Arc};
+    use std::mem::{transmute};
+    use std::raw::{Slice};
 
-    for seq in data.as_mut_slice().split_mut(|c| *c == '>' as u8) {
-        // skip header and last \n
-        let begin = match seq.iter().position(|c| *c == '\n' as u8) {
-            None => continue,
-            Some(c) => c
-        };
-        let len = seq.len();
-        let seq = seq[mut begin+1..len-1];
-
-        // arrange line breaks
-        let len = seq.len();
-        let off = LINE_LEN - len % (LINE_LEN + 1);
-        for i in range_step(LINE_LEN, len, LINE_LEN + 1) {
-            for j in std::iter::count(i, -1).take(off) {
-                seq[j] = seq[j - 1];
+    /// Structure for sharing disjoint parts of a vector mutably across tasks.
+    pub struct SharedMemory {
+        ptr: Arc<Vec<u8>>,
+        start: uint,
+        len: uint,
+    }
+
+    impl SharedMemory {
+        pub fn new(ptr: Vec<u8>) -> SharedMemory {
+            let len = ptr.len();
+            SharedMemory {
+                ptr: Arc::new(ptr),
+                start: 0,
+                len: len,
+            }
+        }
+
+        pub fn as_mut_slice(&mut self) -> &mut [u8] {
+            unsafe {
+                transmute(Slice {
+                    data: self.ptr.as_ptr().offset(self.start as int) as *const u8,
+                    len: self.len,
+                })
             }
-            seq[i - off] = '\n' as u8;
         }
 
-        // reverse complement, as
-        //    seq.reverse(); for c in seq.iter_mut() { *c = complements[*c] }
-        // but faster:
-        for (front, back) in two_side_iter(seq) {
-            let tmp = complements[*front as uint];
-            *front = complements[*back as uint];
-            *back = tmp;
+        pub fn len(&self) -> uint {
+            self.len
         }
-        if seq.len() % 2 == 1 {
-            let middle = &mut seq[seq.len() / 2];
-            *middle = complements[*middle as uint];
+
+        pub fn split_at(self, mid: uint) -> (SharedMemory, SharedMemory) {
+            assert!(mid <= self.len);
+            let left = SharedMemory {
+                ptr: self.ptr.clone(),
+                start: self.start,
+                len: mid,
+            };
+            let right = SharedMemory {
+                ptr: self.ptr,
+                start: self.start + mid,
+                len: self.len - mid,
+            };
+            (left, right)
         }
-    }
 
-    stdout().write(data.as_slice()).unwrap();
+        /// Resets the object so that it covers the whole range of the contained vector.
+        ///
+        /// You must not call this method if `self` is not the only reference to the
+        /// shared memory.
+        ///
+        /// FIXME: If `Arc` had a method to check if the reference is unique, then we
+        /// wouldn't need the `unsafe` here.
+        ///
+        /// FIXME: If `Arc` had a method to unwrap the contained value, then we could
+        /// simply unwrap here.
+        pub unsafe fn reset(self) -> SharedMemory {
+            let len = self.ptr.len();
+            SharedMemory {
+                ptr: self.ptr,
+                start: 0,
+                len: len,
+            }
+        }
+    }
 }
 
-pub struct TwoSideIter<'a, T: 'a> {
-    first: *mut T,
-    last: *mut T,
-    marker: std::kinds::marker::ContravariantLifetime<'a>,
-    marker2: std::kinds::marker::NoCopy
+
+/// Reads all remaining bytes from the stream.
+fn read_to_end<R: Reader>(r: &mut R) -> IoResult<Vec<u8>> {
+    const CHUNK: uint = 64 * 1024;
+
+    let mut vec = Vec::with_capacity(1024 * 1024);
+    loop {
+        if vec.capacity() - vec.len() < CHUNK {
+            let cap = vec.capacity();
+            let mult = if cap < 256 * 1024 * 1024 {
+                // FIXME (mahkoh): Temporary workaround for jemalloc on linux. Replace
+                // this by 2x once the jemalloc preformance issue has been fixed.
+                16
+            } else {
+                2
+            };
+            vec.reserve_exact(mult * cap);
+        }
+        unsafe {
+            let ptr = vec.as_mut_ptr().offset(vec.len() as int);
+            match mut_buf_as_slice(ptr, CHUNK, |s| r.read(s)) {
+                Ok(n) => {
+                    let len = vec.len();
+                    vec.set_len(len + n);
+                },
+                Err(ref e) if e.kind == EndOfFile => break,
+                Err(e) => return Err(e),
+            }
+        }
+    }
+    Ok(vec)
 }
 
-pub fn two_side_iter<'a, T>(slice: &'a mut [T]) -> TwoSideIter<'a, T> {
-    let len = slice.len();
-    let first = slice.as_mut_ptr();
-    let last = if len == 0 {
-        first
-    } else {
-        unsafe { first.offset(len as int - 1) }
+/// Finds the first position at which `b` occurs in `s`.
+fn memchr(h: &[u8], n: u8) -> Option<uint> {
+    use libc::{c_void, c_int, size_t};
+    extern {
+        fn memchr(h: *const c_void, n: c_int, s: size_t) -> *mut c_void;
+    }
+    let res = unsafe {
+        memchr(h.as_ptr() as *const c_void, n as c_int, h.len() as size_t)
     };
-
-    TwoSideIter {
-        first: first,
-        last: last,
-        marker: std::kinds::marker::ContravariantLifetime,
-        marker2: std::kinds::marker::NoCopy
+    if res.is_null() {
+        None
+    } else {
+        Some(res as uint - h.as_ptr() as uint)
     }
 }
 
-impl<'a, T> Iterator<(&'a mut T, &'a mut T)> for TwoSideIter<'a, T> {
-    fn next(&mut self) -> Option<(&'a mut T, &'a mut T)> {
-        if self.first < self.last {
-            let result = unsafe { (&mut *self.first, &mut *self.last) };
-            self.first = unsafe { self.first.offset(1) };
-            self.last = unsafe { self.last.offset(-1) };
-            Some(result)
-        } else {
-            None
+/// Length of a normal line without the terminating \n.
+const LINE_LEN: uint = 60;
+
+/// Compute the reverse complement.
+fn reverse_complement(mut view: SharedMemory, tables: tables::Tables) {
+    // Drop the last newline
+    let seq = view.as_mut_slice().init_mut();
+    let len = seq.len();
+    let off = LINE_LEN - len % (LINE_LEN + 1);
+    let mut i = LINE_LEN;
+    while i < len {
+        unsafe {
+            copy_memory(seq.as_mut_ptr().offset((i - off + 1) as int),
+                        seq.as_ptr().offset((i - off) as int), off);
+            *seq.unsafe_mut(i - off) = b'\n';
         }
+        i += LINE_LEN + 1;
     }
+
+    let (div, rem) = div_rem(len, 4);
+    unsafe {
+        let mut left = seq.as_mut_ptr() as *mut u16;
+        // This is slow if len % 2 != 0 but still faster than bytewise operations.
+        let mut right = seq.as_mut_ptr().offset(len as int - 2) as *mut u16;
+        let end = left.offset(div as int);
+        while left != end {
+            let tmp = tables.cpl16(*left);
+            *left = tables.cpl16(*right);
+            *right = tmp;
+            left = left.offset(1);
+            right = right.offset(-1);
+        }
+
+        let end = end as *mut u8;
+        match rem {
+            1 => *end = tables.cpl8(*end),
+            2 => {
+                let tmp = tables.cpl8(*end);
+                *end = tables.cpl8(*end.offset(1));
+                *end.offset(1) = tmp;
+            },
+            3 => {
+                *end.offset(1) = tables.cpl8(*end.offset(1));
+                let tmp = tables.cpl8(*end);
+                *end = tables.cpl8(*end.offset(2));
+                *end.offset(2) = tmp;
+            },
+            _ => { },
+        }
+    }
+}
+
+fn main() {
+    let mut data = SharedMemory::new(read_to_end(&mut stdin_raw()).unwrap());
+    let tables = tables::get();
+
+    let mut futures = vec!();
+    loop {
+        let (_, mut tmp_data) = match memchr(data.as_mut_slice(), b'\n') {
+            Some(i) => data.split_at(i + 1),
+            _ => break,
+        };
+        let (view, tmp_data) = match memchr(tmp_data.as_mut_slice(), b'>') {
+            Some(i) => tmp_data.split_at(i),
+            None => {
+                let len = tmp_data.len();
+                tmp_data.split_at(len)
+            },
+        };
+        futures.push(Future::spawn(proc() reverse_complement(view, tables)));
+        data = tmp_data;
+    }
+
+    for f in futures.iter_mut() {
+        f.get();
+    }
+
+    // Not actually unsafe. If Arc had a way to check uniqueness then we could do that in
+    // `reset` and it would tell us that, yes, it is unique at this point.
+    data = unsafe { data.reset() };
+
+    stdout_raw().write(data.as_mut_slice()).unwrap();
 }