From 5aee5a11e3d4807c6df190e33cc6c4dc81ef7ea3 Mon Sep 17 00:00:00 2001 From: Gary Linscott Date: Wed, 10 Jul 2013 17:06:16 -0400 Subject: Optimize is_utf8 Manually unroll the multibyte loops, and optimize for the single byte chars. --- src/libstd/str.rs | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'src/libstd') diff --git a/src/libstd/str.rs b/src/libstd/str.rs index bc59164637e..1d8a2d404a7 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -596,17 +596,25 @@ pub fn is_utf8(v: &[u8]) -> bool { let mut i = 0u; let total = v.len(); while i < total { - let mut chsize = utf8_char_width(v[i]); - if chsize == 0u { return false; } - if i + chsize > total { return false; } - i += 1u; - while chsize > 1u { - if v[i] & 192u8 != TAG_CONT_U8 { return false; } + if v[i] < 128u8 { i += 1u; - chsize -= 1u; + } else { + let w = utf8_char_width(v[i]); + if w == 0u { return false; } + + let nexti = i + w; + if nexti > total { return false; } + + if v[i + 1] & 192u8 != TAG_CONT_U8 { return false; } + if w > 2 { + if v[i + 2] & 192u8 != TAG_CONT_U8 { return false; } + if w > 3 && (v[i + 3] & 192u8 != TAG_CONT_U8) { return false; } + } + + i = nexti; } } - return true; + true } /// Determines if a vector of `u16` contains valid UTF-16 -- cgit 1.4.1-3-g733a5