diff options
| -rw-r--r-- | library/alloc/tests/str.rs | 40 | ||||
| -rw-r--r-- | library/core/benches/str.rs | 29 | ||||
| -rw-r--r-- | library/core/benches/str/char_count.rs | 101 | ||||
| -rw-r--r-- | library/core/benches/str/corpora.rs | 83 | ||||
| -rw-r--r-- | library/core/src/str/count.rs | 116 | ||||
| -rw-r--r-- | library/core/src/str/iter.rs | 5 | ||||
| -rw-r--r-- | library/core/src/str/mod.rs | 1 |
7 files changed, 346 insertions, 29 deletions
diff --git a/library/alloc/tests/str.rs b/library/alloc/tests/str.rs index e92881b1049..3dcbc54be4e 100644 --- a/library/alloc/tests/str.rs +++ b/library/alloc/tests/str.rs @@ -2230,3 +2230,43 @@ fn utf8_chars() { assert!((!from_utf8(&[0xf0, 0xff, 0x10]).is_ok())); assert!((!from_utf8(&[0xf0, 0xff, 0xff, 0x10]).is_ok())); } + +#[test] +fn utf8_char_counts() { + let strs = [("e", 1), ("รฉ", 1), ("โฌ", 1), ("\u{10000}", 1), ("eรฉโฌ\u{10000}", 4)]; + let mut reps = vec![1, 8, 64, 256, 512, 1024]; + if cfg!(not(miri)) { + reps.push(1 << 16); + } + let counts = if cfg!(miri) { 0..1 } else { 0..8 }; + let padding = counts.map(|len| " ".repeat(len)).collect::<Vec<String>>(); + + for repeat in reps { + for (tmpl_str, tmpl_char_count) in strs { + for pad_start in &padding { + for pad_end in &padding { + // Create a string with padding... + let with_padding = + format!("{}{}{}", pad_start, tmpl_str.repeat(repeat), pad_end); + // ...and then skip past that padding. This should ensure + // that we test several different alignments for both head + // and tail. + let si = pad_start.len(); + let ei = with_padding.len() - pad_end.len(); + let target = &with_padding[si..ei]; + + assert!(!target.starts_with(" ") && !target.ends_with(" ")); + let expected_count = tmpl_char_count * repeat; + assert_eq!( + expected_count, + target.chars().count(), + "wrong count for `{:?}.repeat({})` (padding: `{:?}`)", + tmpl_str, + repeat, + (pad_start.len(), pad_end.len()), + ); + } + } + } + } +} diff --git a/library/core/benches/str.rs b/library/core/benches/str.rs index 1527aa0bd66..78865d81fb9 100644 --- a/library/core/benches/str.rs +++ b/library/core/benches/str.rs @@ -1,33 +1,10 @@ use std::str; use test::{black_box, Bencher}; -const LOREM_SHORT: &str = "Lorem ipsum"; - -const LOREM: &str = "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. -Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat. -Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. -Nam liber tempor cum soluta nobis eleifend option congue nihil imperdiet doming id quod mazim placerat facer possim assum. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat. Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip ex ea commodo consequat. -Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis. -At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, At accusam aliquyam diam diam dolore dolores duo eirmod eos erat, et nonumy sed tempor et et invidunt justo labore Stet clita ea et gubergren, kasd magna no rebum. sanctus sea sed takimata ut vero voluptua. est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur"; - -const EMOJI: &str = "๐๐๐๐๐๐ ๐คฃ๐๐๐๐๐๐๐ฅฐ๐๐คฉ๐๐โบ๐๐๐ฅฒ๐๐๐๐คช๐๐ค๐ค๐คญ๐คซ๐ค๐ค๐คจ๐๐๐ถ๐ถโ๐ซ๏ธ๐๐๐๐ฌ๐ฎโ๐จ๐คฅ๐๐๐ช๐คค๐ด๐ท๐ค๐ค๐คข๐คฎ๐คง๐ฅต๐ฅถ๐ฅด๐ต๐ตโ๐ซ๐คฏ๐ค ๐ฅณ๐ฅธ๐๐ค๐ง๐๐๐โน๐ฎ๐ฏ๐ฒ๐ณ๐ฅบ๐ฆ๐ง๐จ๐ฐ๐ฅ๐ข๐ญ๐ฑ๐๐ฃ๐๐๐ฉ๐ซ๐ฅฑ๐ค๐ก๐ ๐คฌ๐๐ฟ๐โ ๐ฉ๐คก๐น๐บ๐ป๐ฝ๐พ๐ค๐บ๐ธ๐น๐ป๐ผ๐ฝ๐๐ฟ๐พ๐๐๐๐๐๐๐๐๐๐๐๐๐โฃ๐โค๏ธโ๐ฅโค๏ธโ๐ฉนโค๐งก๐๐๐๐๐ค๐ค๐ค๐ฏ๐ข๐ฅ๐ซ๐ฆ๐จ๐ณ๐ฃ๐ฌ๐๏ธโ๐จ๏ธ๐จ๐ฏ๐ญ๐ค๐๐ค๐โ๐๐๐ค๐คโ๐ค๐ค๐ค๐ค๐๐๐๐๐โ๐๐โ๐๐ค๐ค๐๐๐๐คฒ๐ค๐โ๐ ๐คณ๐ช๐ฆพ๐ฆฟ๐ฆต๐ฆถ๐๐ฆป๐๐ง ๐ซ๐ซ๐ฆท๐ฆด๐๐๐ ๐๐ถ๐ง๐ฆ๐ง๐ง๐ฑ๐จ๐ง๐งโโ๏ธ๐งโโ๏ธ๐จโ๐ฆฐ๐จโ๐ฆฑ๐จโ๐ฆณ๐จโ๐ฆฒ๐ฉ๐ฉโ๐ฆฐ๐งโ๐ฆฐ๐ฉโ๐ฆฑ๐งโ๐ฆฑ๐ฉโ๐ฆณ๐งโ๐ฆณ๐ฉโ๐ฆฒ๐งโ๐ฆฒ๐ฑโโ๏ธ๐ฑโโ๏ธ๐ง๐ด๐ต๐๐โโ๏ธ๐โโ๏ธ๐๐โโ๏ธ๐โโ๏ธ๐ ๐ โโ๏ธ๐ โโ๏ธ๐๐โโ๏ธ๐โโ๏ธ๐๐โโ๏ธ๐โโ๏ธ๐๐โโ๏ธ๐โโ๏ธ๐ง๐งโโ๏ธ๐งโโ๏ธ๐๐โโ๏ธ๐โโ๏ธ๐คฆ๐คฆโโ๏ธ๐คฆโโ๏ธ๐คท๐คทโโ๏ธ๐คทโโ๏ธ๐งโโ๏ธ๐จโโ๏ธ๐ฉโโ๏ธ๐งโ๐๐จโ๐๐ฉโ๐๐งโ๐ซ๐จโ๐ซ๐ฉโ๐ซ๐งโโ๏ธ๐จโโ๏ธ๐ฉโโ๏ธ๐งโ๐พ๐จโ๐พ๐ฉโ๐พ๐งโ๐ณ๐จโ๐ณ๐ฉโ๐ณ๐งโ๐ง๐จโ๐ง๐ฉโ๐ง๐งโ๐ญ๐จโ๐ญ๐ฉโ๐ญ๐งโ๐ผ๐จโ๐ผ๐ฉโ๐ผ๐งโ๐ฌ๐จโ๐ฌ๐ฉโ๐ฌ๐งโ๐ป๐จโ๐ป๐ฉโ๐ป๐งโ๐ค๐จโ๐ค๐ฉโ๐ค๐งโ๐จ๐จโ๐จ๐ฉโ๐จ๐งโโ๏ธ๐จโโ๏ธ๐ฉโโ๏ธ๐งโ๐๐จโ๐๐ฉโ๐๐งโ๐๐จโ๐๐ฉโ๐๐ฎ๐ฎโโ๏ธ๐ฎโโ๏ธ๐ต๐ต๏ธโโ๏ธ๐ต๏ธโโ๏ธ๐๐โโ๏ธ๐โโ๏ธ๐ฅท๐ท๐ทโโ๏ธ๐ทโโ๏ธ๐คด๐ธ๐ณ๐ณโโ๏ธ๐ณโโ๏ธ๐ฒ๐ง๐คต๐คตโโ๏ธ๐คตโโ๏ธ๐ฐ๐ฐโโ๏ธ๐ฐโโ๏ธ๐คฐ๐คฑ๐ฉโ๐ผ๐จโ๐ผ๐งโ๐ผ๐ผ๐ ๐คถ๐งโ๐๐ฆธ๐ฆธโโ๏ธ๐ฆธโโ๏ธ๐ฆน๐ฆนโโ๏ธ๐ฆนโโ๏ธ๐ง๐งโโ๏ธ๐งโโ๏ธ๐ง๐งโโ๏ธ๐งโโ๏ธ๐ง๐งโโ๏ธ๐งโโ๏ธ๐ง๐งโโ๏ธ๐งโโ๏ธ๐ง๐งโโ๏ธ๐งโโ๏ธ๐ง๐งโโ๏ธ๐งโโ๏ธ๐ง๐งโโ๏ธ๐งโโ๏ธ๐๐โโ๏ธ๐โโ๏ธ๐๐โโ๏ธ๐โโ๏ธ๐ถ๐ถโโ๏ธ๐ถโโ๏ธ๐ง๐งโโ๏ธ๐งโโ๏ธ๐ง๐งโโ๏ธ๐งโโ๏ธ๐งโ๐ฆฏ๐จโ๐ฆฏ๐ฉโ๐ฆฏ๐งโ๐ฆผ๐จโ๐ฆผ๐ฉโ๐ฆผ๐งโ๐ฆฝ๐จโ๐ฆฝ๐ฉโ๐ฆฝ๐๐โโ๏ธ๐โโ๏ธ๐๐บ๐ด๐ฏ๐ฏโโ๏ธ๐ฏโโ๏ธ๐ง๐งโโ๏ธ๐งโโ๏ธ๐ง๐งโโ๏ธ๐งโโ๏ธ๐คบ๐โท๐๐๐๏ธโโ๏ธ๐๏ธโโ๏ธ๐๐โโ๏ธ๐โโ๏ธ๐ฃ๐ฃโโ๏ธ๐ฃโโ๏ธ๐๐โโ๏ธ๐โโ๏ธโนโน๏ธโโ๏ธโน๏ธโโ๏ธ๐๐๏ธโโ๏ธ๐๏ธโโ๏ธ๐ด๐ดโโ๏ธ๐ดโโ๏ธ๐ต๐ตโโ๏ธ๐ตโโ๏ธ๐คธ๐คธโโ๏ธ๐คธโโ๏ธ๐คผ๐คผโโ๏ธ๐คผโโ๏ธ๐คฝ๐คฝโโ๏ธ๐คฝโโ๏ธ๐คพ๐คพโโ๏ธ๐คพโโ๏ธ๐คน๐คนโโ๏ธ๐คนโโ๏ธ๐ง๐งโโ๏ธ๐งโโ๏ธ๐๐๐งโ๐คโ๐ง๐ญ๐ซ๐ฌ๐๐ฉโโค๏ธโ๐โ๐จ๐จโโค๏ธโ๐โ๐จ๐ฉโโค๏ธโ๐โ๐ฉ๐๐ฉโโค๏ธโ๐จ๐จโโค๏ธโ๐จ๐ฉโโค๏ธโ๐ฉ๐ช๐จโ๐ฉโ๐ฆ๐จโ๐ฉโ๐ง๐จโ๐ฉโ๐งโ๐ฆ๐จโ๐ฉโ๐ฆโ๐ฆ๐จโ๐ฉโ๐งโ๐ง๐จโ๐จโ๐ฆ๐จโ๐จโ๐ง๐จโ๐จโ๐งโ๐ฆ๐จโ๐จโ๐ฆโ๐ฆ๐จโ๐จโ๐งโ๐ง๐ฉโ๐ฉโ๐ฆ๐ฉโ๐ฉโ๐ง๐ฉโ๐ฉโ๐งโ๐ฆ๐ฉโ๐ฉโ๐ฆโ๐ฆ๐ฉโ๐ฉโ๐งโ๐ง๐จโ๐ฆ๐จโ๐ฆโ๐ฆ๐จโ๐ง๐จโ๐งโ๐ฆ๐จโ๐งโ๐ง๐ฉโ๐ฆ๐ฉโ๐ฆโ๐ฆ๐ฉโ๐ง๐ฉโ๐งโ๐ฆ๐ฉโ๐งโ๐ง๐ฃ๐ค๐ฅ๐ซ๐ฃ๐ฆฐ๐ฆฑ๐ฆณ๐ฆฒ๐ต๐๐ฆ๐ฆง๐ถ๐๐ฆฎ๐โ๐ฆบ๐ฉ๐บ๐ฆ๐ฆ๐ฑ๐๐โโฌ๐ฆ๐ฏ๐ ๐๐ด๐๐ฆ๐ฆ๐ฆ๐ฆฌ๐ฎ๐๐๐๐ท๐๐๐ฝ๐๐๐๐ช๐ซ๐ฆ๐ฆ๐๐ฆฃ๐ฆ๐ฆ๐ญ๐๐๐น๐ฐ๐๐ฟ๐ฆซ๐ฆ๐ฆ๐ป๐ปโโ๏ธ๐จ๐ผ๐ฆฅ๐ฆฆ๐ฆจ๐ฆ๐ฆก๐พ๐ฆ๐๐๐ฃ๐ค๐ฅ๐ฆ๐ง๐๐ฆ ๐ฆ๐ฆข๐ฆ๐ฆค๐ชถ๐ฆฉ๐ฆ๐ฆ๐ธ๐๐ข๐ฆ๐๐ฒ๐๐ฆ๐ฆ๐ณ๐๐ฌ๐ฆญ๐๐ ๐ก๐ฆ๐๐๐๐ฆ๐๐๐๐ชฒ๐๐ฆ๐ชณ๐ท๐ธ๐ฆ๐ฆ๐ชฐ๐ชฑ๐ฆ ๐๐ธ๐ฎ๐ต๐น๐ฅ๐บ๐ป๐ผ๐ท๐ฑ๐ชด๐ฒ๐ณ๐ด๐ต๐พ๐ฟโ๐๐๐๐๐๐๐๐๐๐๐๐ฅญ๐๐๐๐๐๐๐ซ๐ฅ๐ ๐ซ๐ฅฅ๐ฅ๐๐ฅ๐ฅ๐ฝ๐ถ๐ซ๐ฅ๐ฅฌ๐ฅฆ๐ง๐ง ๐๐ฅ๐ฐ๐๐ฅ๐ฅ๐ซ๐ฅจ๐ฅฏ๐ฅ๐ง๐ง๐๐๐ฅฉ๐ฅ๐๐๐๐ญ๐ฅช๐ฎ๐ฏ๐ซ๐ฅ๐ง๐ฅ๐ณ๐ฅ๐ฒ๐ซ๐ฅฃ๐ฅ๐ฟ๐ง๐ง๐ฅซ๐ฑ๐๐๐๐๐๐๐ ๐ข๐ฃ๐ค๐ฅ๐ฅฎ๐ก๐ฅ๐ฅ ๐ฅก๐ฆ๐ฆ๐ฆ๐ฆ๐ฆช๐ฆ๐ง๐จ๐ฉ๐ช๐๐ฐ๐ง๐ฅง๐ซ๐ฌ๐ญ๐ฎ๐ฏ๐ผ๐ฅโ๐ซ๐ต๐ถ๐พ๐ท๐ธ๐น๐บ๐ป๐ฅ๐ฅ๐ฅค๐ง๐ง๐ง๐ง๐ฅข๐ฝ๐ด๐ฅ๐ช๐บ๐๐๐๐๐บ๐พ๐งญ๐โฐ๐๐ป๐๐๐๐๐๐๐๐๐งฑ๐ชจ๐ชต๐๐๐๐ ๐ก๐ข๐ฃ๐ค๐ฅ๐ฆ๐จ๐ฉ๐ช๐ซ๐ฌ๐ญ๐ฏ๐ฐ๐๐ผ๐ฝโช๐๐๐โฉ๐โฒโบ๐๐๐๐๐ ๐๐๐โจ๐ ๐ก๐ข๐๐ช๐๐๐๐ ๐๐๐๐๐๐๐๐๐๐๐๐๐๐๐๐๐๐๐๐๐๐ป๐๐๐๐๐๐ต๐ฆฝ๐ฆผ๐บ๐ฒ๐ด๐น๐ผ๐๐ฃ๐ค๐ขโฝ๐จ๐ฅ๐ฆ๐๐งโโต๐ถ๐ค๐ณโด๐ฅ๐ขโ๐ฉ๐ซ๐ฌ๐ช๐บ๐๐๐ ๐ก๐ฐ๐๐ธ๐๐งณโโณโโฐโฑโฒ๐ฐ๐๐ง๐๐๐๐๐๐๐๐๐๐ ๐๐ก๐๐ข๐๐ฃ๐๐ค๐๐ฅ๐๐ฆ๐๐๐๐๐๐๐๐๐๐๐๐๐กโ๐๐๐ชโญ๐๐ ๐โโ โ๐ค๐ฅ๐ฆ๐ง๐จ๐ฉ๐ช๐ซ๐ฌ๐๐๐โโโฑโกโโโโ๐ฅ๐ง๐๐๐๐๐๐งจโจ๐๐๐๐๐๐๐๐๐๐งง๐๐๐๐๐ซ๐๐๐ ๐ฅ๐ฅ๐ฅโฝโพ๐ฅ๐๐๐๐๐พ๐ฅ๐ณ๐๐๐๐ฅ๐๐ธ๐ฅ๐ฅ๐ฅ โณโธ๐ฃ๐คฟ๐ฝ๐ฟ๐ท๐ฅ๐ฏ๐ช๐ช๐ฑ๐ฎ๐ช๐งฟ๐ฎ๐น๐ฐ๐ฒ๐งฉ๐งธ๐ช ๐ชโ โฅโฆโฃโ๐๐๐ด๐ญ๐ผ๐จ๐งต๐ชก๐งถ๐ชข๐๐ถ๐ฅฝ๐ฅผ๐ฆบ๐๐๐๐งฃ๐งค๐งฅ๐งฆ๐๐๐ฅป๐ฉฑ๐ฉฒ๐ฉณ๐๐๐๐๐๐๐๐ฉด๐๐๐ฅพ๐ฅฟ๐ ๐ก๐ฉฐ๐ข๐๐๐ฉ๐๐งข๐ชโ๐ฟ๐๐๐๐๐๐๐๐ข๐ฃ๐ฏ๐๐๐ผ๐ต๐ถ๐๐๐๐ค๐ง๐ป๐ท๐ช๐ธ๐น๐บ๐ป๐ช๐ฅ"; - -#[bench] -fn str_char_count_lorem(b: &mut Bencher) { - b.iter(|| black_box(LOREM).chars().count()); -} - -#[bench] -fn str_char_count_lorem_short(b: &mut Bencher) { - b.iter(|| black_box(LOREM_SHORT).chars().count()); -} - -#[bench] -fn str_char_count_emoji(b: &mut Bencher) { - b.iter(|| black_box(EMOJI).chars().count()); -} +mod char_count; +mod corpora; #[bench] fn str_validate_emoji(b: &mut Bencher) { - b.iter(|| str::from_utf8(black_box(EMOJI.as_bytes()))); + b.iter(|| str::from_utf8(black_box(corpora::emoji::LARGE.as_bytes()))); } diff --git a/library/core/benches/str/char_count.rs b/library/core/benches/str/char_count.rs new file mode 100644 index 00000000000..f19d0941142 --- /dev/null +++ b/library/core/benches/str/char_count.rs @@ -0,0 +1,101 @@ +use super::corpora::*; +use test::{black_box, Bencher}; + +macro_rules! define_benches { + ($( fn $name: ident($arg: ident: &str) $body: block )+) => { + define_benches!(mod en_small, en::SMALL, $($name $arg $body)+); + define_benches!(mod en_medium, en::MEDIUM, $($name $arg $body)+); + define_benches!(mod en_large, en::LARGE, $($name $arg $body)+); + define_benches!(mod en_huge, en::HUGE, $($name $arg $body)+); + + define_benches!(mod zh_small, zh::SMALL, $($name $arg $body)+); + define_benches!(mod zh_medium, zh::MEDIUM, $($name $arg $body)+); + define_benches!(mod zh_large, zh::LARGE, $($name $arg $body)+); + define_benches!(mod zh_huge, zh::HUGE, $($name $arg $body)+); + + define_benches!(mod ru_small, ru::SMALL, $($name $arg $body)+); + define_benches!(mod ru_medium, ru::MEDIUM, $($name $arg $body)+); + define_benches!(mod ru_large, ru::LARGE, $($name $arg $body)+); + define_benches!(mod ru_huge, ru::HUGE, $($name $arg $body)+); + + define_benches!(mod emoji_small, emoji::SMALL, $($name $arg $body)+); + define_benches!(mod emoji_medium, emoji::MEDIUM, $($name $arg $body)+); + define_benches!(mod emoji_large, emoji::LARGE, $($name $arg $body)+); + define_benches!(mod emoji_huge, emoji::HUGE, $($name $arg $body)+); + }; + (mod $mod_name: ident, $input: expr, $($name: ident $arg: ident $body: block)+) => { + mod $mod_name { + use super::*; + $( + #[bench] + fn $name(bencher: &mut Bencher) { + let input = $input; + bencher.bytes = input.len() as u64; + let mut input_s = input.to_string(); + bencher.iter(|| { + let $arg: &str = &black_box(&mut input_s); + black_box($body) + }) + } + )+ + } + }; +} + +define_benches! { + fn case00_cur_libcore(s: &str) { + cur_libcore(s) + } + + fn case01_old_libcore(s: &str) { + old_libcore(s) + } + + fn case02_iter_increment(s: &str) { + iterator_increment(s) + } + + fn case03_manual_char_len(s: &str) { + manual_char_len(s) + } +} + +fn cur_libcore(s: &str) -> usize { + s.chars().count() +} +#[inline] +fn utf8_is_cont_byte(byte: u8) -> bool { + (byte as i8) < -64 +} +fn old_libcore(s: &str) -> usize { + s.as_bytes().iter().filter(|&&byte| !utf8_is_cont_byte(byte)).count() +} + +fn iterator_increment(s: &str) -> usize { + let mut c = 0; + for _ in s.chars() { + c += 1; + } + c +} + +fn manual_char_len(s: &str) -> usize { + let s = s.as_bytes(); + let mut c = 0; + let mut i = 0; + let l = s.len(); + while i < l { + let b = s[i]; + if b < 0x80 { + i += 1; + } else if b < 0xe0 { + i += 2; + } else if b < 0xf0 { + i += 3; + } else { + i += 4; + } + c += 1; + } + c +} diff --git a/library/core/benches/str/corpora.rs b/library/core/benches/str/corpora.rs new file mode 100644 index 00000000000..fefde757150 --- /dev/null +++ b/library/core/benches/str/corpora.rs @@ -0,0 +1,83 @@ +//! Exposes a number of modules with different kinds of strings. +//! +//! Each module contains `&str` constants named `SMALL`, `MEDIUM`, `LARGE`, and +//! `HUGE`. +//! +//! - The `SMALL` string is generally around 30-40 bytes. +//! - The `MEDIUM` string is generally around 600-700 bytes. +//! - The `LARGE` string is the `MEDIUM` string repeated 8x, and isย around 5kb. +//! - The `HUGE` string is the `LARGE` string repeated 8x (or the `MEDIUM` +//! string repeated 64x), and is around 40kb. +//! +//! Except for `mod emoji` (which is just a bunch of emoji), the strings were +//! pulled from (localizations of) rust-lang.org. + +macro_rules! repeat8 { + ($s:expr) => { + concat!($s, $s, $s, $s, $s, $s, $s, $s) + }; +} + +macro_rules! define_consts { + ($s:literal) => { + pub const MEDIUM: &str = $s; + pub const LARGE: &str = repeat8!($s); + pub const HUGE: &str = repeat8!(repeat8!(repeat8!($s))); + }; +} + +pub mod en { + pub const SMALL: &str = "Mary had a little lamb, Little lamb"; + define_consts! { + "Rust is blazingly fast and memory-efficient: with no runtime or garbage + collector, it can power performance-critical services, run on embedded + devices, and easily integrate with other languages. Rustโs rich type system + and ownership model guarantee memory-safety and thread-safety โ enabling you + to eliminate many classes of bugs at compile-time. Rust has great + documentation, a friendly compiler with useful error messages, and top-notch + tooling โ an integrated package manager and build tool, smart multi-editor + support with auto-completion and type inspections, an auto-formatter, and + more." + } +} + +pub mod zh { + pub const SMALL: &str = "ๅบฆๆไบบไธๅ ๅญๅฉ็จ็ๆ้ซ"; + define_consts! { + "Rust ้ๅบฆๆไบบไธๅ ๅญๅฉ็จ็ๆ้ซใ็ฑไบ\ + ๆฒกๆ่ฟ่กๆถๅๅๅพๅๆถ๏ผๅฎ่ฝๅค่ไปปๅฏนๆง่ฝ่ฆ\ + ๆฑ็นๅซ้ซ็ๆๅก๏ผๅฏไปฅๅจๅตๅ ฅๅผ่ฎพๅคไธ่ฟ่ก๏ผ\ + ่ฟ่ฝ่ฝปๆพๅๅ ถไป่ฏญ่จ้ๆใRust ไธฐๅฏ็็ฑปๅ\ + ็ณป็ปๅๆๆๆๆจกๅไฟ่ฏไบๅ ๅญๅฎๅ จๅ็บฟ็จๅฎๅ จ๏ผ\ + ่ฎฉๆจๅจ็ผ่ฏๆๅฐฑ่ฝๅคๆถ้คๅ็งๅๆ ท็้่ฏฏใ\ + Rust ๆฅๆๅบ่ฒ็ๆๆกฃใๅๅฅฝ็็ผ่ฏๅจๅๆธ ๆฐ\ + ็้่ฏฏๆ็คบไฟกๆฏ๏ผ ่ฟ้ๆไบไธๆต็ๅทฅๅ ทโโ\ + ๅ ็ฎก็ๅจๅๆๅปบๅทฅๅ ท๏ผ ๆบ่ฝๅฐ่ชๅจ่กฅๅ จๅ็ฑป\ + ๅๆฃ้ช็ๅค็ผ่พๅจๆฏๆ๏ผ ไปฅๅ่ชๅจๆ ผๅผๅไปฃ\ + ็ ็ญ็ญใ" + } +} + +pub mod ru { + pub const SMALL: &str = "ะกะพัะฝะธ ะบะพะผะฟะฐะฝะธะน ะฟะพ"; + define_consts! { + "ะกะพัะฝะธ ะบะพะผะฟะฐะฝะธะน ะฟะพ ะฒัะตะผั ะผะธัั ะธัะฟะพะปัะทััั Rust ะฒ ัะตะฐะปัะฝัั \ + ะฟัะพะตะบัะฐั ะดะปั ะฑัััััั ะบัะพัั-ะฟะปะฐััะพัะผะตะฝะฝัั ัะตัะตะฝะธะน ั\ + ะพะณัะฐะฝะธัะตะฝะฝัะผะธ ัะตััััะฐะผะธ. ะขะฐะบะธะต ะฟัะพะตะบัั, ะบะฐะบ Firefox,\ + Dropbox ะธ Cloudflare, ะธัะฟะพะปัะทััั Rust. Rust ะพัะปะธัะฝะพ\ + ะฟะพะดั ะพะดะธั ะบะฐะบ ะดะปั ััะฐััะฐะฟะพะฒ, ัะฐะบ ะธ ะดะปั ะฑะพะปััะธั ะบะพะผะฟะฐะฝะธะน,\ + ะบะฐะบ ะดะปั ะฒัััะฐะธะฒะฐะตะผัั ััััะพะนััะฒ, ัะฐะบ ะธ ะดะปั ะผะฐัััะฐะฑะธััะตะผัั \ + web-ัะตัะฒะธัะพะฒ. ะะพะน ัะฐะผัะน ะฑะพะปััะพะน ะบะพะผะฟะปะธะผะตะฝั Rust." + } +} + +pub mod emoji { + pub const SMALL: &str = "๐๐๐๐๐๐ ๐คฃ๐๐๐๐๐๐๐ฅฐ๐๐คฉ๐"; + define_consts! { + "๐๐๐๐๐๐ ๐คฃ๐๐๐๐๐๐๐ฅฐ๐๐คฉ๐๐โบ๐๐๐ฅฒ๐๐๐๐คช๐๐ค๐ค๐คญ๐คซ๐ค๐ค๐คจ๐๐๐ถ๐ถโ๐ซ๏ธ๐๐\ + ๐๐ฌ๐ฎโ๐จ๐คฅ๐๐๐ช๐คค๐ด๐ท๐ค๐ค๐คข๐คฎ๐คง๐ฅต๐ฅถ๐ฅด๐ต๐ตโ๐ซ๐คฏ๏ฟฝ๏ฟฝ๐ฅณ๐ฅธ๐๐ค๐ง๐๐๐โน๐ฎ๐ฏ๐ฒ๐ณ๐ฅบ๐ฆ๐ง๐จ\ + ๐ฐ๐ฅ๐ข๐ญ๐ฑ๐๐ฃ๐๐๐ฉ๐ซ๐ฅฑ๐ค๐ก๐ ๐คฌ๐๐ฟ๐โ ๐ฉ๐คก๐น๐บ๐ป๐ฝ๐พ๐ค๐บ๐ธ๐น๐ป๐ผ๐ฝ๐๐ฟ๐พ๐๐๐\ + ๐๐๐๐๐๐๐๏ฟฝ๏ฟฝ๐๐โฃ๐โค๏ธโ๐ฅโค๏ธโ๐ฉนโค๐งก๐๐๐๐๐ค๐ค๐ค๐ฏ๐ข๐ฅ๐ซ๐ฆ๐จ๐ณ๐ฌ๐๏ธโ๐จ๏ธ๐จ๐ฏ๐ญ๐ค๐\ + ๐ค๐โ๐๐๐ค๐คโ" + } +} diff --git a/library/core/src/str/count.rs b/library/core/src/str/count.rs new file mode 100644 index 00000000000..464c6889c32 --- /dev/null +++ b/library/core/src/str/count.rs @@ -0,0 +1,116 @@ +//! Code for efficiently counting the number of `char`s in a UTF-8 encoded +//! string. +//! +//! Broadly, UTF-8 encodes `char`s as a "leading" byte which begins the `char`, +//! followed by some number (possibly 0) of continuation bytes. +//! +//! The leading byte can have a number of bit-patterns (with the specific +//! pattern indicating how many continuation bytes follow), but the continuation +//! bytes are always in the format `0b10XX_XXXX` (where the `X`s can take any +//! value). That is, the most significant bit is set, and the second most +//! significant bit is unset. +//! +//! To count the number of characters, we can just count the number of bytes in +//! the string which are not continuation bytes, which can be done many bytes at +//! a time fairly easily. +//! +//! Note: Because the term "leading byte" can sometimes be ambiguous (for +//! example, it could also refer to the first byte of a slice), we'll often use +//! the term "non-continuation byte" to refer to these bytes in the code. + +pub(super) fn count_chars(s: &str) -> usize { + // For correctness, `CHUNK_SIZE` must be: + // - Less than or equal to 255, otherwise we'll overflow bytes in `counts`. + // - A multiple of `UNROLL_INNER`, otherwise our `break` inside the + // `body.chunks(CHUNK_SIZE)` loop. + // + // For performance, `CHUNK_SIZE` should be: + // - Relatively cheap to `%` against. + // - Large enough to avoid paying for the cost of the `sum_bytes_in_usize` + // too often. + const CHUNK_SIZE: usize = 192; + const UNROLL_INNER: usize = 4; + + // Check the properties of `CHUNK_SIZE` / `UNROLL_INNER` that are required + // for correctness. + const _: [(); 1] = [(); (CHUNK_SIZE < 256 && (CHUNK_SIZE % UNROLL_INNER) == 0) as usize]; + // SAFETY: transmuting `[u8]` to `[usize]` is safe except for size + // differences which are handled by `align_to`. + let (head, body, tail) = unsafe { s.as_bytes().align_to::<usize>() }; + + let mut total = char_count_general_case(head) + char_count_general_case(tail); + // Split `body` into `CHUNK_SIZE` chunks to reduce the frequency with which + // we call `sum_bytes_in_usize`. + for chunk in body.chunks(CHUNK_SIZE) { + // We accumulate intermediate sums in `counts`, where each byte contains + // a subset of the sum of this chunk, like a `[u8; size_of::<usize>()]`. + let mut counts = 0; + let unrolled_chunks = chunk.array_chunks::<UNROLL_INNER>(); + // If there's a remainder (know can only happen for the last item in + // `chunks`, because `CHUNK_SIZE % UNROLL == 0`), then we need to + // account for that (although we don't use it to later). + let remainder = unrolled_chunks.remainder(); + for unrolled in unrolled_chunks { + for &word in unrolled { + // Because `CHUNK_SIZE` is < 256, this addition can't cause the + // count in any of the bytes to overflow into a subsequent byte. + counts += contains_non_continuation_byte(word); + } + } + + // Sum the values in `counts` (which, again, is conceptually a `[u8; + // size_of::<usize>()]`), and accumulate the result into `total`. + total += sum_bytes_in_usize(counts); + + // If there's any data in `remainder`, then handle it. This will only + // happen for the last `chunk` in `body.chunks()` (because `CHUNK_SIZE` + // is divisible by `UNROLL_INNER`), so we explicitly break at the end + // (which seems to help LLVM out). + if !remainder.is_empty() { + // Accumulate all the data in the remainder. + let mut counts = 0; + for &word in remainder { + counts += contains_non_continuation_byte(word); + } + total += sum_bytes_in_usize(counts); + break; + } + } + total +} + +// Checks each byte of `w` to see if it contains the first byte in a UTF-8 +// sequence. Bytes in `w` which are continuation bytes are left as `0x00` (e.g. +// false), and bytes which are non-continuation bytes are left as `0x01` (e.g. +// true) +#[inline] +fn contains_non_continuation_byte(w: usize) -> usize { + let lsb = 0x0101_0101_0101_0101u64 as usize; + ((!w >> 7) | (w >> 6)) & lsb +} + +// Morally equivalent to `values.to_ne_bytes().into_iter().sum::<usize>()`, but +// more efficient. +#[inline] +fn sum_bytes_in_usize(values: usize) -> usize { + const LSB_SHORTS: usize = 0x0001_0001_0001_0001_u64 as usize; + const SKIP_BYTES: usize = 0x00ff_00ff_00ff_00ff_u64 as usize; + + let pair_sum: usize = (values & SKIP_BYTES) + ((values >> 8) & SKIP_BYTES); + pair_sum.wrapping_mul(LSB_SHORTS) >> ((core::mem::size_of::<usize>() - 2) * 8) +} + +// This is the most direct implementation of the concept of "count the number of +// bytes in the string which are not continuation bytes", and is used for the +// head and tail of the input string (the first and last item in the tuple +// returned by `slice::align_to`). +fn char_count_general_case(s: &[u8]) -> usize { + const CONT_MASK_U8: u8 = 0b0011_1111; + const TAG_CONT_U8: u8 = 0b1000_0000; + let mut leads = 0; + for &byte in s { + let is_lead = (byte & !CONT_MASK_U8) != TAG_CONT_U8; + leads += is_lead as usize; + } + leads +} diff --git a/library/core/src/str/iter.rs b/library/core/src/str/iter.rs index de6e6d52b36..e529bccbc79 100644 --- a/library/core/src/str/iter.rs +++ b/library/core/src/str/iter.rs @@ -12,7 +12,7 @@ use crate::slice::{self, Split as SliceSplit}; use super::from_utf8_unchecked; use super::pattern::Pattern; use super::pattern::{DoubleEndedSearcher, ReverseSearcher, Searcher}; -use super::validations::{next_code_point, next_code_point_reverse, utf8_is_cont_byte}; +use super::validations::{next_code_point, next_code_point_reverse}; use super::LinesAnyMap; use super::{BytesIsNotEmpty, UnsafeBytesToStr}; use super::{CharEscapeDebugContinue, CharEscapeDefault, CharEscapeUnicode}; @@ -46,8 +46,7 @@ impl<'a> Iterator for Chars<'a> { #[inline] fn count(self) -> usize { - // length in `char` is equal to the number of non-continuation bytes - self.iter.filter(|&&byte| !utf8_is_cont_byte(byte)).count() + super::count::count_chars(self.as_str()) } #[inline] diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index 1d4600fa4a2..fceea2366da 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -7,6 +7,7 @@ #![stable(feature = "rust1", since = "1.0.0")] mod converts; +mod count; mod error; mod iter; mod traits; |
