about summary refs log tree commit diff
path: root/compiler/rustc_parse/src
diff options
context:
space:
mode:
authorHans Kratz <hans@appfour.com>2021-11-03 23:37:23 +0100
committerHans Kratz <hans@appfour.com>2021-11-04 12:01:26 +0100
commit2d9f0e2c50ff6131643fd0b2d5a9f65a7006f50c (patch)
tree7522008a2e577c95e9fcccca07e108b3bfa31ab0 /compiler/rustc_parse/src
parent473eaa42e9365c47d129f72693b5d163a20cf369 (diff)
downloadrust-2d9f0e2c50ff6131643fd0b2d5a9f65a7006f50c.tar.gz
rust-2d9f0e2c50ff6131643fd0b2d5a9f65a7006f50c.zip
Optimize bidi character detection.
Diffstat (limited to 'compiler/rustc_parse/src')
-rw-r--r--compiler/rustc_parse/src/lexer/mod.rs45
-rw-r--r--compiler/rustc_parse/src/lib.rs1
2 files changed, 40 insertions, 6 deletions
diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs
index 09a3d1b9028..21d0ee60cda 100644
--- a/compiler/rustc_parse/src/lexer/mod.rs
+++ b/compiler/rustc_parse/src/lexer/mod.rs
@@ -137,12 +137,45 @@ impl<'a> StringReader<'a> {
         // Opening delimiter of the length 2 is not included into the comment text.
         let content_start = start + BytePos(2);
         let content = self.str_from(content_start);
-        let span = self.mk_sp(start, self.pos);
-        const UNICODE_TEXT_FLOW_CHARS: &[char] = &[
-            '\u{202A}', '\u{202B}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}',
-            '\u{202C}', '\u{2069}',
-        ];
-        if content.contains(UNICODE_TEXT_FLOW_CHARS) {
+
+        // Char   - UTF-8
+        // U+202A - E2 80 AA
+        // U+202B - E2 80 AB
+        // U+202C - E2 80 AC
+        // U+202D - E2 80 AD
+        // U+202E - E2 80 AE
+        // U+2066 - E2 81 A6
+        // U+2067 - E2 81 A7
+        // U+2068 - E2 81 A8
+        // U+2069 - E2 81 A9
+        let mut bytes = content.as_bytes();
+        let contains_flow_control_chars = loop {
+            match core::slice::memchr::memchr(0xE2, &bytes) {
+                Some(idx) => {
+                    // bytes are valid UTF-8 -> E2 must be followed by two bytes
+                    match bytes[idx + 1] {
+                        0x80 => {
+                            if (0xAA..=0xAE).contains(&bytes[idx + 2]) {
+                                break true;
+                            }
+                        }
+                        0x81 => {
+                            if (0xA6..=0xA9).contains(&bytes[idx + 2]) {
+                                break true;
+                            }
+                        }
+                        _ => {}
+                    }
+                    bytes = &bytes[idx + 3..];
+                }
+                None => {
+                    break false;
+                }
+            }
+        };
+
+        if contains_flow_control_chars {
+            let span = self.mk_sp(start, self.pos);
             self.sess.buffer_lint_with_diagnostic(
                 &TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
                 span,
diff --git a/compiler/rustc_parse/src/lib.rs b/compiler/rustc_parse/src/lib.rs
index a40f47f895b..063b0183a8f 100644
--- a/compiler/rustc_parse/src/lib.rs
+++ b/compiler/rustc_parse/src/lib.rs
@@ -4,6 +4,7 @@
 #![feature(crate_visibility_modifier)]
 #![feature(if_let_guard)]
 #![feature(box_patterns)]
+#![feature(slice_internals)]
 #![recursion_limit = "256"]
 
 #[macro_use]