about summary refs log tree commit diff
path: root/src/libsyntax/parse/lexer
diff options
context:
space:
mode:
authorKevin Butler <haqkrs@gmail.com>2015-11-12 02:43:43 +0000
committerKevin Butler <haqkrs@gmail.com>2016-01-16 00:57:12 +0000
commit24578e0fe555f267bef40528b8ac79bc7e898007 (patch)
tree336f724d68ee4dc447ced71e23636de956d16870 /src/libsyntax/parse/lexer
parent9e3e43f3f6bb0d87da5f5b7fd92db0cc990e62a3 (diff)
downloadrust-24578e0fe555f267bef40528b8ac79bc7e898007.tar.gz
rust-24578e0fe555f267bef40528b8ac79bc7e898007.zip
libsyntax: accept only whitespace with the PATTERN_WHITE_SPACE property
This aligns with unicode recommendations and should be stable for all future
unicode releases. See http://unicode.org/reports/tr31/#R3.

This renames `libsyntax::lexer::is_whitespace` to `is_pattern_whitespace`
so potentially breaks users of libsyntax.
Diffstat (limited to 'src/libsyntax/parse/lexer')
-rw-r--r--src/libsyntax/parse/lexer/comments.rs4
-rw-r--r--src/libsyntax/parse/lexer/mod.rs17
2 files changed, 12 insertions, 9 deletions
diff --git a/src/libsyntax/parse/lexer/comments.rs b/src/libsyntax/parse/lexer/comments.rs
index e336c98f03c..629edced804 100644
--- a/src/libsyntax/parse/lexer/comments.rs
+++ b/src/libsyntax/parse/lexer/comments.rs
@@ -15,7 +15,7 @@ use codemap::{BytePos, CharPos, CodeMap, Pos};
 use errors;
 use parse::lexer::is_block_doc_comment;
 use parse::lexer::{StringReader, TokenAndSpan};
-use parse::lexer::{is_whitespace, Reader};
+use parse::lexer::{is_pattern_whitespace, Reader};
 use parse::lexer;
 use print::pprust;
 use str::char_at;
@@ -153,7 +153,7 @@ fn push_blank_line_comment(rdr: &StringReader, comments: &mut Vec<Comment>) {
 }
 
 fn consume_whitespace_counting_blank_lines(rdr: &mut StringReader, comments: &mut Vec<Comment>) {
-    while is_whitespace(rdr.curr) && !rdr.is_eof() {
+    while is_pattern_whitespace(rdr.curr) && !rdr.is_eof() {
         if rdr.col == CharPos(0) && rdr.curr_is('\n') {
             push_blank_line_comment(rdr, &mut *comments);
         }
diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs
index 3e61aaff3c9..88a876cac73 100644
--- a/src/libsyntax/parse/lexer/mod.rs
+++ b/src/libsyntax/parse/lexer/mod.rs
@@ -16,6 +16,7 @@ use ext::tt::transcribe::tt_next_token;
 use parse::token::str_to_ident;
 use parse::token;
 use str::char_at;
+use rustc_unicode::property::Pattern_White_Space;
 
 use std::borrow::Cow;
 use std::char;
@@ -546,10 +547,10 @@ impl<'a> StringReader<'a> {
                 let c = self.scan_comment();
                 debug!("scanning a comment {:?}", c);
                 c
-            }
-            c if is_whitespace(Some(c)) => {
+            },
+            c if is_pattern_whitespace(Some(c)) => {
                 let start_bpos = self.last_pos;
-                while is_whitespace(self.curr) {
+                while is_pattern_whitespace(self.curr) {
                     self.bump();
                 }
                 let c = Some(TokenAndSpan {
@@ -1435,7 +1436,7 @@ impl<'a> StringReader<'a> {
     }
 
     fn consume_whitespace(&mut self) {
-        while is_whitespace(self.curr) && !self.is_eof() {
+        while is_pattern_whitespace(self.curr) && !self.is_eof() {
             self.bump();
         }
     }
@@ -1460,7 +1461,7 @@ impl<'a> StringReader<'a> {
     }
 
     fn consume_non_eol_whitespace(&mut self) {
-        while is_whitespace(self.curr) && !self.curr_is('\n') && !self.is_eof() {
+        while is_pattern_whitespace(self.curr) && !self.curr_is('\n') && !self.is_eof() {
             self.bump();
         }
     }
@@ -1591,8 +1592,10 @@ impl<'a> StringReader<'a> {
     }
 }
 
-pub fn is_whitespace(c: Option<char>) -> bool {
-    c.map_or(false, char::is_whitespace)
+// This tests the character for the unicode property 'PATTERN_WHITE_SPACE' which
+// is guaranteed to be forward compatible. http://unicode.org/reports/tr31/#R3
+pub fn is_pattern_whitespace(c: Option<char>) -> bool {
+    c.map_or(false, Pattern_White_Space)
 }
 
 fn in_range(c: Option<char>, lo: char, hi: char) -> bool {