Lex binary and octal literals more eagerly.

Previously 0b12 was considered two tokens, 0b1 and 2, as 2 isn't a valid base 2 digit. This patch changes that to collapse them into one (and makes `0b12` etc. an error: 2 isn't a valid base 2 digit). This may break some macro invocations of macros with `tt` (or syntax extensions) that rely on adjacent digits being separate tokens and hence is a [breaking-change] The fix is to separate the tokens, e.g. `0b12` -> `0b1 2`. cc https://github.com/rust-lang/rfcs/pull/879
author: Huon Wilson <dbau.pp+github@gmail.com> 2015-03-31 00:27:13 +1100
committer: Huon Wilson <dbau.pp+github@gmail.com> 2015-03-31 12:16:42 +1100
commit: 606f50c46dd9a3852d36456d2015e1ccf832642e (patch)
tree: 1c992543bb91d02c72328b9021202a9fdcdba01a /src/libsyntax/parse
parent: f00264074fb8f7e3b603eb3469822813632a6d30 (diff)
download: rust-606f50c46dd9a3852d36456d2015e1ccf832642e.tar.gz
rust-606f50c46dd9a3852d36456d2015e1ccf832642e.zip
2 files changed, 39 insertions, 14 deletions
diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs
index 532b632fac8..ae5c99123a5 100644
--- a/src/libsyntax/parse/lexer/mod.rs
+++ b/src/libsyntax/parse/lexer/mod.rs
@@ -621,7 +621,7 @@ impl<'a> StringReader<'a> {
         let base = 10;
 
         // find the integer representing the name
-        self.scan_digits(base);
+        self.scan_digits(base, base);
         let encoded_name : u32 = self.with_str_from(start_bpos, |s| {
             num::from_str_radix(s, 10).unwrap_or_else(|_| {
                 panic!("expected digits representing a name, got {:?}, {}, range [{:?},{:?}]",
@@ -639,7 +639,7 @@ impl<'a> StringReader<'a> {
 
         // find the integer representing the ctxt
         let start_bpos = self.last_pos;
-        self.scan_digits(base);
+        self.scan_digits(base, base);
         let encoded_ctxt : ast::SyntaxContext = self.with_str_from(start_bpos, |s| {
             num::from_str_radix(s, 10).unwrap_or_else(|_| {
                 panic!("expected digits representing a ctxt, got {:?}, {}", s, whence);
@@ -653,16 +653,28 @@ impl<'a> StringReader<'a> {
                      ctxt: encoded_ctxt, }
     }
 
-    /// Scan through any digits (base `radix`) or underscores, and return how
-    /// many digits there were.
-    fn scan_digits(&mut self, radix: u32) -> usize {
+    /// Scan through any digits (base `scan_radix`) or underscores,
+    /// and return how many digits there were.
+    ///
+    /// `real_radix` represents the true radix of the number we're
+    /// interested in, and errors will be emitted for any digits
+    /// between `real_radix` and `scan_radix`.
+    fn scan_digits(&mut self, real_radix: u32, scan_radix: u32) -> usize {
+        assert!(real_radix <= scan_radix);
         let mut len = 0;
         loop {
             let c = self.curr;
             if c == Some('_') { debug!("skipping a _"); self.bump(); continue; }
-            match c.and_then(|cc| cc.to_digit(radix)) {
+            match c.and_then(|cc| cc.to_digit(scan_radix)) {
                 Some(_) => {
                     debug!("{:?} in scan_digits", c);
+                    // check that the hypothetical digit is actually
+                    // in range for the true radix
+                    if c.unwrap().to_digit(real_radix).is_none() {
+                        self.err_span_(self.last_pos, self.pos,
+                                       &format!("invalid digit for a base {} literal",
+                                                real_radix));
+                    }
                     len += 1;
                     self.bump();
                 }
@@ -681,11 +693,11 @@ impl<'a> StringReader<'a> {
 
         if c == '0' {
             match self.curr.unwrap_or('\0') {
-                'b' => { self.bump(); base = 2; num_digits = self.scan_digits(2); }
-                'o' => { self.bump(); base = 8; num_digits = self.scan_digits(8); }
-                'x' => { self.bump(); base = 16; num_digits = self.scan_digits(16); }
+                'b' => { self.bump(); base = 2; num_digits = self.scan_digits(2, 10); }
+                'o' => { self.bump(); base = 8; num_digits = self.scan_digits(8, 10); }
+                'x' => { self.bump(); base = 16; num_digits = self.scan_digits(16, 16); }
                 '0'...'9' | '_' | '.' => {
-                    num_digits = self.scan_digits(10) + 1;
+                    num_digits = self.scan_digits(10, 10) + 1;
                 }
                 _ => {
                     // just a 0
@@ -693,7 +705,7 @@ impl<'a> StringReader<'a> {
                 }
             }
         } else if c.is_digit(10) {
-            num_digits = self.scan_digits(10) + 1;
+            num_digits = self.scan_digits(10, 10) + 1;
         } else {
             num_digits = 0;
         }
@@ -712,7 +724,7 @@ impl<'a> StringReader<'a> {
             // with a number
             self.bump();
             if self.curr.unwrap_or('\0').is_digit(10) {
-                self.scan_digits(10);
+                self.scan_digits(10, 10);
                 self.scan_float_exponent();
             }
             let last_pos = self.last_pos;
@@ -935,7 +947,7 @@ impl<'a> StringReader<'a> {
             if self.curr_is('-') || self.curr_is('+') {
                 self.bump();
             }
-            if self.scan_digits(10) == 0 {
+            if self.scan_digits(10, 10) == 0 {
                 self.err_span_(self.last_pos, self.pos, "expected at least one digit in exponent")
             }
         }
diff --git a/src/libsyntax/parse/mod.rs b/src/libsyntax/parse/mod.rs
index bea42a88bf5..f8820999c9d 100644
--- a/src/libsyntax/parse/mod.rs
+++ b/src/libsyntax/parse/mod.rs
@@ -735,7 +735,20 @@ pub fn integer_lit(s: &str, suffix: Option<&str>, sd: &SpanHandler, sp: Span) ->
 
     let res: u64 = match ::std::num::from_str_radix(s, base).ok() {
         Some(r) => r,
-        None => { sd.span_err(sp, "int literal is too large"); 0 }
+        None => {
+            // small bases are lexed as if they were base 10, e.g, the string
+            // might be `0b10201`. This will cause the conversion above to fail,
+            // but these cases have errors in the lexer: we don't want to emit
+            // two errors, and we especially don't want to emit this error since
+            // it isn't necessarily true.
+            let already_errored = base < 10 &&
+                s.chars().any(|c| c.to_digit(10).map_or(false, |d| d >= base));
+
+            if !already_errored {
+                sd.span_err(sp, "int literal is too large");
+            }
+            0
+        }
     };
 
     // adjust the sign
author	Huon Wilson <dbau.pp+github@gmail.com>	2015-03-31 00:27:13 +1100
committer	Huon Wilson <dbau.pp+github@gmail.com>	2015-03-31 12:16:42 +1100
commit	606f50c46dd9a3852d36456d2015e1ccf832642e (patch)
tree	1c992543bb91d02c72328b9021202a9fdcdba01a /src/libsyntax/parse
parent	f00264074fb8f7e3b603eb3469822813632a6d30 (diff)
download	rust-606f50c46dd9a3852d36456d2015e1ccf832642e.tar.gz rust-606f50c46dd9a3852d36456d2015e1ccf832642e.zip