diff options
Diffstat (limited to 'src/libsyntax/parse')
| -rw-r--r-- | src/libsyntax/parse/lexer/mod.rs | 100 | ||||
| -rw-r--r-- | src/libsyntax/parse/mod.rs | 232 | ||||
| -rw-r--r-- | src/libsyntax/parse/parser.rs | 17 | ||||
| -rw-r--r-- | src/libsyntax/parse/token.rs | 34 |
4 files changed, 307 insertions, 76 deletions
diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index 7a9051c16ae..e1317e0ed35 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -685,7 +685,7 @@ impl<'a> StringReader<'a> { } - fn scan_numeric_escape(&mut self, n_hex_digits: uint, delim: char) -> char { + fn scan_numeric_escape(&mut self, n_hex_digits: uint, delim: char) -> bool { let mut accum_int = 0u32; let start_bpos = self.last_pos; for _ in range(0, n_hex_digits) { @@ -709,11 +709,11 @@ impl<'a> StringReader<'a> { } match char::from_u32(accum_int) { - Some(x) => x, + Some(_) => true, None => { let last_bpos = self.last_pos; self.err_span_(start_bpos, last_bpos, "illegal numeric character escape"); - '?' + false } } } @@ -721,8 +721,10 @@ impl<'a> StringReader<'a> { /// Scan for a single (possibly escaped) byte or char /// in a byte, (non-raw) byte string, char, or (non-raw) string literal. /// `start` is the position of `first_source_char`, which is already consumed. + /// + /// Returns true if there was a valid char/byte, false otherwise. fn scan_char_or_byte(&mut self, start: BytePos, first_source_char: char, - ascii_only: bool, delim: char) -> Option<char> { + ascii_only: bool, delim: char) -> bool { match first_source_char { '\\' => { // '\X' for some X must be a character constant: @@ -732,24 +734,18 @@ impl<'a> StringReader<'a> { match escaped { None => {}, // EOF here is an error that will be checked later. Some(e) => { - return Some(match e { - 'n' => '\n', - 'r' => '\r', - 't' => '\t', - '\\' => '\\', - '\'' => '\'', - '"' => '"', - '0' => '\x00', + return match e { + 'n' | 'r' | 't' | '\\' | '\'' | '"' | '0' => true, 'x' => self.scan_numeric_escape(2u, delim), 'u' if !ascii_only => self.scan_numeric_escape(4u, delim), 'U' if !ascii_only => self.scan_numeric_escape(8u, delim), '\n' if delim == '"' => { self.consume_whitespace(); - return None + true }, '\r' if delim == '"' && self.curr_is('\n') => { self.consume_whitespace(); - return None + true } c => { let last_pos = self.last_pos; @@ -758,9 +754,9 @@ impl<'a> StringReader<'a> { if ascii_only { "unknown byte escape" } else { "unknown character escape" }, c); - c + false } - }) + } } } } @@ -771,14 +767,16 @@ impl<'a> StringReader<'a> { if ascii_only { "byte constant must be escaped" } else { "character constant must be escaped" }, first_source_char); + return false; } '\r' => { if self.curr_is('\n') { self.bump(); - return Some('\n'); + return true; } else { self.err_span_(start, self.last_pos, "bare CR not allowed in string, use \\r instead"); + return false; } } _ => if ascii_only && first_source_char > '\x7F' { @@ -787,9 +785,10 @@ impl<'a> StringReader<'a> { start, last_pos, "byte constant must be ASCII. \ Use a \\xHH escape for a non-ASCII byte", first_source_char); + return false; } } - Some(first_source_char) + true } fn binop(&mut self, op: token::BinOp) -> token::Token { @@ -924,7 +923,7 @@ impl<'a> StringReader<'a> { let start = self.last_pos; // the eof will be picked up by the final `'` check below - let mut c2 = self.curr.unwrap_or('\x00'); + let c2 = self.curr.unwrap_or('\x00'); self.bump(); // If the character is an ident start not followed by another single @@ -967,7 +966,7 @@ impl<'a> StringReader<'a> { } // Otherwise it is a character constant: - c2 = self.scan_char_or_byte(start, c2, /* ascii_only = */ false, '\'').unwrap(); + let valid = self.scan_char_or_byte(start, c2, /* ascii_only = */ false, '\''); if !self.curr_is('\'') { let last_bpos = self.last_pos; self.fatal_span_verbose( @@ -977,8 +976,9 @@ impl<'a> StringReader<'a> { start - BytePos(1), last_bpos, "unterminated character constant".to_string()); } + let id = if valid { self.ident_from(start) } else { str_to_ident("0") }; self.bump(); // advance curr past token - return token::LIT_CHAR(c2); + return token::LIT_CHAR(id); } 'b' => { self.bump(); @@ -991,8 +991,8 @@ impl<'a> StringReader<'a> { } '"' => { - let mut accum_str = String::new(); let start_bpos = self.last_pos; + let mut valid = true; self.bump(); while !self.curr_is('"') { if self.is_eof() { @@ -1003,11 +1003,13 @@ impl<'a> StringReader<'a> { let ch_start = self.last_pos; let ch = self.curr.unwrap(); self.bump(); - self.scan_char_or_byte(ch_start, ch, /* ascii_only = */ false, '"') - .map(|ch| accum_str.push_char(ch)); + valid &= self.scan_char_or_byte(ch_start, ch, /* ascii_only = */ false, '"'); } + // adjust for the ACSII " at the start of the literal + let id = if valid { self.ident_from(start_bpos + BytePos(1)) } + else { str_to_ident("??") }; self.bump(); - return token::LIT_STR(str_to_ident(accum_str.as_slice())); + return token::LIT_STR(id); } 'r' => { let start_bpos = self.last_pos; @@ -1032,7 +1034,7 @@ impl<'a> StringReader<'a> { self.bump(); let content_start_bpos = self.last_pos; let mut content_end_bpos; - let mut has_cr = false; + let mut valid = true; 'outer: loop { if self.is_eof() { let last_bpos = self.last_pos; @@ -1055,23 +1057,26 @@ impl<'a> StringReader<'a> { } } break; - } + }, '\r' => { - has_cr = true; + if !self.nextch_is('\n') { + let last_bpos = self.last_pos; + self.err_span_(start_bpos, last_bpos, "bare CR not allowed in raw \ + string, use \\r instead"); + valid = false; + } } _ => () } self.bump(); } self.bump(); - let str_content = self.with_str_from_to(content_start_bpos, content_end_bpos, |string| { - let string = if has_cr { - self.translate_crlf(content_start_bpos, string, - "bare CR not allowed in raw string") - } else { string.into_maybe_owned() }; - str_to_ident(string.as_slice()) - }); - return token::LIT_STR_RAW(str_content, hash_count); + let id = if valid { + self.ident_from_to(content_start_bpos, content_end_bpos) + } else { + str_to_ident("??") + }; + return token::LIT_STR_RAW(id, hash_count); } '-' => { if self.nextch_is('>') { @@ -1145,10 +1150,10 @@ impl<'a> StringReader<'a> { let start = self.last_pos; // the eof will be picked up by the final `'` check below - let mut c2 = self.curr.unwrap_or('\x00'); + let c2 = self.curr.unwrap_or('\x00'); self.bump(); - c2 = self.scan_char_or_byte(start, c2, /* ascii_only = */ true, '\'').unwrap(); + let valid = self.scan_char_or_byte(start, c2, /* ascii_only = */ true, '\''); if !self.curr_is('\'') { // Byte offsetting here is okay because the // character before position `start` are an @@ -1158,14 +1163,17 @@ impl<'a> StringReader<'a> { start - BytePos(2), last_pos, "unterminated byte constant".to_string()); } + + let id = if valid { self.ident_from(start) } else { str_to_ident("??") }; self.bump(); // advance curr past token - return token::LIT_BYTE(c2 as u8); + return token::LIT_BYTE(id); } fn scan_byte_string(&mut self) -> token::Token { self.bump(); let start = self.last_pos; - let mut value = Vec::new(); + let mut valid = true; + while !self.curr_is('"') { if self.is_eof() { let last_pos = self.last_pos; @@ -1176,11 +1184,11 @@ impl<'a> StringReader<'a> { let ch_start = self.last_pos; let ch = self.curr.unwrap(); self.bump(); - self.scan_char_or_byte(ch_start, ch, /* ascii_only = */ true, '"') - .map(|ch| value.push(ch as u8)); + valid &= self.scan_char_or_byte(ch_start, ch, /* ascii_only = */ true, '"'); } + let id = if valid { self.ident_from(start) } else { str_to_ident("??") }; self.bump(); - return token::LIT_BINARY(Rc::new(value)); + return token::LIT_BINARY(id); } fn scan_raw_byte_string(&mut self) -> token::Token { @@ -1231,10 +1239,8 @@ impl<'a> StringReader<'a> { self.bump(); } self.bump(); - let bytes = self.with_str_from_to(content_start_bpos, - content_end_bpos, - |s| s.as_bytes().to_owned()); - return token::LIT_BINARY_RAW(Rc::new(bytes), hash_count); + return token::LIT_BINARY_RAW(self.ident_from_to(content_start_bpos, content_end_bpos), + hash_count); } } diff --git a/src/libsyntax/parse/mod.rs b/src/libsyntax/parse/mod.rs index bea8b6a94d4..62750e60bf8 100644 --- a/src/libsyntax/parse/mod.rs +++ b/src/libsyntax/parse/mod.rs @@ -272,7 +272,239 @@ pub fn maybe_aborted<T>(result: T, mut p: Parser) -> T { result } +/// Parse a string representing a character literal into its final form. +/// Rather than just accepting/rejecting a given literal, unescapes it as +/// well. Can take any slice prefixed by a character escape. Returns the +/// character and the number of characters consumed. +pub fn char_lit(lit: &str) -> (char, int) { + use std::{num, char}; + + let mut chars = lit.chars(); + let c = match (chars.next(), chars.next()) { + (Some(c), None) if c != '\\' => return (c, 1), + (Some('\\'), Some(c)) => match c { + '"' => Some('"'), + 'n' => Some('\n'), + 'r' => Some('\r'), + 't' => Some('\t'), + '\\' => Some('\\'), + '\'' => Some('\''), + '0' => Some('\0'), + _ => { None } + }, + _ => fail!("lexer accepted invalid char escape `{}`", lit) + }; + + match c { + Some(x) => return (x, 2), + None => { } + } + + let msg = format!("lexer should have rejected a bad character escape {}", lit); + let msg2 = msg.as_slice(); + + let esc: |uint| -> Option<(char, int)> = |len| + num::from_str_radix(lit.slice(2, len), 16) + .and_then(char::from_u32) + .map(|x| (x, len as int)); + + // Unicode escapes + return match lit.as_bytes()[1] as char { + 'x' | 'X' => esc(4), + 'u' => esc(6), + 'U' => esc(10), + _ => None, + }.expect(msg2); +} + +/// Parse a string representing a string literal into its final form. Does +/// unescaping. +pub fn str_lit(lit: &str) -> String { + debug!("parse_str_lit: given {}", lit.escape_default()); + let mut res = String::with_capacity(lit.len()); + + // FIXME #8372: This could be a for-loop if it didn't borrow the iterator + let error = |i| format!("lexer should have rejected {} at {}", lit, i); + + /// Eat everything up to a non-whitespace + fn eat<'a>(it: &mut ::std::iter::Peekable<(uint, char), ::std::str::CharOffsets<'a>>) { + loop { + match it.peek().map(|x| x.val1()) { + Some(' ') | Some('\n') | Some('\r') | Some('\t') => { + it.next(); + }, + _ => { break; } + } + } + } + + let mut chars = lit.char_indices().peekable(); + loop { + match chars.next() { + Some((i, c)) => { + let em = error(i); + match c { + '\\' => { + if chars.peek().expect(em.as_slice()).val1() == '\n' { + eat(&mut chars); + } else if chars.peek().expect(em.as_slice()).val1() == '\r' { + chars.next(); + if chars.peek().expect(em.as_slice()).val1() != '\n' { + fail!("lexer accepted bare CR"); + } + eat(&mut chars); + } else { + // otherwise, a normal escape + let (c, n) = char_lit(lit.slice_from(i)); + for _ in range(0, n - 1) { // we don't need to move past the first \ + chars.next(); + } + res.push_char(c); + } + }, + '\r' => { + if chars.peek().expect(em.as_slice()).val1() != '\n' { + fail!("lexer accepted bare CR"); + } + chars.next(); + res.push_char('\n'); + } + c => res.push_char(c), + } + }, + None => break + } + } + + res.shrink_to_fit(); // probably not going to do anything, unless there was an escape. + debug!("parse_str_lit: returning {}", res); + res +} + +/// Parse a string representing a raw string literal into its final form. The +/// only operation this does is convert embedded CRLF into a single LF. +pub fn raw_str_lit(lit: &str) -> String { + debug!("raw_str_lit: given {}", lit.escape_default()); + let mut res = String::with_capacity(lit.len()); + + // FIXME #8372: This could be a for-loop if it didn't borrow the iterator + let mut chars = lit.chars().peekable(); + loop { + match chars.next() { + Some(c) => { + if c == '\r' { + if *chars.peek().unwrap() != '\n' { + fail!("lexer accepted bare CR"); + } + chars.next(); + res.push_char('\n'); + } else { + res.push_char(c); + } + }, + None => break + } + } + res.shrink_to_fit(); + res +} + +pub fn float_lit(s: &str) -> ast::Lit_ { + debug!("float_lit: {}", s); + // FIXME #2252: bounds checking float literals is defered until trans + let s2 = s.chars().filter(|&c| c != '_').collect::<String>(); + let s = s2.as_slice(); + + let mut ty = None; + + if s.ends_with("f32") { + ty = Some(ast::TyF32); + } else if s.ends_with("f64") { + ty = Some(ast::TyF64); + } + + + match ty { + Some(t) => { + ast::LitFloat(token::intern_and_get_ident(s.slice_to(s.len() - t.suffix_len())), t) + }, + None => ast::LitFloatUnsuffixed(token::intern_and_get_ident(s)) + } +} + +/// Parse a string representing a byte literal into its final form. Similar to `char_lit` +pub fn byte_lit(lit: &str) -> (u8, uint) { + let err = |i| format!("lexer accepted invalid byte literal {} step {}", lit, i); + + if lit.len() == 1 { + (lit.as_bytes()[0], 1) + } else { + assert!(lit.as_bytes()[0] == b'\\', err(0i)); + let b = match lit.as_bytes()[1] { + b'"' => b'"', + b'n' => b'\n', + b'r' => b'\r', + b't' => b'\t', + b'\\' => b'\\', + b'\'' => b'\'', + b'0' => b'\0', + _ => { + match ::std::num::from_str_radix::<u64>(lit.slice(2, 4), 16) { + Some(c) => + if c > 0xFF { + fail!(err(2)) + } else { + return (c as u8, 4) + }, + None => fail!(err(3)) + } + } + }; + return (b, 2); + } +} + +pub fn binary_lit(lit: &str) -> Rc<Vec<u8>> { + let mut res = Vec::with_capacity(lit.len()); + + // FIXME #8372: This could be a for-loop if it didn't borrow the iterator + let error = |i| format!("lexer should have rejected {} at {}", lit, i); + + // binary literals *must* be ASCII, but the escapes don't have to be + let mut chars = lit.as_bytes().iter().enumerate().peekable(); + loop { + match chars.next() { + Some((i, &c)) => { + if c == b'\\' { + if *chars.peek().expect(error(i).as_slice()).val1() == b'\n' { + loop { + // eat everything up to a non-whitespace + match chars.peek().map(|x| *x.val1()) { + Some(b' ') | Some(b'\n') | Some(b'\r') | Some(b'\t') => { + chars.next(); + }, + _ => { break; } + } + } + } else { + // otherwise, a normal escape + let (c, n) = byte_lit(lit.slice_from(i)); + for _ in range(0, n - 1) { // we don't need to move past the first \ + chars.next(); + } + res.push(c); + } + } else { + res.push(c); + } + }, + None => { break; } + } + } + + Rc::new(res) +} #[cfg(test)] mod test { diff --git a/src/libsyntax/parse/parser.rs b/src/libsyntax/parse/parser.rs index 3bf88424891..553e685bdde 100644 --- a/src/libsyntax/parse/parser.rs +++ b/src/libsyntax/parse/parser.rs @@ -61,6 +61,7 @@ use ast_util::{as_prec, ident_to_path, lit_is_str, operator_prec}; use ast_util; use codemap::{Span, BytePos, Spanned, spanned, mk_sp}; use codemap; +use parse; use parse::attr::ParserAttr; use parse::classify; use parse::common::{SeqSep, seq_sep_none}; @@ -1543,8 +1544,8 @@ impl<'a> Parser<'a> { /// Matches token_lit = LIT_INT | ... pub fn lit_from_token(&mut self, tok: &token::Token) -> Lit_ { match *tok { - token::LIT_BYTE(i) => LitByte(i), - token::LIT_CHAR(i) => LitChar(i), + token::LIT_BYTE(i) => LitByte(parse::byte_lit(i.as_str()).val0()), + token::LIT_CHAR(i) => LitChar(parse::char_lit(i.as_str()).val0()), token::LIT_INT(i, it) => LitInt(i, it), token::LIT_UINT(u, ut) => LitUint(u, ut), token::LIT_INT_UNSUFFIXED(i) => LitIntUnsuffixed(i), @@ -1555,13 +1556,17 @@ impl<'a> Parser<'a> { LitFloatUnsuffixed(self.id_to_interned_str(s)) } token::LIT_STR(s) => { - LitStr(self.id_to_interned_str(s), ast::CookedStr) + LitStr(token::intern_and_get_ident(parse::str_lit(s.as_str()).as_slice()), + ast::CookedStr) } token::LIT_STR_RAW(s, n) => { - LitStr(self.id_to_interned_str(s), ast::RawStr(n)) + LitStr(token::intern_and_get_ident(parse::raw_str_lit(s.as_str()).as_slice()), + ast::RawStr(n)) } - token::LIT_BINARY_RAW(ref v, _) | - token::LIT_BINARY(ref v) => LitBinary(v.clone()), + token::LIT_BINARY(i) => + LitBinary(parse::binary_lit(self.id_to_interned_str(i).get())), + token::LIT_BINARY_RAW(i, _) => + LitBinary(Rc::new(i.as_str().as_bytes().iter().map(|&x| x).collect())), token::LPAREN => { self.expect(&token::RPAREN); LitNil }, _ => { self.unexpected_last(tok); } } diff --git a/src/libsyntax/parse/token.rs b/src/libsyntax/parse/token.rs index c7aeae04ba2..bb6183b7e9e 100644 --- a/src/libsyntax/parse/token.rs +++ b/src/libsyntax/parse/token.rs @@ -79,8 +79,8 @@ pub enum Token { QUESTION, /* Literals */ - LIT_BYTE(u8), - LIT_CHAR(char), + LIT_BYTE(Ident), + LIT_CHAR(Ident), LIT_INT(i64, ast::IntTy), LIT_UINT(u64, ast::UintTy), LIT_INT_UNSUFFIXED(i64), @@ -88,8 +88,8 @@ pub enum Token { LIT_FLOAT_UNSUFFIXED(Ident), LIT_STR(Ident), LIT_STR_RAW(Ident, uint), /* raw str delimited by n hash symbols */ - LIT_BINARY(Rc<Vec<u8>>), - LIT_BINARY_RAW(Rc<Vec<u8>>, uint), /* raw binary str delimited by n hash symbols */ + LIT_BINARY(Ident), + LIT_BINARY_RAW(Ident, uint), /* raw binary str delimited by n hash symbols */ /* Name components */ /// An identifier contains an "is_mod_name" boolean, @@ -201,20 +201,10 @@ pub fn to_string(t: &Token) -> String { /* Literals */ LIT_BYTE(b) => { - let mut res = String::from_str("b'"); - (b as char).escape_default(|c| { - res.push_char(c); - }); - res.push_char('\''); - res + format!("b'{}'", get_ident(b).get()) } LIT_CHAR(c) => { - let mut res = String::from_str("'"); - c.escape_default(|c| { - res.push_char(c); - }); - res.push_char('\''); - res + format!("'{}'", get_ident(c).get()) } LIT_INT(i, t) => ast_util::int_ty_to_string(t, Some(i)), LIT_UINT(u, t) => ast_util::uint_ty_to_string(t, Some(u)), @@ -235,20 +225,18 @@ pub fn to_string(t: &Token) -> String { body } LIT_STR(s) => { - format!("\"{}\"", get_ident(s).get().escape_default()) + format!("\"{}\"", get_ident(s).get()) } LIT_STR_RAW(s, n) => { format!("r{delim}\"{string}\"{delim}", delim="#".repeat(n), string=get_ident(s)) } - LIT_BINARY(ref v) => { - format!( - "b\"{}\"", - v.iter().map(|&b| b as char).collect::<String>().escape_default()) + LIT_BINARY(v) => { + format!("b\"{}\"", get_ident(v).get()) } - LIT_BINARY_RAW(ref s, n) => { + LIT_BINARY_RAW(s, n) => { format!("br{delim}\"{string}\"{delim}", - delim="#".repeat(n), string=s.as_slice().to_ascii().as_str_ascii()) + delim="#".repeat(n), string=get_ident(s).get()) } /* Name components */ |
