diff options
| author | bors <bors@rust-lang.org> | 2014-06-18 02:06:37 +0000 |
|---|---|---|
| committer | bors <bors@rust-lang.org> | 2014-06-18 02:06:37 +0000 |
| commit | d6736a1440d42f6af967a8a20ab8d73522112b72 (patch) | |
| tree | 2a92204127cf1512a60c81ecbb5700288d8f1b92 /src/libsyntax/parse | |
| parent | 5c81a186e9d835ca66865bd9807524b805a06d8d (diff) | |
| parent | 3744d828513092d1ed64c4c6f8cd2536f7a5ff0d (diff) | |
| download | rust-d6736a1440d42f6af967a8a20ab8d73522112b72.tar.gz rust-d6736a1440d42f6af967a8a20ab8d73522112b72.zip | |
auto merge of #14880 : SimonSapin/rust/byte-literals, r=alexcrichton
See #14646 (tracking issue) and rust-lang/rfcs#69. This does not close the tracking issue, as the `bytes!()` macro still needs to be removed. It will be later, after a snapshot is made with the changes in this PR, so that the new syntax can be used when bootstrapping the compiler.
Diffstat (limited to 'src/libsyntax/parse')
| -rw-r--r-- | src/libsyntax/parse/lexer/mod.rs | 257 | ||||
| -rw-r--r-- | src/libsyntax/parse/parser.rs | 5 | ||||
| -rw-r--r-- | src/libsyntax/parse/token.rs | 34 |
3 files changed, 213 insertions, 83 deletions
diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index f7eac0b323f..9039f346edb 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -636,6 +636,67 @@ impl<'a> StringReader<'a> { } } + /// Scan for a single (possibly escaped) byte or char + /// in a byte, (non-raw) byte string, char, or (non-raw) string literal. + /// `start` is the position of `first_source_char`, which is already consumed. + fn scan_char_or_byte(&mut self, start: BytePos, first_source_char: char, + ascii_only: bool, delim: char) -> Option<char> { + match first_source_char { + '\\' => { + // '\X' for some X must be a character constant: + let escaped = self.curr; + let escaped_pos = self.last_pos; + self.bump(); + match escaped { + None => {}, // EOF here is an error that will be checked later. + Some(e) => { + return Some(match e { + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + '\\' => '\\', + '\'' => '\'', + '"' => '"', + '0' => '\x00', + 'x' => self.scan_numeric_escape(2u, delim), + 'u' if !ascii_only => self.scan_numeric_escape(4u, delim), + 'U' if !ascii_only => self.scan_numeric_escape(8u, delim), + '\n' if delim == '"' => { + self.consume_whitespace(); + return None + }, + c => { + let last_pos = self.last_pos; + self.err_span_char( + escaped_pos, last_pos, + if ascii_only { "unknown byte escape" } + else { "unknown character escape" }, + c); + c + } + }) + } + } + } + '\t' | '\n' | '\r' | '\'' if delim == '\'' => { + let last_pos = self.last_pos; + self.err_span_char( + start, last_pos, + if ascii_only { "byte constant must be escaped" } + else { "character constant must be escaped" }, + first_source_char); + } + _ => if ascii_only && first_source_char > '\x7F' { + let last_pos = self.last_pos; + self.err_span_char( + start, last_pos, + "byte constant must be ASCII. \ + Use a \\xHH escape for a non-ASCII byte", first_source_char); + } + } + Some(first_source_char) + } + fn binop(&mut self, op: token::BinOp) -> token::Token { self.bump(); if self.curr_is('=') { @@ -650,10 +711,15 @@ impl<'a> StringReader<'a> { /// token, and updates the interner fn next_token_inner(&mut self) -> token::Token { let c = self.curr; - if ident_start(c) && !self.nextch_is('"') && !self.nextch_is('#') { + if ident_start(c) && match (c.unwrap(), self.nextch(), self.nextnextch()) { // Note: r as in r" or r#" is part of a raw string literal, - // not an identifier, and is handled further down. - + // b as in b' is part of a byte literal. + // They are not identifiers, and are handled further down. + ('r', Some('"'), _) | ('r', Some('#'), _) | + ('b', Some('"'), _) | ('b', Some('\''), _) | + ('b', Some('r'), Some('"')) | ('b', Some('r'), Some('#')) => false, + _ => true + } { let start = self.last_pos; while ident_continue(self.curr) { self.bump(); @@ -805,43 +871,7 @@ impl<'a> StringReader<'a> { } // Otherwise it is a character constant: - match c2 { - '\\' => { - // '\X' for some X must be a character constant: - let escaped = self.curr; - let escaped_pos = self.last_pos; - self.bump(); - match escaped { - None => {} - Some(e) => { - c2 = match e { - 'n' => '\n', - 'r' => '\r', - 't' => '\t', - '\\' => '\\', - '\'' => '\'', - '"' => '"', - '0' => '\x00', - 'x' => self.scan_numeric_escape(2u, '\''), - 'u' => self.scan_numeric_escape(4u, '\''), - 'U' => self.scan_numeric_escape(8u, '\''), - c2 => { - let last_bpos = self.last_pos; - self.err_span_char(escaped_pos, last_bpos, - "unknown character escape", c2); - c2 - } - } - } - } - } - '\t' | '\n' | '\r' | '\'' => { - let last_bpos = self.last_pos; - self.err_span_char( start, last_bpos, - "character constant must be escaped", c2); - } - _ => {} - } + c2 = self.scan_char_or_byte(start, c2, /* ascii_only = */ false, '\'').unwrap(); if !self.curr_is('\'') { let last_bpos = self.last_pos; self.fatal_span_verbose( @@ -854,6 +884,112 @@ impl<'a> StringReader<'a> { self.bump(); // advance curr past token return token::LIT_CHAR(c2); } + 'b' => { + self.bump(); + return match self.curr { + Some('\'') => parse_byte(self), + Some('"') => parse_byte_string(self), + Some('r') => parse_raw_byte_string(self), + _ => unreachable!() // Should have been a token::IDENT above. + }; + + fn parse_byte(self_: &mut StringReader) -> token::Token { + self_.bump(); + let start = self_.last_pos; + + // the eof will be picked up by the final `'` check below + let mut c2 = self_.curr.unwrap_or('\x00'); + self_.bump(); + + c2 = self_.scan_char_or_byte(start, c2, /* ascii_only = */ true, '\'').unwrap(); + if !self_.curr_is('\'') { + // Byte offsetting here is okay because the + // character before position `start` are an + // ascii single quote and ascii 'b'. + let last_pos = self_.last_pos; + self_.fatal_span_verbose( + start - BytePos(2), last_pos, + "unterminated byte constant".to_string()); + } + self_.bump(); // advance curr past token + return token::LIT_BYTE(c2 as u8); + } + + fn parse_byte_string(self_: &mut StringReader) -> token::Token { + self_.bump(); + let start = self_.last_pos; + let mut value = Vec::new(); + while !self_.curr_is('"') { + if self_.is_eof() { + let last_pos = self_.last_pos; + self_.fatal_span(start, last_pos, + "unterminated double quote byte string"); + } + + let ch_start = self_.last_pos; + let ch = self_.curr.unwrap(); + self_.bump(); + self_.scan_char_or_byte(ch_start, ch, /* ascii_only = */ true, '"') + .map(|ch| value.push(ch as u8)); + } + self_.bump(); + return token::LIT_BINARY(Rc::new(value)); + } + + fn parse_raw_byte_string(self_: &mut StringReader) -> token::Token { + let start_bpos = self_.last_pos; + self_.bump(); + let mut hash_count = 0u; + while self_.curr_is('#') { + self_.bump(); + hash_count += 1; + } + + if self_.is_eof() { + let last_pos = self_.last_pos; + self_.fatal_span(start_bpos, last_pos, "unterminated raw string"); + } else if !self_.curr_is('"') { + let last_pos = self_.last_pos; + let ch = self_.curr.unwrap(); + self_.fatal_span_char(start_bpos, last_pos, + "only `#` is allowed in raw string delimitation; \ + found illegal character", + ch); + } + self_.bump(); + let content_start_bpos = self_.last_pos; + let mut content_end_bpos; + 'outer: loop { + match self_.curr { + None => { + let last_pos = self_.last_pos; + self_.fatal_span(start_bpos, last_pos, "unterminated raw string") + }, + Some('"') => { + content_end_bpos = self_.last_pos; + for _ in range(0, hash_count) { + self_.bump(); + if !self_.curr_is('#') { + continue 'outer; + } + } + break; + }, + Some(c) => if c > '\x7F' { + let last_pos = self_.last_pos; + self_.err_span_char( + last_pos, last_pos, "raw byte string must be ASCII", c); + } + } + self_.bump(); + } + self_.bump(); + let bytes = self_.with_str_from_to(content_start_bpos, + content_end_bpos, + |s| s.as_bytes().to_owned()); + return token::LIT_BINARY_RAW(Rc::new(bytes), hash_count); + } + } '"' => { let mut accum_str = String::new(); let start_bpos = self.last_pos; @@ -864,46 +1000,11 @@ impl<'a> StringReader<'a> { self.fatal_span(start_bpos, last_bpos, "unterminated double quote string"); } + let ch_start = self.last_pos; let ch = self.curr.unwrap(); self.bump(); - match ch { - '\\' => { - if self.is_eof() { - let last_bpos = self.last_pos; - self.fatal_span(start_bpos, last_bpos, - "unterminated double quote string"); - } - - let escaped = self.curr.unwrap(); - let escaped_pos = self.last_pos; - self.bump(); - match escaped { - 'n' => accum_str.push_char('\n'), - 'r' => accum_str.push_char('\r'), - 't' => accum_str.push_char('\t'), - '\\' => accum_str.push_char('\\'), - '\'' => accum_str.push_char('\''), - '"' => accum_str.push_char('"'), - '\n' => self.consume_whitespace(), - '0' => accum_str.push_char('\x00'), - 'x' => { - accum_str.push_char(self.scan_numeric_escape(2u, '"')); - } - 'u' => { - accum_str.push_char(self.scan_numeric_escape(4u, '"')); - } - 'U' => { - accum_str.push_char(self.scan_numeric_escape(8u, '"')); - } - c2 => { - let last_bpos = self.last_pos; - self.err_span_char(escaped_pos, last_bpos, - "unknown string escape", c2); - } - } - } - _ => accum_str.push_char(ch) - } + self.scan_char_or_byte(ch_start, ch, /* ascii_only = */ false, '"') + .map(|ch| accum_str.push_char(ch)); } self.bump(); return token::LIT_STR(str_to_ident(accum_str.as_slice())); diff --git a/src/libsyntax/parse/parser.rs b/src/libsyntax/parse/parser.rs index bbe0680ef14..ae2ec216bee 100644 --- a/src/libsyntax/parse/parser.rs +++ b/src/libsyntax/parse/parser.rs @@ -33,7 +33,7 @@ use ast::{ForeignItem, ForeignItemStatic, ForeignItemFn, ForeignMod}; use ast::{Ident, NormalFn, Inherited, Item, Item_, ItemStatic}; use ast::{ItemEnum, ItemFn, ItemForeignMod, ItemImpl}; use ast::{ItemMac, ItemMod, ItemStruct, ItemTrait, ItemTy, Lit, Lit_}; -use ast::{LitBool, LitFloat, LitFloatUnsuffixed, LitInt, LitChar}; +use ast::{LitBool, LitFloat, LitFloatUnsuffixed, LitInt, LitChar, LitByte, LitBinary}; use ast::{LitIntUnsuffixed, LitNil, LitStr, LitUint, Local, LocalLet}; use ast::{MutImmutable, MutMutable, Mac_, MacInvocTT, Matcher, MatchNonterminal}; use ast::{MatchSeq, MatchTok, Method, MutTy, BiMul, Mutability}; @@ -1512,6 +1512,7 @@ impl<'a> Parser<'a> { // matches token_lit = LIT_INT | ... pub fn lit_from_token(&mut self, tok: &token::Token) -> Lit_ { match *tok { + token::LIT_BYTE(i) => LitByte(i), token::LIT_CHAR(i) => LitChar(i), token::LIT_INT(i, it) => LitInt(i, it), token::LIT_UINT(u, ut) => LitUint(u, ut), @@ -1528,6 +1529,8 @@ impl<'a> Parser<'a> { token::LIT_STR_RAW(s, n) => { LitStr(self.id_to_interned_str(s), ast::RawStr(n)) } + token::LIT_BINARY_RAW(ref v, _) | + token::LIT_BINARY(ref v) => LitBinary(v.clone()), token::LPAREN => { self.expect(&token::RPAREN); LitNil }, _ => { self.unexpected_last(tok); } } diff --git a/src/libsyntax/parse/token.rs b/src/libsyntax/parse/token.rs index a4a022708d9..a2af417ed79 100644 --- a/src/libsyntax/parse/token.rs +++ b/src/libsyntax/parse/token.rs @@ -78,6 +78,7 @@ pub enum Token { DOLLAR, /* Literals */ + LIT_BYTE(u8), LIT_CHAR(char), LIT_INT(i64, ast::IntTy), LIT_UINT(u64, ast::UintTy), @@ -86,6 +87,8 @@ pub enum Token { LIT_FLOAT_UNSUFFIXED(ast::Ident), LIT_STR(ast::Ident), LIT_STR_RAW(ast::Ident, uint), /* raw str delimited by n hash symbols */ + LIT_BINARY(Rc<Vec<u8>>), + LIT_BINARY_RAW(Rc<Vec<u8>>, uint), /* raw binary str delimited by n hash symbols */ /* Name components */ // an identifier contains an "is_mod_name" boolean, @@ -193,6 +196,14 @@ pub fn to_str(t: &Token) -> String { DOLLAR => "$".to_string(), /* Literals */ + LIT_BYTE(b) => { + let mut res = String::from_str("b'"); + (b as char).escape_default(|c| { + res.push_char(c); + }); + res.push_char('\''); + res + } LIT_CHAR(c) => { let mut res = String::from_str("'"); c.escape_default(|c| { @@ -222,17 +233,26 @@ pub fn to_str(t: &Token) -> String { body } LIT_STR(s) => { - (format!("\"{}\"", get_ident(s).get().escape_default())).to_string() + format!("\"{}\"", get_ident(s).get().escape_default()) } LIT_STR_RAW(s, n) => { - (format!("r{delim}\"{string}\"{delim}", - delim="#".repeat(n), string=get_ident(s))).to_string() + format!("r{delim}\"{string}\"{delim}", + delim="#".repeat(n), string=get_ident(s)) + } + LIT_BINARY(ref v) => { + format!( + "b\"{}\"", + v.iter().map(|&b| b as char).collect::<String>().escape_default()) + } + LIT_BINARY_RAW(ref s, n) => { + format!("br{delim}\"{string}\"{delim}", + delim="#".repeat(n), string=s.as_slice().to_ascii().as_str_ascii()) } /* Name components */ IDENT(s, _) => get_ident(s).get().to_string(), LIFETIME(s) => { - (format!("{}", get_ident(s))).to_string() + format!("{}", get_ident(s)) } UNDERSCORE => "_".to_string(), @@ -273,6 +293,7 @@ pub fn can_begin_expr(t: &Token) -> bool { IDENT(_, _) => true, UNDERSCORE => true, TILDE => true, + LIT_BYTE(_) => true, LIT_CHAR(_) => true, LIT_INT(_, _) => true, LIT_UINT(_, _) => true, @@ -281,6 +302,8 @@ pub fn can_begin_expr(t: &Token) -> bool { LIT_FLOAT_UNSUFFIXED(_) => true, LIT_STR(_) => true, LIT_STR_RAW(_, _) => true, + LIT_BINARY(_) => true, + LIT_BINARY_RAW(_, _) => true, POUND => true, AT => true, NOT => true, @@ -311,6 +334,7 @@ pub fn close_delimiter_for(t: &Token) -> Option<Token> { pub fn is_lit(t: &Token) -> bool { match *t { + LIT_BYTE(_) => true, LIT_CHAR(_) => true, LIT_INT(_, _) => true, LIT_UINT(_, _) => true, @@ -319,6 +343,8 @@ pub fn is_lit(t: &Token) -> bool { LIT_FLOAT_UNSUFFIXED(_) => true, LIT_STR(_) => true, LIT_STR_RAW(_, _) => true, + LIT_BINARY(_) => true, + LIT_BINARY_RAW(_, _) => true, _ => false } } |
