diff options
| author | Alex Crichton <alex@alexcrichton.com> | 2014-05-16 14:23:04 -0700 |
|---|---|---|
| committer | Alex Crichton <alex@alexcrichton.com> | 2014-05-17 01:01:47 -0700 |
| commit | 4e9e091e91ea2ad8a6f45a9b20ff331d4bca7a23 (patch) | |
| tree | c01cb3c61732225abd28d5eb8991537ca7ad30b5 /src/libregex/parse/mod.rs | |
| parent | 25c54226c3e7dd6f59cf2e92238a4d79d8b0128d (diff) | |
| download | rust-4e9e091e91ea2ad8a6f45a9b20ff331d4bca7a23.tar.gz rust-4e9e091e91ea2ad8a6f45a9b20ff331d4bca7a23.zip | |
syntax: Tighten search paths for inner modules
This is an implementation of RFC 16. A module can now only be loaded if the
module declaring `mod name;` "owns" the current directory. A module is
considered as owning its directory if it meets one of the following criteria:
* It is the top-level crate file
* It is a `mod.rs` file
* It was loaded via `#[path]`
* It was loaded via `include!`
* The module was declared via an inline `mod foo { ... }` statement
For example, this directory structure is now invalid
// lib.rs
mod foo;
// foo.rs
mod bar;
// bar.rs;
fn bar() {}
With this change `foo.rs` must be renamed to `foo/mod.rs`, and `bar.rs` must be
renamed to `foo/bar.rs`. This makes it clear that `bar` is a submodule of `foo`,
and can only be accessed through `foo`.
RFC: 0016-module-file-system-hierarchy
Closes #14180
[breaking-change]
Diffstat (limited to 'src/libregex/parse/mod.rs')
| -rw-r--r-- | src/libregex/parse/mod.rs | 1030 |
1 files changed, 1030 insertions, 0 deletions
diff --git a/src/libregex/parse/mod.rs b/src/libregex/parse/mod.rs new file mode 100644 index 00000000000..a695da9fa16 --- /dev/null +++ b/src/libregex/parse/mod.rs @@ -0,0 +1,1030 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::char; +use std::cmp; +use std::fmt; +use std::iter; +use std::num; +use std::str; + +/// Static data containing Unicode ranges for general categories and scripts. +use self::unicode::{UNICODE_CLASSES, PERLD, PERLS, PERLW}; +#[allow(visible_private_types)] +pub mod unicode; + +/// The maximum number of repetitions allowed with the `{n,m}` syntax. +static MAX_REPEAT: uint = 1000; + +/// Error corresponds to something that can go wrong while parsing +/// a regular expression. +/// +/// (Once an expression is compiled, it is not possible to produce an error +/// via searching, splitting or replacing.) +pub struct Error { + /// The *approximate* character index of where the error occurred. + pub pos: uint, + /// A message describing the error. + pub msg: StrBuf, +} + +impl fmt::Show for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "Regex syntax error near position {}: {}", + self.pos, self.msg) + } +} + +/// Represents the abstract syntax of a regular expression. +/// It is showable so that error messages resulting from a bug can provide +/// useful information. +/// It is cloneable so that expressions can be repeated for the counted +/// repetition feature. (No other copying is done.) +/// +/// Note that this representation prevents one from reproducing the regex as +/// it was typed. (But it could be used to reproduce an equivalent regex.) +#[deriving(Show, Clone)] +pub enum Ast { + Nothing, + Literal(char, Flags), + Dot(Flags), + Class(Vec<(char, char)>, Flags), + Begin(Flags), + End(Flags), + WordBoundary(Flags), + Capture(uint, Option<StrBuf>, Box<Ast>), + // Represent concatenation as a flat vector to avoid blowing the + // stack in the compiler. + Cat(Vec<Ast>), + Alt(Box<Ast>, Box<Ast>), + Rep(Box<Ast>, Repeater, Greed), +} + +#[deriving(Show, Eq, Clone)] +pub enum Repeater { + ZeroOne, + ZeroMore, + OneMore, +} + +#[deriving(Show, Clone)] +pub enum Greed { + Greedy, + Ungreedy, +} + +impl Greed { + pub fn is_greedy(&self) -> bool { + match *self { + Greedy => true, + _ => false, + } + } + + fn swap(self, swapped: bool) -> Greed { + if !swapped { return self } + match self { + Greedy => Ungreedy, + Ungreedy => Greedy, + } + } +} + +/// BuildAst is a regrettable type that represents intermediate state for +/// constructing an abstract syntax tree. Its central purpose is to facilitate +/// parsing groups and alternations while also maintaining a stack of flag +/// state. +#[deriving(Show)] +enum BuildAst { + Ast(Ast), + Paren(Flags, uint, StrBuf), // '(' + Bar, // '|' +} + +impl BuildAst { + fn paren(&self) -> bool { + match *self { + Paren(_, _, _) => true, + _ => false, + } + } + + fn flags(&self) -> Flags { + match *self { + Paren(flags, _, _) => flags, + _ => fail!("Cannot get flags from {}", self), + } + } + + fn capture(&self) -> Option<uint> { + match *self { + Paren(_, 0, _) => None, + Paren(_, c, _) => Some(c), + _ => fail!("Cannot get capture group from {}", self), + } + } + + fn capture_name(&self) -> Option<StrBuf> { + match *self { + Paren(_, 0, _) => None, + Paren(_, _, ref name) => { + if name.len() == 0 { + None + } else { + Some(name.clone()) + } + } + _ => fail!("Cannot get capture name from {}", self), + } + } + + fn bar(&self) -> bool { + match *self { + Bar => true, + _ => false, + } + } + + fn unwrap(self) -> Result<Ast, Error> { + match self { + Ast(x) => Ok(x), + _ => fail!("Tried to unwrap non-AST item: {}", self), + } + } +} + +/// Flags represents all options that can be twiddled by a user in an +/// expression. +pub type Flags = u8; + +pub static FLAG_EMPTY: u8 = 0; +pub static FLAG_NOCASE: u8 = 1 << 0; // i +pub static FLAG_MULTI: u8 = 1 << 1; // m +pub static FLAG_DOTNL: u8 = 1 << 2; // s +pub static FLAG_SWAP_GREED: u8 = 1 << 3; // U +pub static FLAG_NEGATED: u8 = 1 << 4; // char class or not word boundary + +struct Parser<'a> { + // The input, parsed only as a sequence of UTF8 code points. + chars: Vec<char>, + // The index of the current character in the input. + chari: uint, + // The intermediate state representing the AST. + stack: Vec<BuildAst>, + // The current set of flags. + flags: Flags, + // The total number of capture groups. + // Incremented each time an opening left paren is seen (assuming it is + // opening a capture group). + caps: uint, + // A set of all capture group names used only to detect duplicates. + names: Vec<StrBuf>, +} + +pub fn parse(s: &str) -> Result<Ast, Error> { + Parser { + chars: s.chars().collect(), + chari: 0, + stack: vec!(), + flags: FLAG_EMPTY, + caps: 0, + names: vec!(), + }.parse() +} + +impl<'a> Parser<'a> { + fn parse(&mut self) -> Result<Ast, Error> { + loop { + let c = self.cur(); + match c { + '?' | '*' | '+' => try!(self.push_repeater(c)), + '\\' => { + let ast = try!(self.parse_escape()); + self.push(ast) + } + '{' => try!(self.parse_counted()), + '[' => match self.try_parse_ascii() { + None => try!(self.parse_class()), + Some(class) => self.push(class), + }, + '(' => { + if self.peek_is(1, '?') { + try!(self.expect('?')) + try!(self.parse_group_opts()) + } else { + self.caps += 1; + self.stack.push(Paren(self.flags, + self.caps, + "".to_strbuf())) + } + } + ')' => { + let catfrom = try!( + self.pos_last(false, |x| x.paren() || x.bar())); + try!(self.concat(catfrom)); + + let altfrom = try!(self.pos_last(false, |x| x.paren())); + // Before we smush the alternates together and pop off the + // left paren, let's grab the old flags and see if we + // need a capture. + let (cap, cap_name, oldflags) = { + let paren = self.stack.get(altfrom-1); + (paren.capture(), paren.capture_name(), paren.flags()) + }; + try!(self.alternate(altfrom)); + self.flags = oldflags; + + // If this was a capture, pop what we just pushed in + // alternate and make it a capture. + if cap.is_some() { + let ast = try!(self.pop_ast()); + self.push(Capture(cap.unwrap(), cap_name, box ast)); + } + } + '|' => { + let catfrom = try!( + self.pos_last(true, |x| x.paren() || x.bar())); + try!(self.concat(catfrom)); + + self.stack.push(Bar); + } + _ => try!(self.push_literal(c)), + } + if !self.next_char() { + break + } + } + + // Try to improve error handling. At this point, there should be + // no remaining open parens. + if self.stack.iter().any(|x| x.paren()) { + return self.err("Unclosed parenthesis.") + } + let catfrom = try!(self.pos_last(true, |x| x.bar())); + try!(self.concat(catfrom)); + try!(self.alternate(0)); + + assert!(self.stack.len() == 1); + self.pop_ast() + } + + fn noteof(&mut self, expected: &str) -> Result<(), Error> { + match self.next_char() { + true => Ok(()), + false => self.err(format!("Expected {} but got EOF.", expected)), + } + } + + fn expect(&mut self, expected: char) -> Result<(), Error> { + match self.next_char() { + true if self.cur() == expected => Ok(()), + true => self.err(format!("Expected '{}' but got '{}'.", + expected, self.cur())), + false => self.err(format!("Expected '{}' but got EOF.", expected)), + } + } + + fn next_char(&mut self) -> bool { + self.chari += 1; + self.chari < self.chars.len() + } + + fn pop_ast(&mut self) -> Result<Ast, Error> { + match self.stack.pop().unwrap().unwrap() { + Err(e) => Err(e), + Ok(ast) => Ok(ast), + } + } + + fn push(&mut self, ast: Ast) { + self.stack.push(Ast(ast)) + } + + fn push_repeater(&mut self, c: char) -> Result<(), Error> { + if self.stack.len() == 0 { + return self.err( + "A repeat operator must be preceded by a valid expression.") + } + let rep: Repeater = match c { + '?' => ZeroOne, '*' => ZeroMore, '+' => OneMore, + _ => fail!("Not a valid repeater operator."), + }; + + match self.peek(1) { + Some('*') | Some('+') => + return self.err( + "Double repeat operators are not supported."), + _ => {}, + } + let ast = try!(self.pop_ast()); + match ast { + Begin(_) | End(_) | WordBoundary(_) => + return self.err( + "Repeat arguments cannot be empty width assertions."), + _ => {} + } + let greed = try!(self.get_next_greedy()); + self.push(Rep(box ast, rep, greed)); + Ok(()) + } + + fn push_literal(&mut self, c: char) -> Result<(), Error> { + match c { + '.' => { + self.push(Dot(self.flags)) + } + '^' => { + self.push(Begin(self.flags)) + } + '$' => { + self.push(End(self.flags)) + } + _ => { + self.push(Literal(c, self.flags)) + } + } + Ok(()) + } + + // Parses all forms of character classes. + // Assumes that '[' is the current character. + fn parse_class(&mut self) -> Result<(), Error> { + let negated = + if self.peek_is(1, '^') { + try!(self.expect('^')) + FLAG_NEGATED + } else { + FLAG_EMPTY + }; + let mut ranges: Vec<(char, char)> = vec!(); + let mut alts: Vec<Ast> = vec!(); + + if self.peek_is(1, ']') { + try!(self.expect(']')) + ranges.push((']', ']')) + } + while self.peek_is(1, '-') { + try!(self.expect('-')) + ranges.push(('-', '-')) + } + loop { + try!(self.noteof("a closing ']' or a non-empty character class)")) + let mut c = self.cur(); + match c { + '[' => + match self.try_parse_ascii() { + Some(Class(asciis, flags)) => { + alts.push(Class(asciis, flags ^ negated)); + continue + } + Some(ast) => + fail!("Expected Class AST but got '{}'", ast), + // Just drop down and try to add as a regular character. + None => {}, + }, + '\\' => { + match try!(self.parse_escape()) { + Class(asciis, flags) => { + alts.push(Class(asciis, flags ^ negated)); + continue + } + Literal(c2, _) => c = c2, // process below + Begin(_) | End(_) | WordBoundary(_) => + return self.err( + "\\A, \\z, \\b and \\B are not valid escape \ + sequences inside a character class."), + ast => fail!("Unexpected AST item '{}'", ast), + } + } + _ => {}, + } + match c { + ']' => { + if ranges.len() > 0 { + let flags = negated | (self.flags & FLAG_NOCASE); + let mut ast = Class(combine_ranges(ranges), flags); + for alt in alts.move_iter() { + ast = Alt(box alt, box ast) + } + self.push(ast); + } else if alts.len() > 0 { + let mut ast = alts.pop().unwrap(); + for alt in alts.move_iter() { + ast = Alt(box alt, box ast) + } + self.push(ast); + } + return Ok(()) + } + c => { + if self.peek_is(1, '-') && !self.peek_is(2, ']') { + try!(self.expect('-')) + try!(self.noteof("not a ']'")) + let c2 = self.cur(); + if c2 < c { + return self.err(format!( + "Invalid character class range '{}-{}'", c, c2)) + } + ranges.push((c, self.cur())) + } else { + ranges.push((c, c)) + } + } + } + } + } + + // Tries to parse an ASCII character class of the form [:name:]. + // If successful, returns an AST character class corresponding to name + // and moves the parser to the final ']' character. + // If unsuccessful, no state is changed and None is returned. + // Assumes that '[' is the current character. + fn try_parse_ascii(&mut self) -> Option<Ast> { + if !self.peek_is(1, ':') { + return None + } + let closer = + match self.pos(']') { + Some(i) => i, + None => return None, + }; + if *self.chars.get(closer-1) != ':' { + return None + } + if closer - self.chari <= 3 { + return None + } + let mut name_start = self.chari + 2; + let negated = + if self.peek_is(2, '^') { + name_start += 1; + FLAG_NEGATED + } else { + FLAG_EMPTY + }; + let name = self.slice(name_start, closer - 1); + match find_class(ASCII_CLASSES, name.as_slice()) { + None => None, + Some(ranges) => { + self.chari = closer; + let flags = negated | (self.flags & FLAG_NOCASE); + Some(Class(combine_ranges(ranges), flags)) + } + } + } + + // Parses counted repetition. Supports: + // {n}, {n,}, {n,m}, {n}?, {n,}? and {n,m}? + // Assumes that '{' is the current character. + // Returns either an error or moves the parser to the final '}' character. + // (Or the '?' character if not greedy.) + fn parse_counted(&mut self) -> Result<(), Error> { + // Scan until the closing '}' and grab the stuff in {}. + let start = self.chari; + let closer = + match self.pos('}') { + Some(i) => i, + None => return self.err(format!( + "No closing brace for counted repetition starting at \ + position {}.", start)), + }; + self.chari = closer; + let greed = try!(self.get_next_greedy()); + let inner = str::from_chars( + self.chars.as_slice().slice(start + 1, closer)); + + // Parse the min and max values from the regex. + let (mut min, mut max): (uint, Option<uint>); + if !inner.contains(",") { + min = try!(self.parse_uint(inner)); + max = Some(min); + } else { + let pieces: Vec<&str> = inner.splitn(',', 1).collect(); + let (smin, smax) = (*pieces.get(0), *pieces.get(1)); + if smin.len() == 0 { + return self.err("Max repetitions cannot be specified \ + without min repetitions.") + } + min = try!(self.parse_uint(smin)); + max = + if smax.len() == 0 { + None + } else { + Some(try!(self.parse_uint(smax))) + }; + } + + // Do some bounds checking and make sure max >= min. + if min > MAX_REPEAT { + return self.err(format!( + "{} exceeds maximum allowed repetitions ({})", + min, MAX_REPEAT)); + } + if max.is_some() { + let m = max.unwrap(); + if m > MAX_REPEAT { + return self.err(format!( + "{} exceeds maximum allowed repetitions ({})", + m, MAX_REPEAT)); + } + if m < min { + return self.err(format!( + "Max repetitions ({}) cannot be smaller than min \ + repetitions ({}).", m, min)); + } + } + + // Now manipulate the AST be repeating elements. + if max.is_none() { + // Require N copies of what's on the stack and then repeat it. + let ast = try!(self.pop_ast()); + for _ in iter::range(0, min) { + self.push(ast.clone()) + } + self.push(Rep(box ast, ZeroMore, greed)); + } else { + // Require N copies of what's on the stack and then repeat it + // up to M times optionally. + let ast = try!(self.pop_ast()); + for _ in iter::range(0, min) { + self.push(ast.clone()) + } + if max.is_some() { + for _ in iter::range(min, max.unwrap()) { + self.push(Rep(box ast.clone(), ZeroOne, greed)) + } + } + // It's possible that we popped something off the stack but + // never put anything back on it. To keep things simple, add + // a no-op expression. + if min == 0 && (max.is_none() || max == Some(0)) { + self.push(Nothing) + } + } + Ok(()) + } + + // Parses all escape sequences. + // Assumes that '\' is the current character. + fn parse_escape(&mut self) -> Result<Ast, Error> { + try!(self.noteof("an escape sequence following a '\\'")) + + let c = self.cur(); + if is_punct(c) { + return Ok(Literal(c, FLAG_EMPTY)) + } + match c { + 'a' => Ok(Literal('\x07', FLAG_EMPTY)), + 'f' => Ok(Literal('\x0C', FLAG_EMPTY)), + 't' => Ok(Literal('\t', FLAG_EMPTY)), + 'n' => Ok(Literal('\n', FLAG_EMPTY)), + 'r' => Ok(Literal('\r', FLAG_EMPTY)), + 'v' => Ok(Literal('\x0B', FLAG_EMPTY)), + 'A' => Ok(Begin(FLAG_EMPTY)), + 'z' => Ok(End(FLAG_EMPTY)), + 'b' => Ok(WordBoundary(FLAG_EMPTY)), + 'B' => Ok(WordBoundary(FLAG_NEGATED)), + '0'|'1'|'2'|'3'|'4'|'5'|'6'|'7' => Ok(try!(self.parse_octal())), + 'x' => Ok(try!(self.parse_hex())), + 'p' | 'P' => Ok(try!(self.parse_unicode_name())), + 'd' | 'D' | 's' | 'S' | 'w' | 'W' => { + let ranges = perl_unicode_class(c); + let mut flags = self.flags & FLAG_NOCASE; + if c.is_uppercase() { flags |= FLAG_NEGATED } + Ok(Class(ranges, flags)) + } + _ => self.err(format!("Invalid escape sequence '\\\\{}'", c)), + } + } + + // Parses a unicode character class name, either of the form \pF where + // F is a one letter unicode class name or of the form \p{name} where + // name is the unicode class name. + // Assumes that \p or \P has been read (and 'p' or 'P' is the current + // character). + fn parse_unicode_name(&mut self) -> Result<Ast, Error> { + let negated = if self.cur() == 'P' { FLAG_NEGATED } else { FLAG_EMPTY }; + let mut name: StrBuf; + if self.peek_is(1, '{') { + try!(self.expect('{')) + let closer = + match self.pos('}') { + Some(i) => i, + None => return self.err(format!( + "Missing '\\}' for unclosed '\\{' at position {}", + self.chari)), + }; + if closer - self.chari + 1 == 0 { + return self.err("No Unicode class name found.") + } + name = self.slice(self.chari + 1, closer); + self.chari = closer; + } else { + if self.chari + 1 >= self.chars.len() { + return self.err("No single letter Unicode class name found.") + } + name = self.slice(self.chari + 1, self.chari + 2); + self.chari += 1; + } + match find_class(UNICODE_CLASSES, name.as_slice()) { + None => return self.err(format!( + "Could not find Unicode class '{}'", name)), + Some(ranges) => { + Ok(Class(ranges, negated | (self.flags & FLAG_NOCASE))) + } + } + } + + // Parses an octal number, up to 3 digits. + // Assumes that \n has been read, where n is the first digit. + fn parse_octal(&mut self) -> Result<Ast, Error> { + let start = self.chari; + let mut end = start + 1; + let (d2, d3) = (self.peek(1), self.peek(2)); + if d2 >= Some('0') && d2 <= Some('7') { + try!(self.noteof("expected octal character in [0-7]")) + end += 1; + if d3 >= Some('0') && d3 <= Some('7') { + try!(self.noteof("expected octal character in [0-7]")) + end += 1; + } + } + let s = self.slice(start, end); + match num::from_str_radix::<u32>(s.as_slice(), 8) { + Some(n) => Ok(Literal(try!(self.char_from_u32(n)), FLAG_EMPTY)), + None => self.err(format!( + "Could not parse '{}' as octal number.", s)), + } + } + + // Parse a hex number. Either exactly two digits or anything in {}. + // Assumes that \x has been read. + fn parse_hex(&mut self) -> Result<Ast, Error> { + if !self.peek_is(1, '{') { + try!(self.expect('{')) + return self.parse_hex_two() + } + let start = self.chari + 2; + let closer = + match self.pos('}') { + None => return self.err(format!( + "Missing '\\}' for unclosed '\\{' at position {}", start)), + Some(i) => i, + }; + self.chari = closer; + self.parse_hex_digits(self.slice(start, closer).as_slice()) + } + + // Parses a two-digit hex number. + // Assumes that \xn has been read, where n is the first digit and is the + // current character. + // After return, parser will point at the second digit. + fn parse_hex_two(&mut self) -> Result<Ast, Error> { + let (start, end) = (self.chari, self.chari + 2); + let bad = self.slice(start - 2, self.chars.len()); + try!(self.noteof(format!("Invalid hex escape sequence '{}'", bad))) + self.parse_hex_digits(self.slice(start, end).as_slice()) + } + + // Parses `s` as a hexadecimal number. + fn parse_hex_digits(&self, s: &str) -> Result<Ast, Error> { + match num::from_str_radix::<u32>(s, 16) { + Some(n) => Ok(Literal(try!(self.char_from_u32(n)), FLAG_EMPTY)), + None => self.err(format!( + "Could not parse '{}' as hex number.", s)), + } + } + + // Parses a named capture. + // Assumes that '(?P<' has been consumed and that the current character + // is '<'. + // When done, parser will be at the closing '>' character. + fn parse_named_capture(&mut self) -> Result<(), Error> { + try!(self.noteof("a capture name")) + let closer = + match self.pos('>') { + Some(i) => i, + None => return self.err("Capture name must end with '>'."), + }; + if closer - self.chari == 0 { + return self.err("Capture names must have at least 1 character.") + } + let name = self.slice(self.chari, closer); + if !name.as_slice().chars().all(is_valid_cap) { + return self.err( + "Capture names can only have underscores, letters and digits.") + } + if self.names.contains(&name) { + return self.err(format!("Duplicate capture group name '{}'.", name)) + } + self.names.push(name.clone()); + self.chari = closer; + self.caps += 1; + self.stack.push(Paren(self.flags, self.caps, name)); + Ok(()) + } + + // Parses non-capture groups and options. + // Assumes that '(?' has already been consumed and '?' is the current + // character. + fn parse_group_opts(&mut self) -> Result<(), Error> { + if self.peek_is(1, 'P') && self.peek_is(2, '<') { + try!(self.expect('P')) try!(self.expect('<')) + return self.parse_named_capture() + } + let start = self.chari; + let mut flags = self.flags; + let mut sign = 1; + let mut saw_flag = false; + loop { + try!(self.noteof("expected non-empty set of flags or closing ')'")) + match self.cur() { + 'i' => { flags = flags | FLAG_NOCASE; saw_flag = true}, + 'm' => { flags = flags | FLAG_MULTI; saw_flag = true}, + 's' => { flags = flags | FLAG_DOTNL; saw_flag = true}, + 'U' => { flags = flags | FLAG_SWAP_GREED; saw_flag = true}, + '-' => { + if sign < 0 { + return self.err(format!( + "Cannot negate flags twice in '{}'.", + self.slice(start, self.chari + 1))) + } + sign = -1; + saw_flag = false; + flags = flags ^ flags; + } + ':' | ')' => { + if sign < 0 { + if !saw_flag { + return self.err(format!( + "A valid flag does not follow negation in '{}'", + self.slice(start, self.chari + 1))) + } + flags = flags ^ flags; + } + if self.cur() == ':' { + // Save the old flags with the opening paren. + self.stack.push(Paren(self.flags, 0, "".to_strbuf())); + } + self.flags = flags; + return Ok(()) + } + _ => return self.err(format!( + "Unrecognized flag '{}'.", self.cur())), + } + } + } + + // Peeks at the next character and returns whether it's ungreedy or not. + // If it is, then the next character is consumed. + fn get_next_greedy(&mut self) -> Result<Greed, Error> { + Ok(if self.peek_is(1, '?') { + try!(self.expect('?')) + Ungreedy + } else { + Greedy + }.swap(self.flags & FLAG_SWAP_GREED > 0)) + } + + // Searches the stack (starting at the top) until it finds an expression + // for which `pred` returns true. The index of that expression in the + // stack is returned. + // If there's no match, then one of two things happens depending on the + // values of `allow_start`. When it's true, then `0` will be returned. + // Otherwise, an error will be returned. + // Generally, `allow_start` is only true when you're *not* expecting an + // opening parenthesis. + fn pos_last(&self, allow_start: bool, pred: |&BuildAst| -> bool) + -> Result<uint, Error> { + let from = match self.stack.iter().rev().position(pred) { + Some(i) => i, + None => { + if allow_start { + self.stack.len() + } else { + return self.err("No matching opening parenthesis.") + } + } + }; + // Adjust index since 'from' is for the reversed stack. + // Also, don't include the '(' or '|'. + Ok(self.stack.len() - from) + } + + // concat starts at `from` in the parser's stack and concatenates all + // expressions up to the top of the stack. The resulting concatenation is + // then pushed on to the stack. + // Usually `from` corresponds to the position of an opening parenthesis, + // a '|' (alternation) or the start of the entire expression. + fn concat(&mut self, from: uint) -> Result<(), Error> { + let ast = try!(self.build_from(from, concat_flatten)); + self.push(ast); + Ok(()) + } + + // concat starts at `from` in the parser's stack and alternates all + // expressions up to the top of the stack. The resulting alternation is + // then pushed on to the stack. + // Usually `from` corresponds to the position of an opening parenthesis + // or the start of the entire expression. + // This will also drop any opening parens or alternation bars found in + // the intermediate AST. + fn alternate(&mut self, mut from: uint) -> Result<(), Error> { + // Unlike in the concatenation case, we want 'build_from' to continue + // all the way to the opening left paren (so it will be popped off and + // thrown away). But be careful with overflow---we can't count on the + // open paren to be there. + if from > 0 { from = from - 1} + let ast = try!(self.build_from(from, |l,r| Alt(box l, box r))); + self.push(ast); + Ok(()) + } + + // build_from combines all AST elements starting at 'from' in the + // parser's stack using 'mk' to combine them. If any such element is not an + // AST then it is popped off the stack and ignored. + fn build_from(&mut self, from: uint, mk: |Ast, Ast| -> Ast) + -> Result<Ast, Error> { + if from >= self.stack.len() { + return self.err("Empty group or alternate not allowed.") + } + + let mut combined = try!(self.pop_ast()); + let mut i = self.stack.len(); + while i > from { + i = i - 1; + match self.stack.pop().unwrap() { + Ast(x) => combined = mk(x, combined), + _ => {}, + } + } + Ok(combined) + } + + fn parse_uint(&self, s: &str) -> Result<uint, Error> { + match from_str::<uint>(s) { + Some(i) => Ok(i), + None => self.err(format!( + "Expected an unsigned integer but got '{}'.", s)), + } + } + + fn char_from_u32(&self, n: u32) -> Result<char, Error> { + match char::from_u32(n) { + Some(c) => Ok(c), + None => self.err(format!( + "Could not decode '{}' to unicode character.", n)), + } + } + + fn pos(&self, c: char) -> Option<uint> { + self.chars.iter() + .skip(self.chari).position(|&c2| c2 == c).map(|i| self.chari + i) + } + + fn err<T>(&self, msg: &str) -> Result<T, Error> { + Err(Error { + pos: self.chari, + msg: msg.to_strbuf(), + }) + } + + fn peek(&self, offset: uint) -> Option<char> { + if self.chari + offset >= self.chars.len() { + return None + } + Some(*self.chars.get(self.chari + offset)) + } + + fn peek_is(&self, offset: uint, is: char) -> bool { + self.peek(offset) == Some(is) + } + + fn cur(&self) -> char { + *self.chars.get(self.chari) + } + + fn slice(&self, start: uint, end: uint) -> StrBuf { + str::from_chars(self.chars.as_slice().slice(start, end)).to_strbuf() + } +} + +// Given an unordered collection of character ranges, combine_ranges returns +// an ordered sequence of character ranges where no two ranges overlap. They +// are ordered from least to greatest (using start position). +fn combine_ranges(unordered: Vec<(char, char)>) -> Vec<(char, char)> { + // Returns true iff the two character classes overlap or share a boundary. + // e.g., ('a', 'g') and ('h', 'm') would return true. + fn should_merge((a, b): (char, char), (x, y): (char, char)) -> bool { + cmp::max(a, x) as u32 <= cmp::min(b, y) as u32 + 1 + } + + // This is currently O(n^2), but I think with sufficient cleverness, + // it can be reduced to O(n) **if necessary**. + let mut ordered: Vec<(char, char)> = Vec::with_capacity(unordered.len()); + for (us, ue) in unordered.move_iter() { + let (mut us, mut ue) = (us, ue); + assert!(us <= ue); + let mut which: Option<uint> = None; + for (i, &(os, oe)) in ordered.iter().enumerate() { + if should_merge((us, ue), (os, oe)) { + us = cmp::min(us, os); + ue = cmp::max(ue, oe); + which = Some(i); + break + } + } + match which { + None => ordered.push((us, ue)), + Some(i) => *ordered.get_mut(i) = (us, ue), + } + } + ordered.sort(); + ordered +} + +// Constructs a Unicode friendly Perl character class from \d, \s or \w +// (or any of their negated forms). Note that this does not handle negation. +fn perl_unicode_class(which: char) -> Vec<(char, char)> { + match which.to_lowercase() { + 'd' => Vec::from_slice(PERLD), + 's' => Vec::from_slice(PERLS), + 'w' => Vec::from_slice(PERLW), + _ => unreachable!(), + } +} + +// Returns a concatenation of two expressions. This also guarantees that a +// `Cat` expression will never be a direct child of another `Cat` expression. +fn concat_flatten(x: Ast, y: Ast) -> Ast { + match (x, y) { + (Cat(mut xs), Cat(ys)) => { xs.push_all_move(ys); Cat(xs) } + (Cat(mut xs), ast) => { xs.push(ast); Cat(xs) } + (ast, Cat(mut xs)) => { xs.unshift(ast); Cat(xs) } + (ast1, ast2) => Cat(vec!(ast1, ast2)), + } +} + +pub fn is_punct(c: char) -> bool { + match c { + '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | + '[' | ']' | '{' | '}' | '^' | '$' => true, + _ => false, + } +} + +fn is_valid_cap(c: char) -> bool { + c == '_' || (c >= '0' && c <= '9') + || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') +} + +fn find_class(classes: NamedClasses, name: &str) -> Option<Vec<(char, char)>> { + match classes.bsearch(|&(s, _)| s.cmp(&name)) { + Some(i) => Some(Vec::from_slice(classes[i].val1())), + None => None, + } +} + +type Class = &'static [(char, char)]; +type NamedClasses = &'static [(&'static str, Class)]; + +static ASCII_CLASSES: NamedClasses = &[ + // Classes must be in alphabetical order so that bsearch works. + // [:alnum:] alphanumeric (== [0-9A-Za-z]) + // [:alpha:] alphabetic (== [A-Za-z]) + // [:ascii:] ASCII (== [\x00-\x7F]) + // [:blank:] blank (== [\t ]) + // [:cntrl:] control (== [\x00-\x1F\x7F]) + // [:digit:] digits (== [0-9]) + // [:graph:] graphical (== [!-~]) + // [:lower:] lower case (== [a-z]) + // [:print:] printable (== [ -~] == [ [:graph:]]) + // [:punct:] punctuation (== [!-/:-@[-`{-~]) + // [:space:] whitespace (== [\t\n\v\f\r ]) + // [:upper:] upper case (== [A-Z]) + // [:word:] word characters (== [0-9A-Za-z_]) + // [:xdigit:] hex digit (== [0-9A-Fa-f]) + // Taken from: http://golang.org/pkg/regex/syntax/ + ("alnum", &[('0', '9'), ('A', 'Z'), ('a', 'z')]), + ("alpha", &[('A', 'Z'), ('a', 'z')]), + ("ascii", &[('\x00', '\x7F')]), + ("blank", &[(' ', ' '), ('\t', '\t')]), + ("cntrl", &[('\x00', '\x1F'), ('\x7F', '\x7F')]), + ("digit", &[('0', '9')]), + ("graph", &[('!', '~')]), + ("lower", &[('a', 'z')]), + ("print", &[(' ', '~')]), + ("punct", &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')]), + ("space", &[('\t', '\t'), ('\n', '\n'), ('\x0B', '\x0B'), ('\x0C', '\x0C'), + ('\r', '\r'), (' ', ' ')]), + ("upper", &[('A', 'Z')]), + ("word", &[('0', '9'), ('A', 'Z'), ('a', 'z'), ('_', '_')]), + ("xdigit", &[('0', '9'), ('A', 'F'), ('a', 'f')]), +]; |
