syntax: Tighten search paths for inner modules

This is an implementation of RFC 16. A module can now only be loaded if the module declaring `mod name;` "owns" the current directory. A module is considered as owning its directory if it meets one of the following criteria: * It is the top-level crate file * It is a `mod.rs` file * It was loaded via `#[path]` * It was loaded via `include!` * The module was declared via an inline `mod foo { ... }` statement For example, this directory structure is now invalid // lib.rs mod foo; // foo.rs mod bar; // bar.rs; fn bar() {} With this change `foo.rs` must be renamed to `foo/mod.rs`, and `bar.rs` must be renamed to `foo/bar.rs`. This makes it clear that `bar` is a submodule of `foo`, and can only be accessed through `foo`. RFC: 0016-module-file-system-hierarchy Closes #14180 [breaking-change]
author: Alex Crichton <alex@alexcrichton.com> 2014-05-16 14:23:04 -0700
committer: Alex Crichton <alex@alexcrichton.com> 2014-05-17 01:01:47 -0700
commit: 4e9e091e91ea2ad8a6f45a9b20ff331d4bca7a23 (patch)
tree: c01cb3c61732225abd28d5eb8991537ca7ad30b5 /src/libregex/parse/mod.rs
parent: 25c54226c3e7dd6f59cf2e92238a4d79d8b0128d (diff)
download: rust-4e9e091e91ea2ad8a6f45a9b20ff331d4bca7a23.tar.gz
rust-4e9e091e91ea2ad8a6f45a9b20ff331d4bca7a23.zip
1 files changed, 1030 insertions, 0 deletions
diff --git a/src/libregex/parse/mod.rs b/src/libregex/parse/mod.rs
new file mode 100644
index 00000000000..a695da9fa16
--- /dev/null
+++ b/src/libregex/parse/mod.rs
@@ -0,0 +1,1030 @@
+// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use std::char;
+use std::cmp;
+use std::fmt;
+use std::iter;
+use std::num;
+use std::str;
+
+/// Static data containing Unicode ranges for general categories and scripts.
+use self::unicode::{UNICODE_CLASSES, PERLD, PERLS, PERLW};
+#[allow(visible_private_types)]
+pub mod unicode;
+
+/// The maximum number of repetitions allowed with the `{n,m}` syntax.
+static MAX_REPEAT: uint = 1000;
+
+/// Error corresponds to something that can go wrong while parsing
+/// a regular expression.
+///
+/// (Once an expression is compiled, it is not possible to produce an error
+/// via searching, splitting or replacing.)
+pub struct Error {
+    /// The *approximate* character index of where the error occurred.
+    pub pos: uint,
+    /// A message describing the error.
+    pub msg: StrBuf,
+}
+
+impl fmt::Show for Error {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "Regex syntax error near position {}: {}",
+               self.pos, self.msg)
+    }
+}
+
+/// Represents the abstract syntax of a regular expression.
+/// It is showable so that error messages resulting from a bug can provide
+/// useful information.
+/// It is cloneable so that expressions can be repeated for the counted
+/// repetition feature. (No other copying is done.)
+///
+/// Note that this representation prevents one from reproducing the regex as
+/// it was typed. (But it could be used to reproduce an equivalent regex.)
+#[deriving(Show, Clone)]
+pub enum Ast {
+    Nothing,
+    Literal(char, Flags),
+    Dot(Flags),
+    Class(Vec<(char, char)>, Flags),
+    Begin(Flags),
+    End(Flags),
+    WordBoundary(Flags),
+    Capture(uint, Option<StrBuf>, Box<Ast>),
+    // Represent concatenation as a flat vector to avoid blowing the
+    // stack in the compiler.
+    Cat(Vec<Ast>),
+    Alt(Box<Ast>, Box<Ast>),
+    Rep(Box<Ast>, Repeater, Greed),
+}
+
+#[deriving(Show, Eq, Clone)]
+pub enum Repeater {
+    ZeroOne,
+    ZeroMore,
+    OneMore,
+}
+
+#[deriving(Show, Clone)]
+pub enum Greed {
+    Greedy,
+    Ungreedy,
+}
+
+impl Greed {
+    pub fn is_greedy(&self) -> bool {
+        match *self {
+            Greedy => true,
+            _ => false,
+        }
+    }
+
+    fn swap(self, swapped: bool) -> Greed {
+        if !swapped { return self }
+        match self {
+            Greedy => Ungreedy,
+            Ungreedy => Greedy,
+        }
+    }
+}
+
+/// BuildAst is a regrettable type that represents intermediate state for
+/// constructing an abstract syntax tree. Its central purpose is to facilitate
+/// parsing groups and alternations while also maintaining a stack of flag
+/// state.
+#[deriving(Show)]
+enum BuildAst {
+    Ast(Ast),
+    Paren(Flags, uint, StrBuf), // '('
+    Bar, // '|'
+}
+
+impl BuildAst {
+    fn paren(&self) -> bool {
+        match *self {
+            Paren(_, _, _) => true,
+            _ => false,
+        }
+    }
+
+    fn flags(&self) -> Flags {
+        match *self {
+            Paren(flags, _, _) => flags,
+            _ => fail!("Cannot get flags from {}", self),
+        }
+    }
+
+    fn capture(&self) -> Option<uint> {
+        match *self {
+            Paren(_, 0, _) => None,
+            Paren(_, c, _) => Some(c),
+            _ => fail!("Cannot get capture group from {}", self),
+        }
+    }
+
+    fn capture_name(&self) -> Option<StrBuf> {
+        match *self {
+            Paren(_, 0, _) => None,
+            Paren(_, _, ref name) => {
+                if name.len() == 0 {
+                    None
+                } else {
+                    Some(name.clone())
+                }
+            }
+            _ => fail!("Cannot get capture name from {}", self),
+        }
+    }
+
+    fn bar(&self) -> bool {
+        match *self {
+            Bar => true,
+            _ => false,
+        }
+    }
+
+    fn unwrap(self) -> Result<Ast, Error> {
+        match self {
+            Ast(x) => Ok(x),
+            _ => fail!("Tried to unwrap non-AST item: {}", self),
+        }
+    }
+}
+
+/// Flags represents all options that can be twiddled by a user in an
+/// expression.
+pub type Flags = u8;
+
+pub static FLAG_EMPTY:      u8 = 0;
+pub static FLAG_NOCASE:     u8 = 1 << 0; // i
+pub static FLAG_MULTI:      u8 = 1 << 1; // m
+pub static FLAG_DOTNL:      u8 = 1 << 2; // s
+pub static FLAG_SWAP_GREED: u8 = 1 << 3; // U
+pub static FLAG_NEGATED:    u8 = 1 << 4; // char class or not word boundary
+
+struct Parser<'a> {
+    // The input, parsed only as a sequence of UTF8 code points.
+    chars: Vec<char>,
+    // The index of the current character in the input.
+    chari: uint,
+    // The intermediate state representing the AST.
+    stack: Vec<BuildAst>,
+    // The current set of flags.
+    flags: Flags,
+    // The total number of capture groups.
+    // Incremented each time an opening left paren is seen (assuming it is
+    // opening a capture group).
+    caps: uint,
+    // A set of all capture group names used only to detect duplicates.
+    names: Vec<StrBuf>,
+}
+
+pub fn parse(s: &str) -> Result<Ast, Error> {
+    Parser {
+        chars: s.chars().collect(),
+        chari: 0,
+        stack: vec!(),
+        flags: FLAG_EMPTY,
+        caps: 0,
+        names: vec!(),
+    }.parse()
+}
+
+impl<'a> Parser<'a> {
+    fn parse(&mut self) -> Result<Ast, Error> {
+        loop {
+            let c = self.cur();
+            match c {
+                '?' | '*' | '+' => try!(self.push_repeater(c)),
+                '\\' => {
+                    let ast = try!(self.parse_escape());
+                    self.push(ast)
+                }
+                '{' => try!(self.parse_counted()),
+                '[' => match self.try_parse_ascii() {
+                    None => try!(self.parse_class()),
+                    Some(class) => self.push(class),
+                },
+                '(' => {
+                    if self.peek_is(1, '?') {
+                        try!(self.expect('?'))
+                        try!(self.parse_group_opts())
+                    } else {
+                        self.caps += 1;
+                        self.stack.push(Paren(self.flags,
+                                              self.caps,
+                                              "".to_strbuf()))
+                    }
+                }
+                ')' => {
+                    let catfrom = try!(
+                        self.pos_last(false, |x| x.paren() || x.bar()));
+                    try!(self.concat(catfrom));
+
+                    let altfrom = try!(self.pos_last(false, |x| x.paren()));
+                    // Before we smush the alternates together and pop off the
+                    // left paren, let's grab the old flags and see if we
+                    // need a capture.
+                    let (cap, cap_name, oldflags) = {
+                        let paren = self.stack.get(altfrom-1);
+                        (paren.capture(), paren.capture_name(), paren.flags())
+                    };
+                    try!(self.alternate(altfrom));
+                    self.flags = oldflags;
+
+                    // If this was a capture, pop what we just pushed in
+                    // alternate and make it a capture.
+                    if cap.is_some() {
+                        let ast = try!(self.pop_ast());
+                        self.push(Capture(cap.unwrap(), cap_name, box ast));
+                    }
+                }
+                '|' => {
+                    let catfrom = try!(
+                        self.pos_last(true, |x| x.paren() || x.bar()));
+                    try!(self.concat(catfrom));
+
+                    self.stack.push(Bar);
+                }
+                _ => try!(self.push_literal(c)),
+            }
+            if !self.next_char() {
+                break
+            }
+        }
+
+        // Try to improve error handling. At this point, there should be
+        // no remaining open parens.
+        if self.stack.iter().any(|x| x.paren()) {
+            return self.err("Unclosed parenthesis.")
+        }
+        let catfrom = try!(self.pos_last(true, |x| x.bar()));
+        try!(self.concat(catfrom));
+        try!(self.alternate(0));
+
+        assert!(self.stack.len() == 1);
+        self.pop_ast()
+    }
+
+    fn noteof(&mut self, expected: &str) -> Result<(), Error> {
+        match self.next_char() {
+            true => Ok(()),
+            false => self.err(format!("Expected {} but got EOF.", expected)),
+        }
+    }
+
+    fn expect(&mut self, expected: char) -> Result<(), Error> {
+        match self.next_char() {
+            true if self.cur() == expected => Ok(()),
+            true => self.err(format!("Expected '{}' but got '{}'.",
+                                     expected, self.cur())),
+            false => self.err(format!("Expected '{}' but got EOF.", expected)),
+        }
+    }
+
+    fn next_char(&mut self) -> bool {
+        self.chari += 1;
+        self.chari < self.chars.len()
+    }
+
+    fn pop_ast(&mut self) -> Result<Ast, Error> {
+        match self.stack.pop().unwrap().unwrap() {
+            Err(e) => Err(e),
+            Ok(ast) => Ok(ast),
+        }
+    }
+
+    fn push(&mut self, ast: Ast) {
+        self.stack.push(Ast(ast))
+    }
+
+    fn push_repeater(&mut self, c: char) -> Result<(), Error> {
+        if self.stack.len() == 0 {
+            return self.err(
+                "A repeat operator must be preceded by a valid expression.")
+        }
+        let rep: Repeater = match c {
+            '?' => ZeroOne, '*' => ZeroMore, '+' => OneMore,
+            _ => fail!("Not a valid repeater operator."),
+        };
+
+        match self.peek(1) {
+            Some('*') | Some('+') =>
+                return self.err(
+                    "Double repeat operators are not supported."),
+            _ => {},
+        }
+        let ast = try!(self.pop_ast());
+        match ast {
+            Begin(_) | End(_) | WordBoundary(_) =>
+                return self.err(
+                    "Repeat arguments cannot be empty width assertions."),
+            _ => {}
+        }
+        let greed = try!(self.get_next_greedy());
+        self.push(Rep(box ast, rep, greed));
+        Ok(())
+    }
+
+    fn push_literal(&mut self, c: char) -> Result<(), Error> {
+        match c {
+            '.' => {
+                self.push(Dot(self.flags))
+            }
+            '^' => {
+                self.push(Begin(self.flags))
+            }
+            '$' => {
+                self.push(End(self.flags))
+            }
+            _ => {
+                self.push(Literal(c, self.flags))
+            }
+        }
+        Ok(())
+    }
+
+    // Parses all forms of character classes.
+    // Assumes that '[' is the current character.
+    fn parse_class(&mut self) -> Result<(), Error> {
+        let negated =
+            if self.peek_is(1, '^') {
+                try!(self.expect('^'))
+                FLAG_NEGATED
+            } else {
+                FLAG_EMPTY
+            };
+        let mut ranges: Vec<(char, char)> = vec!();
+        let mut alts: Vec<Ast> = vec!();
+
+        if self.peek_is(1, ']') {
+            try!(self.expect(']'))
+            ranges.push((']', ']'))
+        }
+        while self.peek_is(1, '-') {
+            try!(self.expect('-'))
+            ranges.push(('-', '-'))
+        }
+        loop {
+            try!(self.noteof("a closing ']' or a non-empty character class)"))
+            let mut c = self.cur();
+            match c {
+                '[' =>
+                    match self.try_parse_ascii() {
+                        Some(Class(asciis, flags)) => {
+                            alts.push(Class(asciis, flags ^ negated));
+                            continue
+                        }
+                        Some(ast) =>
+                            fail!("Expected Class AST but got '{}'", ast),
+                        // Just drop down and try to add as a regular character.
+                        None => {},
+                    },
+                '\\' => {
+                    match try!(self.parse_escape()) {
+                        Class(asciis, flags) => {
+                            alts.push(Class(asciis, flags ^ negated));
+                            continue
+                        }
+                        Literal(c2, _) => c = c2, // process below
+                        Begin(_) | End(_) | WordBoundary(_) =>
+                            return self.err(
+                                "\\A, \\z, \\b and \\B are not valid escape \
+                                 sequences inside a character class."),
+                        ast => fail!("Unexpected AST item '{}'", ast),
+                    }
+                }
+                _ => {},
+            }
+            match c {
+                ']' => {
+                    if ranges.len() > 0 {
+                        let flags = negated | (self.flags & FLAG_NOCASE);
+                        let mut ast = Class(combine_ranges(ranges), flags);
+                        for alt in alts.move_iter() {
+                            ast = Alt(box alt, box ast)
+                        }
+                        self.push(ast);
+                    } else if alts.len() > 0 {
+                        let mut ast = alts.pop().unwrap();
+                        for alt in alts.move_iter() {
+                            ast = Alt(box alt, box ast)
+                        }
+                        self.push(ast);
+                    }
+                    return Ok(())
+                }
+                c => {
+                    if self.peek_is(1, '-') && !self.peek_is(2, ']') {
+                        try!(self.expect('-'))
+                        try!(self.noteof("not a ']'"))
+                        let c2 = self.cur();
+                        if c2 < c {
+                            return self.err(format!(
+                                "Invalid character class range '{}-{}'", c, c2))
+                        }
+                        ranges.push((c, self.cur()))
+                    } else {
+                        ranges.push((c, c))
+                    }
+                }
+            }
+        }
+    }
+
+    // Tries to parse an ASCII character class of the form [:name:].
+    // If successful, returns an AST character class corresponding to name
+    // and moves the parser to the final ']' character.
+    // If unsuccessful, no state is changed and None is returned.
+    // Assumes that '[' is the current character.
+    fn try_parse_ascii(&mut self) -> Option<Ast> {
+        if !self.peek_is(1, ':') {
+            return None
+        }
+        let closer =
+            match self.pos(']') {
+                Some(i) => i,
+                None => return None,
+            };
+        if *self.chars.get(closer-1) != ':' {
+            return None
+        }
+        if closer - self.chari <= 3 {
+            return None
+        }
+        let mut name_start = self.chari + 2;
+        let negated =
+            if self.peek_is(2, '^') {
+                name_start += 1;
+                FLAG_NEGATED
+            } else {
+                FLAG_EMPTY
+            };
+        let name = self.slice(name_start, closer - 1);
+        match find_class(ASCII_CLASSES, name.as_slice()) {
+            None => None,
+            Some(ranges) => {
+                self.chari = closer;
+                let flags = negated | (self.flags & FLAG_NOCASE);
+                Some(Class(combine_ranges(ranges), flags))
+            }
+        }
+    }
+
+    // Parses counted repetition. Supports:
+    // {n}, {n,}, {n,m}, {n}?, {n,}? and {n,m}?
+    // Assumes that '{' is the current character.
+    // Returns either an error or moves the parser to the final '}' character.
+    // (Or the '?' character if not greedy.)
+    fn parse_counted(&mut self) -> Result<(), Error> {
+        // Scan until the closing '}' and grab the stuff in {}.
+        let start = self.chari;
+        let closer =
+            match self.pos('}') {
+                Some(i) => i,
+                None => return self.err(format!(
+                    "No closing brace for counted repetition starting at \
+                     position {}.", start)),
+            };
+        self.chari = closer;
+        let greed = try!(self.get_next_greedy());
+        let inner = str::from_chars(
+            self.chars.as_slice().slice(start + 1, closer));
+
+        // Parse the min and max values from the regex.
+        let (mut min, mut max): (uint, Option<uint>);
+        if !inner.contains(",") {
+            min = try!(self.parse_uint(inner));
+            max = Some(min);
+        } else {
+            let pieces: Vec<&str> = inner.splitn(',', 1).collect();
+            let (smin, smax) = (*pieces.get(0), *pieces.get(1));
+            if smin.len() == 0 {
+                return self.err("Max repetitions cannot be specified \
+                                    without min repetitions.")
+            }
+            min = try!(self.parse_uint(smin));
+            max =
+                if smax.len() == 0 {
+                    None
+                } else {
+                    Some(try!(self.parse_uint(smax)))
+                };
+        }
+
+        // Do some bounds checking and make sure max >= min.
+        if min > MAX_REPEAT {
+            return self.err(format!(
+                "{} exceeds maximum allowed repetitions ({})",
+                min, MAX_REPEAT));
+        }
+        if max.is_some() {
+            let m = max.unwrap();
+            if m > MAX_REPEAT {
+                return self.err(format!(
+                    "{} exceeds maximum allowed repetitions ({})",
+                    m, MAX_REPEAT));
+            }
+            if m < min {
+                return self.err(format!(
+                    "Max repetitions ({}) cannot be smaller than min \
+                     repetitions ({}).", m, min));
+            }
+        }
+
+        // Now manipulate the AST be repeating elements.
+        if max.is_none() {
+            // Require N copies of what's on the stack and then repeat it.
+            let ast = try!(self.pop_ast());
+            for _ in iter::range(0, min) {
+                self.push(ast.clone())
+            }
+            self.push(Rep(box ast, ZeroMore, greed));
+        } else {
+            // Require N copies of what's on the stack and then repeat it
+            // up to M times optionally.
+            let ast = try!(self.pop_ast());
+            for _ in iter::range(0, min) {
+                self.push(ast.clone())
+            }
+            if max.is_some() {
+                for _ in iter::range(min, max.unwrap()) {
+                    self.push(Rep(box ast.clone(), ZeroOne, greed))
+                }
+            }
+            // It's possible that we popped something off the stack but
+            // never put anything back on it. To keep things simple, add
+            // a no-op expression.
+            if min == 0 && (max.is_none() || max == Some(0)) {
+                self.push(Nothing)
+            }
+        }
+        Ok(())
+    }
+
+    // Parses all escape sequences.
+    // Assumes that '\' is the current character.
+    fn parse_escape(&mut self) -> Result<Ast, Error> {
+        try!(self.noteof("an escape sequence following a '\\'"))
+
+        let c = self.cur();
+        if is_punct(c) {
+            return Ok(Literal(c, FLAG_EMPTY))
+        }
+        match c {
+            'a' => Ok(Literal('\x07', FLAG_EMPTY)),
+            'f' => Ok(Literal('\x0C', FLAG_EMPTY)),
+            't' => Ok(Literal('\t', FLAG_EMPTY)),
+            'n' => Ok(Literal('\n', FLAG_EMPTY)),
+            'r' => Ok(Literal('\r', FLAG_EMPTY)),
+            'v' => Ok(Literal('\x0B', FLAG_EMPTY)),
+            'A' => Ok(Begin(FLAG_EMPTY)),
+            'z' => Ok(End(FLAG_EMPTY)),
+            'b' => Ok(WordBoundary(FLAG_EMPTY)),
+            'B' => Ok(WordBoundary(FLAG_NEGATED)),
+            '0'|'1'|'2'|'3'|'4'|'5'|'6'|'7' => Ok(try!(self.parse_octal())),
+            'x' => Ok(try!(self.parse_hex())),
+            'p' | 'P' => Ok(try!(self.parse_unicode_name())),
+            'd' | 'D' | 's' | 'S' | 'w' | 'W' => {
+                let ranges = perl_unicode_class(c);
+                let mut flags = self.flags & FLAG_NOCASE;
+                if c.is_uppercase() { flags |= FLAG_NEGATED }
+                Ok(Class(ranges, flags))
+            }
+            _ => self.err(format!("Invalid escape sequence '\\\\{}'", c)),
+        }
+    }
+
+    // Parses a unicode character class name, either of the form \pF where
+    // F is a one letter unicode class name or of the form \p{name} where
+    // name is the unicode class name.
+    // Assumes that \p or \P has been read (and 'p' or 'P' is the current
+    // character).
+    fn parse_unicode_name(&mut self) -> Result<Ast, Error> {
+        let negated = if self.cur() == 'P' { FLAG_NEGATED } else { FLAG_EMPTY };
+        let mut name: StrBuf;
+        if self.peek_is(1, '{') {
+            try!(self.expect('{'))
+            let closer =
+                match self.pos('}') {
+                    Some(i) => i,
+                    None => return self.err(format!(
+                        "Missing '\\}' for unclosed '\\{' at position {}",
+                        self.chari)),
+                };
+            if closer - self.chari + 1 == 0 {
+                return self.err("No Unicode class name found.")
+            }
+            name = self.slice(self.chari + 1, closer);
+            self.chari = closer;
+        } else {
+            if self.chari + 1 >= self.chars.len() {
+                return self.err("No single letter Unicode class name found.")
+            }
+            name = self.slice(self.chari + 1, self.chari + 2);
+            self.chari += 1;
+        }
+        match find_class(UNICODE_CLASSES, name.as_slice()) {
+            None => return self.err(format!(
+                "Could not find Unicode class '{}'", name)),
+            Some(ranges) => {
+                Ok(Class(ranges, negated | (self.flags & FLAG_NOCASE)))
+            }
+        }
+    }
+
+    // Parses an octal number, up to 3 digits.
+    // Assumes that \n has been read, where n is the first digit.
+    fn parse_octal(&mut self) -> Result<Ast, Error> {
+        let start = self.chari;
+        let mut end = start + 1;
+        let (d2, d3) = (self.peek(1), self.peek(2));
+        if d2 >= Some('0') && d2 <= Some('7') {
+            try!(self.noteof("expected octal character in [0-7]"))
+            end += 1;
+            if d3 >= Some('0') && d3 <= Some('7') {
+                try!(self.noteof("expected octal character in [0-7]"))
+                end += 1;
+            }
+        }
+        let s = self.slice(start, end);
+        match num::from_str_radix::<u32>(s.as_slice(), 8) {
+            Some(n) => Ok(Literal(try!(self.char_from_u32(n)), FLAG_EMPTY)),
+            None => self.err(format!(
+                "Could not parse '{}' as octal number.", s)),
+        }
+    }
+
+    // Parse a hex number. Either exactly two digits or anything in {}.
+    // Assumes that \x has been read.
+    fn parse_hex(&mut self) -> Result<Ast, Error> {
+        if !self.peek_is(1, '{') {
+            try!(self.expect('{'))
+            return self.parse_hex_two()
+        }
+        let start = self.chari + 2;
+        let closer =
+            match self.pos('}') {
+                None => return self.err(format!(
+                    "Missing '\\}' for unclosed '\\{' at position {}", start)),
+                Some(i) => i,
+            };
+        self.chari = closer;
+        self.parse_hex_digits(self.slice(start, closer).as_slice())
+    }
+
+    // Parses a two-digit hex number.
+    // Assumes that \xn has been read, where n is the first digit and is the
+    // current character.
+    // After return, parser will point at the second digit.
+    fn parse_hex_two(&mut self) -> Result<Ast, Error> {
+        let (start, end) = (self.chari, self.chari + 2);
+        let bad = self.slice(start - 2, self.chars.len());
+        try!(self.noteof(format!("Invalid hex escape sequence '{}'", bad)))
+        self.parse_hex_digits(self.slice(start, end).as_slice())
+    }
+
+    // Parses `s` as a hexadecimal number.
+    fn parse_hex_digits(&self, s: &str) -> Result<Ast, Error> {
+        match num::from_str_radix::<u32>(s, 16) {
+            Some(n) => Ok(Literal(try!(self.char_from_u32(n)), FLAG_EMPTY)),
+            None => self.err(format!(
+                "Could not parse '{}' as hex number.", s)),
+        }
+    }
+
+    // Parses a named capture.
+    // Assumes that '(?P<' has been consumed and that the current character
+    // is '<'.
+    // When done, parser will be at the closing '>' character.
+    fn parse_named_capture(&mut self) -> Result<(), Error> {
+        try!(self.noteof("a capture name"))
+        let closer =
+            match self.pos('>') {
+                Some(i) => i,
+                None => return self.err("Capture name must end with '>'."),
+            };
+        if closer - self.chari == 0 {
+            return self.err("Capture names must have at least 1 character.")
+        }
+        let name = self.slice(self.chari, closer);
+        if !name.as_slice().chars().all(is_valid_cap) {
+            return self.err(
+                "Capture names can only have underscores, letters and digits.")
+        }
+        if self.names.contains(&name) {
+            return self.err(format!("Duplicate capture group name '{}'.", name))
+        }
+        self.names.push(name.clone());
+        self.chari = closer;
+        self.caps += 1;
+        self.stack.push(Paren(self.flags, self.caps, name));
+        Ok(())
+    }
+
+    // Parses non-capture groups and options.
+    // Assumes that '(?' has already been consumed and '?' is the current
+    // character.
+    fn parse_group_opts(&mut self) -> Result<(), Error> {
+        if self.peek_is(1, 'P') && self.peek_is(2, '<') {
+            try!(self.expect('P')) try!(self.expect('<'))
+            return self.parse_named_capture()
+        }
+        let start = self.chari;
+        let mut flags = self.flags;
+        let mut sign = 1;
+        let mut saw_flag = false;
+        loop {
+            try!(self.noteof("expected non-empty set of flags or closing ')'"))
+            match self.cur() {
+                'i' => { flags = flags | FLAG_NOCASE;     saw_flag = true},
+                'm' => { flags = flags | FLAG_MULTI;      saw_flag = true},
+                's' => { flags = flags | FLAG_DOTNL;      saw_flag = true},
+                'U' => { flags = flags | FLAG_SWAP_GREED; saw_flag = true},
+                '-' => {
+                    if sign < 0 {
+                        return self.err(format!(
+                            "Cannot negate flags twice in '{}'.",
+                            self.slice(start, self.chari + 1)))
+                    }
+                    sign = -1;
+                    saw_flag = false;
+                    flags = flags ^ flags;
+                }
+                ':' | ')' => {
+                    if sign < 0 {
+                        if !saw_flag {
+                            return self.err(format!(
+                                "A valid flag does not follow negation in '{}'",
+                                self.slice(start, self.chari + 1)))
+                        }
+                        flags = flags ^ flags;
+                    }
+                    if self.cur() == ':' {
+                        // Save the old flags with the opening paren.
+                        self.stack.push(Paren(self.flags, 0, "".to_strbuf()));
+                    }
+                    self.flags = flags;
+                    return Ok(())
+                }
+                _ => return self.err(format!(
+                    "Unrecognized flag '{}'.", self.cur())),
+            }
+        }
+    }
+
+    // Peeks at the next character and returns whether it's ungreedy or not.
+    // If it is, then the next character is consumed.
+    fn get_next_greedy(&mut self) -> Result<Greed, Error> {
+        Ok(if self.peek_is(1, '?') {
+            try!(self.expect('?'))
+            Ungreedy
+        } else {
+            Greedy
+        }.swap(self.flags & FLAG_SWAP_GREED > 0))
+    }
+
+    // Searches the stack (starting at the top) until it finds an expression
+    // for which `pred` returns true. The index of that expression in the
+    // stack is returned.
+    // If there's no match, then one of two things happens depending on the
+    // values of `allow_start`. When it's true, then `0` will be returned.
+    // Otherwise, an error will be returned.
+    // Generally, `allow_start` is only true when you're *not* expecting an
+    // opening parenthesis.
+    fn pos_last(&self, allow_start: bool, pred: |&BuildAst| -> bool)
+               -> Result<uint, Error> {
+        let from = match self.stack.iter().rev().position(pred) {
+            Some(i) => i,
+            None => {
+                if allow_start {
+                    self.stack.len()
+                } else {
+                    return self.err("No matching opening parenthesis.")
+                }
+            }
+        };
+        // Adjust index since 'from' is for the reversed stack.
+        // Also, don't include the '(' or '|'.
+        Ok(self.stack.len() - from)
+    }
+
+    // concat starts at `from` in the parser's stack and concatenates all
+    // expressions up to the top of the stack. The resulting concatenation is
+    // then pushed on to the stack.
+    // Usually `from` corresponds to the position of an opening parenthesis,
+    // a '|' (alternation) or the start of the entire expression.
+    fn concat(&mut self, from: uint) -> Result<(), Error> {
+        let ast = try!(self.build_from(from, concat_flatten));
+        self.push(ast);
+        Ok(())
+    }
+
+    // concat starts at `from` in the parser's stack and alternates all
+    // expressions up to the top of the stack. The resulting alternation is
+    // then pushed on to the stack.
+    // Usually `from` corresponds to the position of an opening parenthesis
+    // or the start of the entire expression.
+    // This will also drop any opening parens or alternation bars found in
+    // the intermediate AST.
+    fn alternate(&mut self, mut from: uint) -> Result<(), Error> {
+        // Unlike in the concatenation case, we want 'build_from' to continue
+        // all the way to the opening left paren (so it will be popped off and
+        // thrown away). But be careful with overflow---we can't count on the
+        // open paren to be there.
+        if from > 0 { from = from - 1}
+        let ast = try!(self.build_from(from, |l,r| Alt(box l, box r)));
+        self.push(ast);
+        Ok(())
+    }
+
+    // build_from combines all AST elements starting at 'from' in the
+    // parser's stack using 'mk' to combine them. If any such element is not an
+    // AST then it is popped off the stack and ignored.
+    fn build_from(&mut self, from: uint, mk: |Ast, Ast| -> Ast)
+                 -> Result<Ast, Error> {
+        if from >= self.stack.len() {
+            return self.err("Empty group or alternate not allowed.")
+        }
+
+        let mut combined = try!(self.pop_ast());
+        let mut i = self.stack.len();
+        while i > from {
+            i = i - 1;
+            match self.stack.pop().unwrap() {
+                Ast(x) => combined = mk(x, combined),
+                _ => {},
+            }
+        }
+        Ok(combined)
+    }
+
+    fn parse_uint(&self, s: &str) -> Result<uint, Error> {
+        match from_str::<uint>(s) {
+            Some(i) => Ok(i),
+            None => self.err(format!(
+                "Expected an unsigned integer but got '{}'.", s)),
+        }
+    }
+
+    fn char_from_u32(&self, n: u32) -> Result<char, Error> {
+        match char::from_u32(n) {
+            Some(c) => Ok(c),
+            None => self.err(format!(
+                "Could not decode '{}' to unicode character.", n)),
+        }
+    }
+
+    fn pos(&self, c: char) -> Option<uint> {
+        self.chars.iter()
+            .skip(self.chari).position(|&c2| c2 == c).map(|i| self.chari + i)
+    }
+
+    fn err<T>(&self, msg: &str) -> Result<T, Error> {
+        Err(Error {
+            pos: self.chari,
+            msg: msg.to_strbuf(),
+        })
+    }
+
+    fn peek(&self, offset: uint) -> Option<char> {
+        if self.chari + offset >= self.chars.len() {
+            return None
+        }
+        Some(*self.chars.get(self.chari + offset))
+    }
+
+    fn peek_is(&self, offset: uint, is: char) -> bool {
+        self.peek(offset) == Some(is)
+    }
+
+    fn cur(&self) -> char {
+        *self.chars.get(self.chari)
+    }
+
+    fn slice(&self, start: uint, end: uint) -> StrBuf {
+        str::from_chars(self.chars.as_slice().slice(start, end)).to_strbuf()
+    }
+}
+
+// Given an unordered collection of character ranges, combine_ranges returns
+// an ordered sequence of character ranges where no two ranges overlap. They
+// are ordered from least to greatest (using start position).
+fn combine_ranges(unordered: Vec<(char, char)>) -> Vec<(char, char)> {
+    // Returns true iff the two character classes overlap or share a boundary.
+    // e.g., ('a', 'g') and ('h', 'm') would return true.
+    fn should_merge((a, b): (char, char), (x, y): (char, char)) -> bool {
+        cmp::max(a, x) as u32 <= cmp::min(b, y) as u32 + 1
+    }
+
+    // This is currently O(n^2), but I think with sufficient cleverness,
+    // it can be reduced to O(n) **if necessary**.
+    let mut ordered: Vec<(char, char)> = Vec::with_capacity(unordered.len());
+    for (us, ue) in unordered.move_iter() {
+        let (mut us, mut ue) = (us, ue);
+        assert!(us <= ue);
+        let mut which: Option<uint> = None;
+        for (i, &(os, oe)) in ordered.iter().enumerate() {
+            if should_merge((us, ue), (os, oe)) {
+                us = cmp::min(us, os);
+                ue = cmp::max(ue, oe);
+                which = Some(i);
+                break
+            }
+        }
+        match which {
+            None => ordered.push((us, ue)),
+            Some(i) => *ordered.get_mut(i) = (us, ue),
+        }
+    }
+    ordered.sort();
+    ordered
+}
+
+// Constructs a Unicode friendly Perl character class from \d, \s or \w
+// (or any of their negated forms). Note that this does not handle negation.
+fn perl_unicode_class(which: char) -> Vec<(char, char)> {
+    match which.to_lowercase() {
+        'd' => Vec::from_slice(PERLD),
+        's' => Vec::from_slice(PERLS),
+        'w' => Vec::from_slice(PERLW),
+        _ => unreachable!(),
+    }
+}
+
+// Returns a concatenation of two expressions. This also guarantees that a
+// `Cat` expression will never be a direct child of another `Cat` expression.
+fn concat_flatten(x: Ast, y: Ast) -> Ast {
+    match (x, y) {
+        (Cat(mut xs), Cat(ys)) => { xs.push_all_move(ys); Cat(xs) }
+        (Cat(mut xs), ast) => { xs.push(ast); Cat(xs) }
+        (ast, Cat(mut xs)) => { xs.unshift(ast); Cat(xs) }
+        (ast1, ast2) => Cat(vec!(ast1, ast2)),
+    }
+}
+
+pub fn is_punct(c: char) -> bool {
+    match c {
+        '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' |
+        '[' | ']' | '{' | '}' | '^' | '$' => true,
+        _ => false,
+    }
+}
+
+fn is_valid_cap(c: char) -> bool {
+    c == '_' || (c >= '0' && c <= '9')
+    || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
+}
+
+fn find_class(classes: NamedClasses, name: &str) -> Option<Vec<(char, char)>> {
+    match classes.bsearch(|&(s, _)| s.cmp(&name)) {
+        Some(i) => Some(Vec::from_slice(classes[i].val1())),
+        None => None,
+    }
+}
+
+type Class = &'static [(char, char)];
+type NamedClasses = &'static [(&'static str, Class)];
+
+static ASCII_CLASSES: NamedClasses = &[
+    // Classes must be in alphabetical order so that bsearch works.
+    // [:alnum:]      alphanumeric (== [0-9A-Za-z])
+    // [:alpha:]      alphabetic (== [A-Za-z])
+    // [:ascii:]      ASCII (== [\x00-\x7F])
+    // [:blank:]      blank (== [\t ])
+    // [:cntrl:]      control (== [\x00-\x1F\x7F])
+    // [:digit:]      digits (== [0-9])
+    // [:graph:]      graphical (== [!-~])
+    // [:lower:]      lower case (== [a-z])
+    // [:print:]      printable (== [ -~] == [ [:graph:]])
+    // [:punct:]      punctuation (== [!-/:-@[-`{-~])
+    // [:space:]      whitespace (== [\t\n\v\f\r ])
+    // [:upper:]      upper case (== [A-Z])
+    // [:word:]       word characters (== [0-9A-Za-z_])
+    // [:xdigit:]     hex digit (== [0-9A-Fa-f])
+    // Taken from: http://golang.org/pkg/regex/syntax/
+    ("alnum", &[('0', '9'), ('A', 'Z'), ('a', 'z')]),
+    ("alpha", &[('A', 'Z'), ('a', 'z')]),
+    ("ascii", &[('\x00', '\x7F')]),
+    ("blank", &[(' ', ' '), ('\t', '\t')]),
+    ("cntrl", &[('\x00', '\x1F'), ('\x7F', '\x7F')]),
+    ("digit", &[('0', '9')]),
+    ("graph", &[('!', '~')]),
+    ("lower", &[('a', 'z')]),
+    ("print", &[(' ', '~')]),
+    ("punct", &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')]),
+    ("space", &[('\t', '\t'), ('\n', '\n'), ('\x0B', '\x0B'), ('\x0C', '\x0C'),
+                ('\r', '\r'), (' ', ' ')]),
+    ("upper", &[('A', 'Z')]),
+    ("word", &[('0', '9'), ('A', 'Z'), ('a', 'z'), ('_', '_')]),
+    ("xdigit", &[('0', '9'), ('A', 'F'), ('a', 'f')]),
+];
author	Alex Crichton <alex@alexcrichton.com>	2014-05-16 14:23:04 -0700
committer	Alex Crichton <alex@alexcrichton.com>	2014-05-17 01:01:47 -0700
commit	4e9e091e91ea2ad8a6f45a9b20ff331d4bca7a23 (patch)
tree	c01cb3c61732225abd28d5eb8991537ca7ad30b5 /src/libregex/parse/mod.rs
parent	25c54226c3e7dd6f59cf2e92238a4d79d8b0128d (diff)
download	rust-4e9e091e91ea2ad8a6f45a9b20ff331d4bca7a23.tar.gz rust-4e9e091e91ea2ad8a6f45a9b20ff331d4bca7a23.zip