diff options
Diffstat (limited to 'src/libregex/lib.rs')
| -rw-r--r-- | src/libregex/lib.rs | 426 |
1 files changed, 426 insertions, 0 deletions
diff --git a/src/libregex/lib.rs b/src/libregex/lib.rs new file mode 100644 index 00000000000..cd5d387bfa0 --- /dev/null +++ b/src/libregex/lib.rs @@ -0,0 +1,426 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! This crate provides a native implementation of regular expressions that is +//! heavily based on RE2 both in syntax and in implementation. Notably, +//! backreferences and arbitrary lookahead/lookbehind assertions are not +//! provided. In return, regular expression searching provided by this package +//! has excellent worst case performance. The specific syntax supported is +//! documented further down. +//! +//! This crate's documentation provides some simple examples, describes Unicode +//! support and exhaustively lists the supported syntax. For more specific +//! details on the API, please see the documentation for the `Regex` type. +//! +//! # First example: find a date +//! +//! General use of regular expressions in this package involves compiling an +//! expression and then using it to search, split or replace text. For example, +//! to confirm that some text resembles a date: +//! +//! ```rust +//! use regex::Regex; +//! let re = match Regex::new(r"^\d{4}-\d{2}-\d{2}$") { +//! Ok(re) => re, +//! Err(err) => fail!("{}", err), +//! }; +//! assert_eq!(re.is_match("2014-01-01"), true); +//! ``` +//! +//! Notice the use of the `^` and `$` anchors. In this crate, every expression +//! is executed with an implicit `.*?` at the beginning and end, which allows +//! it to match anywhere in the text. Anchors can be used to ensure that the +//! full text matches an expression. +//! +//! This example also demonstrates the utility of raw strings in Rust, which +//! are just like regular strings except they are prefixed with an `r` and do +//! not process any escape sequences. For example, `"\\d"` is the same +//! expression as `r"\d"`. +//! +//! # The `regex!` macro +//! +//! Rust's compile time meta-programming facilities provide a way to write a +//! `regex!` macro which compiles regular expressions *when your program +//! compiles*. Said differently, if you only use `regex!` to build regular +//! expressions in your program, then your program cannot compile with an +//! invalid regular expression. Moreover, the `regex!` macro compiles the +//! given expression to native Rust code, which makes it much faster for +//! searching text. +//! +//! Since `regex!` provides compiled regular expressions that are both safer +//! and faster to use, you should use them whenever possible. The only +//! requirement for using them is that you have a string literal corresponding +//! to your expression. Otherwise, it is indistinguishable from an expression +//! compiled at runtime with `Regex::new`. +//! +//! To use the `regex!` macro, you must enable the `phase` feature and import +//! the `regex_macros` crate as a syntax extension: +//! +//! ```rust +//! #![feature(phase)] +//! #[phase(syntax)] +//! extern crate regex_macros; +//! extern crate regex; +//! +//! fn main() { +//! let re = regex!(r"^\d{4}-\d{2}-\d{2}$"); +//! assert_eq!(re.is_match("2014-01-01"), true); +//! } +//! ``` +//! +//! There are a few things worth mentioning about using the `regex!` macro. +//! Firstly, the `regex!` macro *only* accepts string *literals*. +//! Secondly, the `regex` crate *must* be linked with the name `regex` since +//! the generated code depends on finding symbols in the `regex` crate. +//! +//! The only downside of using the `regex!` macro is that it can increase the +//! size of your program's binary since it generates specialized Rust code. +//! The extra size probably won't be significant for a small number of +//! expressions, but 100+ calls to `regex!` will probably result in a +//! noticeably bigger binary. +//! +//! # Example: iterating over capture groups +//! +//! This crate provides convenient iterators for matching an expression +//! repeatedly against a search string to find successive non-overlapping +//! matches. For example, to find all dates in a string and be able to access +//! them by their component pieces: +//! +//! ```rust +//! # #![feature(phase)] +//! # extern crate regex; #[phase(syntax)] extern crate regex_macros; +//! # fn main() { +//! let re = regex!(r"(\d{4})-(\d{2})-(\d{2})"); +//! let text = "2012-03-14, 2013-01-01 and 2014-07-05"; +//! for cap in re.captures_iter(text) { +//! println!("Month: {} Day: {} Year: {}", cap.at(2), cap.at(3), cap.at(1)); +//! } +//! // Output: +//! // Month: 03 Day: 14 Year: 2012 +//! // Month: 01 Day: 01 Year: 2013 +//! // Month: 07 Day: 05 Year: 2014 +//! # } +//! ``` +//! +//! Notice that the year is in the capture group indexed at `1`. This is +//! because the *entire match* is stored in the capture group at index `0`. +//! +//! # Example: replacement with named capture groups +//! +//! Building on the previous example, perhaps we'd like to rearrange the date +//! formats. This can be done with text replacement. But to make the code +//! clearer, we can *name* our capture groups and use those names as variables +//! in our replacement text: +//! +//! ```rust +//! # #![feature(phase)] +//! # extern crate regex; #[phase(syntax)] extern crate regex_macros; +//! # fn main() { +//! let re = regex!(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})"); +//! let before = "2012-03-14, 2013-01-01 and 2014-07-05"; +//! let after = re.replace_all(before, "$m/$d/$y"); +//! assert_eq!(after.as_slice(), "03/14/2012, 01/01/2013 and 07/05/2014"); +//! # } +//! ``` +//! +//! The `replace` methods are actually polymorphic in the replacement, which +//! provides more flexibility than is seen here. (See the documentation for +//! `Regex::replace` for more details.) +//! +//! # Pay for what you use +//! +//! With respect to searching text with a regular expression, there are three +//! questions that can be asked: +//! +//! 1. Does the text match this expression? +//! 2. If so, where does it match? +//! 3. Where are the submatches? +//! +//! Generally speaking, this crate could provide a function to answer only #3, +//! which would subsume #1 and #2 automatically. However, it can be +//! significantly more expensive to compute the location of submatches, so it's +//! best not to do it if you don't need to. +//! +//! Therefore, only use what you need. For example, don't use `find` if you +//! only need to test if an expression matches a string. (Use `is_match` +//! instead.) +//! +//! # Unicode +//! +//! This implementation executes regular expressions **only** on sequences of +//! UTF8 codepoints while exposing match locations as byte indices. +//! +//! Currently, only naive case folding is supported. Namely, when matching +//! case insensitively, the characters are first converted to their uppercase +//! forms and then compared. +//! +//! Regular expressions themselves are also **only** interpreted as a sequence +//! of UTF8 codepoints. This means you can embed Unicode characters directly +//! into your expression: +//! +//! ```rust +//! # #![feature(phase)] +//! # extern crate regex; #[phase(syntax)] extern crate regex_macros; +//! # fn main() { +//! let re = regex!(r"(?i)Δ+"); +//! assert_eq!(re.find("ΔδΔ"), Some((0, 6))); +//! # } +//! ``` +//! +//! Finally, Unicode general categories and scripts are available as character +//! classes. For example, you can match a sequence of numerals, Greek or +//! Cherokee letters: +//! +//! ```rust +//! # #![feature(phase)] +//! # extern crate regex; #[phase(syntax)] extern crate regex_macros; +//! # fn main() { +//! let re = regex!(r"[\pN\p{Greek}\p{Cherokee}]+"); +//! assert_eq!(re.find("abcΔᎠβⅠᏴγδⅡxyz"), Some((3, 23))); +//! # } +//! ``` +//! +//! # Syntax +//! +//! The syntax supported in this crate is almost in an exact correspondence +//! with the syntax supported by RE2. +//! +//! ## Matching one character +//! +//! <pre class="rust"> +//! . any character except new line (includes new line with s flag) +//! [xyz] A character class matching either x, y or z. +//! [^xyz] A character class matching any character except x, y and z. +//! [a-z] A character class matching any character in range a-z. +//! \d Perl character class ([0-9]) +//! \D Negated Perl character class ([^0-9]) +//! [:alpha:] ASCII character class ([A-Za-z]) +//! [:^alpha:] Negated ASCII character class ([^A-Za-z]) +//! \pN One letter name Unicode character class +//! \p{Greek} Unicode character class (general category or script) +//! \PN Negated one letter name Unicode character class +//! \P{Greek} negated Unicode character class (general category or script) +//! </pre> +//! +//! Any named character class may appear inside a bracketed `[...]` character +//! class. For example, `[\p{Greek}\pN]` matches any Greek or numeral +//! character. +//! +//! ## Composites +//! +//! <pre class="rust"> +//! xy concatenation (x followed by y) +//! x|y alternation (x or y, prefer x) +//! </pre> +//! +//! ## Repetitions +//! +//! <pre class="rust"> +//! x* zero or more of x (greedy) +//! x+ one or more of x (greedy) +//! x? zero or one of x (greedy) +//! x*? zero or more of x (ungreedy) +//! x+? one or more of x (ungreedy) +//! x?? zero or one of x (ungreedy) +//! x{n,m} at least n and at most x (greedy) +//! x{n,} at least n x (greedy) +//! x{n} exactly n x +//! x{n,m}? at least n and at most x (ungreedy) +//! x{n,}? at least n x (ungreedy) +//! x{n}? exactly n x +//! </pre> +//! +//! ## Empty matches +//! +//! <pre class="rust"> +//! ^ the beginning of text (or start-of-line with multi-line mode) +//! $ the end of text (or end-of-line with multi-line mode) +//! \A only the beginning of text (even with multi-line mode enabled) +//! \z only the end of text (even with multi-line mode enabled) +//! \b a Unicode word boundary (\w on one side and \W, \A, or \z on other) +//! \B not a Unicode word boundary +//! </pre> +//! +//! ## Grouping and flags +//! +//! <pre class="rust"> +//! (exp) numbered capture group (indexed by opening parenthesis) +//! (?P<name>exp) named (also numbered) capture group (allowed chars: [_0-9a-zA-Z]) +//! (?:exp) non-capturing group +//! (?flags) set flags within current group +//! (?flags:exp) set flags for exp (non-capturing) +//! </pre> +//! +//! Flags are each a single character. For example, `(?x)` sets the flag `x` +//! and `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at +//! the same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets +//! the `x` flag and clears the `y` flag. +//! +//! All flags are by default disabled. They are: +//! +//! <pre class="rust"> +//! i case insensitive +//! m multi-line mode: ^ and $ match begin/end of line +//! s allow . to match \n +//! U swap the meaning of x* and x*? +//! </pre> +//! +//! Here's an example that matches case insensitively for only part of the +//! expression: +//! +//! ```rust +//! # #![feature(phase)] +//! # extern crate regex; #[phase(syntax)] extern crate regex_macros; +//! # fn main() { +//! let re = regex!(r"(?i)a+(?-i)b+"); +//! let cap = re.captures("AaAaAbbBBBb").unwrap(); +//! assert_eq!(cap.at(0), "AaAaAbb"); +//! # } +//! ``` +//! +//! Notice that the `a+` matches either `a` or `A`, but the `b+` only matches +//! `b`. +//! +//! ## Escape sequences +//! +//! <pre class="rust"> +//! \* literal *, works for any punctuation character: \.+*?()|[]{}^$ +//! \a bell (\x07) +//! \f form feed (\x0C) +//! \t horizontal tab +//! \n new line +//! \r carriage return +//! \v vertical tab (\x0B) +//! \123 octal character code (up to three digits) +//! \x7F hex character code (exactly two digits) +//! \x{10FFFF} any hex character code corresponding to a valid UTF8 codepoint +//! </pre> +//! +//! ## Perl character classes (Unicode friendly) +//! +//! <pre class="rust"> +//! \d digit ([0-9] + \p{Nd}) +//! \D not digit +//! \s whitespace ([\t\n\f\r ] + \p{Z}) +//! \S not whitespace +//! \w word character ([0-9A-Za-z_] + \p{L}) +//! \W not word character +//! </pre> +//! +//! ## ASCII character classes +//! +//! <pre class="rust"> +//! [:alnum:] alphanumeric ([0-9A-Za-z]) +//! [:alpha:] alphabetic ([A-Za-z]) +//! [:ascii:] ASCII ([\x00-\x7F]) +//! [:blank:] blank ([\t ]) +//! [:cntrl:] control ([\x00-\x1F\x7F]) +//! [:digit:] digits ([0-9]) +//! [:graph:] graphical ([!-~]) +//! [:lower:] lower case ([a-z]) +//! [:print:] printable ([ -~]) +//! [:punct:] punctuation ([!-/:-@[-`{-~]) +//! [:space:] whitespace ([\t\n\v\f\r ]) +//! [:upper:] upper case ([A-Z]) +//! [:word:] word characters ([0-9A-Za-z_]) +//! [:xdigit:] hex digit ([0-9A-Fa-f]) +//! </pre> +//! +//! # Untrusted input +//! +//! There are two factors to consider here: untrusted regular expressions and +//! untrusted search text. +//! +//! Currently, there are no counter-measures in place to prevent a malicious +//! user from writing an expression that may use a lot of resources. One such +//! example is to repeat counted repetitions: `((a{100}){100}){100}` will try +//! to repeat the `a` instruction `100^3` times. Essentially, this means it's +//! very easy for an attacker to exhaust your system's memory if they are +//! allowed to execute arbitrary regular expressions. A possible solution to +//! this is to impose a hard limit on the size of a compiled expression, but it +//! does not yet exist. +//! +//! The story is a bit better with untrusted search text, since this crate's +//! implementation provides `O(nm)` search where `n` is the number of +//! characters in the search text and `m` is the number of instructions in a +//! compiled expression. + +#![crate_id = "regex#0.11-pre"] +#![crate_type = "rlib"] +#![crate_type = "dylib"] +#![experimental] +#![license = "MIT/ASL2"] +#![doc(html_logo_url = "http://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png", + html_favicon_url = "http://www.rust-lang.org/favicon.ico", + html_root_url = "http://static.rust-lang.org/doc/master")] + +#![feature(macro_rules, phase)] +#![deny(missing_doc)] + +extern crate collections; +#[cfg(test)] +extern crate stdtest = "test"; +#[cfg(test)] +extern crate rand; + +// During tests, this links with the `regex` crate so that the `regex!` macro +// can be tested. +#[cfg(test)] +extern crate regex; + +pub use parse::Error; +pub use re::{Regex, Captures, SubCaptures, SubCapturesPos}; +pub use re::{FindCaptures, FindMatches}; +pub use re::{Replacer, NoExpand, RegexSplits, RegexSplitsN}; +pub use re::{quote, is_match}; + +mod compile; +mod parse; +mod re; +mod vm; + +// FIXME(#13725) windows needs fixing. +#[cfg(test, not(windows))] +mod test; + +/// The `program` module exists to support the `regex!` macro. Do not use. +#[doc(hidden)] +pub mod native { + // Exporting this stuff is bad form, but it's necessary for two reasons. + // Firstly, the `regex!` syntax extension is in a different crate and + // requires access to the representation of a regex (particularly the + // instruction set) in order to compile to native Rust. This could be + // mitigated if `regex!` was defined in the same crate, but this has + // undesirable consequences (such as requiring a dependency on + // `libsyntax`). + // + // Secondly, the code generated generated by `regex!` must *also* be able + // to access various functions in this crate to reduce code duplication + // and to provide a value with precisely the same `Regex` type in this + // crate. This, AFAIK, is impossible to mitigate. + // + // On the bright side, `rustdoc` lets us hide this from the public API + // documentation. + pub use compile::{ + Program, + OneChar, CharClass, Any, Save, Jump, Split, + Match, EmptyBegin, EmptyEnd, EmptyWordBoundary, + }; + pub use parse::{ + FLAG_EMPTY, FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, + FLAG_SWAP_GREED, FLAG_NEGATED, + }; + pub use re::{Dynamic, Native}; + pub use vm::{ + MatchKind, Exists, Location, Submatches, + StepState, StepMatchEarlyReturn, StepMatch, StepContinue, + CharReader, find_prefix, + }; +} |
