diff options
Diffstat (limited to 'src/libregex_macros/lib.rs')
| -rw-r--r-- | src/libregex_macros/lib.rs | 684 | 
1 files changed, 684 insertions, 0 deletions
| diff --git a/src/libregex_macros/lib.rs b/src/libregex_macros/lib.rs new file mode 100644 index 00000000000..72e00deba4d --- /dev/null +++ b/src/libregex_macros/lib.rs @@ -0,0 +1,684 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! This crate provides the `regex!` macro. Its use is documented in the +//! `regex` crate. + +#![crate_id = "regex_macros#0.11-pre"] +#![crate_type = "dylib"] +#![experimental] +#![license = "MIT/ASL2"] +#![doc(html_logo_url = "http://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png", + html_favicon_url = "http://www.rust-lang.org/favicon.ico", + html_root_url = "http://static.rust-lang.org/doc/master")] + +#![feature(macro_registrar, managed_boxes, quote)] + +extern crate regex; +extern crate syntax; + +use syntax::ast; +use syntax::codemap; +use syntax::ext::base::{ + SyntaxExtension, ExtCtxt, MacResult, MacExpr, DummyResult, + NormalTT, BasicMacroExpander, +}; +use syntax::parse; +use syntax::parse::token; +use syntax::print::pprust; + +use regex::Regex; +use regex::native::{ + OneChar, CharClass, Any, Save, Jump, Split, + Match, EmptyBegin, EmptyEnd, EmptyWordBoundary, + Program, Dynamic, Native, + FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED, +}; + +/// For the `regex!` syntax extension. Do not use. +#[macro_registrar] +#[doc(hidden)] +pub fn macro_registrar(register: |ast::Name, SyntaxExtension|) { + let expander = ~BasicMacroExpander { expander: native, span: None }; + register(token::intern("regex"), NormalTT(expander, None)) +} + +/// Generates specialized code for the Pike VM for a particular regular +/// expression. +/// +/// There are two primary differences between the code generated here and the +/// general code in vm.rs. +/// +/// 1. All heap allocation is removed. Sized vector types are used instead. +/// Care must be taken to make sure that these vectors are not copied +/// gratuitously. (If you're not sure, run the benchmarks. They will yell +/// at you if you do.) +/// 2. The main `match instruction { ... }` expressions are replaced with more +/// direct `match pc { ... }`. The generators can be found in +/// `step_insts` and `add_insts`. +/// +/// Other more minor changes include eliding code when possible (although this +/// isn't completely thorough at the moment), and translating character class +/// matching from using a binary search to a simple `match` expression (see +/// `match_class`). +/// +/// It is strongly recommended to read the dynamic implementation in vm.rs +/// first before trying to understand the code generator. The implementation +/// strategy is identical and vm.rs has comments and will be easier to follow. +fn native(cx: &mut ExtCtxt, sp: codemap::Span, tts: &[ast::TokenTree]) + -> ~MacResult { + let regex = match parse(cx, tts) { + Some(r) => r, + // error is logged in 'parse' with cx.span_err + None => return DummyResult::any(sp), + }; + let re = match Regex::new(regex.to_owned()) { + Ok(re) => re, + Err(err) => { + cx.span_err(sp, err.to_str()); + return DummyResult::any(sp) + } + }; + let prog = match re.p { + Dynamic(ref prog) => prog.clone(), + Native(_) => unreachable!(), + }; + + let mut gen = NfaGen { + cx: &*cx, sp: sp, prog: prog, + names: re.names.clone(), original: re.original.clone(), + }; + MacExpr::new(gen.code()) +} + +struct NfaGen<'a> { + cx: &'a ExtCtxt<'a>, + sp: codemap::Span, + prog: Program, + names: ~[Option<~str>], + original: ~str, +} + +impl<'a> NfaGen<'a> { + fn code(&mut self) -> @ast::Expr { + // Most or all of the following things are used in the quasiquoted + // expression returned. + let num_cap_locs = 2 * self.prog.num_captures(); + let num_insts = self.prog.insts.len(); + let cap_names = self.vec_expr(self.names, + |cx, name| match name { + &Some(ref name) => { + let name = name.as_slice(); + quote_expr!(cx, Some(~$name)) + } + &None => quote_expr!(cx, None), + } + ); + let prefix_anchor = + match self.prog.insts.as_slice()[1] { + EmptyBegin(flags) if flags & FLAG_MULTI == 0 => true, + _ => false, + }; + let init_groups = self.vec_from_fn(num_cap_locs, + |cx| quote_expr!(cx, None)); + let prefix_bytes = self.vec_expr(self.prog.prefix.as_slice().as_bytes(), + |cx, b| quote_expr!(cx, $b)); + let check_prefix = self.check_prefix(); + let step_insts = self.step_insts(); + let add_insts = self.add_insts(); + let regex = self.original.as_slice(); + + quote_expr!(self.cx, { +fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str, + start: uint, end: uint) -> Vec<Option<uint>> { + #![allow(unused_imports)] + use regex::native::{ + MatchKind, Exists, Location, Submatches, + StepState, StepMatchEarlyReturn, StepMatch, StepContinue, + CharReader, find_prefix, + }; + + return Nfa { + which: which, + input: input, + ic: 0, + chars: CharReader::new(input), + }.run(start, end); + + type Captures = [Option<uint>, ..$num_cap_locs]; + + struct Nfa<'t> { + which: MatchKind, + input: &'t str, + ic: uint, + chars: CharReader<'t>, + } + + impl<'t> Nfa<'t> { + #[allow(unused_variable)] + fn run(&mut self, start: uint, end: uint) -> Vec<Option<uint>> { + let mut matched = false; + let prefix_bytes: &[u8] = &$prefix_bytes; + let mut clist = &mut Threads::new(self.which); + let mut nlist = &mut Threads::new(self.which); + + let mut groups = $init_groups; + + self.ic = start; + let mut next_ic = self.chars.set(start); + while self.ic <= end { + if clist.size == 0 { + if matched { + break + } + $check_prefix + } + if clist.size == 0 || (!$prefix_anchor && !matched) { + self.add(clist, 0, &mut groups) + } + + self.ic = next_ic; + next_ic = self.chars.advance(); + + let mut i = 0; + while i < clist.size { + let pc = clist.pc(i); + let step_state = self.step(&mut groups, nlist, + clist.groups(i), pc); + match step_state { + StepMatchEarlyReturn => + return vec![Some(0u), Some(0u)], + StepMatch => { matched = true; clist.empty() }, + StepContinue => {}, + } + i += 1; + } + ::std::mem::swap(&mut clist, &mut nlist); + nlist.empty(); + } + match self.which { + Exists if matched => vec![Some(0u), Some(0u)], + Exists => vec![None, None], + Location | Submatches => groups.iter().map(|x| *x).collect(), + } + } + + // Sometimes `nlist` is never used (for empty regexes). + #[allow(unused_variable)] + #[inline] + fn step(&self, groups: &mut Captures, nlist: &mut Threads, + caps: &mut Captures, pc: uint) -> StepState { + $step_insts + StepContinue + } + + fn add(&self, nlist: &mut Threads, pc: uint, + groups: &mut Captures) { + if nlist.contains(pc) { + return + } + $add_insts + } + } + + struct Thread { + pc: uint, + groups: Captures, + } + + struct Threads { + which: MatchKind, + queue: [Thread, ..$num_insts], + sparse: [uint, ..$num_insts], + size: uint, + } + + impl Threads { + fn new(which: MatchKind) -> Threads { + Threads { + which: which, + // These unsafe blocks are used for performance reasons, as it + // gives us a zero-cost initialization of a sparse set. The + // trick is described in more detail here: + // http://research.swtch.com/sparse + // The idea here is to avoid initializing threads that never + // need to be initialized, particularly for larger regexs with + // a lot of instructions. + queue: unsafe { ::std::mem::uninit() }, + sparse: unsafe { ::std::mem::uninit() }, + size: 0, + } + } + + #[inline] + fn add(&mut self, pc: uint, groups: &Captures) { + let t = &mut self.queue[self.size]; + t.pc = pc; + match self.which { + Exists => {}, + Location => { + t.groups[0] = groups[0]; + t.groups[1] = groups[1]; + } + Submatches => { + for (slot, val) in t.groups.mut_iter().zip(groups.iter()) { + *slot = *val; + } + } + } + self.sparse[pc] = self.size; + self.size += 1; + } + + #[inline] + fn add_empty(&mut self, pc: uint) { + self.queue[self.size].pc = pc; + self.sparse[pc] = self.size; + self.size += 1; + } + + #[inline] + fn contains(&self, pc: uint) -> bool { + let s = self.sparse[pc]; + s < self.size && self.queue[s].pc == pc + } + + #[inline] + fn empty(&mut self) { + self.size = 0; + } + + #[inline] + fn pc(&self, i: uint) -> uint { + self.queue[i].pc + } + + #[inline] + fn groups<'r>(&'r mut self, i: uint) -> &'r mut Captures { + &'r mut self.queue[i].groups + } + } +} + +::regex::Regex { + original: ~$regex, + names: ~$cap_names, + p: ::regex::native::Native(exec), +} + }) + } + + // Generates code for the `add` method, which is responsible for adding + // zero-width states to the next queue of states to visit. + fn add_insts(&self) -> @ast::Expr { + let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| { + let nextpc = pc + 1; + let body = match *inst { + EmptyBegin(flags) => { + let nl = '\n'; + let cond = + if flags & FLAG_MULTI > 0 { + quote_expr!(self.cx, + self.chars.is_begin() + || self.chars.prev == Some($nl) + ) + } else { + quote_expr!(self.cx, self.chars.is_begin()) + }; + quote_expr!(self.cx, { + nlist.add_empty($pc); + if $cond { self.add(nlist, $nextpc, &mut *groups) } + }) + } + EmptyEnd(flags) => { + let nl = '\n'; + let cond = + if flags & FLAG_MULTI > 0 { + quote_expr!(self.cx, + self.chars.is_end() + || self.chars.cur == Some($nl) + ) + } else { + quote_expr!(self.cx, self.chars.is_end()) + }; + quote_expr!(self.cx, { + nlist.add_empty($pc); + if $cond { self.add(nlist, $nextpc, &mut *groups) } + }) + } + EmptyWordBoundary(flags) => { + let cond = + if flags & FLAG_NEGATED > 0 { + quote_expr!(self.cx, !self.chars.is_word_boundary()) + } else { + quote_expr!(self.cx, self.chars.is_word_boundary()) + }; + quote_expr!(self.cx, { + nlist.add_empty($pc); + if $cond { self.add(nlist, $nextpc, &mut *groups) } + }) + } + Save(slot) => { + let save = quote_expr!(self.cx, { + let old = groups[$slot]; + groups[$slot] = Some(self.ic); + self.add(nlist, $nextpc, &mut *groups); + groups[$slot] = old; + }); + let add = quote_expr!(self.cx, { + self.add(nlist, $nextpc, &mut *groups); + }); + // If this is saving a submatch location but we request + // existence or only full match location, then we can skip + // right over it every time. + if slot > 1 { + quote_expr!(self.cx, { + nlist.add_empty($pc); + match self.which { + Submatches => $save, + Exists | Location => $add, + } + }) + } else { + quote_expr!(self.cx, { + nlist.add_empty($pc); + match self.which { + Submatches | Location => $save, + Exists => $add, + } + }) + } + } + Jump(to) => { + quote_expr!(self.cx, { + nlist.add_empty($pc); + self.add(nlist, $to, &mut *groups); + }) + } + Split(x, y) => { + quote_expr!(self.cx, { + nlist.add_empty($pc); + self.add(nlist, $x, &mut *groups); + self.add(nlist, $y, &mut *groups); + }) + } + // For Match, OneChar, CharClass, Any + _ => quote_expr!(self.cx, nlist.add($pc, &*groups)), + }; + self.arm_inst(pc, body) + }).collect::<Vec<ast::Arm>>(); + + self.match_insts(arms) + } + + // Generates the code for the `step` method, which processes all states + // in the current queue that consume a single character. + fn step_insts(&self) -> @ast::Expr { + let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| { + let nextpc = pc + 1; + let body = match *inst { + Match => { + quote_expr!(self.cx, { + match self.which { + Exists => { + return StepMatchEarlyReturn + } + Location => { + groups[0] = caps[0]; + groups[1] = caps[1]; + return StepMatch + } + Submatches => { + for (slot, val) in groups.mut_iter().zip(caps.iter()) { + *slot = *val; + } + return StepMatch + } + } + }) + } + OneChar(c, flags) => { + if flags & FLAG_NOCASE > 0 { + let upc = c.to_uppercase(); + quote_expr!(self.cx, { + let upc = self.chars.prev.map(|c| c.to_uppercase()); + if upc == Some($upc) { + self.add(nlist, $nextpc, caps); + } + }) + } else { + quote_expr!(self.cx, { + if self.chars.prev == Some($c) { + self.add(nlist, $nextpc, caps); + } + }) + } + } + CharClass(ref ranges, flags) => { + let negate = flags & FLAG_NEGATED > 0; + let casei = flags & FLAG_NOCASE > 0; + let get_char = + if casei { + quote_expr!(self.cx, self.chars.prev.unwrap().to_uppercase()) + } else { + quote_expr!(self.cx, self.chars.prev.unwrap()) + }; + let negcond = + if negate { + quote_expr!(self.cx, !found) + } else { + quote_expr!(self.cx, found) + }; + let mranges = self.match_class(casei, ranges.as_slice()); + quote_expr!(self.cx, { + if self.chars.prev.is_some() { + let c = $get_char; + let found = $mranges; + if $negcond { + self.add(nlist, $nextpc, caps); + } + } + }) + } + Any(flags) => { + if flags & FLAG_DOTNL > 0 { + quote_expr!(self.cx, self.add(nlist, $nextpc, caps)) + } else { + let nl = '\n'; // no char lits allowed? wtf? + quote_expr!(self.cx, { + if self.chars.prev != Some($nl) { + self.add(nlist, $nextpc, caps) + } + }) + } + } + // EmptyBegin, EmptyEnd, EmptyWordBoundary, Save, Jump, Split + _ => quote_expr!(self.cx, {}), + }; + self.arm_inst(pc, body) + }).collect::<Vec<ast::Arm>>(); + + self.match_insts(arms) + } + + // Translates a character class into a match expression. + // This avoids a binary search (and is hopefully replaced by a jump + // table). + fn match_class(&self, casei: bool, ranges: &[(char, char)]) -> @ast::Expr { + let mut arms = ranges.iter().map(|&(mut start, mut end)| { + if casei { + start = start.to_uppercase(); + end = end.to_uppercase(); + } + ast::Arm { + attrs: vec!(), + pats: vec!(@ast::Pat{ + id: ast::DUMMY_NODE_ID, + span: self.sp, + node: ast::PatRange(quote_expr!(self.cx, $start), + quote_expr!(self.cx, $end)), + }), + guard: None, + body: quote_expr!(self.cx, true), + } + }).collect::<Vec<ast::Arm>>(); + + arms.push(self.wild_arm_expr(quote_expr!(self.cx, false))); + + let match_on = quote_expr!(self.cx, c); + self.dummy_expr(ast::ExprMatch(match_on, arms)) + } + + // Generates code for checking a literal prefix of the search string. + // The code is only generated if the regex *has* a literal prefix. + // Otherwise, a no-op is returned. + fn check_prefix(&self) -> @ast::Expr { + if self.prog.prefix.len() == 0 { + quote_expr!(self.cx, {}) + } else { + quote_expr!(self.cx, + if clist.size == 0 { + let haystack = self.input.as_bytes().slice_from(self.ic); + match find_prefix(prefix_bytes, haystack) { + None => break, + Some(i) => { + self.ic += i; + next_ic = self.chars.set(self.ic); + } + } + } + ) + } + } + + // Builds a `match pc { ... }` expression from a list of arms, specifically + // for matching the current program counter with an instruction. + // A wild-card arm is automatically added that executes a no-op. It will + // never be used, but is added to satisfy the compiler complaining about + // non-exhaustive patterns. + fn match_insts(&self, mut arms: Vec<ast::Arm>) -> @ast::Expr { + let mat_pc = quote_expr!(self.cx, pc); + arms.push(self.wild_arm_expr(quote_expr!(self.cx, {}))); + self.dummy_expr(ast::ExprMatch(mat_pc, arms)) + } + + // Creates a match arm for the instruction at `pc` with the expression + // `body`. + fn arm_inst(&self, pc: uint, body: @ast::Expr) -> ast::Arm { + ast::Arm { + attrs: vec!(), + pats: vec!(@ast::Pat{ + id: ast::DUMMY_NODE_ID, + span: self.sp, + node: ast::PatLit(quote_expr!(self.cx, $pc)), + }), + guard: None, + body: body, + } + } + + // Creates a wild-card match arm with the expression `body`. + fn wild_arm_expr(&self, body: @ast::Expr) -> ast::Arm { + ast::Arm { + attrs: vec!(), + pats: vec!(@ast::Pat{ + id: ast::DUMMY_NODE_ID, + span: self.sp, + node: ast::PatWild, + }), + guard: None, + body: body, + } + } + + // Builds a `[a, b, .., len]` expression where each element is the result + // of executing `to_expr`. + fn vec_from_fn(&self, len: uint, to_expr: |&ExtCtxt| -> @ast::Expr) + -> @ast::Expr { + self.vec_expr(Vec::from_elem(len, ()).as_slice(), + |cx, _| to_expr(cx)) + } + + // Converts `xs` to a `[x1, x2, .., xN]` expression by calling `to_expr` + // on each element in `xs`. + fn vec_expr<T>(&self, xs: &[T], to_expr: |&ExtCtxt, &T| -> @ast::Expr) + -> @ast::Expr { + let mut exprs = vec!(); + for x in xs.iter() { + exprs.push(to_expr(self.cx, x)) + } + let vec_exprs = self.dummy_expr(ast::ExprVec(exprs)); + quote_expr!(self.cx, $vec_exprs) + } + + // Creates an expression with a dummy node ID given an underlying + // `ast::Expr_`. + fn dummy_expr(&self, e: ast::Expr_) -> @ast::Expr { + @ast::Expr { + id: ast::DUMMY_NODE_ID, + node: e, + span: self.sp, + } + } +} + +// This trait is defined in the quote module in the syntax crate, but I +// don't think it's exported. +// Interestingly, quote_expr! only requires that a 'to_tokens' method be +// defined rather than satisfying a particular trait. +#[doc(hidden)] +trait ToTokens { + fn to_tokens(&self, cx: &ExtCtxt) -> Vec<ast::TokenTree>; +} + +impl ToTokens for char { + fn to_tokens(&self, _: &ExtCtxt) -> Vec<ast::TokenTree> { + vec!(ast::TTTok(codemap::DUMMY_SP, token::LIT_CHAR((*self) as u32))) + } +} + +impl ToTokens for bool { + fn to_tokens(&self, _: &ExtCtxt) -> Vec<ast::TokenTree> { + let ident = token::IDENT(token::str_to_ident(self.to_str()), false); + vec!(ast::TTTok(codemap::DUMMY_SP, ident)) + } +} + +/// Looks for a single string literal and returns it. +/// Otherwise, logs an error with cx.span_err and returns None. +fn parse(cx: &mut ExtCtxt, tts: &[ast::TokenTree]) -> Option<~str> { + let mut parser = parse::new_parser_from_tts(cx.parse_sess(), cx.cfg(), + Vec::from_slice(tts)); + let entry = cx.expand_expr(parser.parse_expr()); + let regex = match entry.node { + ast::ExprLit(lit) => { + match lit.node { + ast::LitStr(ref s, _) => s.to_str(), + _ => { + cx.span_err(entry.span, format!( + "expected string literal but got `{}`", + pprust::lit_to_str(lit))); + return None + } + } + } + _ => { + cx.span_err(entry.span, format!( + "expected string literal but got `{}`", + pprust::expr_to_str(entry))); + return None + } + }; + if !parser.eat(&token::EOF) { + cx.span_err(parser.span, "only one string literal allowed"); + return None; + } + Some(regex) +} | 
