diff options
| author | Paul Stansifer <paul.stansifer@gmail.com> | 2012-06-12 10:59:50 -0700 |
|---|---|---|
| committer | Paul Stansifer <paul.stansifer@gmail.com> | 2012-06-25 18:01:37 -0700 |
| commit | 4f104954a67ad736244ce212467290c836394fad (patch) | |
| tree | f7c8af3b048c43d55d8dfbca638faefba9d6105e /src/libsyntax | |
| parent | 650dfe58a3d0b41b561d8a68924f31e93f79d4bc (diff) | |
| download | rust-4f104954a67ad736244ce212467290c836394fad.tar.gz rust-4f104954a67ad736244ce212467290c836394fad.zip | |
parsing for the macro system
Diffstat (limited to 'src/libsyntax')
| -rw-r--r-- | src/libsyntax/ast.rs | 11 | ||||
| -rw-r--r-- | src/libsyntax/ext/earley_parser.rs | 243 | ||||
| -rw-r--r-- | src/libsyntax/parse/comments.rs | 3 | ||||
| -rw-r--r-- | src/libsyntax/parse/common.rs | 2 | ||||
| -rw-r--r-- | src/libsyntax/parse/lexer.rs | 70 | ||||
| -rw-r--r-- | src/libsyntax/parse/parser.rs | 61 | ||||
| -rw-r--r-- | src/libsyntax/parse/token.rs | 6 | ||||
| -rw-r--r-- | src/libsyntax/syntax.rc | 3 |
8 files changed, 370 insertions, 29 deletions
diff --git a/src/libsyntax/ast.rs b/src/libsyntax/ast.rs index fd7c3aa9e71..d6d2d4f3165 100644 --- a/src/libsyntax/ast.rs +++ b/src/libsyntax/ast.rs @@ -378,6 +378,17 @@ enum token_tree { } #[auto_serialize] +type matcher = spanned<matcher_>; + +#[auto_serialize] +enum matcher_ { + mtc_tok(token::token), + /* body, separator, zero ok? : */ + mtc_rep([matcher], option<token::token>, bool), + mtc_bb(ident, ident, uint) +} + +#[auto_serialize] type mac = spanned<mac_>; #[auto_serialize] diff --git a/src/libsyntax/ext/earley_parser.rs b/src/libsyntax/ext/earley_parser.rs new file mode 100644 index 00000000000..b1a2524ca35 --- /dev/null +++ b/src/libsyntax/ext/earley_parser.rs @@ -0,0 +1,243 @@ +// Earley-like parser for macros. +import parse::token; +import parse::token::{token, EOF, to_str, whole_nt}; +import parse::lexer::{reader, tt_reader, tt_reader_as_reader}; +import parse::parser::{parser,SOURCE_FILE}; +import parse::common::parser_common; +import parse::parse_sess; +import dvec::{dvec, extensions}; +import ast::{matcher, mtc_tok, mtc_rep, mtc_bb}; + +/* This is an Earley-like parser, without support for nonterminals. This +means that there are no completer or predictor rules, and therefore no need to +store one column per token: instead, there's a set of current Earley items and +a set of next ones. Instead of NTs, we have a special case for Kleene +star. The big-O, in pathological cases, is worse than traditional Earley +parsing, but it's an easier fit for Macro-by-Example-style rules, and I think +the overhead is lower. */ + + +/* to avoid costly uniqueness checks, we require that `mtc_rep` always has a +nonempty body. */ + +enum matcher_pos_up { /* to break a circularity */ + matcher_pos_up(option<matcher_pos>) +} + +fn is_some(&&mpu: matcher_pos_up) -> bool { + alt mpu { + matcher_pos_up(none) { false } + _ { true } + } +} + +type matcher_pos = ~{ + elts: [ast::matcher], // maybe should be /& ? Need to understand regions. + sep: option<token>, + mut idx: uint, + mut up: matcher_pos_up, // mutable for swapping only + matches: [dvec<@arb_depth>] +}; + +fn copy_up(&& mpu: matcher_pos_up) -> matcher_pos { + alt mpu { + matcher_pos_up(some(mp)) { copy mp } + _ { fail } + } +} + +fn count_names(ms: [matcher]/&) -> uint { + vec::foldl(0u, ms, {|ct, m| + ct + alt m.node { + mtc_tok(_) { 0u } + mtc_rep(more_ms, _, _) { count_names(more_ms) } + mtc_bb(_,_,_) { 1u } + }}) +} + +fn new_matcher_pos(ms: [matcher], sep: option<token>) -> matcher_pos { + ~{elts: ms, sep: sep, mut idx: 0u, mut up: matcher_pos_up(none), + matches: copy vec::from_fn(count_names(ms), {|_i| dvec::dvec()}) } +} + +/* logically, an arb_depth should contain only one kind of nonterminal */ +enum arb_depth { leaf(whole_nt), seq([@arb_depth]) } + +type earley_item = matcher_pos; + + +fn parse(sess: parse_sess, cfg: ast::crate_cfg, rdr: reader, ms: [matcher]) + -> [@arb_depth] { + let mut cur_eis = []; + vec::push(cur_eis, new_matcher_pos(ms, none)); + + loop { + let mut bb_eis = []; // black-box parsed by parser.rs + let mut next_eis = []; // or proceed normally + let mut eof_eis = []; + + let {tok: tok, sp: _} = rdr.peek(); + + /* we append new items to this while we go */ + while cur_eis.len() > 0u { /* for each Earley Item */ + let mut ei = vec::pop(cur_eis); + + let idx = ei.idx; + let len = ei.elts.len(); + + /* at end of sequence */ + if idx >= len { + // can't move out of `alt`s, so: + if is_some(ei.up) { + // hack: a matcher sequence is repeating iff it has a + // parent (the top level is just a container) + + + // disregard separator, try to go up + // (remove this condition to make trailing seps ok) + if idx == len { + // pop from the matcher position + + let new_pos = copy_up(ei.up); + + // update matches (the MBE "parse tree") by appending + // each tree as a subtree. + + // I bet this is a perf problem: we're preemptively + // doing a lot of array work that will get thrown away + // most of the time. + for ei.matches.eachi() { |idx, elt| + new_pos.matches[idx].push(@seq(elt.get())); + } + + new_pos.idx += 1u; + vec::push(cur_eis, new_pos); + } + + // can we go around again? + + // the *_t vars are workarounds for the lack of unary move + alt copy ei.sep { + some(t) if idx == len { // we need a separator + if tok == t { //pass the separator + let ei_t <- ei; + ei_t.idx += 1u; + vec::push(next_eis, ei_t); + } + } + _ { // we don't need a separator + let ei_t <- ei; + ei_t.idx = 0u; + vec::push(cur_eis, ei_t); + } + } + } else { + vec::push(eof_eis, ei); + } + } else { + alt copy ei.elts[idx].node { + /* need to descend into sequence */ + mtc_rep(matchers, sep, zero_ok) { + if zero_ok { + let new_ei = copy ei; + new_ei.idx += 1u; + vec::push(cur_eis, new_ei); + } + + let matches = vec::map(ei.matches, // fresh, same size: + {|_m| dvec::<@arb_depth>()}); + let ei_t <- ei; + vec::push(cur_eis, ~{ + elts: matchers, sep: sep, mut idx: 0u, + mut up: matcher_pos_up(some(ei_t)), + matches: matches + }); + } + mtc_bb(_,_,_) { vec::push(bb_eis, ei) } + mtc_tok(t) { + let ei_t <- ei; + if t == tok { ei_t.idx += 1u; vec::push(next_eis, ei_t)} + } + } + } + } + + /* error messages here could be improved with links to orig. rules */ + if tok == EOF { + if eof_eis.len() == 1u { + let ret_val = vec::map(eof_eis[0u].matches, {|dv| dv.pop()}); + ret ret_val; /* success */ + } else if eof_eis.len() > 1u { + rdr.fatal("Ambiguity: multiple successful parses"); + } else { + rdr.fatal("Unexpected end of macro invocation"); + } + } else { + if (bb_eis.len() > 0u && next_eis.len() > 0u) + || bb_eis.len() > 1u { + let nts = str::connect(vec::map(bb_eis, {|ei| + alt ei.elts[ei.idx].node + { mtc_bb(_,name,_) { *name } _ { fail; } } + }), " or "); + rdr.fatal(#fmt["Local ambiguity: multiple parsing options: \ + built-in NTs %s or %u other options.", + nts, next_eis.len()]); + } else if (bb_eis.len() == 0u && next_eis.len() == 0u) { + rdr.fatal("No rules expected the token " + + to_str(*rdr.interner(), tok)); + } else if (next_eis.len() > 0u) { + /* Now process the next token */ + while(next_eis.len() > 0u) { + vec::push(cur_eis, vec::pop(next_eis)); + } + rdr.next_token(); + } else /* bb_eis.len() == 1 */ { + let rust_parser = parser(sess, cfg, rdr.dup(), SOURCE_FILE); + + let ei = vec::pop(bb_eis); + alt ei.elts[ei.idx].node { + mtc_bb(_, name, idx) { + ei.matches[idx].push(@leaf( + parse_nt(rust_parser, *name))); + ei.idx += 1u; + } + _ { fail; } + } + vec::push(cur_eis,ei); + + /* this would fail if zero-length tokens existed */ + while rdr.peek().sp.lo < rust_parser.span.lo { + rdr.next_token(); + } + } + } + + assert cur_eis.len() > 0u; + } +} + +fn parse_nt(p: parser, name: str) -> whole_nt { + alt name { + "item" { alt p.parse_item([], ast::public) { + some(i) { token::w_item(i) } + none { p.fatal("expected an item keyword") } + }} + "block" { token::w_block(p.parse_block()) } + "stmt" { token::w_stmt(p.parse_stmt([])) } + "pat" { token::w_pat(p.parse_pat()) } + "expr" { token::w_expr(p.parse_expr()) } + "ty" { token::w_ty(p.parse_ty(false /* no need to disambiguate*/)) } + // this could be handled like a token, since it is one + "ident" { token::w_ident(p.parse_ident()) } + "path" { token::w_path(p.parse_path_with_tps(false)) } + _ { p.fatal("Unsupported builtin nonterminal parser: " + name)} + } +} + +// Local Variables: +// mode: rust; +// fill-column: 78; +// indent-tabs-mode: nil +// c-basic-offset: 4 +// buffer-file-coding-system: utf-8-unix +// End: diff --git a/src/libsyntax/parse/comments.rs b/src/libsyntax/parse/comments.rs index f1415cb9e20..54d14f2eaf4 100644 --- a/src/libsyntax/parse/comments.rs +++ b/src/libsyntax/parse/comments.rs @@ -199,8 +199,9 @@ fn gather_comments_and_literals(span_diagnostic: diagnostic::span_handler, let bstart = rdr.pos; + rdr.next_token(); //discard, and look ahead; we're working with internal state - let {tok: tok, sp: sp} = rdr.next_token(); + let {tok: tok, sp: sp} = rdr.peek(); if token::is_lit(tok) { let s = get_str_from(rdr, bstart); vec::push(literals, {lit: s, pos: sp.lo}); diff --git a/src/libsyntax/parse/common.rs b/src/libsyntax/parse/common.rs index 52cb9366df8..1d92561a108 100644 --- a/src/libsyntax/parse/common.rs +++ b/src/libsyntax/parse/common.rs @@ -111,8 +111,8 @@ impl parser_common for parser { if !self.eat_keyword(word) { self.fatal("expecting " + word + ", found " + token_to_str(self.reader, self.token)); + } } -} fn is_restricted_keyword(word: str) -> bool { self.restricted_keywords.contains_key(word) diff --git a/src/libsyntax/parse/lexer.rs b/src/libsyntax/parse/lexer.rs index 02b34e8dc86..5a3dceace8d 100644 --- a/src/libsyntax/parse/lexer.rs +++ b/src/libsyntax/parse/lexer.rs @@ -13,13 +13,17 @@ iface reader { fn is_eof() -> bool; fn next_token() -> {tok: token::token, sp: span}; fn fatal(str) -> !; + fn span_diag() -> diagnostic::span_handler; fn interner() -> @interner::interner<@str>; + fn peek() -> {tok: token::token, sp: span}; + fn dup() -> reader; } enum tt_frame_up { /* to break a circularity */ tt_frame_up(option<tt_frame>) } +/* TODO: figure out how to have a uniquely linked stack, and change to `~` */ #[doc = "an unzipping of `token_tree`s"] type tt_frame = @{ readme: [ast::token_tree], @@ -27,7 +31,7 @@ type tt_frame = @{ up: tt_frame_up }; -type tt_reader = ~{ +type tt_reader = @{ span_diagnostic: diagnostic::span_handler, interner: @interner::interner<@str>, mut cur: tt_frame, @@ -39,7 +43,7 @@ type tt_reader = ~{ fn new_tt_reader(span_diagnostic: diagnostic::span_handler, itr: @interner::interner<@str>, src: [ast::token_tree]) -> tt_reader { - let r = ~{span_diagnostic: span_diagnostic, interner: itr, + let r = @{span_diagnostic: span_diagnostic, interner: itr, mut cur: @{readme: src, mut idx: 0u, up: tt_frame_up(option::none)}, /* dummy values, never read: */ @@ -62,7 +66,7 @@ pure fn dup_tt_frame(&&f: tt_frame) -> tt_frame { } pure fn dup_tt_reader(&&r: tt_reader) -> tt_reader { - ~{span_diagnostic: r.span_diagnostic, interner: r.interner, + @{span_diagnostic: r.span_diagnostic, interner: r.interner, mut cur: dup_tt_frame(r.cur), mut cur_tok: r.cur_tok, mut cur_span: r.cur_span} } @@ -75,13 +79,17 @@ type string_reader = @{ mut curr: char, mut chpos: uint, filemap: codemap::filemap, - interner: @interner::interner<@str> + interner: @interner::interner<@str>, + /* cached: */ + mut peek_tok: token::token, + mut peek_span: span }; fn new_string_reader(span_diagnostic: diagnostic::span_handler, filemap: codemap::filemap, itr: @interner::interner<@str>) -> string_reader { let r = new_low_level_string_reader(span_diagnostic, filemap, itr); + string_advance_token(r); /* fill in peek_* */ ret r; } @@ -93,7 +101,10 @@ fn new_low_level_string_reader(span_diagnostic: diagnostic::span_handler, let r = @{span_diagnostic: span_diagnostic, src: filemap.src, mut col: 0u, mut pos: 0u, mut curr: -1 as char, mut chpos: filemap.start_pos.ch, - filemap: filemap, interner: itr}; + filemap: filemap, interner: itr, + /* dummy values; not read */ + mut peek_tok: token::EOF, + mut peek_span: ast_util::mk_sp(0u,0u)}; if r.pos < (*filemap.src).len() { let next = str::char_range_at(*r.src, r.pos); r.pos = next.next; @@ -102,23 +113,29 @@ fn new_low_level_string_reader(span_diagnostic: diagnostic::span_handler, ret r; } +fn dup_string_reader(&&r: string_reader) -> string_reader { + @{span_diagnostic: r.span_diagnostic, src: r.src, + mut col: r.col, mut pos: r.pos, mut curr: r.curr, mut chpos: r.chpos, + filemap: r.filemap, interner: r.interner, + mut peek_tok: r.peek_tok, mut peek_span: r.peek_span} +} + impl string_reader_as_reader of reader for string_reader { fn is_eof() -> bool { is_eof(self) } fn next_token() -> {tok: token::token, sp: span} { - consume_whitespace_and_comments(self); - let start_chpos = self.chpos; - let tok = if is_eof(self) { - token::EOF - } else { - next_token_inner(self) - }; - ret {tok: tok, sp: ast_util::mk_sp(start_chpos, self.chpos)}; + let ret_val = {tok: self.peek_tok, sp: self.peek_span}; + string_advance_token(self); + ret ret_val; } fn fatal(m: str) -> ! { - self.span_diagnostic.span_fatal( - ast_util::mk_sp(self.chpos, self.chpos), m) + self.span_diagnostic.span_fatal(copy self.peek_span, m) } + fn span_diag() -> diagnostic::span_handler { self.span_diagnostic } fn interner() -> @interner::interner<@str> { self.interner } + fn peek() -> {tok: token::token, sp: span} { + {tok: self.peek_tok, sp: self.peek_span} + } + fn dup() -> reader { dup_string_reader(self) as reader } } impl tt_reader_as_reader of reader for tt_reader { @@ -135,13 +152,25 @@ impl tt_reader_as_reader of reader for tt_reader { fn fatal(m: str) -> ! { self.span_diagnostic.span_fatal(copy self.cur_span, m); } + fn span_diag() -> diagnostic::span_handler { self.span_diagnostic } fn interner() -> @interner::interner<@str> { self.interner } + fn peek() -> {tok: token::token, sp: span} { + { tok: self.cur_tok, sp: self.cur_span } + } + fn dup() -> reader { dup_tt_reader(self) as reader } } fn string_advance_token(&&r: string_reader) { consume_whitespace_and_comments(r); - next_token_inner(r); + if is_eof(r) { + r.peek_tok = token::EOF; + } else { + let start_chpos = r.chpos; + r.peek_tok = next_token_inner(r); + r.peek_span = ast_util::mk_sp(start_chpos, r.chpos); + }; + } fn tt_next_token(&&r: tt_reader) -> {tok: token::token, sp: span} { @@ -149,11 +178,11 @@ fn tt_next_token(&&r: tt_reader) -> {tok: token::token, sp: span} { if r.cur.idx >= vec::len(r.cur.readme) { /* done with this set; pop */ alt r.cur.up { - tt_frame_up(option::none) { + tt_frame_up(none) { r.cur_tok = token::EOF; ret ret_val; } - tt_frame_up(option::some(tt_f)) { + tt_frame_up(some(tt_f)) { r.cur = tt_f; /* the above `if` would need to be a `while` if we didn't know that the last thing in a `tt_delim` is always a `tt_flat` */ @@ -165,11 +194,10 @@ fn tt_next_token(&&r: tt_reader) -> {tok: token::token, sp: span} { between popping and pushing until we got to an actual `tt_flat` */ loop { /* because it's easiest, this handles `tt_delim` not starting with a `tt_flat`, even though it won't happen */ - alt r.cur.readme[r.cur.idx] { + alt copy r.cur.readme[r.cur.idx] { tt_delim(tts) { - /* TODO: this copy should be a unary move, once they exist */ r.cur = @{readme: tts, mut idx: 0u, - up: tt_frame_up(option::some(copy r.cur)) }; + up: tt_frame_up(option::some(r.cur)) }; } tt_flat(sp, tok) { r.cur_span = sp; r.cur_tok = tok; diff --git a/src/libsyntax/parse/parser.rs b/src/libsyntax/parse/parser.rs index a1043c4ed2f..02d08ab5ae8 100644 --- a/src/libsyntax/parse/parser.rs +++ b/src/libsyntax/parse/parser.rs @@ -803,10 +803,11 @@ class parser { } else if self.token == token::POUND && self.look_ahead(1u) == token::POUND { self.bump(); self.bump(); - let macname = self.parse_path_without_tps(); - let macbody = self.parse_token_tree(); - ret pexpr(self.mk_mac_expr(lo, self.span.hi, - mac_invoc_tt(macname, macbody))); + //let macname = self.parse_path_without_tps(); + //let macbody = self.parse_token_tree(); + //ret pexpr(self.mk_mac_expr(lo, self.span.hi, + // mac_invoc_tt(macname, macbody))); + ret pexpr(self.parse_tt_mac_demo()); } else if self.token == token::POUND && self.look_ahead(1u) == token::LT { self.bump(); self.bump(); @@ -1084,6 +1085,58 @@ class parser { }; } + /* temporary */ + fn parse_tt_mac_demo() -> @expr { + let ms = self.parse_seq(token::LBRACE, token::RBRACE, + common::seq_sep_none(), + {|p| p.parse_matcher(@mut 0u)}).node; + let tt = self.parse_token_tree(); + alt tt { + tt_delim(tts) { + let rdr = lexer::new_tt_reader(self.reader.span_diag(), + self.reader.interner(), tts) + as reader; + ext::earley_parser::parse(self.sess, self.cfg, rdr, ms); + } + _ { fail; } + } + + ret self.mk_expr(0u, 0u, expr_break); + } + + fn parse_matcher(name_idx: @mut uint) -> matcher { + let lo = self.span.lo; + let mut sep = none; + if self.eat_keyword("sep") { sep = some(self.token); self.bump(); } + + let m = if self.is_keyword("many")||self.is_keyword("at_least_one") { + let zero_ok = self.is_keyword("many"); + self.bump(); + let ms = (self.parse_seq(token::LPAREN, token::RPAREN, + common::seq_sep_none(), + {|p| p.parse_matcher(name_idx)}).node); + if ms.len() == 0u { + self.fatal("repetition body must be nonempty"); + } + mtc_rep(ms, sep, zero_ok) + } else if option::is_some(sep) { + self.fatal("`sep <tok>` must preceed `many` or `at_least_one`"); + } else if self.eat_keyword("parse") { + let bound_to = self.parse_ident(); + self.expect(token::EQ); + let nt_name = self.parse_ident(); + + let m = mtc_bb(bound_to, nt_name, *name_idx); + *name_idx += 1u; + m + } else { + let m = mtc_tok(self.token); + self.bump(); + m + }; + ret spanned(lo, self.span.hi, m); + } + fn parse_prefix_expr() -> pexpr { let lo = self.span.lo; diff --git a/src/libsyntax/parse/token.rs b/src/libsyntax/parse/token.rs index 9240e3d7a9f..9d6427912df 100644 --- a/src/libsyntax/parse/token.rs +++ b/src/libsyntax/parse/token.rs @@ -85,7 +85,7 @@ enum token { #[auto_serialize] #[doc = "For interpolation during macro expansion."] -enum whole_nonterminal { +enum whole_nt { w_item(@ast::item), w_block(ast::blk), w_stmt(@ast::stmt), @@ -257,7 +257,9 @@ fn contextual_keyword_table() -> hashmap<str, ()> { "self", "send", "static", "to", "use", - "with" + "with", + /* temp */ + "sep", "many", "at_least_one", "parse" ]; for keys.each {|word| words.insert(word, ()); diff --git a/src/libsyntax/syntax.rc b/src/libsyntax/syntax.rc index 9eaa8af921d..cf4d24b8599 100644 --- a/src/libsyntax/syntax.rc +++ b/src/libsyntax/syntax.rc @@ -29,6 +29,7 @@ mod util { mod parse { export parser; + export common; export lexer; export token; export comments; @@ -64,6 +65,8 @@ mod ext { mod qquote; mod build; + mod earley_parser; + mod fmt; mod env; mod simplext; |
