auto merge of #13700 : BurntSushi/rust/regexp, r=alexcrichton

Implements [RFC 7](https://github.com/rust-lang/rfcs/blob/master/active/0007-regexps.md) and will hopefully resolve #3591. The crate is marked as experimental. It includes a syntax extension for compiling regexps to native Rust code. Embeds and passes the `basic`, `nullsubexpr` and `repetition` tests from [Glenn Fowler's (slightly modified by Russ Cox for leftmost-first semantics) testregex test suite](http://www2.research.att.com/~astopen/testregex/testregex.html). I've also hand written a plethora of other tests that exercise Unicode support, the parser, public API, etc. Also includes a `regex-dna` benchmark for the shootout. I know the addition looks huge at first, but consider these things: 1. More than half the number of lines is dedicated to Unicode character classes. 2. Of the ~4,500 lines remaining, 1,225 of them are comments. 3. Another ~800 are tests. 4. That leaves 2500 lines for the meat. The parser is ~850 of them. The public API, compiler, dynamic VM and code generator (for `regexp!`) make up the rest.
author: bors <bors@rust-lang.org> 2014-04-24 23:41:15 -0700
committer: bors <bors@rust-lang.org> 2014-04-24 23:41:15 -0700
commit: eea4909a8713a54b3c47e871a70baf6c722999a3 (patch)
tree: b01796a9342a4763e0701712c72f8fe22672c789 /src/etc/regex-match-tests.py
parent: 2bb2341a4af75fb54b809a8e1d5aacbca4df56fc (diff)
parent: 7269bc77e1d1f1babfd159db97024cbd535c47a7 (diff)
download: rust-eea4909a8713a54b3c47e871a70baf6c722999a3.tar.gz
rust-eea4909a8713a54b3c47e871a70baf6c722999a3.zip
1 files changed, 109 insertions, 0 deletions
diff --git a/src/etc/regex-match-tests.py b/src/etc/regex-match-tests.py
new file mode 100755
index 00000000000..826af961fce
--- /dev/null
+++ b/src/etc/regex-match-tests.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python2
+
+# Copyright 2014 The Rust Project Developers. See the COPYRIGHT
+# file at the top-level directory of this distribution and at
+# http://rust-lang.org/COPYRIGHT.
+#
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+from __future__ import absolute_import, division, print_function
+import argparse
+import datetime
+import os.path as path
+
+
+def print_tests(tests):
+    print('\n'.join([test_tostr(t) for t in tests]))
+
+
+def read_tests(f):
+    basename, _ = path.splitext(path.basename(f))
+    tests = []
+    for lineno, line in enumerate(open(f), 1):
+        fields = filter(None, map(str.strip, line.split('\t')))
+        if not (4 <= len(fields) <= 5) \
+           or 'E' not in fields[0] or fields[0][0] == '#':
+            continue
+
+        opts, pat, text, sgroups = fields[0:4]
+        groups = []  # groups as integer ranges
+        if sgroups == 'NOMATCH':
+            groups = [None]
+        elif ',' in sgroups:
+            noparen = map(lambda s: s.strip('()'), sgroups.split(')('))
+            for g in noparen:
+                s, e = map(str.strip, g.split(','))
+                if s == '?' and e == '?':
+                    groups.append(None)
+                else:
+                    groups.append((int(s), int(e)))
+        else:
+            # This skips tests that should result in an error.
+            # There aren't many, so I think we can just capture those
+            # manually. Possibly fix this in future.
+            continue
+
+        if pat == 'SAME':
+            pat = tests[-1][1]
+        if '$' in opts:
+            pat = pat.decode('string_escape')
+            text = text.decode('string_escape')
+        if 'i' in opts:
+            pat = '(?i)%s' % pat
+
+        name = '%s_%d' % (basename, lineno)
+        tests.append((name, pat, text, groups))
+    return tests
+
+
+def test_tostr(t):
+    lineno, pat, text, groups = t
+    options = map(group_tostr, groups)
+    return 'mat!(match_%s, r"%s", r"%s", %s)' \
+           % (lineno, pat, '' if text == "NULL" else text, ', '.join(options))
+
+
+def group_tostr(g):
+    if g is None:
+        return 'None'
+    else:
+        return 'Some((%d, %d))' % (g[0], g[1])
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Generate match tests from an AT&T POSIX test file.')
+    aa = parser.add_argument
+    aa('files', nargs='+',
+       help='A list of dat AT&T POSIX test files. See src/libregexp/testdata')
+    args = parser.parse_args()
+
+    tests = []
+    for f in args.files:
+        tests += read_tests(f)
+
+    tpl = '''// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// http://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+// ignore-tidy-linelength
+
+// DO NOT EDIT. Automatically generated by 'src/etc/regexp-match-tests'
+// on {date}.
+'''
+    print(tpl.format(date=str(datetime.datetime.now())))
+
+    for f in args.files:
+        print('// Tests from %s' % path.basename(f))
+        print_tests(read_tests(f))
+        print('')
author	bors <bors@rust-lang.org>	2014-04-24 23:41:15 -0700
committer	bors <bors@rust-lang.org>	2014-04-24 23:41:15 -0700
commit	eea4909a8713a54b3c47e871a70baf6c722999a3 (patch)
tree	b01796a9342a4763e0701712c72f8fe22672c789 /src/etc/regex-match-tests.py
parent	2bb2341a4af75fb54b809a8e1d5aacbca4df56fc (diff)
parent	7269bc77e1d1f1babfd159db97024cbd535c47a7 (diff)
download	rust-eea4909a8713a54b3c47e871a70baf6c722999a3.tar.gz rust-eea4909a8713a54b3c47e871a70baf6c722999a3.zip