Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add regexp crate to Rust distribution (implements RFC 7) #13700

Merged
merged 3 commits into from
Apr 25, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions mk/crates.mk
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@

TARGET_CRATES := libc std green rustuv native flate arena glob term semver \
uuid serialize sync getopts collections num test time rand \
workcache url log
HOST_CRATES := syntax rustc rustdoc fourcc hexfloat
workcache url log regex
HOST_CRATES := syntax rustc rustdoc fourcc hexfloat regex_macros
CRATES := $(TARGET_CRATES) $(HOST_CRATES)
TOOLS := compiletest rustdoc rustc

Expand Down Expand Up @@ -84,6 +84,8 @@ DEPS_rand := std
DEPS_url := std collections
DEPS_workcache := std serialize collections log
DEPS_log := std sync
DEPS_regex := std collections
DEPS_regex_macros = syntax std regex

TOOL_DEPS_compiletest := test green rustuv getopts
TOOL_DEPS_rustdoc := rustdoc native
Expand Down
5 changes: 1 addition & 4 deletions mk/main.mk
Original file line number Diff line number Diff line change
Expand Up @@ -311,8 +311,6 @@ HSREQ$(1)_H_$(3) = $$(HBIN$(1)_H_$(3))/rustc$$(X_$(3))
else
HSREQ$(1)_H_$(3) = \
$$(HBIN$(1)_H_$(3))/rustc$$(X_$(3)) \
$$(HLIB$(1)_H_$(3))/stamp.rustc \
$$(foreach dep,$$(RUST_DEPS_rustc),$$(HLIB$(1)_H_$(3))/stamp.$$(dep)) \
$$(MKFILE_DEPS)
endif

Expand All @@ -334,8 +332,7 @@ SREQ$(1)_T_$(2)_H_$(3) = \
CSREQ$(1)_T_$(2)_H_$(3) = \
$$(TSREQ$(1)_T_$(2)_H_$(3)) \
$$(HBIN$(1)_H_$(3))/rustdoc$$(X_$(3)) \
$$(foreach dep,$$(CRATES),$$(TLIB$(1)_T_$(2)_H_$(3))/stamp.$$(dep)) \
$$(foreach dep,$$(HOST_CRATES),$$(HLIB$(1)_H_$(3))/stamp.$$(dep))
$$(foreach dep,$$(CRATES),$$(TLIB$(1)_T_$(2)_H_$(3))/stamp.$$(dep))

ifeq ($(1),0)
# Don't run the stage0 compiler under valgrind - that ship has sailed
Expand Down
1 change: 1 addition & 0 deletions src/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Source layout:
| `libfourcc/` | Data format identifier library |
| `libgetopts/` | Get command-line-options library |
| `libglob/` | Unix glob patterns library |
| `libregex/` | Regular expressions |
| `libsemver/` | Rust's semantic versioning library |
| `libserialize/` | Encode-Decode types library |
| `libsync/` | Concurrency mechanisms and primitives |
Expand Down
1 change: 1 addition & 0 deletions src/doc/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ li {list-style-type: none; }
* [The `native` 1:1 threading runtime](native/index.html)
* [The `num` arbitrary precision numerics library](num/index.html)
* [The `rand` library for random numbers and distributions](rand/index.html)
* [The `regex` library for regular expressions](regex/index.html)
* [The `rustc` compiler](rustc/index.html)
* [The `rustuv` M:N I/O library](rustuv/index.html)
* [The `semver` version collation library](semver/index.html)
Expand Down
109 changes: 109 additions & 0 deletions src/etc/regex-match-tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#!/usr/bin/env python2

# Copyright 2014 The Rust Project Developers. See the COPYRIGHT
# file at the top-level directory of this distribution and at
# http://rust-lang.org/COPYRIGHT.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.

from __future__ import absolute_import, division, print_function
import argparse
import datetime
import os.path as path


def print_tests(tests):
print('\n'.join([test_tostr(t) for t in tests]))


def read_tests(f):
basename, _ = path.splitext(path.basename(f))
tests = []
for lineno, line in enumerate(open(f), 1):
fields = filter(None, map(str.strip, line.split('\t')))
if not (4 <= len(fields) <= 5) \
or 'E' not in fields[0] or fields[0][0] == '#':
continue

opts, pat, text, sgroups = fields[0:4]
groups = [] # groups as integer ranges
if sgroups == 'NOMATCH':
groups = [None]
elif ',' in sgroups:
noparen = map(lambda s: s.strip('()'), sgroups.split(')('))
for g in noparen:
s, e = map(str.strip, g.split(','))
if s == '?' and e == '?':
groups.append(None)
else:
groups.append((int(s), int(e)))
else:
# This skips tests that should result in an error.
# There aren't many, so I think we can just capture those
# manually. Possibly fix this in future.
continue

if pat == 'SAME':
pat = tests[-1][1]
if '$' in opts:
pat = pat.decode('string_escape')
text = text.decode('string_escape')
if 'i' in opts:
pat = '(?i)%s' % pat

name = '%s_%d' % (basename, lineno)
tests.append((name, pat, text, groups))
return tests


def test_tostr(t):
lineno, pat, text, groups = t
options = map(group_tostr, groups)
return 'mat!(match_%s, r"%s", r"%s", %s)' \
% (lineno, pat, '' if text == "NULL" else text, ', '.join(options))


def group_tostr(g):
if g is None:
return 'None'
else:
return 'Some((%d, %d))' % (g[0], g[1])


if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Generate match tests from an AT&T POSIX test file.')
aa = parser.add_argument
aa('files', nargs='+',
help='A list of dat AT&T POSIX test files. See src/libregexp/testdata')
args = parser.parse_args()

tests = []
for f in args.files:
tests += read_tests(f)

tpl = '''// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

// ignore-tidy-linelength

// DO NOT EDIT. Automatically generated by 'src/etc/regexp-match-tests'
// on {date}.
'''
print(tpl.format(date=str(datetime.datetime.now())))

for f in args.files:
print('// Tests from %s' % path.basename(f))
print_tests(read_tests(f))
print('')
183 changes: 183 additions & 0 deletions src/etc/regex-unicode-tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
#!/usr/bin/env python2

# Copyright 2014 The Rust Project Developers. See the COPYRIGHT
# file at the top-level directory of this distribution and at
# http://rust-lang.org/COPYRIGHT.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.

from __future__ import absolute_import, division, print_function
import argparse
from collections import defaultdict
import csv
import datetime
import urllib2

BASE_URL = 'http://www.unicode.org/Public/6.3.0/ucd/'
DATA = 'UnicodeData.txt'
SCRIPTS = 'Scripts.txt'

# Mapping taken from Table 12 from:
# http://www.unicode.org/reports/tr44/#General_Category_Values
expanded_categories = {
'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
'Lm': ['L'], 'Lo': ['L'],
'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
}


def as_4byte_uni(n):
s = hex(n)[2:]
return '\\U%s%s' % ('0' * (8 - len(s)), s)


def expand_cat(c):
return expanded_categories.get(c, []) + [c]


def is_valid_unicode(n):
return 0 <= n <= 0xD7FF or 0xE000 <= n <= 0x10FFFF


def read_cats(f):
assigned = defaultdict(list)
for row in csv.reader(f, delimiter=';'):
(hex, cats) = (int(row[0], 16), expand_cat(row[2]))
if not is_valid_unicode(hex):
continue
for cat in cats:
assigned[cat].append(hex)
return assigned


def read_scripts(f):
assigned = defaultdict(list)
for line in f:
line = line.strip()
if not line or line.startswith('#'):
continue
hexes, name = map(str.strip, line.split(';'))[:2]
name = name[:name.index('#')].strip()
if '..' not in hexes:
hex = int(hexes, 16)
if is_valid_unicode(hex):
assigned[name].append(hex)
else:
hex1, hex2 = map(lambda s: int(s, 16), hexes.split('..'))
for hex in xrange(hex1, hex2 + 1):
if is_valid_unicode(hex):
assigned[name].append(hex)
return assigned


def group(letters):
letters = sorted(set(letters))
grouped = []
cur_start = letters.pop(0)
cur_end = cur_start
for letter in letters:
assert letter > cur_end, \
'cur_end: %s, letter: %s' % (hex(cur_end), hex(letter))

if letter == cur_end + 1:
cur_end = letter
else:
grouped.append((cur_start, cur_end))
cur_start, cur_end = letter, letter
grouped.append((cur_start, cur_end))
return grouped


def ranges_to_rust(rs):
rs = ("('%s', '%s')" % (as_4byte_uni(s), as_4byte_uni(e)) for s, e in rs)
return ',\n '.join(rs)


def groups_to_rust(groups):
rust_groups = []
for group_name in sorted(groups):
rust_groups.append('("%s", &[\n %s\n ]),'
% (group_name, ranges_to_rust(groups[group_name])))
return '\n'.join(rust_groups)


if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Generate Unicode character class tables.')
aa = parser.add_argument
aa('--local', action='store_true',
help='When set, Scripts.txt and UnicodeData.txt will be read from '
'the CWD.')
aa('--base-url', type=str, default=BASE_URL,
help='The base URL to use for downloading Unicode data files.')
args = parser.parse_args()

if args.local:
cats = read_cats(open(DATA))
scripts = read_scripts(open(SCRIPTS))
else:
cats = read_cats(urllib2.urlopen(args.base_url + '/' + DATA))
scripts = read_scripts(urllib2.urlopen(args.base_url + '/' + SCRIPTS))

# Get Rust code for all Unicode general categories and scripts.
combined = dict(cats, **scripts)
unigroups = groups_to_rust({k: group(letters)
for k, letters in combined.items()})

# Now get Perl character classes that are Unicode friendly.
perld = range(ord('0'), ord('9') + 1)
dgroups = ranges_to_rust(group(perld + cats['Nd'][:]))

perls = map(ord, ['\t', '\n', '\x0C', '\r', ' '])
sgroups = ranges_to_rust(group(perls + cats['Z'][:]))

low, up = (range(ord('a'), ord('z') + 1), range(ord('A'), ord('Z') + 1))
perlw = [ord('_')] + perld + low + up
wgroups = ranges_to_rust(group(perlw + cats['L'][:]))

tpl = '''// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

// DO NOT EDIT. Automatically generated by 'src/etc/regexp-unicode-tables'
// on {date}.

use parse::{{Class, NamedClasses}};

pub static UNICODE_CLASSES: NamedClasses = &[

{groups}

];

pub static PERLD: Class = &[
{dgroups}
];

pub static PERLS: Class = &[
{sgroups}
];

pub static PERLW: Class = &[
{wgroups}
];
'''
now = datetime.datetime.now()
print(tpl.format(date=str(now), groups=unigroups,
dgroups=dgroups, sgroups=sgroups, wgroups=wgroups))
Loading