from __future__ import with_statement import regex import string from weakref import proxy import unittest import copy from test.test_support import run_unittest import re # _AssertRaisesContext is defined here because the class doesn't exist before # Python 2.7. class _AssertRaisesContext(object): """A context manager used to implement TestCase.assertRaises* methods.""" def __init__(self, expected, test_case, expected_regexp=None): self.expected = expected self.failureException = test_case.failureException self.expected_regexp = expected_regexp def __enter__(self): return self def __exit__(self, exc_type, exc_value, tb): if exc_type is None: try: exc_name = self.expected.__name__ except AttributeError: exc_name = str(self.expected) raise self.failureException( "{0} not raised".format(exc_name)) if not issubclass(exc_type, self.expected): # let unexpected exceptions pass through return False self.exception = exc_value # store for later retrieval if self.expected_regexp is None: return True expected_regexp = self.expected_regexp if isinstance(expected_regexp, basestring): expected_regexp = re.compile(expected_regexp) if not expected_regexp.search(str(exc_value)): raise self.failureException('"%s" does not match "%s"' % (expected_regexp.pattern, str(exc_value))) return True class RegexTests(unittest.TestCase): PATTERN_CLASS = "" FLAGS_WITH_COMPILED_PAT = "can't process flags argument with a compiled pattern" INVALID_GROUP_REF = "invalid group reference" MISSING_GT = "missing >" BAD_GROUP_NAME = "bad group name" MISSING_LT = "missing <" UNKNOWN_GROUP_I = "unknown group" UNKNOWN_GROUP = "unknown group" BAD_ESCAPE = "bad escape" BAD_OCTAL_ESCAPE = "bad octal escape" BAD_SET = "bad set" STR_PAT_ON_BYTES = "can't use a string pattern on a bytes-like object" BYTES_PAT_ON_STR = "can't use a bytes pattern on a string-like object" STR_PAT_BYTES_TEMPL = "expected str instance, bytes found" BYTES_PAT_STR_TEMPL = "expected bytes instance, str found" BYTES_PAT_UNI_FLAG = "can't use UNICODE flag with a bytes pattern" MIXED_FLAGS = "ASCII, LOCALE and UNICODE flags are mutually incompatible" MISSING_RPAREN = "missing \\)" # Need to escape parenthesis for unittest. TRAILING_CHARS = "trailing characters in pattern" BAD_CHAR_RANGE = "bad character range" NOTHING_TO_REPEAT = "nothing to repeat" OPEN_GROUP = "can't refer to an open group" DUPLICATE_GROUP = "duplicate group" CANT_TURN_OFF = "bad inline flags: can't turn flags off" UNDEF_CHAR_NAME = "undefined character name" # assertRaisesRegex is defined here because the method isn't in the # superclass before Python 2.7. def assertRaisesRegex(self, expected_exception, expected_regexp, callable_obj=None, *args, **kwargs): """Asserts that the message in a raised exception matches a regexp. Args: expected_exception: Exception class expected to be raised. expected_regexp: Regexp (re pattern object or string) expected to be found in error message. callable_obj: Function to be called. args: Extra args. kwargs: Extra kwargs. """ context = _AssertRaisesContext(expected_exception, self, expected_regexp) if callable_obj is None: return context with context: callable_obj(*args, **kwargs) def assertTypedEqual(self, actual, expect, msg=None): self.assertEqual(actual, expect, msg) def recurse(actual, expect): if isinstance(expect, (tuple, list)): for x, y in zip(actual, expect): recurse(x, y) else: self.assertIs(type(actual), type(expect), msg) recurse(actual, expect) def test_weakref(self): s = 'QabbbcR' x = regex.compile('ab+c') y = proxy(x) if x.findall('QabbbcR') != y.findall('QabbbcR'): self.fail() def test_search_star_plus(self): self.assertEqual(regex.search('a*', 'xxx').span(0), (0, 0)) self.assertEqual(regex.search('x*', 'axx').span(), (0, 0)) self.assertEqual(regex.search('x+', 'axx').span(0), (1, 3)) self.assertEqual(regex.search('x+', 'axx').span(), (1, 3)) self.assertEqual(regex.search('x', 'aaa'), None) self.assertEqual(regex.match('a*', 'xxx').span(0), (0, 0)) self.assertEqual(regex.match('a*', 'xxx').span(), (0, 0)) self.assertEqual(regex.match('x*', 'xxxa').span(0), (0, 3)) self.assertEqual(regex.match('x*', 'xxxa').span(), (0, 3)) self.assertEqual(regex.match('a+', 'xxx'), None) def bump_num(self, matchobj): int_value = int(matchobj[0]) return str(int_value + 1) def test_basic_regex_sub(self): self.assertEqual(regex.sub("(?i)b+", "x", "bbbb BBBB"), 'x x') self.assertEqual(regex.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'), '9.3 -3 24x100y') self.assertEqual(regex.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3), '9.3 -3 23x99y') self.assertEqual(regex.sub('.', lambda m: r"\n", 'x'), "\\n") self.assertEqual(regex.sub('.', r"\n", 'x'), "\n") self.assertEqual(regex.sub('(?Px)', r'\g\g', 'xx'), 'xxxx') self.assertEqual(regex.sub('(?Px)', r'\g\g<1>', 'xx'), 'xxxx') self.assertEqual(regex.sub('(?Px)', r'\g\g', 'xx'), 'xxxx') self.assertEqual(regex.sub('(?Px)', r'\g<1>\g<1>', 'xx'), 'xxxx') self.assertEqual(regex.sub('a', r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D', 'a'), "\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D") self.assertEqual(regex.sub('a', '\t\n\v\r\f\a', 'a'), "\t\n\v\r\f\a") self.assertEqual(regex.sub('a', '\t\n\v\r\f\a', 'a'), chr(9) + chr(10) + chr(11) + chr(13) + chr(12) + chr(7)) self.assertEqual(regex.sub(r'^\s*', 'X', 'test'), 'Xtest') self.assertEqual(regex.sub(ur"x", ur"\x0A", u"x"), u"\n") self.assertEqual(regex.sub(ur"x", ur"\u000A", u"x"), u"\n") self.assertEqual(regex.sub(ur"x", ur"\U0000000A", u"x"), u"\n") self.assertEqual(regex.sub(ur"x", ur"\N{LATIN CAPITAL LETTER A}", u"x"), u"A") self.assertEqual(regex.sub(r"x", r"\x0A", "x"), "\n") self.assertEqual(regex.sub(r"x", r"\u000A", "x"), "\\u000A") self.assertEqual(regex.sub(r"x", r"\U0000000A", "x"), "\\U0000000A") self.assertEqual(regex.sub(r"x", r"\N{LATIN CAPITAL LETTER A}", "x"), "\\N{LATIN CAPITAL LETTER A}") def test_bug_449964(self): # Fails for group followed by other escape. self.assertEqual(regex.sub(r'(?Px)', r'\g<1>\g<1>\b', 'xx'), "xx\bxx\b") def test_bug_449000(self): # Test for sub() on escaped characters. self.assertEqual(regex.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'), "abc\ndef\n") self.assertEqual(regex.sub('\r\n', r'\n', 'abc\r\ndef\r\n'), "abc\ndef\n") self.assertEqual(regex.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'), "abc\ndef\n") self.assertEqual(regex.sub('\r\n', '\n', 'abc\r\ndef\r\n'), "abc\ndef\n") def test_bug_1140(self): # regex.sub(x, y, u'') should return u'', not '', and # regex.sub(x, y, '') should return '', not u''. # Also: # regex.sub(x, y, unicode(x)) should return unicode(y), and # regex.sub(x, y, str(x)) should return # str(y) if isinstance(y, str) else unicode(y). for x in 'x', u'x': for y in 'y', u'y': z = regex.sub(x, y, u'') self.assertEqual((type(z), z), (unicode, u'')) z = regex.sub(x, y, '') self.assertEqual((type(z), z), (str, '')) z = regex.sub(x, y, unicode(x)) self.assertEqual((type(z), z), (unicode, unicode(y))) z = regex.sub(x, y, str(x)) self.assertEqual((type(z), z), (type(y), y)) def test_bug_1661(self): # Verify that flags do not get silently ignored with compiled patterns pattern = regex.compile('.') self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT, lambda: regex.match(pattern, 'A', regex.I)) self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT, lambda: regex.search(pattern, 'A', regex.I)) self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT, lambda: regex.findall(pattern, 'A', regex.I)) self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT, lambda: regex.compile(pattern, regex.I)) def test_bug_3629(self): # A regex that triggered a bug in the sre-code validator self.assertEqual(repr(type(regex.compile("(?P)(?(quote))"))), self.PATTERN_CLASS) def test_sub_template_numeric_escape(self): # Bug 776311 and friends. self.assertEqual(regex.sub('x', r'\0', 'x'), "\0") self.assertEqual(regex.sub('x', r'\000', 'x'), "\000") self.assertEqual(regex.sub('x', r'\001', 'x'), "\001") self.assertEqual(regex.sub('x', r'\008', 'x'), "\0" + "8") self.assertEqual(regex.sub('x', r'\009', 'x'), "\0" + "9") self.assertEqual(regex.sub('x', r'\111', 'x'), "\111") self.assertEqual(regex.sub('x', r'\117', 'x'), "\117") self.assertEqual(regex.sub('x', r'\1111', 'x'), "\1111") self.assertEqual(regex.sub('x', r'\1111', 'x'), "\111" + "1") self.assertEqual(regex.sub('x', r'\00', 'x'), '\x00') self.assertEqual(regex.sub('x', r'\07', 'x'), '\x07') self.assertEqual(regex.sub('x', r'\08', 'x'), "\0" + "8") self.assertEqual(regex.sub('x', r'\09', 'x'), "\0" + "9") self.assertEqual(regex.sub('x', r'\0a', 'x'), "\0" + "a") self.assertEqual(regex.sub(u'x', ur'\400', u'x'), u"\u0100") self.assertEqual(regex.sub(u'x', ur'\777', u'x'), u"\u01FF") self.assertEqual(regex.sub('x', r'\400', 'x'), "\x00") self.assertEqual(regex.sub('x', r'\777', 'x'), "\xFF") self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: regex.sub('x', r'\1', 'x')) self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: regex.sub('x', r'\8', 'x')) self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: regex.sub('x', r'\9', 'x')) self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: regex.sub('x', r'\11', 'x')) self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: regex.sub('x', r'\18', 'x')) self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: regex.sub('x', r'\1a', 'x')) self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: regex.sub('x', r'\90', 'x')) self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: regex.sub('x', r'\99', 'x')) self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: regex.sub('x', r'\118', 'x')) # r'\11' + '8' self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: regex.sub('x', r'\11a', 'x')) self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: regex.sub('x', r'\181', 'x')) # r'\18' + '1' self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: regex.sub('x', r'\800', 'x')) # r'\80' + '0' # In Python 2.3 (etc), these loop endlessly in sre_parser.py. self.assertEqual(regex.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x') self.assertEqual(regex.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'), 'xz8') self.assertEqual(regex.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'), 'xza') def test_qualified_re_sub(self): self.assertEqual(regex.sub('a', 'b', 'aaaaa'), 'bbbbb') self.assertEqual(regex.sub('a', 'b', 'aaaaa', 1), 'baaaa') def test_bug_114660(self): self.assertEqual(regex.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'), 'hello there') def test_bug_462270(self): # Test for empty sub() behaviour, see SF bug #462270 self.assertEqual(regex.sub('(?V0)x*', '-', 'abxd'), '-a-b-d-') self.assertEqual(regex.sub('(?V1)x*', '-', 'abxd'), '-a-b--d-') self.assertEqual(regex.sub('x+', '-', 'abxd'), 'ab-d') def test_bug_14462(self): # chr(255) is not a valid identifier in Python 2. group_name = u'\xFF' self.assertRaisesRegex(regex.error, self.BAD_GROUP_NAME, lambda: regex.search(ur'(?P<' + group_name + '>a)', u'a')) def test_symbolic_refs(self): self.assertRaisesRegex(regex.error, self.MISSING_GT, lambda: regex.sub('(?Px)', r'\gx)', r'\g<', 'xx')) self.assertRaisesRegex(regex.error, self.MISSING_LT, lambda: regex.sub('(?Px)', r'\g', 'xx')) self.assertRaisesRegex(regex.error, self.BAD_GROUP_NAME, lambda: regex.sub('(?Px)', r'\g', 'xx')) self.assertRaisesRegex(regex.error, self.BAD_GROUP_NAME, lambda: regex.sub('(?Px)', r'\g<1a1>', 'xx')) self.assertRaisesRegex(IndexError, self.UNKNOWN_GROUP_I, lambda: regex.sub('(?Px)', r'\g', 'xx')) # The new behaviour of unmatched but valid groups is to treat them like # empty matches in the replacement template, like in Perl. self.assertEqual(regex.sub('(?Px)|(?Py)', r'\g', 'xx'), '') self.assertEqual(regex.sub('(?Px)|(?Py)', r'\2', 'xx'), '') # The old behaviour was to raise it as an IndexError. self.assertRaisesRegex(regex.error, self.BAD_GROUP_NAME, lambda: regex.sub('(?Px)', r'\g<-1>', 'xx')) def test_re_subn(self): self.assertEqual(regex.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2)) self.assertEqual(regex.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1)) self.assertEqual(regex.subn("b+", "x", "xyz"), ('xyz', 0)) self.assertEqual(regex.subn("b*", "x", "xyz"), ('xxxyxzx', 4)) self.assertEqual(regex.subn("b*", "x", "xyz", 2), ('xxxyz', 2)) def test_re_split(self): self.assertEqual(regex.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c']) self.assertEqual(regex.split(":*", ":a:b::c"), ['', 'a', 'b', 'c']) self.assertEqual(regex.split("(:*)", ":a:b::c"), ['', ':', 'a', ':', 'b', '::', 'c']) self.assertEqual(regex.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c']) self.assertEqual(regex.split("(:)*", ":a:b::c"), ['', ':', 'a', ':', 'b', ':', 'c']) self.assertEqual(regex.split("([b:]+)", ":a:b::c"), ['', ':', 'a', ':b::', 'c']) self.assertEqual(regex.split("(b)|(:+)", ":a:b::c"), ['', None, ':', 'a', None, ':', '', 'b', None, '', None, '::', 'c']) self.assertEqual(regex.split("(?:b)|(?::+)", ":a:b::c"), ['', 'a', '', '', 'c']) self.assertEqual(regex.split("x", "xaxbxc"), ['', 'a', 'b', 'c']) self.assertEqual([m for m in regex.splititer("x", "xaxbxc")], ['', 'a', 'b', 'c']) self.assertEqual(regex.split("(?r)x", "xaxbxc"), ['c', 'b', 'a', '']) self.assertEqual([m for m in regex.splititer("(?r)x", "xaxbxc")], ['c', 'b', 'a', '']) self.assertEqual(regex.split("(x)|(y)", "xaxbxc"), ['', 'x', None, 'a', 'x', None, 'b', 'x', None, 'c']) self.assertEqual([m for m in regex.splititer("(x)|(y)", "xaxbxc")], ['', 'x', None, 'a', 'x', None, 'b', 'x', None, 'c']) self.assertEqual(regex.split("(?r)(x)|(y)", "xaxbxc"), ['c', 'x', None, 'b', 'x', None, 'a', 'x', None, '']) self.assertEqual([m for m in regex.splititer("(?r)(x)|(y)", "xaxbxc")], ['c', 'x', None, 'b', 'x', None, 'a', 'x', None, '']) self.assertEqual(regex.split(r"(?V1)\b", "a b c"), ['', 'a', ' ', 'b', ' ', 'c', '']) self.assertEqual(regex.split(r"(?V1)\m", "a b c"), ['', 'a ', 'b ', 'c']) self.assertEqual(regex.split(r"(?V1)\M", "a b c"), ['a', ' b', ' c', '']) def test_qualified_re_split(self): self.assertEqual(regex.split(":", ":a:b::c", 2), ['', 'a', 'b::c']) self.assertEqual(regex.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d']) self.assertEqual(regex.split("(:)", ":a:b::c", 2), ['', ':', 'a', ':', 'b::c']) self.assertEqual(regex.split("(:*)", ":a:b::c", 2), ['', ':', 'a', ':', 'b::c']) def test_re_findall(self): self.assertEqual(regex.findall(":+", "abc"), []) self.assertEqual(regex.findall(":+", "a:b::c:::d"), [':', '::', ':::']) self.assertEqual(regex.findall("(:+)", "a:b::c:::d"), [':', '::', ':::']) self.assertEqual(regex.findall("(:)(:*)", "a:b::c:::d"), [(':', ''), (':', ':'), (':', '::')]) self.assertEqual(regex.findall(r"\((?P.{0,5}?TEST)\)", "(MY TEST)"), ["MY TEST"]) self.assertEqual(regex.findall(r"\((?P.{0,3}?TEST)\)", "(MY TEST)"), ["MY TEST"]) self.assertEqual(regex.findall(r"\((?P.{0,3}?T)\)", "(MY T)"), ["MY T"]) self.assertEqual(regex.findall(r"[^a]{2}[A-Z]", "\n S"), [' S']) self.assertEqual(regex.findall(r"[^a]{2,3}[A-Z]", "\n S"), ['\n S']) self.assertEqual(regex.findall(r"[^a]{2,3}[A-Z]", "\n S"), [' S']) self.assertEqual(regex.findall(r"X(Y[^Y]+?){1,2}( |Q)+DEF", "XYABCYPPQ\nQ DEF"), [('YPPQ\n', ' ')]) self.assertEqual(regex.findall(r"(\nTest(\n+.+?){0,2}?)?\n+End", "\nTest\nxyz\nxyz\nEnd"), [('\nTest\nxyz\nxyz', '\nxyz')]) def test_bug_117612(self): self.assertEqual(regex.findall(r"(a|(b))", "aba"), [('a', ''), ('b', 'b'), ('a', '')]) def test_re_match(self): self.assertEqual(regex.match('a', 'a')[:], ('a',)) self.assertEqual(regex.match('(a)', 'a')[:], ('a', 'a')) self.assertEqual(regex.match(r'(a)', 'a')[0], 'a') self.assertEqual(regex.match(r'(a)', 'a')[1], 'a') self.assertEqual(regex.match(r'(a)', 'a').group(1, 1), ('a', 'a')) pat = regex.compile('((a)|(b))(c)?') self.assertEqual(pat.match('a')[:], ('a', 'a', 'a', None, None)) self.assertEqual(pat.match('b')[:], ('b', 'b', None, 'b', None)) self.assertEqual(pat.match('ac')[:], ('ac', 'a', 'a', None, 'c')) self.assertEqual(pat.match('bc')[:], ('bc', 'b', None, 'b', 'c')) self.assertEqual(pat.match('bc')[:], ('bc', 'b', None, 'b', 'c')) # A single group. m = regex.match('(a)', 'a') self.assertEqual(m.group(), 'a') self.assertEqual(m.group(0), 'a') self.assertEqual(m.group(1), 'a') self.assertEqual(m.group(1, 1), ('a', 'a')) pat = regex.compile('(?:(?Pa)|(?Pb))(?Pc)?') self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None)) self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'), (None, 'b', None)) self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c')) def test_re_groupref_exists(self): self.assertEqual(regex.match(r'^(\()?([^()]+)(?(1)\))$', '(a)')[:], ('(a)', '(', 'a')) self.assertEqual(regex.match(r'^(\()?([^()]+)(?(1)\))$', 'a')[:], ('a', None, 'a')) self.assertEqual(regex.match(r'^(\()?([^()]+)(?(1)\))$', 'a)'), None) self.assertEqual(regex.match(r'^(\()?([^()]+)(?(1)\))$', '(a'), None) self.assertEqual(regex.match('^(?:(a)|c)((?(1)b|d))$', 'ab')[:], ('ab', 'a', 'b')) self.assertEqual(regex.match('^(?:(a)|c)((?(1)b|d))$', 'cd')[:], ('cd', None, 'd')) self.assertEqual(regex.match('^(?:(a)|c)((?(1)|d))$', 'cd')[:], ('cd', None, 'd')) self.assertEqual(regex.match('^(?:(a)|c)((?(1)|d))$', 'a')[:], ('a', 'a', '')) # Tests for bug #1177831: exercise groups other than the first group. p = regex.compile('(?Pa)(?Pb)?((?(g2)c|d))') self.assertEqual(p.match('abc')[:], ('abc', 'a', 'b', 'c')) self.assertEqual(p.match('ad')[:], ('ad', 'a', None, 'd')) self.assertEqual(p.match('abd'), None) self.assertEqual(p.match('ac'), None) def test_re_groupref(self): self.assertEqual(regex.match(r'^(\|)?([^()]+)\1$', '|a|')[:], ('|a|', '|', 'a')) self.assertEqual(regex.match(r'^(\|)?([^()]+)\1?$', 'a')[:], ('a', None, 'a')) self.assertEqual(regex.match(r'^(\|)?([^()]+)\1$', 'a|'), None) self.assertEqual(regex.match(r'^(\|)?([^()]+)\1$', '|a'), None) self.assertEqual(regex.match(r'^(?:(a)|c)(\1)$', 'aa')[:], ('aa', 'a', 'a')) self.assertEqual(regex.match(r'^(?:(a)|c)(\1)?$', 'c')[:], ('c', None, None)) self.assertEqual(regex.findall("(?i)(.{1,40}?),(.{1,40}?)(?:;)+(.{1,80}).{1,40}?\\3(\ |;)+(.{1,80}?)\\1", "TEST, BEST; LEST ; Lest 123 Test, Best"), [('TEST', ' BEST', ' LEST', ' ', '123 ')]) def test_groupdict(self): self.assertEqual(regex.match('(?Pfirst) (?Psecond)', 'first second').groupdict(), {'first': 'first', 'second': 'second'}) def test_expand(self): self.assertEqual(regex.match("(?Pfirst) (?Psecond)", "first second").expand(r"\2 \1 \g \g"), 'second first second first') def test_repeat_minmax(self): self.assertEqual(regex.match(r"^(\w){1}$", "abc"), None) self.assertEqual(regex.match(r"^(\w){1}?$", "abc"), None) self.assertEqual(regex.match(r"^(\w){1,2}$", "abc"), None) self.assertEqual(regex.match(r"^(\w){1,2}?$", "abc"), None) self.assertEqual(regex.match(r"^(\w){3}$", "abc")[1], 'c') self.assertEqual(regex.match(r"^(\w){1,3}$", "abc")[1], 'c') self.assertEqual(regex.match(r"^(\w){1,4}$", "abc")[1], 'c') self.assertEqual(regex.match(r"^(\w){3,4}?$", "abc")[1], 'c') self.assertEqual(regex.match(r"^(\w){3}?$", "abc")[1], 'c') self.assertEqual(regex.match(r"^(\w){1,3}?$", "abc")[1], 'c') self.assertEqual(regex.match(r"^(\w){1,4}?$", "abc")[1], 'c') self.assertEqual(regex.match(r"^(\w){3,4}?$", "abc")[1], 'c') self.assertEqual(regex.match("^x{1}$", "xxx"), None) self.assertEqual(regex.match("^x{1}?$", "xxx"), None) self.assertEqual(regex.match("^x{1,2}$", "xxx"), None) self.assertEqual(regex.match("^x{1,2}?$", "xxx"), None) self.assertEqual(regex.match("^x{1}", "xxx")[0], 'x') self.assertEqual(regex.match("^x{1}?", "xxx")[0], 'x') self.assertEqual(regex.match("^x{0,1}", "xxx")[0], 'x') self.assertEqual(regex.match("^x{0,1}?", "xxx")[0], '') self.assertEqual(bool(regex.match("^x{3}$", "xxx")), True) self.assertEqual(bool(regex.match("^x{1,3}$", "xxx")), True) self.assertEqual(bool(regex.match("^x{1,4}$", "xxx")), True) self.assertEqual(bool(regex.match("^x{3,4}?$", "xxx")), True) self.assertEqual(bool(regex.match("^x{3}?$", "xxx")), True) self.assertEqual(bool(regex.match("^x{1,3}?$", "xxx")), True) self.assertEqual(bool(regex.match("^x{1,4}?$", "xxx")), True) self.assertEqual(bool(regex.match("^x{3,4}?$", "xxx")), True) self.assertEqual(regex.match("^x{}$", "xxx"), None) self.assertEqual(bool(regex.match("^x{}$", "x{}")), True) def test_getattr(self): self.assertEqual(regex.compile("(?i)(a)(b)").pattern, '(?i)(a)(b)') self.assertEqual(regex.compile("(?i)(a)(b)").flags, regex.A | regex.I | regex.DEFAULT_VERSION) self.assertEqual(regex.compile(u"(?i)(a)(b)").flags, regex.I | regex.U | regex.DEFAULT_VERSION) self.assertEqual(regex.compile("(?i)(a)(b)").groups, 2) self.assertEqual(regex.compile("(?i)(a)(b)").groupindex, {}) self.assertEqual(regex.compile("(?i)(?Pa)(?Pb)").groupindex, {'first': 1, 'other': 2}) self.assertEqual(regex.match("(a)", "a").pos, 0) self.assertEqual(regex.match("(a)", "a").endpos, 1) self.assertEqual(regex.search("b(c)", "abcdef").pos, 0) self.assertEqual(regex.search("b(c)", "abcdef").endpos, 6) self.assertEqual(regex.search("b(c)", "abcdef").span(), (1, 3)) self.assertEqual(regex.search("b(c)", "abcdef").span(1), (2, 3)) self.assertEqual(regex.match("(a)", "a").string, 'a') self.assertEqual(regex.match("(a)", "a").regs, ((0, 1), (0, 1))) self.assertEqual(repr(type(regex.match("(a)", "a").re)), self.PATTERN_CLASS) # Issue 14260. p = regex.compile(r'abc(?Pdef)') p.groupindex["n"] = 0 self.assertEqual(p.groupindex["n"], 1) def test_special_escapes(self): self.assertEqual(regex.search(r"\b(b.)\b", "abcd abc bcd bx")[1], 'bx') self.assertEqual(regex.search(r"\B(b.)\B", "abc bcd bc abxd")[1], 'bx') self.assertEqual(regex.search(r"\b(b.)\b", "abcd abc bcd bx", regex.LOCALE)[1], 'bx') self.assertEqual(regex.search(r"\B(b.)\B", "abc bcd bc abxd", regex.LOCALE)[1], 'bx') self.assertEqual(regex.search(ur"\b(b.)\b", u"abcd abc bcd bx", regex.UNICODE)[1], u'bx') self.assertEqual(regex.search(ur"\B(b.)\B", u"abc bcd bc abxd", regex.UNICODE)[1], u'bx') self.assertEqual(regex.search(r"^abc$", "\nabc\n", regex.M)[0], 'abc') self.assertEqual(regex.search(r"^\Aabc\Z$", "abc", regex.M)[0], 'abc') self.assertEqual(regex.search(r"^\Aabc\Z$", "\nabc\n", regex.M), None) self.assertEqual(regex.search(ur"\b(b.)\b", u"abcd abc bcd bx")[1], u'bx') self.assertEqual(regex.search(ur"\B(b.)\B", u"abc bcd bc abxd")[1], u'bx') self.assertEqual(regex.search(ur"^abc$", u"\nabc\n", regex.M)[0], u'abc') self.assertEqual(regex.search(ur"^\Aabc\Z$", u"abc", regex.M)[0], u'abc') self.assertEqual(regex.search(ur"^\Aabc\Z$", u"\nabc\n", regex.M), None) self.assertEqual(regex.search(r"\d\D\w\W\s\S", "1aa! a")[0], '1aa! a') self.assertEqual(regex.search(r"\d\D\w\W\s\S", "1aa! a", regex.LOCALE)[0], '1aa! a') self.assertEqual(regex.search(ur"\d\D\w\W\s\S", u"1aa! a", regex.UNICODE)[0], u'1aa! a') def test_bigcharset(self): self.assertEqual(regex.match(ur"(?u)([\u2222\u2223])", u"\u2222")[1], u'\u2222') self.assertEqual(regex.match(ur"(?u)([\u2222\u2223])", u"\u2222", regex.UNICODE)[1], u'\u2222') self.assertEqual(u"".join(regex.findall(u".", u"e\xe8\xe9\xea\xeb\u0113\u011b\u0117", flags=regex.UNICODE)), u'e\xe8\xe9\xea\xeb\u0113\u011b\u0117') self.assertEqual(u"".join(regex.findall(ur"[e\xe8\xe9\xea\xeb\u0113\u011b\u0117]", u"e\xe8\xe9\xea\xeb\u0113\u011b\u0117", flags=regex.UNICODE)), u'e\xe8\xe9\xea\xeb\u0113\u011b\u0117') self.assertEqual(u"".join(regex.findall(ur"e|\xe8|\xe9|\xea|\xeb|\u0113|\u011b|\u0117", u"e\xe8\xe9\xea\xeb\u0113\u011b\u0117", flags=regex.UNICODE)), u'e\xe8\xe9\xea\xeb\u0113\u011b\u0117') def test_anyall(self): self.assertEqual(regex.match("a.b", "a\nb", regex.DOTALL)[0], "a\nb") self.assertEqual(regex.match("a.*b", "a\n\nb", regex.DOTALL)[0], "a\n\nb") def test_non_consuming(self): self.assertEqual(regex.match(r"(a(?=\s[^a]))", "a b")[1], 'a') self.assertEqual(regex.match(r"(a(?=\s[^a]*))", "a b")[1], 'a') self.assertEqual(regex.match(r"(a(?=\s[abc]))", "a b")[1], 'a') self.assertEqual(regex.match(r"(a(?=\s[abc]*))", "a bc")[1], 'a') self.assertEqual(regex.match(r"(a)(?=\s\1)", "a a")[1], 'a') self.assertEqual(regex.match(r"(a)(?=\s\1*)", "a aa")[1], 'a') self.assertEqual(regex.match(r"(a)(?=\s(abc|a))", "a a")[1], 'a') self.assertEqual(regex.match(r"(a(?!\s[^a]))", "a a")[1], 'a') self.assertEqual(regex.match(r"(a(?!\s[abc]))", "a d")[1], 'a') self.assertEqual(regex.match(r"(a)(?!\s\1)", "a b")[1], 'a') self.assertEqual(regex.match(r"(a)(?!\s(abc|a))", "a b")[1], 'a') def test_ignore_case(self): self.assertEqual(regex.match("abc", "ABC", regex.I)[0], 'ABC') self.assertEqual(regex.match(u"abc", u"ABC", regex.I)[0], u'ABC') self.assertEqual(regex.match(r"(a\s[^a]*)", "a bb", regex.I)[1], 'a bb') self.assertEqual(regex.match(r"(a\s[abc])", "a b", regex.I)[1], 'a b') self.assertEqual(regex.match(r"(a\s[abc]*)", "a bb", regex.I)[1], 'a bb') self.assertEqual(regex.match(r"((a)\s\2)", "a a", regex.I)[1], 'a a') self.assertEqual(regex.match(r"((a)\s\2*)", "a aa", regex.I)[1], 'a aa') self.assertEqual(regex.match(r"((a)\s(abc|a))", "a a", regex.I)[1], 'a a') self.assertEqual(regex.match(r"((a)\s(abc|a)*)", "a aa", regex.I)[1], 'a aa') # Issue 3511. self.assertEqual(regex.match(r"[Z-a]", "_").span(), (0, 1)) self.assertEqual(regex.match(r"(?i)[Z-a]", "_").span(), (0, 1)) self.assertEqual(bool(regex.match(ur"(?iu)nao", u"nAo")), True) self.assertEqual(bool(regex.match(ur"(?iu)n\xE3o", u"n\xC3o")), True) self.assertEqual(bool(regex.match(ur"(?iu)n\xE3o", u"N\xC3O")), True) self.assertEqual(bool(regex.match(ur"(?iu)s", u"\u017F")), True) def test_case_folding(self): self.assertEqual(regex.search(ur"(?fiu)ss", u"SS").span(), (0, 2)) self.assertEqual(regex.search(ur"(?fiu)SS", u"ss").span(), (0, 2)) self.assertEqual(regex.search(ur"(?fiu)SS", u"\N{LATIN SMALL LETTER SHARP S}").span(), (0, 1)) self.assertEqual(regex.search(ur"(?fi)\N{LATIN SMALL LETTER SHARP S}", u"SS").span(), (0, 2)) self.assertEqual(regex.search(ur"(?fiu)\N{LATIN SMALL LIGATURE ST}", u"ST").span(), (0, 2)) self.assertEqual(regex.search(ur"(?fiu)ST", u"\N{LATIN SMALL LIGATURE ST}").span(), (0, 1)) self.assertEqual(regex.search(ur"(?fiu)ST", u"\N{LATIN SMALL LIGATURE LONG S T}").span(), (0, 1)) self.assertEqual(regex.search(ur"(?fiu)SST", u"\N{LATIN SMALL LETTER SHARP S}t").span(), (0, 2)) self.assertEqual(regex.search(ur"(?fiu)SST", u"s\N{LATIN SMALL LIGATURE LONG S T}").span(), (0, 2)) self.assertEqual(regex.search(ur"(?fiu)SST", u"s\N{LATIN SMALL LIGATURE ST}").span(), (0, 2)) self.assertEqual(regex.search(ur"(?fiu)\N{LATIN SMALL LIGATURE ST}", u"SST").span(), (1, 3)) self.assertEqual(regex.search(ur"(?fiu)SST", u"s\N{LATIN SMALL LIGATURE ST}").span(), (0, 2)) self.assertEqual(regex.search(ur"(?fiu)FFI", u"\N{LATIN SMALL LIGATURE FFI}").span(), (0, 1)) self.assertEqual(regex.search(ur"(?fiu)FFI", u"\N{LATIN SMALL LIGATURE FF}i").span(), (0, 2)) self.assertEqual(regex.search(ur"(?fiu)FFI", u"f\N{LATIN SMALL LIGATURE FI}").span(), (0, 2)) self.assertEqual(regex.search(ur"(?fiu)\N{LATIN SMALL LIGATURE FFI}", u"FFI").span(), (0, 3)) self.assertEqual(regex.search(ur"(?fiu)\N{LATIN SMALL LIGATURE FF}i", u"FFI").span(), (0, 3)) self.assertEqual(regex.search(ur"(?fiu)f\N{LATIN SMALL LIGATURE FI}", u"FFI").span(), (0, 3)) sigma = u"\u03A3\u03C3\u03C2" for ch1 in sigma: for ch2 in sigma: if not regex.match(ur"(?fiu)" + ch1, ch2): self.fail() self.assertEqual(bool(regex.search(ur"(?iuV1)ff", u"\uFB00\uFB01")), True) self.assertEqual(bool(regex.search(ur"(?iuV1)ff", u"\uFB01\uFB00")), True) self.assertEqual(bool(regex.search(ur"(?iuV1)fi", u"\uFB00\uFB01")), True) self.assertEqual(bool(regex.search(ur"(?iuV1)fi", u"\uFB01\uFB00")), True) self.assertEqual(bool(regex.search(ur"(?iuV1)fffi", u"\uFB00\uFB01")), True) self.assertEqual(bool(regex.search(ur"(?iuV1)f\uFB03", u"\uFB00\uFB01")), True) self.assertEqual(bool(regex.search(ur"(?iuV1)ff", u"\uFB00\uFB01")), True) self.assertEqual(bool(regex.search(ur"(?iuV1)fi", u"\uFB00\uFB01")), True) self.assertEqual(bool(regex.search(ur"(?iuV1)fffi", u"\uFB00\uFB01")), True) self.assertEqual(bool(regex.search(ur"(?iuV1)f\uFB03", u"\uFB00\uFB01")), True) self.assertEqual(bool(regex.search(ur"(?iuV1)f\uFB01", u"\uFB00i")), True) self.assertEqual(bool(regex.search(ur"(?iuV1)f\uFB01", u"\uFB00i")), True) self.assertEqual(regex.findall(ur"(?iuV0)\m(?:word){e<=3}\M(?ne", u"affine", options=[u"\N{LATIN SMALL LIGATURE FFI}"]).span(), (0, 6)) self.assertEqual(regex.search(ur"(?fi)a\Lne", u"a\N{LATIN SMALL LIGATURE FFI}ne", options=[u"ffi"]).span(), (0, 4)) def test_category(self): self.assertEqual(regex.match(r"(\s)", " ")[1], ' ') def test_not_literal(self): self.assertEqual(regex.search(r"\s([^a])", " b")[1], 'b') self.assertEqual(regex.search(r"\s([^a]*)", " bb")[1], 'bb') def test_search_coverage(self): self.assertEqual(regex.search(r"\s(b)", " b")[1], 'b') self.assertEqual(regex.search(r"a\s", "a ")[0], 'a ') def test_re_escape(self): p = "" self.assertEqual(regex.escape(p), p) for i in range(0, 256): p += chr(i) self.assertEqual(bool(regex.match(regex.escape(chr(i)), chr(i))), True) self.assertEqual(regex.match(regex.escape(chr(i)), chr(i)).span(), (0, 1)) pat = regex.compile(regex.escape(p)) self.assertEqual(pat.match(p).span(), (0, 256)) def test_constants(self): if regex.I != regex.IGNORECASE: self.fail() if regex.L != regex.LOCALE: self.fail() if regex.M != regex.MULTILINE: self.fail() if regex.S != regex.DOTALL: self.fail() if regex.X != regex.VERBOSE: self.fail() def test_flags(self): for flag in [regex.I, regex.M, regex.X, regex.S, regex.L]: self.assertEqual(repr(type(regex.compile('^pattern$', flag))), self.PATTERN_CLASS) def test_sre_character_literals(self): for i in [0, 8, 16, 32, 64, 127, 128, 255]: self.assertEqual(bool(regex.match(r"\%03o" % i, chr(i))), True) self.assertEqual(bool(regex.match(r"\%03o0" % i, chr(i) + "0")), True) self.assertEqual(bool(regex.match(r"\%03o8" % i, chr(i) + "8")), True) self.assertEqual(bool(regex.match(r"\x%02x" % i, chr(i))), True) self.assertEqual(bool(regex.match(r"\x%02x0" % i, chr(i) + "0")), True) self.assertEqual(bool(regex.match(r"\x%02xz" % i, chr(i) + "z")), True) self.assertRaisesRegex(regex.error, self.UNKNOWN_GROUP, lambda: regex.match(r"\911", "")) def test_sre_character_class_literals(self): for i in [0, 8, 16, 32, 64, 127, 128, 255]: self.assertEqual(bool(regex.match(r"[\%03o]" % i, chr(i))), True) self.assertEqual(bool(regex.match(r"[\%03o0]" % i, chr(i))), True) self.assertEqual(bool(regex.match(r"[\%03o8]" % i, chr(i))), True) self.assertEqual(bool(regex.match(r"[\x%02x]" % i, chr(i))), True) self.assertEqual(bool(regex.match(r"[\x%02x0]" % i, chr(i))), True) self.assertEqual(bool(regex.match(r"[\x%02xz]" % i, chr(i))), True) self.assertRaisesRegex(regex.error, self.BAD_OCTAL_ESCAPE, lambda: regex.match(r"[\911]", "")) def test_bug_113254(self): self.assertEqual(regex.match(r'(a)|(b)', 'b').start(1), -1) self.assertEqual(regex.match(r'(a)|(b)', 'b').end(1), -1) self.assertEqual(regex.match(r'(a)|(b)', 'b').span(1), (-1, -1)) def test_bug_527371(self): # Bug described in patches 527371/672491. self.assertEqual(regex.match(r'(a)?a','a').lastindex, None) self.assertEqual(regex.match(r'(a)(b)?b','ab').lastindex, 1) self.assertEqual(regex.match(r'(?Pa)(?Pb)?b','ab').lastgroup, 'a') self.assertEqual(regex.match("(?Pa(b))", "ab").lastgroup, 'a') self.assertEqual(regex.match("((a))", "a").lastindex, 1) def test_bug_545855(self): # Bug 545855 -- This pattern failed to cause a compile error as it # should, instead provoking a TypeError. self.assertRaisesRegex(regex.error, self.BAD_SET, lambda: regex.compile('foo[a-')) def test_bug_418626(self): # Bugs 418626 at al. -- Testing Greg Chapman's addition of op code # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of # pattern '*?' on a long string. self.assertEqual(regex.match('.*?c', 10000 * 'ab' + 'cd').end(0), 20001) self.assertEqual(regex.match('.*?cd', 5000 * 'ab' + 'c' + 5000 * 'ab' + 'cde').end(0), 20003) self.assertEqual(regex.match('.*?cd', 20000 * 'abc' + 'de').end(0), 60001) # Non-simple '*?' still used to hit the recursion limit, before the # non-recursive scheme was implemented. self.assertEqual(regex.search('(a|b)*?c', 10000 * 'ab' + 'cd').end(0), 20001) def test_bug_612074(self): pat = u"[" + regex.escape(u"\u2039") + u"]" self.assertEqual(regex.compile(pat) and 1, 1) def test_stack_overflow(self): # Nasty cases that used to overflow the straightforward recursive # implementation of repeated groups. self.assertEqual(regex.match('(x)*', 50000 * 'x')[1], 'x') self.assertEqual(regex.match('(x)*y', 50000 * 'x' + 'y')[1], 'x') self.assertEqual(regex.match('(x)*?y', 50000 * 'x' + 'y')[1], 'x') def test_scanner(self): def s_ident(scanner, token): return token def s_operator(scanner, token): return "op%s" % token def s_float(scanner, token): return float(token) def s_int(scanner, token): return int(token) scanner = regex.Scanner([(r"[a-zA-Z_]\w*", s_ident), (r"\d+\.\d*", s_float), (r"\d+", s_int), (r"=|\+|-|\*|/", s_operator), (r"\s+", None), ]) self.assertEqual(repr(type(scanner.scanner.scanner("").pattern)), self.PATTERN_CLASS) self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"), (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5, 'op+', 'bar'], '')) def test_bug_448951(self): # Bug 448951 (similar to 429357, but with single char match). # (Also test greedy matches.) for op in '', '?', '*': self.assertEqual(regex.match(r'((.%s):)?z' % op, 'z')[:], ('z', None, None)) self.assertEqual(regex.match(r'((.%s):)?z' % op, 'a:z')[:], ('a:z', 'a:', 'a')) def test_bug_725106(self): # Capturing groups in alternatives in repeats. self.assertEqual(regex.match('^((a)|b)*', 'abc')[:], ('ab', 'b', 'a')) self.assertEqual(regex.match('^(([ab])|c)*', 'abc')[:], ('abc', 'c', 'b')) self.assertEqual(regex.match('^((d)|[ab])*', 'abc')[:], ('ab', 'b', None)) self.assertEqual(regex.match('^((a)c|[ab])*', 'abc')[:], ('ab', 'b', None)) self.assertEqual(regex.match('^((a)|b)*?c', 'abc')[:], ('abc', 'b', 'a')) self.assertEqual(regex.match('^(([ab])|c)*?d', 'abcd')[:], ('abcd', 'c', 'b')) self.assertEqual(regex.match('^((d)|[ab])*?c', 'abc')[:], ('abc', 'b', None)) self.assertEqual(regex.match('^((a)c|[ab])*?c', 'abc')[:], ('abc', 'b', None)) def test_bug_725149(self): # Mark_stack_base restoring before restoring marks. self.assertEqual(regex.match('(a)(?:(?=(b)*)c)*', 'abb')[:], ('a', 'a', None)) self.assertEqual(regex.match('(a)((?!(b)*))*', 'abb')[:], ('a', 'a', None, None)) def test_bug_764548(self): # Bug 764548, regex.compile() barfs on str/unicode subclasses. class my_unicode(str): pass pat = regex.compile(my_unicode("abc")) self.assertEqual(pat.match("xyz"), None) def test_finditer(self): it = regex.finditer(r":+", "a:b::c:::d") self.assertEqual([item[0] for item in it], [':', '::', ':::']) def test_bug_926075(self): if regex.compile('bug_926075') is regex.compile(u'bug_926075'): self.fail() def test_bug_931848(self): pattern = u"[\u002E\u3002\uFF0E\uFF61]" self.assertEqual(regex.compile(pattern).split("a.b.c"), ['a', 'b', 'c']) def test_bug_581080(self): it = regex.finditer(r"\s", "a b") self.assertEqual(it.next().span(), (1, 2)) self.assertRaises(StopIteration, lambda: it.next()) scanner = regex.compile(r"\s").scanner("a b") self.assertEqual(scanner.search().span(), (1, 2)) self.assertEqual(scanner.search(), None) def test_bug_817234(self): it = regex.finditer(r".*", "asdf") self.assertEqual(it.next().span(), (0, 4)) self.assertEqual(it.next().span(), (4, 4)) self.assertRaises(StopIteration, lambda: it.next()) def test_empty_array(self): # SF buf 1647541. import array for typecode in 'cbBuhHiIlLfd': a = array.array(typecode) self.assertEqual(regex.compile("bla").match(a), None) self.assertEqual(regex.compile("").match(a)[1 : ], ()) def test_inline_flags(self): # Bug #1700. upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Below lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Below p = regex.compile(upper_char, regex.I | regex.U) self.assertEqual(bool(p.match(lower_char)), True) p = regex.compile(lower_char, regex.I | regex.U) self.assertEqual(bool(p.match(upper_char)), True) p = regex.compile('(?i)' + upper_char, regex.U) self.assertEqual(bool(p.match(lower_char)), True) p = regex.compile('(?i)' + lower_char, regex.U) self.assertEqual(bool(p.match(upper_char)), True) p = regex.compile('(?iu)' + upper_char) self.assertEqual(bool(p.match(lower_char)), True) p = regex.compile('(?iu)' + lower_char) self.assertEqual(bool(p.match(upper_char)), True) self.assertEqual(bool(regex.match(r"(?i)a", "A")), True) self.assertEqual(bool(regex.match(r"a(?i)", "A")), True) self.assertEqual(bool(regex.match(r"(?iV1)a", "A")), True) self.assertEqual(regex.match(r"a(?iV1)", "A"), None) def test_dollar_matches_twice(self): # $ matches the end of string, and just before the terminating \n. pattern = regex.compile('$') self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#') self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#') self.assertEqual(pattern.sub('#', '\n'), '#\n#') pattern = regex.compile('$', regex.MULTILINE) self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#') self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#') self.assertEqual(pattern.sub('#', '\n'), '#\n#') def test_ascii_and_unicode_flag(self): # Unicode patterns. for flags in (0, regex.UNICODE): pat = regex.compile(u'\xc0', flags | regex.IGNORECASE) self.assertEqual(bool(pat.match(u'\xe0')), True) pat = regex.compile(u'\w', flags) self.assertEqual(bool(pat.match(u'\xe0')), True) pat = regex.compile(u'\xc0', regex.ASCII | regex.IGNORECASE) self.assertEqual(pat.match(u'\xe0'), None) pat = regex.compile(u'(?a)\xc0', regex.IGNORECASE) self.assertEqual(pat.match(u'\xe0'), None) pat = regex.compile(u'\w', regex.ASCII) self.assertEqual(pat.match(u'\xe0'), None) pat = regex.compile(u'(?a)\w') self.assertEqual(pat.match(u'\xe0'), None) # String patterns. for flags in (0, regex.ASCII): pat = regex.compile('\xc0', flags | regex.IGNORECASE) self.assertEqual(pat.match('\xe0'), None) pat = regex.compile('\w') self.assertEqual(pat.match('\xe0'), None) self.assertRaisesRegex(ValueError, self.MIXED_FLAGS, lambda: regex.compile('(?au)\w')) def test_subscripting_match(self): m = regex.match(r'(?\w)', 'xy') if not m: self.fail("Failed: expected match but returned None") elif not m or m[0] != m.group(0) or m[1] != m.group(1): self.fail("Failed") if not m: self.fail("Failed: expected match but returned None") elif m[:] != ('x', 'x'): self.fail("Failed: expected \"('x', 'x')\" but got %s instead" % repr(m[:])) def test_new_named_groups(self): m0 = regex.match(r'(?P\w)', 'x') m1 = regex.match(r'(?\w)', 'x') if not (m0 and m1 and m0[:] == m1[:]): self.fail("Failed") def test_properties(self): self.assertEqual(regex.match('(?i)\xC0', '\xE0'), None) self.assertEqual(regex.match(r'(?i)\xC0', '\xE0'), None) self.assertEqual(regex.match(r'\w', '\xE0'), None) self.assertEqual(bool(regex.match(ur'(?u)\w', u'\xE0')), True) # Dropped the following test. It's not possible to determine what the # correct result should be in the general case. # self.assertEqual(bool(regex.match(r'(?L)\w', '\xE0')), # '\xE0'.isalnum()) self.assertEqual(bool(regex.match(r'(?L)\d', '0')), True) self.assertEqual(bool(regex.match(r'(?L)\s', ' ')), True) self.assertEqual(bool(regex.match(r'(?L)\w', 'a')), True) self.assertEqual(regex.match(r'(?L)\d', '?'), None) self.assertEqual(regex.match(r'(?L)\s', '?'), None) self.assertEqual(regex.match(r'(?L)\w', '?'), None) self.assertEqual(regex.match(r'(?L)\D', '0'), None) self.assertEqual(regex.match(r'(?L)\S', ' '), None) self.assertEqual(regex.match(r'(?L)\W', 'a'), None) self.assertEqual(bool(regex.match(r'(?L)\D', '?')), True) self.assertEqual(bool(regex.match(r'(?L)\S', '?')), True) self.assertEqual(bool(regex.match(r'(?L)\W', '?')), True) self.assertEqual(bool(regex.match(ur'(?u)\p{Cyrillic}', u'\N{CYRILLIC CAPITAL LETTER A}')), True) self.assertEqual(bool(regex.match(ur'(?u)\p{IsCyrillic}', u'\N{CYRILLIC CAPITAL LETTER A}')), True) self.assertEqual(bool(regex.match(ur'(?u)\p{Script=Cyrillic}', u'\N{CYRILLIC CAPITAL LETTER A}')), True) self.assertEqual(bool(regex.match(ur'(?u)\p{InCyrillic}', u'\N{CYRILLIC CAPITAL LETTER A}')), True) self.assertEqual(bool(regex.match(ur'(?u)\p{Block=Cyrillic}', u'\N{CYRILLIC CAPITAL LETTER A}')), True) self.assertEqual(bool(regex.match(ur'(?u)[[:Cyrillic:]]', u'\N{CYRILLIC CAPITAL LETTER A}')), True) self.assertEqual(bool(regex.match(ur'(?u)[[:IsCyrillic:]]', u'\N{CYRILLIC CAPITAL LETTER A}')), True) self.assertEqual(bool(regex.match(ur'(?u)[[:Script=Cyrillic:]]', u'\N{CYRILLIC CAPITAL LETTER A}')), True) self.assertEqual(bool(regex.match(ur'(?u)[[:InCyrillic:]]', u'\N{CYRILLIC CAPITAL LETTER A}')), True) self.assertEqual(bool(regex.match(ur'(?u)[[:Block=Cyrillic:]]', u'\N{CYRILLIC CAPITAL LETTER A}')), True) self.assertEqual(bool(regex.match(ur'(?u)\P{Cyrillic}', u'\N{LATIN CAPITAL LETTER A}')), True) self.assertEqual(bool(regex.match(ur'(?u)\P{IsCyrillic}', u'\N{LATIN CAPITAL LETTER A}')), True) self.assertEqual(bool(regex.match(ur'(?u)\P{Script=Cyrillic}', u'\N{LATIN CAPITAL LETTER A}')), True) self.assertEqual(bool(regex.match(ur'(?u)\P{InCyrillic}', u'\N{LATIN CAPITAL LETTER A}')), True) self.assertEqual(bool(regex.match(ur'(?u)\P{Block=Cyrillic}', u'\N{LATIN CAPITAL LETTER A}')), True) self.assertEqual(bool(regex.match(ur'(?u)\p{^Cyrillic}', u'\N{LATIN CAPITAL LETTER A}')), True) self.assertEqual(bool(regex.match(ur'(?u)\p{^IsCyrillic}', u'\N{LATIN CAPITAL LETTER A}')), True) self.assertEqual(bool(regex.match(ur'(?u)\p{^Script=Cyrillic}', u'\N{LATIN CAPITAL LETTER A}')), True) self.assertEqual(bool(regex.match(ur'(?u)\p{^InCyrillic}', u'\N{LATIN CAPITAL LETTER A}')), True) self.assertEqual(bool(regex.match(ur'(?u)\p{^Block=Cyrillic}', u'\N{LATIN CAPITAL LETTER A}')), True) self.assertEqual(bool(regex.match(ur'(?u)[[:^Cyrillic:]]', u'\N{LATIN CAPITAL LETTER A}')), True) self.assertEqual(bool(regex.match(ur'(?u)[[:^IsCyrillic:]]', u'\N{LATIN CAPITAL LETTER A}')), True) self.assertEqual(bool(regex.match(ur'(?u)[[:^Script=Cyrillic:]]', u'\N{LATIN CAPITAL LETTER A}')), True) self.assertEqual(bool(regex.match(ur'(?u)[[:^InCyrillic:]]', u'\N{LATIN CAPITAL LETTER A}')), True) self.assertEqual(bool(regex.match(ur'(?u)[[:^Block=Cyrillic:]]', u'\N{LATIN CAPITAL LETTER A}')), True) self.assertEqual(bool(regex.match(ur'(?u)\d', u'0')), True) self.assertEqual(bool(regex.match(ur'(?u)\s', u' ')), True) self.assertEqual(bool(regex.match(ur'(?u)\w', u'A')), True) self.assertEqual(regex.match(ur"(?u)\d", u"?"), None) self.assertEqual(regex.match(ur"(?u)\s", u"?"), None) self.assertEqual(regex.match(ur"(?u)\w", u"?"), None) self.assertEqual(regex.match(ur"(?u)\D", u"0"), None) self.assertEqual(regex.match(ur"(?u)\S", u" "), None) self.assertEqual(regex.match(ur"(?u)\W", u"A"), None) self.assertEqual(bool(regex.match(ur'(?u)\D', u'?')), True) self.assertEqual(bool(regex.match(ur'(?u)\S', u'?')), True) self.assertEqual(bool(regex.match(ur'(?u)\W', u'?')), True) self.assertEqual(bool(regex.match(ur'(?u)\p{L}', u'A')), True) self.assertEqual(bool(regex.match(ur'(?u)\p{L}', u'a')), True) self.assertEqual(bool(regex.match(ur'(?u)\p{Lu}', u'A')), True) self.assertEqual(bool(regex.match(ur'(?u)\p{Ll}', u'a')), True) self.assertEqual(bool(regex.match(ur'(?u)(?i)a', u'a')), True) self.assertEqual(bool(regex.match(ur'(?u)(?i)a', u'A')), True) self.assertEqual(bool(regex.match(ur'(?u)\w', u'0')), True) self.assertEqual(bool(regex.match(ur'(?u)\w', u'a')), True) self.assertEqual(bool(regex.match(ur'(?u)\w', u'_')), True) self.assertEqual(regex.match(ur"(?u)\X", u"\xE0").span(), (0, 1)) self.assertEqual(regex.match(ur"(?u)\X", u"a\u0300").span(), (0, 2)) self.assertEqual(regex.findall(ur"(?u)\X", u"a\xE0a\u0300e\xE9e\u0301"), [u'a', u'\xe0', u'a\u0300', u'e', u'\xe9', u'e\u0301']) self.assertEqual(regex.findall(ur"(?u)\X{3}", u"a\xE0a\u0300e\xE9e\u0301"), [u'a\xe0a\u0300', u'e\xe9e\u0301']) self.assertEqual(regex.findall(ur"(?u)\X", u"\r\r\n\u0301A\u0301"), [u'\r', u'\r\n', u'\u0301', u'A\u0301']) self.assertEqual(bool(regex.match(ur'(?u)\p{Ll}', u'a')), True) chars_u = u"-09AZaz_\u0393\u03b3" chars_b = "-09AZaz_" word_set = set("Ll Lm Lo Lt Lu Mc Me Mn Nd Nl No Pc".split()) tests = [ (ur"(?u)\w", chars_u, u"09AZaz_\u0393\u03b3"), (ur"(?u)[[:word:]]", chars_u, u"09AZaz_\u0393\u03b3"), (ur"(?u)\W", chars_u, u"-"), (ur"(?u)[[:^word:]]", chars_u, u"-"), (ur"(?u)\d", chars_u, u"09"), (ur"(?u)[[:digit:]]", chars_u, u"09"), (ur"(?u)\D", chars_u, u"-AZaz_\u0393\u03b3"), (ur"(?u)[[:^digit:]]", chars_u, u"-AZaz_\u0393\u03b3"), (ur"(?u)[[:alpha:]]", chars_u, u"AZaz\u0393\u03b3"), (ur"(?u)[[:^alpha:]]", chars_u, u"-09_"), (ur"(?u)[[:alnum:]]", chars_u, u"09AZaz\u0393\u03b3"), (ur"(?u)[[:^alnum:]]", chars_u, u"-_"), (ur"(?u)[[:xdigit:]]", chars_u, u"09Aa"), (ur"(?u)[[:^xdigit:]]", chars_u, u"-Zz_\u0393\u03b3"), (ur"(?u)\p{InBasicLatin}", u"a\xE1", u"a"), (ur"(?u)\P{InBasicLatin}", u"a\xE1", u"\xE1"), (ur"(?iu)\p{InBasicLatin}", u"a\xE1", u"a"), (ur"(?iu)\P{InBasicLatin}", u"a\xE1", u"\xE1"), (r"(?L)\w", chars_b, "09AZaz_"), (r"(?L)[[:word:]]", chars_b, "09AZaz_"), (r"(?L)\W", chars_b, "-"), (r"(?L)[[:^word:]]", chars_b, "-"), (r"(?L)\d", chars_b, "09"), (r"(?L)[[:digit:]]", chars_b, "09"), (r"(?L)\D", chars_b, "-AZaz_"), (r"(?L)[[:^digit:]]", chars_b, "-AZaz_"), (r"(?L)[[:alpha:]]", chars_b, "AZaz"), (r"(?L)[[:^alpha:]]", chars_b, "-09_"), (r"(?L)[[:alnum:]]", chars_b, "09AZaz"), (r"(?L)[[:^alnum:]]", chars_b, "-_"), (r"(?L)[[:xdigit:]]", chars_b, "09Aa"), (r"(?L)[[:^xdigit:]]", chars_b, "-Zz_"), (r"\w", chars_b, "09AZaz_"), (r"[[:word:]]", chars_b, "09AZaz_"), (r"\W", chars_b, "-"), (r"[[:^word:]]", chars_b, "-"), (r"\d", chars_b, "09"), (r"[[:digit:]]", chars_b, "09"), (r"\D", chars_b, "-AZaz_"), (r"[[:^digit:]]", chars_b, "-AZaz_"), (r"[[:alpha:]]", chars_b, "AZaz"), (r"[[:^alpha:]]", chars_b, "-09_"), (r"[[:alnum:]]", chars_b, "09AZaz"), (r"[[:^alnum:]]", chars_b, "-_"), (r"[[:xdigit:]]", chars_b, "09Aa"), (r"[[:^xdigit:]]", chars_b, "-Zz_"), ] for pattern, chars, expected in tests: try: if chars[ : 0].join(regex.findall(pattern, chars)) != expected: self.fail("Failed: %s" % pattern) except Exception, e: self.fail("Failed: %s raised %s" % (pattern, repr(e))) self.assertEqual(bool(regex.match(ur"(?u)\p{NumericValue=0}", u"0")), True) self.assertEqual(bool(regex.match(ur"(?u)\p{NumericValue=1/2}", u"\N{VULGAR FRACTION ONE HALF}")), True) self.assertEqual(bool(regex.match(ur"(?u)\p{NumericValue=0.5}", u"\N{VULGAR FRACTION ONE HALF}")), True) def test_word_class(self): self.assertEqual(regex.findall(ur"(?u)\w+", u" \u0939\u093f\u0928\u094d\u0926\u0940,"), [u'\u0939\u093f\u0928\u094d\u0926\u0940']) self.assertEqual(regex.findall(ur"(?u)\W+", u" \u0939\u093f\u0928\u094d\u0926\u0940,"), [u' ', u',']) self.assertEqual(regex.split(ur"(?uV1)\b", u" \u0939\u093f\u0928\u094d\u0926\u0940,"), [u' ', u'\u0939\u093f\u0928\u094d\u0926\u0940', u',']) self.assertEqual(regex.split(ur"(?uV1)\B", u" \u0939\u093f\u0928\u094d\u0926\u0940,"), [u'', u' \u0939', u'\u093f', u'\u0928', u'\u094d', u'\u0926', u'\u0940,', u'']) def test_search_anchor(self): self.assertEqual(regex.findall(r"\G\w{2}", "abcd ef"), ['ab', 'cd']) def test_search_reverse(self): self.assertEqual(regex.findall(r"(?r).", "abc"), ['c', 'b', 'a']) self.assertEqual(regex.findall(r"(?r).", "abc", overlapped=True), ['c', 'b', 'a']) self.assertEqual(regex.findall(r"(?r)..", "abcde"), ['de', 'bc']) self.assertEqual(regex.findall(r"(?r)..", "abcde", overlapped=True), ['de', 'cd', 'bc', 'ab']) self.assertEqual(regex.findall(r"(?r)(.)(-)(.)", "a-b-c", overlapped=True), [("b", "-", "c"), ("a", "-", "b")]) self.assertEqual([m[0] for m in regex.finditer(r"(?r).", "abc")], ['c', 'b', 'a']) self.assertEqual([m[0] for m in regex.finditer(r"(?r)..", "abcde", overlapped=True)], ['de', 'cd', 'bc', 'ab']) self.assertEqual([m[0] for m in regex.finditer(r"(?r).", "abc")], ['c', 'b', 'a']) self.assertEqual([m[0] for m in regex.finditer(r"(?r)..", "abcde", overlapped=True)], ['de', 'cd', 'bc', 'ab']) self.assertEqual(regex.findall(r"^|\w+", "foo bar"), ['', 'foo', 'bar']) self.assertEqual(regex.findall(r"(?V1)^|\w+", "foo bar"), ['', 'foo', 'bar']) self.assertEqual(regex.findall(r"(?r)^|\w+", "foo bar"), ['bar', 'foo', '']) self.assertEqual(regex.findall(r"(?rV1)^|\w+", "foo bar"), ['bar', 'foo', '']) self.assertEqual([m[0] for m in regex.finditer(r"^|\w+", "foo bar")], ['', 'foo', 'bar']) self.assertEqual([m[0] for m in regex.finditer(r"(?V1)^|\w+", "foo bar")], ['', 'foo', 'bar']) self.assertEqual([m[0] for m in regex.finditer(r"(?r)^|\w+", "foo bar")], ['bar', 'foo', '']) self.assertEqual([m[0] for m in regex.finditer(r"(?rV1)^|\w+", "foo bar")], ['bar', 'foo', '']) self.assertEqual(regex.findall(r"\G\w{2}", "abcd ef"), ['ab', 'cd']) self.assertEqual(regex.findall(r".{2}(?<=\G.*)", "abcd"), ['ab', 'cd']) self.assertEqual(regex.findall(r"(?r)\G\w{2}", "abcd ef"), []) self.assertEqual(regex.findall(r"(?r)\w{2}\G", "abcd ef"), ['ef']) self.assertEqual(regex.findall(r"q*", "qqwe"), ['qq', '', '', '']) self.assertEqual(regex.findall(r"(?V1)q*", "qqwe"), ['qq', '', '', '']) self.assertEqual(regex.findall(r"(?r)q*", "qqwe"), ['', '', 'qq', '']) self.assertEqual(regex.findall(r"(?rV1)q*", "qqwe"), ['', '', 'qq', '']) self.assertEqual(regex.findall(".", "abcd", pos=1, endpos=3), ['b', 'c']) self.assertEqual(regex.findall(".", "abcd", pos=1, endpos=-1), ['b', 'c']) self.assertEqual([m[0] for m in regex.finditer(".", "abcd", pos=1, endpos=3)], ['b', 'c']) self.assertEqual([m[0] for m in regex.finditer(".", "abcd", pos=1, endpos=-1)], ['b', 'c']) self.assertEqual([m[0] for m in regex.finditer("(?r).", "abcd", pos=1, endpos=3)], ['c', 'b']) self.assertEqual([m[0] for m in regex.finditer("(?r).", "abcd", pos=1, endpos=-1)], ['c', 'b']) self.assertEqual(regex.findall("(?r).", "abcd", pos=1, endpos=3), ['c', 'b']) self.assertEqual(regex.findall("(?r).", "abcd", pos=1, endpos=-1), ['c', 'b']) self.assertEqual(regex.findall(r"[ab]", "aB", regex.I), ['a', 'B']) self.assertEqual(regex.findall(r"(?r)[ab]", "aB", regex.I), ['B', 'a']) self.assertEqual(regex.findall(r"(?r).{2}", "abc"), ['bc']) self.assertEqual(regex.findall(r"(?r).{2}", "abc", overlapped=True), ['bc', 'ab']) self.assertEqual(regex.findall(r"(\w+) (\w+)", "first second third fourth fifth"), [('first', 'second'), ('third', 'fourth')]) self.assertEqual(regex.findall(r"(?r)(\w+) (\w+)", "first second third fourth fifth"), [('fourth', 'fifth'), ('second', 'third')]) self.assertEqual([m[0] for m in regex.finditer(r"(?r).{2}", "abc")], ['bc']) self.assertEqual([m[0] for m in regex.finditer(r"(?r).{2}", "abc", overlapped=True)], ['bc', 'ab']) self.assertEqual([m[0] for m in regex.finditer(r"(\w+) (\w+)", "first second third fourth fifth")], ['first second', 'third fourth']) self.assertEqual([m[0] for m in regex.finditer(r"(?r)(\w+) (\w+)", "first second third fourth fifth")], ['fourth fifth', 'second third']) self.assertEqual(regex.search("abcdef", "abcdef").span(), (0, 6)) self.assertEqual(regex.search("(?r)abcdef", "abcdef").span(), (0, 6)) self.assertEqual(regex.search("(?i)abcdef", "ABCDEF").span(), (0, 6)) self.assertEqual(regex.search("(?ir)abcdef", "ABCDEF").span(), (0, 6)) self.assertEqual(regex.sub(r"(.)", r"\1", "abc"), 'abc') self.assertEqual(regex.sub(r"(?r)(.)", r"\1", "abc"), 'abc') def test_atomic(self): # Issue 433030. self.assertEqual(regex.search(r"(?>a*)a", "aa"), None) def test_possessive(self): # Single-character non-possessive. self.assertEqual(regex.search(r"a?a", "a").span(), (0, 1)) self.assertEqual(regex.search(r"a*a", "aaa").span(), (0, 3)) self.assertEqual(regex.search(r"a+a", "aaa").span(), (0, 3)) self.assertEqual(regex.search(r"a{1,3}a", "aaa").span(), (0, 3)) # Multiple-character non-possessive. self.assertEqual(regex.search(r"(?:ab)?ab", "ab").span(), (0, 2)) self.assertEqual(regex.search(r"(?:ab)*ab", "ababab").span(), (0, 6)) self.assertEqual(regex.search(r"(?:ab)+ab", "ababab").span(), (0, 6)) self.assertEqual(regex.search(r"(?:ab){1,3}ab", "ababab").span(), (0, 6)) # Single-character possessive. self.assertEqual(regex.search(r"a?+a", "a"), None) self.assertEqual(regex.search(r"a*+a", "aaa"), None) self.assertEqual(regex.search(r"a++a", "aaa"), None) self.assertEqual(regex.search(r"a{1,3}+a", "aaa"), None) # Multiple-character possessive. self.assertEqual(regex.search(r"(?:ab)?+ab", "ab"), None) self.assertEqual(regex.search(r"(?:ab)*+ab", "ababab"), None) self.assertEqual(regex.search(r"(?:ab)++ab", "ababab"), None) self.assertEqual(regex.search(r"(?:ab){1,3}+ab", "ababab"), None) def test_zerowidth(self): # Issue 3262. self.assertEqual(regex.split(r"\b", "a b"), ['a b']) self.assertEqual(regex.split(r"(?V1)\b", "a b"), ['', 'a', ' ', 'b', '']) # Issue 1647489. self.assertEqual(regex.findall(r"^|\w+", "foo bar"), ['', 'foo', 'bar']) self.assertEqual([m[0] for m in regex.finditer(r"^|\w+", "foo bar")], ['', 'foo', 'bar']) self.assertEqual(regex.findall(r"(?r)^|\w+", "foo bar"), ['bar', 'foo', '']) self.assertEqual([m[0] for m in regex.finditer(r"(?r)^|\w+", "foo bar")], ['bar', 'foo', '']) self.assertEqual(regex.findall(r"(?V1)^|\w+", "foo bar"), ['', 'foo', 'bar']) self.assertEqual([m[0] for m in regex.finditer(r"(?V1)^|\w+", "foo bar")], ['', 'foo', 'bar']) self.assertEqual(regex.findall(r"(?rV1)^|\w+", "foo bar"), ['bar', 'foo', '']) self.assertEqual([m[0] for m in regex.finditer(r"(?rV1)^|\w+", "foo bar")], ['bar', 'foo', '']) self.assertEqual(regex.split("", "xaxbxc"), ['xaxbxc']) self.assertEqual([m for m in regex.splititer("", "xaxbxc")], ['xaxbxc']) self.assertEqual(regex.split("(?r)", "xaxbxc"), ['xaxbxc']) self.assertEqual([m for m in regex.splititer("(?r)", "xaxbxc")], ['xaxbxc']) self.assertEqual(regex.split("(?V1)", "xaxbxc"), ['', 'x', 'a', 'x', 'b', 'x', 'c', '']) self.assertEqual([m for m in regex.splititer("(?V1)", "xaxbxc")], ['', 'x', 'a', 'x', 'b', 'x', 'c', '']) self.assertEqual(regex.split("(?rV1)", "xaxbxc"), ['', 'c', 'x', 'b', 'x', 'a', 'x', '']) self.assertEqual([m for m in regex.splititer("(?rV1)", "xaxbxc")], ['', 'c', 'x', 'b', 'x', 'a', 'x', '']) def test_scoped_and_inline_flags(self): # Issues 433028, 433024, 433027. self.assertEqual(regex.search(r"(?i)Ab", "ab").span(), (0, 2)) self.assertEqual(regex.search(r"(?i:A)b", "ab").span(), (0, 2)) self.assertEqual(regex.search(r"A(?i)b", "ab").span(), (0, 2)) self.assertEqual(regex.search(r"A(?iV1)b", "ab"), None) self.assertRaisesRegex(regex.error, self.CANT_TURN_OFF, lambda: regex.search(r"(?V0-i)Ab", "ab", flags=regex.I)) self.assertEqual(regex.search(r"(?V0)Ab", "ab"), None) self.assertEqual(regex.search(r"(?V1)Ab", "ab"), None) self.assertEqual(regex.search(r"(?V1-i)Ab", "ab", flags=regex.I), None) self.assertEqual(regex.search(r"(?-i:A)b", "ab", flags=regex.I), None) self.assertEqual(regex.search(r"A(?V1-i)b", "ab", flags=regex.I).span(), (0, 2)) def test_repeated_repeats(self): # Issue 2537. self.assertEqual(regex.search(r"(?:a+)+", "aaa").span(), (0, 3)) self.assertEqual(regex.search(r"(?:(?:ab)+c)+", "abcabc").span(), (0, 6)) def test_lookbehind(self): self.assertEqual(regex.search(r"123(?<=a\d+)", "a123").span(), (1, 4)) self.assertEqual(regex.search(r"123(?<=a\d+)", "b123"), None) self.assertEqual(regex.search(r"123(?[ \t]+\r*$)|(?P(?<=[^\n])\Z)') self.assertEqual(pat.subn(lambda m: '<' + m.lastgroup + '>', 'foobar '), ('foobar', 1)) self.assertEqual([m.group() for m in pat.finditer('foobar ')], [' ', '']) pat = regex.compile(r'(?mV1)(?P[ \t]+\r*$)|(?P(?<=[^\n])\Z)') self.assertEqual(pat.subn(lambda m: '<' + m.lastgroup + '>', 'foobar '), ('foobar', 2)) self.assertEqual([m.group() for m in pat.finditer('foobar ')], [' ', '']) def test_overlapped(self): self.assertEqual(regex.findall(r"..", "abcde"), ['ab', 'cd']) self.assertEqual(regex.findall(r"..", "abcde", overlapped=True), ['ab', 'bc', 'cd', 'de']) self.assertEqual(regex.findall(r"(?r)..", "abcde"), ['de', 'bc']) self.assertEqual(regex.findall(r"(?r)..", "abcde", overlapped=True), ['de', 'cd', 'bc', 'ab']) self.assertEqual(regex.findall(r"(.)(-)(.)", "a-b-c", overlapped=True), [("a", "-", "b"), ("b", "-", "c")]) self.assertEqual([m[0] for m in regex.finditer(r"..", "abcde")], ['ab', 'cd']) self.assertEqual([m[0] for m in regex.finditer(r"..", "abcde", overlapped=True)], ['ab', 'bc', 'cd', 'de']) self.assertEqual([m[0] for m in regex.finditer(r"(?r)..", "abcde")], ['de', 'bc']) self.assertEqual([m[0] for m in regex.finditer(r"(?r)..", "abcde", overlapped=True)], ['de', 'cd', 'bc', 'ab']) self.assertEqual([m.groups() for m in regex.finditer(r"(.)(-)(.)", "a-b-c", overlapped=True)], [("a", "-", "b"), ("b", "-", "c")]) self.assertEqual([m.groups() for m in regex.finditer(r"(?r)(.)(-)(.)", "a-b-c", overlapped=True)], [("b", "-", "c"), ("a", "-", "b")]) def test_splititer(self): self.assertEqual(regex.split(r",", "a,b,,c,"), ['a', 'b', '', 'c', '']) self.assertEqual([m for m in regex.splititer(r",", "a,b,,c,")], ['a', 'b', '', 'c', '']) def test_grapheme(self): self.assertEqual(regex.match(ur"(?u)\X", u"\xE0").span(), (0, 1)) self.assertEqual(regex.match(ur"(?u)\X", u"a\u0300").span(), (0, 2)) self.assertEqual(regex.findall(ur"(?u)\X", u"a\xE0a\u0300e\xE9e\u0301"), [u'a', u'\xe0', u'a\u0300', u'e', u'\xe9', u'e\u0301']) self.assertEqual(regex.findall(ur"(?u)\X{3}", u"a\xE0a\u0300e\xE9e\u0301"), [u'a\xe0a\u0300', u'e\xe9e\u0301']) self.assertEqual(regex.findall(ur"(?u)\X", u"\r\r\n\u0301A\u0301"), [u'\r', u'\r\n', u'\u0301', u'A\u0301']) def test_word_boundary(self): text = u'The quick ("brown") fox can\'t jump 32.3 feet, right?' self.assertEqual(regex.split(ur'(?V1)\b', text), [u'', u'The', u' ', u'quick', u' ("', u'brown', u'") ', u'fox', u' ', u'can', u"'", u't', u' ', u'jump', u' ', u'32', u'.', u'3', u' ', u'feet', u', ', u'right', u'?']) self.assertEqual(regex.split(ur'(?V1w)\b', text), [u'', u'The', u' ', u'quick', u' ', u'(', u'"', u'brown', u'"', u')', u' ', u'fox', u' ', u"can't", u' ', u'jump', u' ', u'32.3', u' ', u'feet', u',', u' ', u'right', u'?', u'']) text = u"The fox" self.assertEqual(regex.split(ur'(?V1)\b', text), [u'', u'The', u' ', u'fox', u'']) self.assertEqual(regex.split(ur'(?V1w)\b', text), [u'', u'The', u' ', u' ', u'fox', u'']) text = u"can't aujourd'hui l'objectif" self.assertEqual(regex.split(ur'(?V1)\b', text), [u'', u'can', u"'", u't', u' ', u'aujourd', u"'", u'hui', u' ', u'l', u"'", u'objectif', u'']) self.assertEqual(regex.split(ur'(?V1w)\b', text), [u'', u"can't", u' ', u"aujourd'hui", u' ', u"l'", u'objectif', u'']) def test_line_boundary(self): self.assertEqual(regex.findall(r".+", "Line 1\nLine 2\n"), ["Line 1", "Line 2"]) self.assertEqual(regex.findall(r".+", "Line 1\rLine 2\r"), ["Line 1\rLine 2\r"]) self.assertEqual(regex.findall(r".+", "Line 1\r\nLine 2\r\n"), ["Line 1\r", "Line 2\r"]) self.assertEqual(regex.findall(r"(?w).+", "Line 1\nLine 2\n"), ["Line 1", "Line 2"]) self.assertEqual(regex.findall(r"(?w).+", "Line 1\rLine 2\r"), ["Line 1", "Line 2"]) self.assertEqual(regex.findall(r"(?w).+", "Line 1\r\nLine 2\r\n"), ["Line 1", "Line 2"]) self.assertEqual(regex.search(r"^abc", "abc").start(), 0) self.assertEqual(regex.search(r"^abc", "\nabc"), None) self.assertEqual(regex.search(r"^abc", "\rabc"), None) self.assertEqual(regex.search(r"(?w)^abc", "abc").start(), 0) self.assertEqual(regex.search(r"(?w)^abc", "\nabc"), None) self.assertEqual(regex.search(r"(?w)^abc", "\rabc"), None) self.assertEqual(regex.search(r"abc$", "abc").start(), 0) self.assertEqual(regex.search(r"abc$", "abc\n").start(), 0) self.assertEqual(regex.search(r"abc$", "abc\r"), None) self.assertEqual(regex.search(r"(?w)abc$", "abc").start(), 0) self.assertEqual(regex.search(r"(?w)abc$", "abc\n").start(), 0) self.assertEqual(regex.search(r"(?w)abc$", "abc\r").start(), 0) self.assertEqual(regex.search(r"(?m)^abc", "abc").start(), 0) self.assertEqual(regex.search(r"(?m)^abc", "\nabc").start(), 1) self.assertEqual(regex.search(r"(?m)^abc", "\rabc"), None) self.assertEqual(regex.search(r"(?mw)^abc", "abc").start(), 0) self.assertEqual(regex.search(r"(?mw)^abc", "\nabc").start(), 1) self.assertEqual(regex.search(r"(?mw)^abc", "\rabc").start(), 1) self.assertEqual(regex.search(r"(?m)abc$", "abc").start(), 0) self.assertEqual(regex.search(r"(?m)abc$", "abc\n").start(), 0) self.assertEqual(regex.search(r"(?m)abc$", "abc\r"), None) self.assertEqual(regex.search(r"(?mw)abc$", "abc").start(), 0) self.assertEqual(regex.search(r"(?mw)abc$", "abc\n").start(), 0) self.assertEqual(regex.search(r"(?mw)abc$", "abc\r").start(), 0) def test_branch_reset(self): self.assertEqual(regex.match(r"(?:(a)|(b))(c)", "ac").groups(), ('a', None, 'c')) self.assertEqual(regex.match(r"(?:(a)|(b))(c)", "bc").groups(), (None, 'b', 'c')) self.assertEqual(regex.match(r"(?:(?a)|(?b))(?c)", "ac").groups(), ('a', None, 'c')) self.assertEqual(regex.match(r"(?:(?a)|(?b))(?c)", "bc").groups(), (None, 'b', 'c')) self.assertEqual(regex.match(r"(?a)(?:(?b)|(?c))(?d)", "abd").groups(), ('a', 'b', None, 'd')) self.assertEqual(regex.match(r"(?a)(?:(?b)|(?c))(?d)", "acd").groups(), ('a', None, 'c', 'd')) self.assertEqual(regex.match(r"(a)(?:(b)|(c))(d)", "abd").groups(), ('a', 'b', None, 'd')) self.assertEqual(regex.match(r"(a)(?:(b)|(c))(d)", "acd").groups(), ('a', None, 'c', 'd')) self.assertEqual(regex.match(r"(a)(?|(b)|(b))(d)", "abd").groups(), ('a', 'b', 'd')) self.assertEqual(regex.match(r"(?|(?a)|(?b))(c)", "ac").groups(), ('a', None, 'c')) self.assertEqual(regex.match(r"(?|(?a)|(?b))(c)", "bc").groups(), (None, 'b', 'c')) self.assertEqual(regex.match(r"(?|(?a)|(?b))(c)", "ac").groups(), ('a', 'c')) self.assertEqual(regex.match(r"(?|(?a)|(?b))(c)", "bc").groups(), ('b', 'c')) self.assertEqual(regex.match(r"(?|(?a)(?b)|(?c)(?d))(e)", "abe").groups(), ('a', 'b', 'e')) self.assertEqual(regex.match(r"(?|(?a)(?b)|(?c)(?d))(e)", "cde").groups(), ('d', 'c', 'e')) self.assertEqual(regex.match(r"(?|(?a)(?b)|(?c)(d))(e)", "abe").groups(), ('a', 'b', 'e')) self.assertEqual(regex.match(r"(?|(?a)(?b)|(?c)(d))(e)", "cde").groups(), ('d', 'c', 'e')) self.assertEqual(regex.match(r"(?|(?a)(?b)|(c)(d))(e)", "abe").groups(), ('a', 'b', 'e')) self.assertEqual(regex.match(r"(?|(?a)(?b)|(c)(d))(e)", "cde").groups(), ('c', 'd', 'e')) # Hg issue 87. self.assertEqual(regex.match(r"(?|(?a)(?b)|(c)(?d))(e)", "abe").groups(), ("a", "b", "e")) self.assertEqual(regex.match(r"(?|(?a)(?b)|(c)(?d))(e)", "abe").capturesdict(), {"a": ["a"], "b": ["b"]}) self.assertEqual(regex.match(r"(?|(?a)(?b)|(c)(?d))(e)", "cde").groups(), ("d", None, "e")) self.assertEqual(regex.match(r"(?|(?a)(?b)|(c)(?d))(e)", "cde").capturesdict(), {"a": ["c", "d"], "b": []}) def test_set(self): self.assertEqual(regex.match(r"[a]", "a").span(), (0, 1)) self.assertEqual(regex.match(r"(?i)[a]", "A").span(), (0, 1)) self.assertEqual(regex.match(r"[a-b]", r"a").span(), (0, 1)) self.assertEqual(regex.match(r"(?i)[a-b]", r"A").span(), (0, 1)) self.assertEqual(regex.sub(r"(?V0)([][])", r"-", "a[b]c"), "a-b-c") self.assertEqual(regex.findall(ur"[\p{Alpha}]", u"a0"), [u"a"]) self.assertEqual(regex.findall(ur"(?i)[\p{Alpha}]", u"A0"), [u"A"]) self.assertEqual(regex.findall(ur"[a\p{Alpha}]", u"ab0"), [u"a", u"b"]) self.assertEqual(regex.findall(ur"[a\P{Alpha}]", u"ab0"), [u"a", u"0"]) self.assertEqual(regex.findall(ur"(?i)[a\p{Alpha}]", u"ab0"), [u"a", u"b"]) self.assertEqual(regex.findall(ur"(?i)[a\P{Alpha}]", u"ab0"), [u"a", u"0"]) self.assertEqual(regex.findall(ur"[a-b\p{Alpha}]", u"abC0"), [u"a", u"b", u"C"]) self.assertEqual(regex.findall(ur"(?i)[a-b\p{Alpha}]", u"AbC0"), [u"A", u"b", u"C"]) self.assertEqual(regex.findall(ur"[\p{Alpha}]", u"a0"), [u"a"]) self.assertEqual(regex.findall(ur"[\P{Alpha}]", u"a0"), [u"0"]) self.assertEqual(regex.findall(ur"[^\p{Alpha}]", u"a0"), [u"0"]) self.assertEqual(regex.findall(ur"[^\P{Alpha}]", u"a0"), [u"a"]) self.assertEqual("".join(regex.findall(r"[^\d-h]", "a^b12c-h")), 'a^bc') self.assertEqual("".join(regex.findall(r"[^\dh]", "a^b12c-h")), 'a^bc-') self.assertEqual("".join(regex.findall(r"[^h\s\db]", "a^b 12c-h")), 'a^c-') self.assertEqual("".join(regex.findall(r"[^b\w]", "a b")), ' ') self.assertEqual("".join(regex.findall(r"[^b\S]", "a b")), ' ') self.assertEqual("".join(regex.findall(r"[^8\d]", "a 1b2")), 'a b') all_chars = u"".join(unichr(c) for c in range(0x100)) self.assertEqual(len(regex.findall(ur"(?u)\p{ASCII}", all_chars)), 128) self.assertEqual(len(regex.findall(ur"(?u)\p{Letter}", all_chars)), 117) self.assertEqual(len(regex.findall(ur"(?u)\p{Digit}", all_chars)), 10) # Set operators self.assertEqual(len(regex.findall(ur"(?uV1)[\p{ASCII}&&\p{Letter}]", all_chars)), 52) self.assertEqual(len(regex.findall(ur"(?uV1)[\p{ASCII}&&\p{Alnum}&&\p{Letter}]", all_chars)), 52) self.assertEqual(len(regex.findall(ur"(?uV1)[\p{ASCII}&&\p{Alnum}&&\p{Digit}]", all_chars)), 10) self.assertEqual(len(regex.findall(ur"(?uV1)[\p{ASCII}&&\p{Cc}]", all_chars)), 33) self.assertEqual(len(regex.findall(ur"(?uV1)[\p{ASCII}&&\p{Graph}]", all_chars)), 94) self.assertEqual(len(regex.findall(ur"(?uV1)[\p{ASCII}--\p{Cc}]", all_chars)), 95) self.assertEqual(len(regex.findall(ur"(?u)[\p{Letter}\p{Digit}]", all_chars)), 127) self.assertEqual(len(regex.findall(ur"(?uV1)[\p{Letter}||\p{Digit}]", all_chars)), 127) self.assertEqual(len(regex.findall(ur"(?u)\p{HexDigit}", all_chars)), 22) self.assertEqual(len(regex.findall(ur"(?uV1)[\p{HexDigit}~~\p{Digit}]", all_chars)), 12) self.assertEqual(len(regex.findall(ur"(?uV1)[\p{Digit}~~\p{HexDigit}]", all_chars)), 12) self.assertEqual(repr(type(regex.compile(r"(?V0)([][-])"))), self.PATTERN_CLASS) self.assertEqual(regex.findall(r"(?V1)[[a-z]--[aei]]", "abc"), ["b", "c"]) self.assertEqual(regex.findall(r"(?iV1)[[a-z]--[aei]]", "abc"), ["b", "c"]) self.assertEqual(regex.findall("(?V1)[\w--a]","abc"), ["b", "c"]) self.assertEqual(regex.findall("(?iV1)[\w--a]","abc"), ["b", "c"]) def test_various(self): tests = [ # Test ?P< and ?P= extensions. ('(?Pa)', '', '', regex.error, self.BAD_GROUP_NAME), # Begins with a digit. ('(?Pa)', '', '', regex.error, self.BAD_GROUP_NAME), # Begins with an illegal char. ('(?Pa)', '', '', regex.error, self.BAD_GROUP_NAME), # Begins with an illegal char. # Same tests, for the ?P= form. ('(?Pa)(?P=foo_123', 'aa', '', regex.error, self.MISSING_RPAREN), ('(?Pa)(?P=1)', 'aa', '', regex.error, self.BAD_GROUP_NAME), ('(?Pa)(?P=!)', 'aa', '', regex.error, self.BAD_GROUP_NAME), ('(?Pa)(?P=foo_124)', 'aa', '', regex.error, self.UNKNOWN_GROUP), # Backref to undefined group. ('(?Pa)', 'a', '1', repr('a')), ('(?Pa)(?P=foo_123)', 'aa', '1', repr('a')), # Mal-formed \g in pattern treated as literal for compatibility. (r'(?a)\ga)\g<1>', 'aa', '1', repr('a')), (r'(?a)\g', 'aa', '', repr(None)), (r'(?a)\g', 'aa', '', regex.error, self.UNKNOWN_GROUP), # Backref to undefined group. ('(?a)', 'a', '1', repr('a')), (r'(?a)\g', 'aa', '1', repr('a')), # Test octal escapes. ('\\1', 'a', '', regex.error, self.UNKNOWN_GROUP), # Backreference. ('[\\1]', '\1', '0', "'\\x01'"), # Character. ('\\09', chr(0) + '9', '0', repr(chr(0) + '9')), ('\\141', 'a', '0', repr('a')), ('(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\\119', 'abcdefghijklk9', '0,11', repr(('abcdefghijklk9', 'k'))), # Test \0 is handled everywhere. (r'\0', '\0', '0', repr('\0')), (r'[\0a]', '\0', '0', repr('\0')), (r'[a\0]', '\0', '0', repr('\0')), (r'[^a\0]', '\0', '', repr(None)), # Test various letter escapes. (r'\a[\b]\f\n\r\t\v', '\a\b\f\n\r\t\v', '0', repr('\a\b\f\n\r\t\v')), (r'[\a][\b][\f][\n][\r][\t][\v]', '\a\b\f\n\r\t\v', '0', repr('\a\b\f\n\r\t\v')), (r'\c\e\g\h\i\j\k\o\p\q\y\z', 'ceghijkopqyz', '0', repr('ceghijkopqyz')), (r'\xff', '\377', '0', repr(chr(255))), # New \x semantics. (r'\x00ffffffffffffff', '\377', '', repr(None)), (r'\x00f', '\017', '', repr(None)), (r'\x00fe', '\376', '', repr(None)), (r'\x00ff', '\377', '', repr(None)), (r'\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', '0', repr('\t\n\v\r\f\ag')), ('\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', '0', repr('\t\n\v\r\f\ag')), (r'\t\n\v\r\f\a', '\t\n\v\r\f\a', '0', repr(chr(9) + chr(10) + chr(11) + chr(13) + chr(12) + chr(7))), (r'[\t][\n][\v][\r][\f][\b]', '\t\n\v\r\f\b', '0', repr('\t\n\v\r\f\b')), (r"^\w+=(\\[\000-\277]|[^\n\\])*", "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c", '0', repr("SRC=eval.c g.c blah blah blah \\\\")), # Test that . only matches \n in DOTALL mode. ('a.b', 'acb', '0', repr('acb')), ('a.b', 'a\nb', '', repr(None)), ('a.*b', 'acc\nccb', '', repr(None)), ('a.{4,5}b', 'acc\nccb', '', repr(None)), ('a.b', 'a\rb', '0', repr('a\rb')), # The new behaviour is that the inline flag affects only what follows. ('a.b(?s)', 'a\nb', '0', repr('a\nb')), ('a.b(?sV1)', 'a\nb', '', repr(None)), ('(?s)a.b', 'a\nb', '0', repr('a\nb')), ('a.*(?s)b', 'acc\nccb', '0', repr('acc\nccb')), ('a.*(?sV1)b', 'acc\nccb', '', repr(None)), ('(?s)a.*b', 'acc\nccb', '0', repr('acc\nccb')), ('(?s)a.{4,5}b', 'acc\nccb', '0', repr('acc\nccb')), (')', '', '', regex.error, self.TRAILING_CHARS), # Unmatched right bracket. ('', '', '0', "''"), # Empty pattern. ('abc', 'abc', '0', repr('abc')), ('abc', 'xbc', '', repr(None)), ('abc', 'axc', '', repr(None)), ('abc', 'abx', '', repr(None)), ('abc', 'xabcy', '0', repr('abc')), ('abc', 'ababc', '0', repr('abc')), ('ab*c', 'abc', '0', repr('abc')), ('ab*bc', 'abc', '0', repr('abc')), ('ab*bc', 'abbc', '0', repr('abbc')), ('ab*bc', 'abbbbc', '0', repr('abbbbc')), ('ab+bc', 'abbc', '0', repr('abbc')), ('ab+bc', 'abc', '', repr(None)), ('ab+bc', 'abq', '', repr(None)), ('ab+bc', 'abbbbc', '0', repr('abbbbc')), ('ab?bc', 'abbc', '0', repr('abbc')), ('ab?bc', 'abc', '0', repr('abc')), ('ab?bc', 'abbbbc', '', repr(None)), ('ab?c', 'abc', '0', repr('abc')), ('^abc$', 'abc', '0', repr('abc')), ('^abc$', 'abcc', '', repr(None)), ('^abc', 'abcc', '0', repr('abc')), ('^abc$', 'aabc', '', repr(None)), ('abc$', 'aabc', '0', repr('abc')), ('^', 'abc', '0', repr('')), ('$', 'abc', '0', repr('')), ('a.c', 'abc', '0', repr('abc')), ('a.c', 'axc', '0', repr('axc')), ('a.*c', 'axyzc', '0', repr('axyzc')), ('a.*c', 'axyzd', '', repr(None)), ('a[bc]d', 'abc', '', repr(None)), ('a[bc]d', 'abd', '0', repr('abd')), ('a[b-d]e', 'abd', '', repr(None)), ('a[b-d]e', 'ace', '0', repr('ace')), ('a[b-d]', 'aac', '0', repr('ac')), ('a[-b]', 'a-', '0', repr('a-')), ('a[\\-b]', 'a-', '0', repr('a-')), ('a[b-]', 'a-', '0', repr('a-')), ('a[]b', '-', '', regex.error, self.BAD_SET), ('a[', '-', '', regex.error, self.BAD_SET), ('a\\', '-', '', regex.error, self.BAD_ESCAPE), ('abc)', '-', '', regex.error, self.TRAILING_CHARS), ('(abc', '-', '', regex.error, self.MISSING_RPAREN), ('a]', 'a]', '0', repr('a]')), ('a[]]b', 'a]b', '0', repr('a]b')), ('a[]]b', 'a]b', '0', repr('a]b')), ('a[^bc]d', 'aed', '0', repr('aed')), ('a[^bc]d', 'abd', '', repr(None)), ('a[^-b]c', 'adc', '0', repr('adc')), ('a[^-b]c', 'a-c', '', repr(None)), ('a[^]b]c', 'a]c', '', repr(None)), ('a[^]b]c', 'adc', '0', repr('adc')), ('\\ba\\b', 'a-', '0', repr('a')), ('\\ba\\b', '-a', '0', repr('a')), ('\\ba\\b', '-a-', '0', repr('a')), ('\\by\\b', 'xy', '', repr(None)), ('\\by\\b', 'yz', '', repr(None)), ('\\by\\b', 'xyz', '', repr(None)), ('x\\b', 'xyz', '', repr(None)), ('x\\B', 'xyz', '0', repr('x')), ('\\Bz', 'xyz', '0', repr('z')), ('z\\B', 'xyz', '', repr(None)), ('\\Bx', 'xyz', '', repr(None)), ('\\Ba\\B', 'a-', '', repr(None)), ('\\Ba\\B', '-a', '', repr(None)), ('\\Ba\\B', '-a-', '', repr(None)), ('\\By\\B', 'xy', '', repr(None)), ('\\By\\B', 'yz', '', repr(None)), ('\\By\\b', 'xy', '0', repr('y')), ('\\by\\B', 'yz', '0', repr('y')), ('\\By\\B', 'xyz', '0', repr('y')), ('ab|cd', 'abc', '0', repr('ab')), ('ab|cd', 'abcd', '0', repr('ab')), ('()ef', 'def', '0,1', repr(('ef', ''))), ('$b', 'b', '', repr(None)), ('a\\(b', 'a(b', '', repr(('a(b',))), ('a\\(*b', 'ab', '0', repr('ab')), ('a\\(*b', 'a((b', '0', repr('a((b')), ('a\\\\b', 'a\\b', '0', repr('a\\b')), ('((a))', 'abc', '0,1,2', repr(('a', 'a', 'a'))), ('(a)b(c)', 'abc', '0,1,2', repr(('abc', 'a', 'c'))), ('a+b+c', 'aabbabc', '0', repr('abc')), ('(a+|b)*', 'ab', '0,1', repr(('ab', 'b'))), ('(a+|b)+', 'ab', '0,1', repr(('ab', 'b'))), ('(a+|b)?', 'ab', '0,1', repr(('a', 'a'))), (')(', '-', '', regex.error, self.TRAILING_CHARS), ('[^ab]*', 'cde', '0', repr('cde')), ('abc', '', '', repr(None)), ('a*', '', '0', repr('')), ('a|b|c|d|e', 'e', '0', repr('e')), ('(a|b|c|d|e)f', 'ef', '0,1', repr(('ef', 'e'))), ('abcd*efg', 'abcdefg', '0', repr('abcdefg')), ('ab*', 'xabyabbbz', '0', repr('ab')), ('ab*', 'xayabbbz', '0', repr('a')), ('(ab|cd)e', 'abcde', '0,1', repr(('cde', 'cd'))), ('[abhgefdc]ij', 'hij', '0', repr('hij')), ('^(ab|cd)e', 'abcde', '', repr(None)), ('(abc|)ef', 'abcdef', '0,1', repr(('ef', ''))), ('(a|b)c*d', 'abcd', '0,1', repr(('bcd', 'b'))), ('(ab|ab*)bc', 'abc', '0,1', repr(('abc', 'a'))), ('a([bc]*)c*', 'abc', '0,1', repr(('abc', 'bc'))), ('a([bc]*)(c*d)', 'abcd', '0,1,2', repr(('abcd', 'bc', 'd'))), ('a([bc]+)(c*d)', 'abcd', '0,1,2', repr(('abcd', 'bc', 'd'))), ('a([bc]*)(c+d)', 'abcd', '0,1,2', repr(('abcd', 'b', 'cd'))), ('a[bcd]*dcdcde', 'adcdcde', '0', repr('adcdcde')), ('a[bcd]+dcdcde', 'adcdcde', '', repr(None)), ('(ab|a)b*c', 'abc', '0,1', repr(('abc', 'ab'))), ('((a)(b)c)(d)', 'abcd', '1,2,3,4', repr(('abc', 'a', 'b', 'd'))), ('[a-zA-Z_][a-zA-Z0-9_]*', 'alpha', '0', repr('alpha')), ('^a(bc+|b[eh])g|.h$', 'abh', '0,1', repr(('bh', None))), ('(bc+d$|ef*g.|h?i(j|k))', 'effgz', '0,1,2', repr(('effgz', 'effgz', None))), ('(bc+d$|ef*g.|h?i(j|k))', 'ij', '0,1,2', repr(('ij', 'ij', 'j'))), ('(bc+d$|ef*g.|h?i(j|k))', 'effg', '', repr(None)), ('(bc+d$|ef*g.|h?i(j|k))', 'bcdd', '', repr(None)), ('(bc+d$|ef*g.|h?i(j|k))', 'reffgz', '0,1,2', repr(('effgz', 'effgz', None))), ('(((((((((a)))))))))', 'a', '0', repr('a')), ('multiple words of text', 'uh-uh', '', repr(None)), ('multiple words', 'multiple words, yeah', '0', repr('multiple words')), ('(.*)c(.*)', 'abcde', '0,1,2', repr(('abcde', 'ab', 'de'))), ('\\((.*), (.*)\\)', '(a, b)', '2,1', repr(('b', 'a'))), ('[k]', 'ab', '', repr(None)), ('a[-]?c', 'ac', '0', repr('ac')), ('(abc)\\1', 'abcabc', '1', repr('abc')), ('([a-c]*)\\1', 'abcabc', '1', repr('abc')), ('^(.+)?B', 'AB', '1', repr('A')), ('(a+).\\1$', 'aaaaa', '0,1', repr(('aaaaa', 'aa'))), ('^(a+).\\1$', 'aaaa', '', repr(None)), ('(abc)\\1', 'abcabc', '0,1', repr(('abcabc', 'abc'))), ('([a-c]+)\\1', 'abcabc', '0,1', repr(('abcabc', 'abc'))), ('(a)\\1', 'aa', '0,1', repr(('aa', 'a'))), ('(a+)\\1', 'aa', '0,1', repr(('aa', 'a'))), ('(a+)+\\1', 'aa', '0,1', repr(('aa', 'a'))), ('(a).+\\1', 'aba', '0,1', repr(('aba', 'a'))), ('(a)ba*\\1', 'aba', '0,1', repr(('aba', 'a'))), ('(aa|a)a\\1$', 'aaa', '0,1', repr(('aaa', 'a'))), ('(a|aa)a\\1$', 'aaa', '0,1', repr(('aaa', 'a'))), ('(a+)a\\1$', 'aaa', '0,1', repr(('aaa', 'a'))), ('([abc]*)\\1', 'abcabc', '0,1', repr(('abcabc', 'abc'))), ('(a)(b)c|ab', 'ab', '0,1,2', repr(('ab', None, None))), ('(a)+x', 'aaax', '0,1', repr(('aaax', 'a'))), ('([ac])+x', 'aacx', '0,1', repr(('aacx', 'c'))), ('([^/]*/)*sub1/', 'd:msgs/tdir/sub1/trial/away.cpp', '0,1', repr(('d:msgs/tdir/sub1/', 'tdir/'))), ('([^.]*)\\.([^:]*):[T ]+(.*)', 'track1.title:TBlah blah blah', '0,1,2,3', repr(('track1.title:TBlah blah blah', 'track1', 'title', 'Blah blah blah'))), ('([^N]*N)+', 'abNNxyzN', '0,1', repr(('abNNxyzN', 'xyzN'))), ('([^N]*N)+', 'abNNxyz', '0,1', repr(('abNN', 'N'))), ('([abc]*)x', 'abcx', '0,1', repr(('abcx', 'abc'))), ('([abc]*)x', 'abc', '', repr(None)), ('([xyz]*)x', 'abcx', '0,1', repr(('x', ''))), ('(a)+b|aac', 'aac', '0,1', repr(('aac', None))), # Test symbolic groups. ('(?Paaa)a', 'aaaa', '', regex.error, self.BAD_GROUP_NAME), ('(?Paaa)a', 'aaaa', '0,id', repr(('aaaa', 'aaa'))), ('(?Paa)(?P=id)', 'aaaa', '0,id', repr(('aaaa', 'aa'))), ('(?Paa)(?P=xd)', 'aaaa', '', regex.error, self.UNKNOWN_GROUP), # Character properties. (ur"\g", u"g", '0', repr(u'g')), (ur"\g<1>", u"g", '', regex.error, self.UNKNOWN_GROUP), (ur"(.)\g<1>", u"gg", '0', repr(u'gg')), (ur"(.)\g<1>", u"gg", '', repr((u'gg', u'g'))), (ur"\N", u"N", '0', repr(u'N')), (ur"\N{LATIN SMALL LETTER A}", u"a", '0', repr(u'a')), (ur"\p", u"p", '0', repr(u'p')), (ur"\p{Ll}", u"a", '0', repr(u'a')), (ur"\P", u"P", '0', repr(u'P')), (ur"\P{Lu}", u"p", '0', repr(u'p')), # All tests from Perl. ('abc', 'abc', '0', repr('abc')), ('abc', 'xbc', '', repr(None)), ('abc', 'axc', '', repr(None)), ('abc', 'abx', '', repr(None)), ('abc', 'xabcy', '0', repr('abc')), ('abc', 'ababc', '0', repr('abc')), ('ab*c', 'abc', '0', repr('abc')), ('ab*bc', 'abc', '0', repr('abc')), ('ab*bc', 'abbc', '0', repr('abbc')), ('ab*bc', 'abbbbc', '0', repr('abbbbc')), ('ab{0,}bc', 'abbbbc', '0', repr('abbbbc')), ('ab+bc', 'abbc', '0', repr('abbc')), ('ab+bc', 'abc', '', repr(None)), ('ab+bc', 'abq', '', repr(None)), ('ab{1,}bc', 'abq', '', repr(None)), ('ab+bc', 'abbbbc', '0', repr('abbbbc')), ('ab{1,}bc', 'abbbbc', '0', repr('abbbbc')), ('ab{1,3}bc', 'abbbbc', '0', repr('abbbbc')), ('ab{3,4}bc', 'abbbbc', '0', repr('abbbbc')), ('ab{4,5}bc', 'abbbbc', '', repr(None)), ('ab?bc', 'abbc', '0', repr('abbc')), ('ab?bc', 'abc', '0', repr('abc')), ('ab{0,1}bc', 'abc', '0', repr('abc')), ('ab?bc', 'abbbbc', '', repr(None)), ('ab?c', 'abc', '0', repr('abc')), ('ab{0,1}c', 'abc', '0', repr('abc')), ('^abc$', 'abc', '0', repr('abc')), ('^abc$', 'abcc', '', repr(None)), ('^abc', 'abcc', '0', repr('abc')), ('^abc$', 'aabc', '', repr(None)), ('abc$', 'aabc', '0', repr('abc')), ('^', 'abc', '0', repr('')), ('$', 'abc', '0', repr('')), ('a.c', 'abc', '0', repr('abc')), ('a.c', 'axc', '0', repr('axc')), ('a.*c', 'axyzc', '0', repr('axyzc')), ('a.*c', 'axyzd', '', repr(None)), ('a[bc]d', 'abc', '', repr(None)), ('a[bc]d', 'abd', '0', repr('abd')), ('a[b-d]e', 'abd', '', repr(None)), ('a[b-d]e', 'ace', '0', repr('ace')), ('a[b-d]', 'aac', '0', repr('ac')), ('a[-b]', 'a-', '0', repr('a-')), ('a[b-]', 'a-', '0', repr('a-')), ('a[b-a]', '-', '', regex.error, self.BAD_CHAR_RANGE), ('a[]b', '-', '', regex.error, self.BAD_SET), ('a[', '-', '', regex.error, self.BAD_SET), ('a]', 'a]', '0', repr('a]')), ('a[]]b', 'a]b', '0', repr('a]b')), ('a[^bc]d', 'aed', '0', repr('aed')), ('a[^bc]d', 'abd', '', repr(None)), ('a[^-b]c', 'adc', '0', repr('adc')), ('a[^-b]c', 'a-c', '', repr(None)), ('a[^]b]c', 'a]c', '', repr(None)), ('a[^]b]c', 'adc', '0', repr('adc')), ('ab|cd', 'abc', '0', repr('ab')), ('ab|cd', 'abcd', '0', repr('ab')), ('()ef', 'def', '0,1', repr(('ef', ''))), ('*a', '-', '', regex.error, self.NOTHING_TO_REPEAT), ('(*)b', '-', '', regex.error, self.NOTHING_TO_REPEAT), ('$b', 'b', '', repr(None)), ('a\\', '-', '', regex.error, self.BAD_ESCAPE), ('a\\(b', 'a(b', '', repr(('a(b',))), ('a\\(*b', 'ab', '0', repr('ab')), ('a\\(*b', 'a((b', '0', repr('a((b')), ('a\\\\b', 'a\\b', '0', repr('a\\b')), ('abc)', '-', '', regex.error, self.TRAILING_CHARS), ('(abc', '-', '', regex.error, self.MISSING_RPAREN), ('((a))', 'abc', '0,1,2', repr(('a', 'a', 'a'))), ('(a)b(c)', 'abc', '0,1,2', repr(('abc', 'a', 'c'))), ('a+b+c', 'aabbabc', '0', repr('abc')), ('a{1,}b{1,}c', 'aabbabc', '0', repr('abc')), ('a**', '-', '', regex.error, self.NOTHING_TO_REPEAT), ('a.+?c', 'abcabc', '0', repr('abc')), ('(a+|b)*', 'ab', '0,1', repr(('ab', 'b'))), ('(a+|b){0,}', 'ab', '0,1', repr(('ab', 'b'))), ('(a+|b)+', 'ab', '0,1', repr(('ab', 'b'))), ('(a+|b){1,}', 'ab', '0,1', repr(('ab', 'b'))), ('(a+|b)?', 'ab', '0,1', repr(('a', 'a'))), ('(a+|b){0,1}', 'ab', '0,1', repr(('a', 'a'))), (')(', '-', '', regex.error, self.TRAILING_CHARS), ('[^ab]*', 'cde', '0', repr('cde')), ('abc', '', '', repr(None)), ('a*', '', '0', repr('')), ('([abc])*d', 'abbbcd', '0,1', repr(('abbbcd', 'c'))), ('([abc])*bcd', 'abcd', '0,1', repr(('abcd', 'a'))), ('a|b|c|d|e', 'e', '0', repr('e')), ('(a|b|c|d|e)f', 'ef', '0,1', repr(('ef', 'e'))), ('abcd*efg', 'abcdefg', '0', repr('abcdefg')), ('ab*', 'xabyabbbz', '0', repr('ab')), ('ab*', 'xayabbbz', '0', repr('a')), ('(ab|cd)e', 'abcde', '0,1', repr(('cde', 'cd'))), ('[abhgefdc]ij', 'hij', '0', repr('hij')), ('^(ab|cd)e', 'abcde', '', repr(None)), ('(abc|)ef', 'abcdef', '0,1', repr(('ef', ''))), ('(a|b)c*d', 'abcd', '0,1', repr(('bcd', 'b'))), ('(ab|ab*)bc', 'abc', '0,1', repr(('abc', 'a'))), ('a([bc]*)c*', 'abc', '0,1', repr(('abc', 'bc'))), ('a([bc]*)(c*d)', 'abcd', '0,1,2', repr(('abcd', 'bc', 'd'))), ('a([bc]+)(c*d)', 'abcd', '0,1,2', repr(('abcd', 'bc', 'd'))), ('a([bc]*)(c+d)', 'abcd', '0,1,2', repr(('abcd', 'b', 'cd'))), ('a[bcd]*dcdcde', 'adcdcde', '0', repr('adcdcde')), ('a[bcd]+dcdcde', 'adcdcde', '', repr(None)), ('(ab|a)b*c', 'abc', '0,1', repr(('abc', 'ab'))), ('((a)(b)c)(d)', 'abcd', '1,2,3,4', repr(('abc', 'a', 'b', 'd'))), ('[a-zA-Z_][a-zA-Z0-9_]*', 'alpha', '0', repr('alpha')), ('^a(bc+|b[eh])g|.h$', 'abh', '0,1', repr(('bh', None))), ('(bc+d$|ef*g.|h?i(j|k))', 'effgz', '0,1,2', repr(('effgz', 'effgz', None))), ('(bc+d$|ef*g.|h?i(j|k))', 'ij', '0,1,2', repr(('ij', 'ij', 'j'))), ('(bc+d$|ef*g.|h?i(j|k))', 'effg', '', repr(None)), ('(bc+d$|ef*g.|h?i(j|k))', 'bcdd', '', repr(None)), ('(bc+d$|ef*g.|h?i(j|k))', 'reffgz', '0,1,2', repr(('effgz', 'effgz', None))), ('((((((((((a))))))))))', 'a', '10', repr('a')), ('((((((((((a))))))))))\\10', 'aa', '0', repr('aa')), # Python does not have the same rules for \\41 so this is a syntax error # ('((((((((((a))))))))))\\41', 'aa', '', repr(None)), # ('((((((((((a))))))))))\\41', 'a!', '0', repr('a!')), ('((((((((((a))))))))))\\41', '', '', regex.error, self.UNKNOWN_GROUP), ('(?i)((((((((((a))))))))))\\41', '', '', regex.error, self.UNKNOWN_GROUP), ('(((((((((a)))))))))', 'a', '0', repr('a')), ('multiple words of text', 'uh-uh', '', repr(None)), ('multiple words', 'multiple words, yeah', '0', repr('multiple words')), ('(.*)c(.*)', 'abcde', '0,1,2', repr(('abcde', 'ab', 'de'))), ('\\((.*), (.*)\\)', '(a, b)', '2,1', repr(('b', 'a'))), ('[k]', 'ab', '', repr(None)), ('a[-]?c', 'ac', '0', repr('ac')), ('(abc)\\1', 'abcabc', '1', repr('abc')), ('([a-c]*)\\1', 'abcabc', '1', repr('abc')), ('(?i)abc', 'ABC', '0', repr('ABC')), ('(?i)abc', 'XBC', '', repr(None)), ('(?i)abc', 'AXC', '', repr(None)), ('(?i)abc', 'ABX', '', repr(None)), ('(?i)abc', 'XABCY', '0', repr('ABC')), ('(?i)abc', 'ABABC', '0', repr('ABC')), ('(?i)ab*c', 'ABC', '0', repr('ABC')), ('(?i)ab*bc', 'ABC', '0', repr('ABC')), ('(?i)ab*bc', 'ABBC', '0', repr('ABBC')), ('(?i)ab*?bc', 'ABBBBC', '0', repr('ABBBBC')), ('(?i)ab{0,}?bc', 'ABBBBC', '0', repr('ABBBBC')), ('(?i)ab+?bc', 'ABBC', '0', repr('ABBC')), ('(?i)ab+bc', 'ABC', '', repr(None)), ('(?i)ab+bc', 'ABQ', '', repr(None)), ('(?i)ab{1,}bc', 'ABQ', '', repr(None)), ('(?i)ab+bc', 'ABBBBC', '0', repr('ABBBBC')), ('(?i)ab{1,}?bc', 'ABBBBC', '0', repr('ABBBBC')), ('(?i)ab{1,3}?bc', 'ABBBBC', '0', repr('ABBBBC')), ('(?i)ab{3,4}?bc', 'ABBBBC', '0', repr('ABBBBC')), ('(?i)ab{4,5}?bc', 'ABBBBC', '', repr(None)), ('(?i)ab??bc', 'ABBC', '0', repr('ABBC')), ('(?i)ab??bc', 'ABC', '0', repr('ABC')), ('(?i)ab{0,1}?bc', 'ABC', '0', repr('ABC')), ('(?i)ab??bc', 'ABBBBC', '', repr(None)), ('(?i)ab??c', 'ABC', '0', repr('ABC')), ('(?i)ab{0,1}?c', 'ABC', '0', repr('ABC')), ('(?i)^abc$', 'ABC', '0', repr('ABC')), ('(?i)^abc$', 'ABCC', '', repr(None)), ('(?i)^abc', 'ABCC', '0', repr('ABC')), ('(?i)^abc$', 'AABC', '', repr(None)), ('(?i)abc$', 'AABC', '0', repr('ABC')), ('(?i)^', 'ABC', '0', repr('')), ('(?i)$', 'ABC', '0', repr('')), ('(?i)a.c', 'ABC', '0', repr('ABC')), ('(?i)a.c', 'AXC', '0', repr('AXC')), ('(?i)a.*?c', 'AXYZC', '0', repr('AXYZC')), ('(?i)a.*c', 'AXYZD', '', repr(None)), ('(?i)a[bc]d', 'ABC', '', repr(None)), ('(?i)a[bc]d', 'ABD', '0', repr('ABD')), ('(?i)a[b-d]e', 'ABD', '', repr(None)), ('(?i)a[b-d]e', 'ACE', '0', repr('ACE')), ('(?i)a[b-d]', 'AAC', '0', repr('AC')), ('(?i)a[-b]', 'A-', '0', repr('A-')), ('(?i)a[b-]', 'A-', '0', repr('A-')), ('(?i)a[b-a]', '-', '', regex.error, self.BAD_CHAR_RANGE), ('(?i)a[]b', '-', '', regex.error, self.BAD_SET), ('(?i)a[', '-', '', regex.error, self.BAD_SET), ('(?i)a]', 'A]', '0', repr('A]')), ('(?i)a[]]b', 'A]B', '0', repr('A]B')), ('(?i)a[^bc]d', 'AED', '0', repr('AED')), ('(?i)a[^bc]d', 'ABD', '', repr(None)), ('(?i)a[^-b]c', 'ADC', '0', repr('ADC')), ('(?i)a[^-b]c', 'A-C', '', repr(None)), ('(?i)a[^]b]c', 'A]C', '', repr(None)), ('(?i)a[^]b]c', 'ADC', '0', repr('ADC')), ('(?i)ab|cd', 'ABC', '0', repr('AB')), ('(?i)ab|cd', 'ABCD', '0', repr('AB')), ('(?i)()ef', 'DEF', '0,1', repr(('EF', ''))), ('(?i)*a', '-', '', regex.error, self.NOTHING_TO_REPEAT), ('(?i)(*)b', '-', '', regex.error, self.NOTHING_TO_REPEAT), ('(?i)$b', 'B', '', repr(None)), ('(?i)a\\', '-', '', regex.error, self.BAD_ESCAPE), ('(?i)a\\(b', 'A(B', '', repr(('A(B',))), ('(?i)a\\(*b', 'AB', '0', repr('AB')), ('(?i)a\\(*b', 'A((B', '0', repr('A((B')), ('(?i)a\\\\b', 'A\\B', '0', repr('A\\B')), ('(?i)abc)', '-', '', regex.error, self.TRAILING_CHARS), ('(?i)(abc', '-', '', regex.error, self.MISSING_RPAREN), ('(?i)((a))', 'ABC', '0,1,2', repr(('A', 'A', 'A'))), ('(?i)(a)b(c)', 'ABC', '0,1,2', repr(('ABC', 'A', 'C'))), ('(?i)a+b+c', 'AABBABC', '0', repr('ABC')), ('(?i)a{1,}b{1,}c', 'AABBABC', '0', repr('ABC')), ('(?i)a**', '-', '', regex.error, self.NOTHING_TO_REPEAT), ('(?i)a.+?c', 'ABCABC', '0', repr('ABC')), ('(?i)a.*?c', 'ABCABC', '0', repr('ABC')), ('(?i)a.{0,5}?c', 'ABCABC', '0', repr('ABC')), ('(?i)(a+|b)*', 'AB', '0,1', repr(('AB', 'B'))), ('(?i)(a+|b){0,}', 'AB', '0,1', repr(('AB', 'B'))), ('(?i)(a+|b)+', 'AB', '0,1', repr(('AB', 'B'))), ('(?i)(a+|b){1,}', 'AB', '0,1', repr(('AB', 'B'))), ('(?i)(a+|b)?', 'AB', '0,1', repr(('A', 'A'))), ('(?i)(a+|b){0,1}', 'AB', '0,1', repr(('A', 'A'))), ('(?i)(a+|b){0,1}?', 'AB', '0,1', repr(('', None))), ('(?i))(', '-', '', regex.error, self.TRAILING_CHARS), ('(?i)[^ab]*', 'CDE', '0', repr('CDE')), ('(?i)abc', '', '', repr(None)), ('(?i)a*', '', '0', repr('')), ('(?i)([abc])*d', 'ABBBCD', '0,1', repr(('ABBBCD', 'C'))), ('(?i)([abc])*bcd', 'ABCD', '0,1', repr(('ABCD', 'A'))), ('(?i)a|b|c|d|e', 'E', '0', repr('E')), ('(?i)(a|b|c|d|e)f', 'EF', '0,1', repr(('EF', 'E'))), ('(?i)abcd*efg', 'ABCDEFG', '0', repr('ABCDEFG')), ('(?i)ab*', 'XABYABBBZ', '0', repr('AB')), ('(?i)ab*', 'XAYABBBZ', '0', repr('A')), ('(?i)(ab|cd)e', 'ABCDE', '0,1', repr(('CDE', 'CD'))), ('(?i)[abhgefdc]ij', 'HIJ', '0', repr('HIJ')), ('(?i)^(ab|cd)e', 'ABCDE', '', repr(None)), ('(?i)(abc|)ef', 'ABCDEF', '0,1', repr(('EF', ''))), ('(?i)(a|b)c*d', 'ABCD', '0,1', repr(('BCD', 'B'))), ('(?i)(ab|ab*)bc', 'ABC', '0,1', repr(('ABC', 'A'))), ('(?i)a([bc]*)c*', 'ABC', '0,1', repr(('ABC', 'BC'))), ('(?i)a([bc]*)(c*d)', 'ABCD', '0,1,2', repr(('ABCD', 'BC', 'D'))), ('(?i)a([bc]+)(c*d)', 'ABCD', '0,1,2', repr(('ABCD', 'BC', 'D'))), ('(?i)a([bc]*)(c+d)', 'ABCD', '0,1,2', repr(('ABCD', 'B', 'CD'))), ('(?i)a[bcd]*dcdcde', 'ADCDCDE', '0', repr('ADCDCDE')), ('(?i)a[bcd]+dcdcde', 'ADCDCDE', '', repr(None)), ('(?i)(ab|a)b*c', 'ABC', '0,1', repr(('ABC', 'AB'))), ('(?i)((a)(b)c)(d)', 'ABCD', '1,2,3,4', repr(('ABC', 'A', 'B', 'D'))), ('(?i)[a-zA-Z_][a-zA-Z0-9_]*', 'ALPHA', '0', repr('ALPHA')), ('(?i)^a(bc+|b[eh])g|.h$', 'ABH', '0,1', repr(('BH', None))), ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'EFFGZ', '0,1,2', repr(('EFFGZ', 'EFFGZ', None))), ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'IJ', '0,1,2', repr(('IJ', 'IJ', 'J'))), ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'EFFG', '', repr(None)), ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'BCDD', '', repr(None)), ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'REFFGZ', '0,1,2', repr(('EFFGZ', 'EFFGZ', None))), ('(?i)((((((((((a))))))))))', 'A', '10', repr('A')), ('(?i)((((((((((a))))))))))\\10', 'AA', '0', repr('AA')), #('(?i)((((((((((a))))))))))\\41', 'AA', '', repr(None)), #('(?i)((((((((((a))))))))))\\41', 'A!', '0', repr('A!')), ('(?i)(((((((((a)))))))))', 'A', '0', repr('A')), ('(?i)(?:(?:(?:(?:(?:(?:(?:(?:(?:(a))))))))))', 'A', '1', repr('A')), ('(?i)(?:(?:(?:(?:(?:(?:(?:(?:(?:(a|b|c))))))))))', 'C', '1', repr('C')), ('(?i)multiple words of text', 'UH-UH', '', repr(None)), ('(?i)multiple words', 'MULTIPLE WORDS, YEAH', '0', repr('MULTIPLE WORDS')), ('(?i)(.*)c(.*)', 'ABCDE', '0,1,2', repr(('ABCDE', 'AB', 'DE'))), ('(?i)\\((.*), (.*)\\)', '(A, B)', '2,1', repr(('B', 'A'))), ('(?i)[k]', 'AB', '', repr(None)), # ('(?i)abcd', 'ABCD', SUCCEED, 'found+"-"+\\found+"-"+\\\\found', repr(ABCD-$&-\\ABCD)), # ('(?i)a(bc)d', 'ABCD', SUCCEED, 'g1+"-"+\\g1+"-"+\\\\g1', repr(BC-$1-\\BC)), ('(?i)a[-]?c', 'AC', '0', repr('AC')), ('(?i)(abc)\\1', 'ABCABC', '1', repr('ABC')), ('(?i)([a-c]*)\\1', 'ABCABC', '1', repr('ABC')), ('a(?!b).', 'abad', '0', repr('ad')), ('a(?=d).', 'abad', '0', repr('ad')), ('a(?=c|d).', 'abad', '0', repr('ad')), ('a(?:b|c|d)(.)', 'ace', '1', repr('e')), ('a(?:b|c|d)*(.)', 'ace', '1', repr('e')), ('a(?:b|c|d)+?(.)', 'ace', '1', repr('e')), ('a(?:b|(c|e){1,2}?|d)+?(.)', 'ace', '1,2', repr(('c', 'e'))), # Lookbehind: split by : but not if it is escaped by -. ('(?]*?b', 'a>b', '', repr(None)), # Bug 490573: minimizing repeat problem. (r'^a*?$', 'foo', '', repr(None)), # Bug 470582: nested groups problem. (r'^((a)c)?(ab)$', 'ab', '1,2,3', repr((None, None, 'ab'))), # Another minimizing repeat problem (capturing groups in assertions). ('^([ab]*?)(?=(b)?)c', 'abc', '1,2', repr(('ab', None))), ('^([ab]*?)(?!(b))c', 'abc', '1,2', repr(('ab', None))), ('^([ab]*?)(?(.){0,2})d", "abcd").captures(1), ['b', 'c']) self.assertEqual(regex.search(r"(.)+", "a").captures(1), ['a']) def test_guards(self): m = regex.search(r"(X.*?Y\s*){3}(X\s*)+AB:", "XY\nX Y\nX Y\nXY\nXX AB:") self.assertEqual(m.span(0, 1, 2), ((3, 21), (12, 15), (16, 18))) m = regex.search(r"(X.*?Y\s*){3,}(X\s*)+AB:", "XY\nX Y\nX Y\nXY\nXX AB:") self.assertEqual(m.span(0, 1, 2), ((0, 21), (12, 15), (16, 18))) m = regex.search(r'\d{4}(\s*\w)?\W*((?!\d)\w){2}', "9999XX") self.assertEqual(m.span(0, 1, 2), ((0, 6), (-1, -1), (5, 6))) m = regex.search(r'A\s*?.*?(\n+.*?\s*?){0,2}\(X', 'A\n1\nS\n1 (X') self.assertEqual(m.span(0, 1), ((0, 10), (5, 8))) m = regex.search('Derde\s*:', 'aaaaaa:\nDerde:') self.assertEqual(m.span(), (8, 14)) m = regex.search('Derde\s*:', 'aaaaa:\nDerde:') self.assertEqual(m.span(), (7, 13)) def test_turkic(self): # Turkish has dotted and dotless I/i. pairs = u"I=i;I=\u0131;i=\u0130" all_chars = set() matching = set() for pair in pairs.split(";"): ch1, ch2 = pair.split("=") all_chars.update((ch1, ch2)) matching.add((ch1, ch1)) matching.add((ch1, ch2)) matching.add((ch2, ch1)) matching.add((ch2, ch2)) for ch1 in all_chars: for ch2 in all_chars: m = regex.match(ur"(?iu)\A" + ch1 + ur"\Z", ch2) if m: if (ch1, ch2) not in matching: self.fail("%s matching %s" % (repr(ch1), repr(ch2))) else: if (ch1, ch2) in matching: self.fail("%s not matching %s" % (repr(ch1), repr(ch2))) def test_named_lists(self): options = [u"one", u"two", u"three"] self.assertEqual(regex.match(ur"333\L444", u"333one444", bar=options).group(), u"333one444") self.assertEqual(regex.match(ur"(?i)333\L444", u"333TWO444", bar=options).group(), u"333TWO444") self.assertEqual(regex.match(ur"333\L444", u"333four444", bar=options), None) options = ["one", "two", "three"] self.assertEqual(regex.match(r"333\L444", "333one444", bar=options).group(), "333one444") self.assertEqual(regex.match(r"(?i)333\L444", "333TWO444", bar=options).group(), "333TWO444") self.assertEqual(regex.match(r"333\L444", "333four444", bar=options), None) self.assertEqual(repr(type(regex.compile(r"3\L4\L+5", bar=["one", "two", "three"]))), self.PATTERN_CLASS) self.assertEqual(regex.findall(r"^\L", "solid QWERT", options=set(['good', 'brilliant', '+s\\ol[i}d'])), []) self.assertEqual(regex.findall(r"^\L", "+solid QWERT", options=set(['good', 'brilliant', '+solid'])), ['+solid']) options = [u"STRASSE"] self.assertEqual(regex.match(ur"(?fiu)\L", u"stra\N{LATIN SMALL LETTER SHARP S}e", words=options).span(), (0, 6)) options = [u"STRASSE", u"stress"] self.assertEqual(regex.match(ur"(?fiu)\L", u"stra\N{LATIN SMALL LETTER SHARP S}e", words=options).span(), (0, 6)) options = [u"stra\N{LATIN SMALL LETTER SHARP S}e"] self.assertEqual(regex.match(ur"(?fiu)\L", u"STRASSE", words=options).span(), (0, 7)) options = ["kit"] self.assertEqual(regex.search(ur"(?iu)\L", u"SKITS", words=options).span(), (1, 4)) self.assertEqual(regex.search(ur"(?iu)\L", u"SK\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}TS", words=options).span(), (1, 4)) self.assertEqual(regex.search(ur"(?fiu)\b(\w+) +\1\b", u" stra\N{LATIN SMALL LETTER SHARP S}e STRASSE ").span(), (1, 15)) self.assertEqual(regex.search(ur"(?fiu)\b(\w+) +\1\b", u" STRASSE stra\N{LATIN SMALL LETTER SHARP S}e ").span(), (1, 15)) self.assertEqual(regex.search(r"^\L$", "", options=[]).span(), (0, 0)) def test_fuzzy(self): # Some tests borrowed from TRE library tests. self.assertEqual(repr(type(regex.compile('(fou){s,e<=1}'))), self.PATTERN_CLASS) self.assertEqual(repr(type(regex.compile('(fuu){s}'))), self.PATTERN_CLASS) self.assertEqual(repr(type(regex.compile('(fuu){s,e}'))), self.PATTERN_CLASS) self.assertEqual(repr(type(regex.compile('(anaconda){1i+1d<1,s<=1}'))), self.PATTERN_CLASS) self.assertEqual(repr(type(regex.compile('(anaconda){1i+1d<1,s<=1,e<=10}'))), self.PATTERN_CLASS) self.assertEqual(repr(type(regex.compile('(anaconda){s<=1,e<=1,1i+1d<1}'))), self.PATTERN_CLASS) text = 'molasses anaconda foo bar baz smith anderson ' self.assertEqual(regex.search('(znacnda){s<=1,e<=3,1i+1d<1}', text), None) self.assertEqual(regex.search('(znacnda){s<=1,e<=3,1i+1d<2}', text).span(0, 1), ((9, 17), (9, 17))) self.assertEqual(regex.search('(ananda){1i+1d<2}', text), None) self.assertEqual(regex.search(r"(?:\bznacnda){e<=2}", text)[0], "anaconda") self.assertEqual(regex.search(r"(?:\bnacnda){e<=2}", text)[0], "anaconda") text = 'anaconda foo bar baz smith anderson' self.assertEqual(regex.search('(fuu){i<=3,d<=3,e<=5}', text).span(0, 1), ((0, 0), (0, 0))) self.assertEqual(regex.search('(?b)(fuu){i<=3,d<=3,e<=5}', text).span(0, 1), ((9, 10), (9, 10))) self.assertEqual(regex.search('(fuu){i<=2,d<=2,e<=5}', text).span(0, 1), ((7, 10), (7, 10))) self.assertEqual(regex.search('(?e)(fuu){i<=2,d<=2,e<=5}', text).span(0, 1), ((9, 10), (9, 10))) self.assertEqual(regex.search('(fuu){i<=3,d<=3,e}', text).span(0, 1), ((0, 0), (0, 0))) self.assertEqual(regex.search('(?b)(fuu){i<=3,d<=3,e}', text).span(0, 1), ((9, 10), (9, 10))) self.assertEqual(repr(type(regex.compile('(approximate){s<=3,1i+1d<3}'))), self.PATTERN_CLASS) # No cost limit. self.assertEqual(regex.search('(foobar){e}', 'xirefoabralfobarxie').span(0, 1), ((0, 6), (0, 6))) self.assertEqual(regex.search('(?e)(foobar){e}', 'xirefoabralfobarxie').span(0, 1), ((0, 3), (0, 3))) self.assertEqual(regex.search('(?b)(foobar){e}', 'xirefoabralfobarxie').span(0, 1), ((11, 16), (11, 16))) # At most two errors. self.assertEqual(regex.search('(foobar){e<=2}', 'xirefoabrzlfd').span(0, 1), ((4, 9), (4, 9))) self.assertEqual(regex.search('(foobar){e<=2}', 'xirefoabzlfd'), None) # At most two inserts or substitutions and max two errors total. self.assertEqual(regex.search('(foobar){i<=2,s<=2,e<=2}', 'oobargoobaploowap').span(0, 1), ((5, 11), (5, 11))) # Find best whole word match for "foobar". self.assertEqual(regex.search('\\b(foobar){e}\\b', 'zfoobarz').span(0, 1), ((0, 8), (0, 8))) self.assertEqual(regex.search('\\b(foobar){e}\\b', 'boing zfoobarz goobar woop').span(0, 1), ((0, 6), (0, 6))) self.assertEqual(regex.search('(?b)\\b(foobar){e}\\b', 'boing zfoobarz goobar woop').span(0, 1), ((15, 21), (15, 21))) # Match whole string, allow only 1 error. self.assertEqual(regex.search('^(foobar){e<=1}$', 'foobar').span(0, 1), ((0, 6), (0, 6))) self.assertEqual(regex.search('^(foobar){e<=1}$', 'xfoobar').span(0, 1), ((0, 7), (0, 7))) self.assertEqual(regex.search('^(foobar){e<=1}$', 'foobarx').span(0, 1), ((0, 7), (0, 7))) self.assertEqual(regex.search('^(foobar){e<=1}$', 'fooxbar').span(0, 1), ((0, 7), (0, 7))) self.assertEqual(regex.search('^(foobar){e<=1}$', 'foxbar').span(0, 1), ((0, 6), (0, 6))) self.assertEqual(regex.search('^(foobar){e<=1}$', 'xoobar').span(0, 1), ((0, 6), (0, 6))) self.assertEqual(regex.search('^(foobar){e<=1}$', 'foobax').span(0, 1), ((0, 6), (0, 6))) self.assertEqual(regex.search('^(foobar){e<=1}$', 'oobar').span(0, 1), ((0, 5), (0, 5))) self.assertEqual(regex.search('^(foobar){e<=1}$', 'fobar').span(0, 1), ((0, 5), (0, 5))) self.assertEqual(regex.search('^(foobar){e<=1}$', 'fooba').span(0, 1), ((0, 5), (0, 5))) self.assertEqual(regex.search('^(foobar){e<=1}$', 'xfoobarx'), None) self.assertEqual(regex.search('^(foobar){e<=1}$', 'foobarxx'), None) self.assertEqual(regex.search('^(foobar){e<=1}$', 'xxfoobar'), None) self.assertEqual(regex.search('^(foobar){e<=1}$', 'xfoxbar'), None) self.assertEqual(regex.search('^(foobar){e<=1}$', 'foxbarx'), None) # At most one insert, two deletes, and three substitutions. # Additionally, deletes cost two and substitutes one, and total # cost must be less than 4. self.assertEqual(regex.search('(foobar){i<=1,d<=2,s<=3,2d+1s<4}', '3oifaowefbaoraofuiebofasebfaobfaorfeoaro').span(0, 1), ((6, 13), (6, 13))) self.assertEqual(regex.search('(?b)(foobar){i<=1,d<=2,s<=3,2d+1s<4}', '3oifaowefbaoraofuiebofasebfaobfaorfeoaro').span(0, 1), ((26, 33), (26, 33))) # Partially fuzzy matches. self.assertEqual(regex.search('foo(bar){e<=1}zap', 'foobarzap').span(0, 1), ((0, 9), (3, 6))) self.assertEqual(regex.search('foo(bar){e<=1}zap', 'fobarzap'), None) self.assertEqual(regex.search('foo(bar){e<=1}zap', 'foobrzap').span(0, 1), ((0, 8), (3, 5))) text = ('www.cnn.com 64.236.16.20\nwww.slashdot.org 66.35.250.150\n' 'For useful information, use www.slashdot.org\nthis is demo data!\n') self.assertEqual(regex.search(r'(?s)^.*(dot.org){e}.*$', text).span(0, 1), ((0, 120), (120, 120))) self.assertEqual(regex.search(r'(?es)^.*(dot.org){e}.*$', text).span(0, 1), ((0, 120), (93, 100))) self.assertEqual(regex.search(r'^.*(dot.org){e}.*$', text).span(0, 1), ((0, 119), (24, 101))) # Behaviour is unexpected, but arguably not wrong. It first finds the # best match, then the best in what follows, etc. self.assertEqual(regex.findall(r"\b\L{e<=1}\b", " book cot dog desk ", words="cat dog".split()), ["cot", "dog"]) self.assertEqual(regex.findall(r"\b\L{e<=1}\b", " book dog cot desk ", words="cat dog".split()), [" dog", "cot"]) self.assertEqual(regex.findall(r"(?e)\b\L{e<=1}\b", " book dog cot desk ", words="cat dog".split()), ["dog", "cot"]) self.assertEqual(regex.findall(r"(?r)\b\L{e<=1}\b", " book cot dog desk ", words="cat dog".split()), ["dog ", "cot"]) self.assertEqual(regex.findall(r"(?er)\b\L{e<=1}\b", " book cot dog desk ", words="cat dog".split()), ["dog", "cot"]) self.assertEqual(regex.findall(r"(?r)\b\L{e<=1}\b", " book dog cot desk ", words="cat dog".split()), ["cot", "dog"]) self.assertEqual(regex.findall(ur"\b\L{e<=1}\b", u" book cot dog desk ", words=u"cat dog".split()), [u"cot", u"dog"]) self.assertEqual(regex.findall(ur"\b\L{e<=1}\b", u" book dog cot desk ", words=u"cat dog".split()), [u" dog", u"cot"]) self.assertEqual(regex.findall(ur"(?e)\b\L{e<=1}\b", u" book dog cot desk ", words=u"cat dog".split()), [u"dog", u"cot"]) self.assertEqual(regex.findall(ur"(?r)\b\L{e<=1}\b", u" book cot dog desk ", words=u"cat dog".split()), [u"dog ", u"cot"]) self.assertEqual(regex.findall(ur"(?er)\b\L{e<=1}\b", u" book cot dog desk ", words=u"cat dog".split()), [u"dog", u"cot"]) self.assertEqual(regex.findall(ur"(?r)\b\L{e<=1}\b", u" book dog cot desk ", words=u"cat dog".split()), [u"cot", u"dog"]) self.assertEqual(regex.search(r"(\w+) (\1{e<=1})", "foo fou").groups(), ("foo", "fou")) self.assertEqual(regex.search(r"(?r)(\2{e<=1}) (\w+)", "foo fou").groups(), ("foo", "fou")) self.assertEqual(regex.search(ur"(\w+) (\1{e<=1})", u"foo fou").groups(), (u"foo", u"fou")) self.assertEqual(regex.findall(r"(?:(?:QR)+){e}","abcde"), ["abcde", ""]) self.assertEqual(regex.findall(r"(?:Q+){e}","abc"), ["abc", ""]) # Hg issue 41. self.assertEqual(regex.match(r"(?:service detection){0[^()]+)|(?R))*\)", "(ab(cd)ef)")[ : ], ("(ab(cd)ef)", "ef")) self.assertEqual(regex.search(r"\(((?>[^()]+)|(?R))*\)", "(ab(cd)ef)").captures(1), ["ab", "cd", "(cd)", "ef"]) self.assertEqual(regex.search(r"(?r)\(((?R)|(?>[^()]+))*\)", "(ab(cd)ef)")[ : ], ("(ab(cd)ef)", "ab")) self.assertEqual(regex.search(r"(?r)\(((?R)|(?>[^()]+))*\)", "(ab(cd)ef)").captures(1), ["ef", "cd", "(cd)", "ab"]) self.assertEqual(regex.search(r"\(([^()]+|(?R))*\)", "some text (a(b(c)d)e) more text")[ : ], ("(a(b(c)d)e)", "e")) self.assertEqual(regex.search(r"(?r)\(((?R)|[^()]+)*\)", "some text (a(b(c)d)e) more text")[ : ], ("(a(b(c)d)e)", "a")) self.assertEqual(regex.search(r"(foo(\(((?:(?>[^()]+)|(?2))*)\)))", "foo(bar(baz)+baz(bop))")[ : ], ("foo(bar(baz)+baz(bop))", "foo(bar(baz)+baz(bop))", "(bar(baz)+baz(bop))", "bar(baz)+baz(bop)")) self.assertEqual(regex.search(r"(?r)(foo(\(((?:(?2)|(?>[^()]+))*)\)))", "foo(bar(baz)+baz(bop))")[ : ], ("foo(bar(baz)+baz(bop))", "foo(bar(baz)+baz(bop))", "(bar(baz)+baz(bop))", "bar(baz)+baz(bop)")) rgx = regex.compile(r"""^\s*(<\s*([a-zA-Z:]+)(?:\s*[a-zA-Z:]*\s*=\s*(?:'[^']*'|"[^"]*"))*\s*(/\s*)?>(?:[^<>]*|(?1))*(?(3)|<\s*/\s*\2\s*>))\s*$""") self.assertEqual(bool(rgx.search('')), True) self.assertEqual(bool(rgx.search('')), False) self.assertEqual(bool(rgx.search('')), True) self.assertEqual(bool(rgx.search('')), False) self.assertEqual(bool(rgx.search('')), False) self.assertEqual(bool(rgx.search('')), False) self.assertEqual(bool(rgx.search('')), True) self.assertEqual(bool(rgx.search('< fooo / >')), True) # The next regex should and does match. Perl 5.14 agrees. #self.assertEqual(bool(rgx.search('foo')), False) self.assertEqual(bool(rgx.search('foo')), False) self.assertEqual(bool(rgx.search('foo')), True) self.assertEqual(bool(rgx.search('foo')), True) self.assertEqual(bool(rgx.search('')), True) def test_copy(self): # PatternObjects are immutable, therefore there's no need to clone them. r = regex.compile("a") self.assert_(copy.copy(r) is r) self.assert_(copy.deepcopy(r) is r) # MatchObjects are normally mutable because the target string can be # detached. However, after the target string has been detached, a # MatchObject becomes immutable, so there's no need to clone it. m = r.match("a") self.assert_(copy.copy(m) is not m) self.assert_(copy.deepcopy(m) is not m) self.assert_(m.string is not None) m2 = copy.copy(m) m2.detach_string() self.assert_(m.string is not None) self.assert_(m2.string is None) # The following behaviour matches that of the re module. it = regex.finditer(".", "ab") it2 = copy.copy(it) self.assertEqual(it.next().group(), "a") self.assertEqual(it2.next().group(), "b") # The following behaviour matches that of the re module. it = regex.finditer(".", "ab") it2 = copy.deepcopy(it) self.assertEqual(it.next().group(), "a") self.assertEqual(it2.next().group(), "b") # The following behaviour is designed to match that of copying 'finditer'. it = regex.splititer(" ", "a b") it2 = copy.copy(it) self.assertEqual(it.next(), "a") self.assertEqual(it2.next(), "b") # The following behaviour is designed to match that of copying 'finditer'. it = regex.splititer(" ", "a b") it2 = copy.deepcopy(it) self.assertEqual(it.next(), "a") self.assertEqual(it2.next(), "b") def test_format(self): self.assertEqual(regex.subf(r"(\w+) (\w+)", "{0} => {2} {1}", "foo bar"), "foo bar => bar foo") self.assertEqual(regex.subf(r"(?\w+) (?\w+)", "{word2} {word1}", "foo bar"), "bar foo") self.assertEqual(regex.subfn(r"(\w+) (\w+)", "{0} => {2} {1}", "foo bar"), ("foo bar => bar foo", 1)) self.assertEqual(regex.subfn(r"(?\w+) (?\w+)", "{word2} {word1}", "foo bar"), ("bar foo", 1)) self.assertEqual(regex.match(r"(\w+) (\w+)", "foo bar").expandf("{0} => {2} {1}"), "foo bar => bar foo") def test_fullmatch(self): self.assertEqual(bool(regex.fullmatch(r"abc", "abc")), True) self.assertEqual(bool(regex.fullmatch(r"abc", "abcx")), False) self.assertEqual(bool(regex.fullmatch(r"abc", "abcx", endpos=3)), True) self.assertEqual(bool(regex.fullmatch(r"abc", "xabc", pos=1)), True) self.assertEqual(bool(regex.fullmatch(r"abc", "xabcy", pos=1)), False) self.assertEqual(bool(regex.fullmatch(r"abc", "xabcy", pos=1, endpos=4)), True) self.assertEqual(bool(regex.fullmatch(r"(?r)abc", "abc")), True) self.assertEqual(bool(regex.fullmatch(r"(?r)abc", "abcx")), False) self.assertEqual(bool(regex.fullmatch(r"(?r)abc", "abcx", endpos=3)), True) self.assertEqual(bool(regex.fullmatch(r"(?r)abc", "xabc", pos=1)), True) self.assertEqual(bool(regex.fullmatch(r"(?r)abc", "xabcy", pos=1)), False) self.assertEqual(bool(regex.fullmatch(r"(?r)abc", "xabcy", pos=1, endpos=4)), True) def test_hg_bugs(self): # Hg issue 28. self.assertEqual(bool(regex.compile("(?>b)", flags=regex.V1)), True) # Hg issue 29. self.assertEqual(bool(regex.compile("^((?>\w+)|(?>\s+))*$", flags=regex.V1)), True) # Hg issue 31. self.assertEqual(regex.findall(r"\((?:(?>[^()]+)|(?R))*\)", "a(bcd(e)f)g(h)"), ['(bcd(e)f)', '(h)']) self.assertEqual(regex.findall(r"\((?:(?:[^()]+)|(?R))*\)", "a(bcd(e)f)g(h)"), ['(bcd(e)f)', '(h)']) self.assertEqual(regex.findall(r"\((?:(?>[^()]+)|(?R))*\)", "a(b(cd)e)f)g)h"), ['(b(cd)e)']) self.assertEqual(regex.findall(r"\((?:(?>[^()]+)|(?R))*\)", "a(bc(d(e)f)gh"), ['(d(e)f)']) self.assertEqual(regex.findall(r"(?r)\((?:(?>[^()]+)|(?R))*\)", "a(bc(d(e)f)gh"), ['(d(e)f)']) self.assertEqual([m.group() for m in regex.finditer(r"\((?:[^()]*+|(?0))*\)", "a(b(c(de)fg)h")], ['(c(de)fg)']) # Hg issue 32. self.assertEqual(regex.search("a(bc)d", "abcd", regex.I | regex.V1).group(0), "abcd") # Hg issue 33. self.assertEqual(regex.search("([\da-f:]+)$", "E", regex.I | regex.V1).group(0), "E") self.assertEqual(regex.search("([\da-f:]+)$", "e", regex.I | regex.V1).group(0), "e") # Hg issue 34. self.assertEqual(regex.search("^(?=ab(de))(abd)(e)", "abde").groups(), ('de', 'abd', 'e')) # Hg issue 35. self.assertEqual(bool(regex.match(r"\ ", " ", flags=regex.X)), True) # Hg issue 36. self.assertEqual(regex.search(r"^(a|)\1{2}b", "b").group(0, 1), ('b', '')) # Hg issue 37. self.assertEqual(regex.search("^(a){0,0}", "abc").group(0, 1), ('', None)) # Hg issue 38. self.assertEqual(regex.search("(?>.*/)b", "a/b").group(0), "a/b") # Hg issue 39. self.assertEqual(regex.search(r"(?V0)((?i)blah)\s+\1", "blah BLAH").group(0, 1), ("blah BLAH", "blah")) self.assertEqual(regex.search(r"(?V1)((?i)blah)\s+\1", "blah BLAH"), None) # Hg issue 40. self.assertEqual(regex.search(r"(\()?[^()]+(?(1)\)|)", "(abcd").group(0), "abcd") # Hg issue 42. self.assertEqual(regex.search("(a*)*", "a").span(1), (1, 1)) self.assertEqual(regex.search("(a*)*", "aa").span(1), (2, 2)) self.assertEqual(regex.search("(a*)*", "aaa").span(1), (3, 3)) # Hg issue 43. self.assertEqual(regex.search("a(?#xxx)*", "aaa").group(), "aaa") # Hg issue 44. self.assertEqual(regex.search("(?=abc){3}abc", "abcabcabc").span(), (0, 3)) # Hg issue 45. self.assertEqual(regex.search("^(?:a(?:(?:))+)+", "a").span(), (0, 1)) self.assertEqual(regex.search("^(?:a(?:(?:))+)+", "aa").span(), (0, 2)) # Hg issue 46. self.assertEqual(regex.search("a(?x: b c )d", "abcd").group(0), "abcd") # Hg issue 47. self.assertEqual(regex.search("a#comment\n*", "aaa", flags=regex.X).group(0), "aaa") # Hg issue 48. self.assertEqual(regex.search(r"(?V1)(a(?(1)\1)){1}", "aaaaaaaaaa").span(0, 1), ((0, 1), (0, 1))) self.assertEqual(regex.search(r"(?V1)(a(?(1)\1)){2}", "aaaaaaaaaa").span(0, 1), ((0, 3), (1, 3))) self.assertEqual(regex.search(r"(?V1)(a(?(1)\1)){3}", "aaaaaaaaaa").span(0, 1), ((0, 6), (3, 6))) self.assertEqual(regex.search(r"(?V1)(a(?(1)\1)){4}", "aaaaaaaaaa").span(0, 1), ((0, 10), (6, 10))) # Hg issue 49. self.assertEqual(regex.search("(?V1)(a)(?<=b(?1))", "baz").group(0), "a") # Hg issue 50. self.assertEqual(regex.findall(ur'(?fi)\L', u'POST, Post, post, po\u017Ft, po\uFB06, and po\uFB05', keywords=['post','pos']), [u'POST', u'Post', u'post', u'po\u017Ft', u'po\uFB06', u'po\uFB05']) self.assertEqual(regex.findall(ur'(?fi)pos|post', u'POST, Post, post, po\u017Ft, po\uFB06, and po\uFB05'), [u'POS', u'Pos', u'pos', u'po\u017F', u'po\uFB06', u'po\uFB05']) self.assertEqual(regex.findall(ur'(?fi)post|pos', u'POST, Post, post, po\u017Ft, po\uFB06, and po\uFB05'), [u'POST', u'Post', u'post', u'po\u017Ft', u'po\uFB06', u'po\uFB05']) self.assertEqual(regex.findall(ur'(?fi)post|another', u'POST, Post, post, po\u017Ft, po\uFB06, and po\uFB05'), [u'POST', u'Post', u'post', u'po\u017Ft', u'po\uFB06', u'po\uFB05']) # Hg issue 51. self.assertEqual(regex.search("(?V1)((a)(?1)|(?2))", "a").group(0, 1, 2), ('a', 'a', None)) # Hg issue 52. self.assertEqual(regex.search(r"(?V1)(\1xx|){6}", "xx").span(0, 1), ((0, 2), (2, 2))) # Hg issue 53. self.assertEqual(regex.search("(a|)+", "a").group(0, 1), ("a", "")) # Hg issue 54. self.assertEqual(regex.search(r"(a|)*\d", "a" * 80), None) # Hg issue 55. self.assertEqual(regex.search("^(?:a?b?)*$", "ac"), None) # Hg issue 58. self.assertRaisesRegex(regex.error, self.UNDEF_CHAR_NAME, lambda: regex.compile("\\N{1}")) # Hg issue 59. self.assertEqual(regex.search("\\Z", "a\na\n").span(0), (4, 4)) # Hg issue 60. self.assertEqual(regex.search("(q1|.)*(q2|.)*(x(a|bc)*y){2,}", "xayxay").group(0), "xayxay") # Hg issue 61. self.assertEqual(regex.search("(?i)[^a]", "A"), None) # Hg issue 63. self.assertEqual(regex.search(u"(?iu)[[:ascii:]]", u"\N{KELVIN SIGN}"), None) # Hg issue 66. self.assertEqual(regex.search("((a|b(?1)c){3,5})", "baaaaca").group(0, 1, 2), ('aaaa', 'aaaa', 'a')) # Hg issue 71. self.assertEqual(regex.findall(r"(?<=:\S+ )\w+", ":9 abc :10 def"), ['abc', 'def']) self.assertEqual(regex.findall(r"(?<=:\S* )\w+", ":9 abc :10 def"), ['abc', 'def']) self.assertEqual(regex.findall(r"(?<=:\S+? )\w+", ":9 abc :10 def"), ['abc', 'def']) self.assertEqual(regex.findall(r"(?<=:\S*? )\w+", ":9 abc :10 def"), ['abc', 'def']) # Hg issue 73. self.assertEqual(regex.search(r"(?:fe)?male", "female").group(), "female") self.assertEqual([m.group() for m in regex.finditer(r"(fe)?male: h(?(1)(er)|(is)) (\w+)", "female: her dog; male: his cat. asdsasda")], ['female: her dog', 'male: his cat']) # Hg issue 78. self.assertEqual(regex.search(r'(?\((?:[^()]++|(?&rec))*\))', 'aaa(((1+0)+1)+1)bbb').captures('rec'), ['(1+0)', '((1+0)+1)', '(((1+0)+1)+1)']) # Hg issue 80. self.assertRaisesRegex(regex.error, self.BAD_ESCAPE, lambda: regex.sub('x', '\\', 'x'), ) # Hg issue 82. fz = "(CAGCCTCCCATTTCAGAATATACATCC){1a(?b))', "ab").spans("x"), [(1, 2), (0, 2)]) # Hg issue 91. # Check that the replacement cache works. self.assertEqual(regex.sub(r'(-)', lambda m: m.expand(r'x'), 'a-b-c'), 'axbxc') # Hg issue 94. rx = regex.compile(r'\bt(est){i<2}', flags=regex.V1) self.assertEqual(rx.search("Some text"), None) self.assertEqual(rx.findall("Some text"), []) # Hg issue 95. self.assertRaisesRegex(regex.error, '^nothing to repeat at position 3$', lambda: regex.compile(r'.???')) # Hg issue 97. self.assertEquals(regex.escape(u'foo!?'), u'foo\\!\\?') self.assertEquals(regex.escape(u'foo!?', special_only=True), u'foo!\\?') self.assertEquals(regex.escape('foo!?'), 'foo\\!\\?') self.assertEquals(regex.escape('foo!?', special_only=True), 'foo!\\?') # Hg issue 100. self.assertEquals(regex.search('^([^z]*(?:WWWi|W))?$', 'WWWi').groups(), ('WWWi', )) self.assertEquals(regex.search('^([^z]*(?:WWWi|w))?$', 'WWWi').groups(), ('WWWi', )) self.assertEquals(regex.search('^([^z]*?(?:WWWi|W))?$', 'WWWi').groups(), ('WWWi', )) # Hg issue 101. pat = regex.compile(r'xxx', flags=regex.FULLCASE | regex.UNICODE) self.assertEquals([x.group() for x in pat.finditer('yxxx')], ['xxx']) self.assertEquals(pat.findall('yxxx'), ['xxx']) raw = 'yxxx' self.assertEquals([x.group() for x in pat.finditer(raw)], ['xxx']) self.assertEquals(pat.findall(raw), ['xxx']) pat = regex.compile(r'xxx', flags=regex.FULLCASE | regex.IGNORECASE | regex.UNICODE) self.assertEquals([x.group() for x in pat.finditer('yxxx')], ['xxx']) self.assertEquals(pat.findall('yxxx'), ['xxx']) raw = 'yxxx' self.assertEquals([x.group() for x in pat.finditer(raw)], ['xxx']) self.assertEquals(pat.findall(raw), ['xxx']) # Hg issue 106. self.assertEquals(regex.sub('(?V0).*', 'x', 'test'), 'x') self.assertEquals(regex.sub('(?V1).*', 'x', 'test'), 'xx') self.assertEquals(regex.sub('(?V0).*?', '|', 'test'), '|t|e|s|t|') self.assertEquals(regex.sub('(?V1).*?', '|', 'test'), '|||||||||') # Hg issue 112. self.assertEquals(regex.sub(r'^(@)\n(?!.*?@)(.*)', r'\1\n==========\n\2', '@\n', flags=regex.DOTALL), '@\n==========\n') # Hg issue 109. self.assertEquals(regex.match(r'(?:cats|cat){e<=1}', 'caz').fuzzy_counts, (1, 0, 0)) self.assertEquals(regex.match(r'(?e)(?:cats|cat){e<=1}', 'caz').fuzzy_counts, (1, 0, 0)) self.assertEquals(regex.match(r'(?b)(?:cats|cat){e<=1}', 'caz').fuzzy_counts, (1, 0, 0)) self.assertEquals(regex.match(r'(?:cat){e<=1}', 'caz').fuzzy_counts, (1, 0, 0)) self.assertEquals(regex.match(r'(?e)(?:cat){e<=1}', 'caz').fuzzy_counts, (1, 0, 0)) self.assertEquals(regex.match(r'(?b)(?:cat){e<=1}', 'caz').fuzzy_counts, (1, 0, 0)) self.assertEquals(regex.match(r'(?:cats){e<=2}', 'c ats').fuzzy_counts, (1, 1, 0)) self.assertEquals(regex.match(r'(?e)(?:cats){e<=2}', 'c ats').fuzzy_counts, (0, 1, 0)) self.assertEquals(regex.match(r'(?b)(?:cats){e<=2}', 'c ats').fuzzy_counts, (0, 1, 0)) self.assertEquals(regex.match(r'(?:cats){e<=2}', 'c a ts').fuzzy_counts, (0, 2, 0)) self.assertEquals(regex.match(r'(?e)(?:cats){e<=2}', 'c a ts').fuzzy_counts, (0, 2, 0)) self.assertEquals(regex.match(r'(?b)(?:cats){e<=2}', 'c a ts').fuzzy_counts, (0, 2, 0)) self.assertEquals(regex.match(r'(?:cats){e<=1}', 'c ats').fuzzy_counts, (0, 1, 0)) self.assertEquals(regex.match(r'(?e)(?:cats){e<=1}', 'c ats').fuzzy_counts, (0, 1, 0)) self.assertEquals(regex.match(r'(?b)(?:cats){e<=1}', 'c ats').fuzzy_counts, (0, 1, 0)) if not hasattr(str, "format"): # Strings don't have the .format method (below Python 2.6). del RegexTests.test_format def test_main(): run_unittest(RegexTests) if __name__ == "__main__": test_main()