r""" Parse string to create Regex object. TODO: - Support \: \001, \x00, \0, \ \[, \(, \{, etc. - Support Python extensions: (?:...), (?P...), etc. - Support \<, \>, \s, \S, \w, \W, \Z <=> $, \d, \D, \A <=> ^, \b, \B, [[:space:]], etc. """ from hachoir.regex import (RegexString, RegexEmpty, RegexRepeat, RegexDot, RegexWord, RegexStart, RegexEnd, RegexRange, RegexRangeItem, RegexRangeCharacter) import re REGEX_COMMAND_CHARACTERS = '.^$[](){}|+?*\\' def parseRange(text, start): r""" >>> parseRange('[a]b', 1) (, 3) >>> parseRange('[a-z]b', 1) (, 5) >>> parseRange('[^a-z-]b', 1) (, 7) >>> parseRange('[^]-]b', 1) (, 5) >>> parseRange(r'[\]abc]', 1) (, 7) >>> parseRange(r'[a\-x]', 1) (, 6) """ index = start char_range = [] exclude = False if text[index] == '^': exclude = True index += 1 if text[index] == ']': char_range.append(RegexRangeCharacter(']')) index += 1 while index < len(text) and text[index] != ']': if index + 1 < len(text) \ and text[index] == '\\': char_range.append(RegexRangeCharacter(text[index + 1])) index += 2 elif index + 1 < len(text) \ and text[index] == '-' and text[index + 1] == ']': break elif index + 3 < len(text) \ and text[index + 1] == '-' \ and text[index + 2] != ']': char_range.append(RegexRangeItem( ord(text[index]), ord(text[index + 2]))) index += 3 else: char_range.append(RegexRangeCharacter(text[index])) index += 1 if index < len(text) and text[index] == '-': char_range.append(RegexRangeCharacter('-')) index += 1 if index == len(text) or text[index] != ']': raise SyntaxError('Invalid range: %s' % text[start - 1:index]) return RegexRange(char_range, exclude), index + 1 def parseOr(text, start): """ >>> parseOr('(a)', 1) (, 3) >>> parseOr('(a|c)', 1) (, 5) >>> parseOr(' (a|[bc]|d)', 2) (, 11) """ index = start # (?:...): Skip Python prefix '?:' if text[index:index + 2] == '?:': index += 2 if text[index] == '?': raise NotImplementedError("Doesn't support Python extension (?...)") regex = None while True: new_regex, index = _parse(text, index, "|)") if regex: regex = regex | new_regex else: regex = new_regex if len(text) <= index: raise SyntaxError('Missing closing parenthesis') if text[index] == ')': break index += 1 index += 1 if regex is None: regex = RegexEmpty() return regex, index REPEAT_REGEX = re.compile("([0-9]+)(,[0-9]*)?}") def parseRepeat(text, start): """ >>> parseRepeat('a{0,1}b', 2) (0, 1, 6) >>> parseRepeat('a{12}', 2) (12, 12, 5) """ match = REPEAT_REGEX.match(text, start) if not match: raise SyntaxError('Unable to parse repetition ' + text[start:]) rmin = int(match.group(1)) if match.group(2): text = match.group(2)[1:] if text: rmax = int(text) else: rmax = None else: rmax = rmin return (rmin, rmax, match.end(0)) CHAR_TO_FUNC = {'[': parseRange, '(': parseOr} CHAR_TO_CLASS = {'.': RegexDot, '^': RegexStart, '$': RegexEnd} CHAR_TO_REPEAT = {'*': (0, None), '?': (0, 1), '+': (1, None)} def _parse(text, start=0, until=None): if len(text) == start: return RegexEmpty(), 0 index = start regex = RegexEmpty() last = None while index < len(text): char = text[index] if until and char in until: break if char in REGEX_COMMAND_CHARACTERS: if char in CHAR_TO_FUNC: new_regex, index = CHAR_TO_FUNC[char](text, index + 1) elif char in CHAR_TO_CLASS: new_regex = CHAR_TO_CLASS[char]() index += 1 elif char == '{': rmin, rmax, index = parseRepeat(text, index + 1) new_regex = RegexRepeat(last, rmin, rmax) last = None elif char in CHAR_TO_REPEAT: rmin, rmax = CHAR_TO_REPEAT[char] if last is None: raise SyntaxError( 'Repetition character (%s) without previous expression' % text[index]) new_regex = RegexRepeat(last, rmin, rmax) last = None index += 1 elif char == "\\": index += 1 if index == len(text): raise SyntaxError( "Antislash (\\) without escaped character") char = text[index] if char == 'b': new_regex = RegexWord() else: if not(char in REGEX_COMMAND_CHARACTERS or char in " '"): raise SyntaxError( "Operator '\\%s' is not supported" % char) new_regex = RegexString(char) index += 1 else: raise NotImplementedError( "Operator '%s' is not supported" % char) if last: regex = regex + last last = new_regex else: subtext = text[index] index += 1 if last: regex = regex + last last = RegexString(subtext) if last: regex = regex + last return regex, index def parse(text): r""" >>> parse('') >>> parse('abc') >>> parse("chats?") >>> parse('[bc]d') >>> parse("\\.") """ regex, index = _parse(text) assert index == len(text) return regex if __name__ == "__main__": import doctest doctest.testmod()