mirror of
https://github.com/SickGear/SickGear.git
synced 2024-11-22 21:05:05 +00:00
209 lines
6.2 KiB
Python
209 lines
6.2 KiB
Python
r"""
|
|
Parse string to create Regex object.
|
|
|
|
TODO:
|
|
- Support \: \001, \x00, \0, \ \[, \(, \{, etc.
|
|
- Support Python extensions: (?:...), (?P<name>...), etc.
|
|
- Support \<, \>, \s, \S, \w, \W, \Z <=> $, \d, \D, \A <=> ^, \b, \B, [[:space:]], etc.
|
|
"""
|
|
|
|
from hachoir.regex import (RegexString, RegexEmpty, RegexRepeat,
|
|
RegexDot, RegexWord, RegexStart, RegexEnd,
|
|
RegexRange, RegexRangeItem, RegexRangeCharacter)
|
|
import re
|
|
|
|
REGEX_COMMAND_CHARACTERS = '.^$[](){}|+?*\\'
|
|
|
|
|
|
def parseRange(text, start):
|
|
r"""
|
|
>>> parseRange('[a]b', 1)
|
|
(<RegexRange '[a]'>, 3)
|
|
>>> parseRange('[a-z]b', 1)
|
|
(<RegexRange '[a-z]'>, 5)
|
|
>>> parseRange('[^a-z-]b', 1)
|
|
(<RegexRange '[^a-z-]'>, 7)
|
|
>>> parseRange('[^]-]b', 1)
|
|
(<RegexRange '[^]-]'>, 5)
|
|
>>> parseRange(r'[\]abc]', 1)
|
|
(<RegexRange '[]a-c]'>, 7)
|
|
>>> parseRange(r'[a\-x]', 1)
|
|
(<RegexRange '[ax-]'>, 6)
|
|
"""
|
|
index = start
|
|
char_range = []
|
|
exclude = False
|
|
if text[index] == '^':
|
|
exclude = True
|
|
index += 1
|
|
if text[index] == ']':
|
|
char_range.append(RegexRangeCharacter(']'))
|
|
index += 1
|
|
while index < len(text) and text[index] != ']':
|
|
if index + 1 < len(text) \
|
|
and text[index] == '\\':
|
|
char_range.append(RegexRangeCharacter(text[index + 1]))
|
|
index += 2
|
|
elif index + 1 < len(text) \
|
|
and text[index] == '-' and text[index + 1] == ']':
|
|
break
|
|
elif index + 3 < len(text) \
|
|
and text[index + 1] == '-' \
|
|
and text[index + 2] != ']':
|
|
char_range.append(RegexRangeItem(
|
|
ord(text[index]), ord(text[index + 2])))
|
|
index += 3
|
|
else:
|
|
char_range.append(RegexRangeCharacter(text[index]))
|
|
index += 1
|
|
if index < len(text) and text[index] == '-':
|
|
char_range.append(RegexRangeCharacter('-'))
|
|
index += 1
|
|
if index == len(text) or text[index] != ']':
|
|
raise SyntaxError('Invalid range: %s' % text[start - 1:index])
|
|
return RegexRange(char_range, exclude), index + 1
|
|
|
|
|
|
def parseOr(text, start):
|
|
"""
|
|
>>> parseOr('(a)', 1)
|
|
(<RegexString 'a'>, 3)
|
|
>>> parseOr('(a|c)', 1)
|
|
(<RegexRange '[ac]'>, 5)
|
|
>>> parseOr(' (a|[bc]|d)', 2)
|
|
(<RegexRange '[a-d]'>, 11)
|
|
"""
|
|
index = start
|
|
# (?:...): Skip Python prefix '?:'
|
|
if text[index:index + 2] == '?:':
|
|
index += 2
|
|
if text[index] == '?':
|
|
raise NotImplementedError("Doesn't support Python extension (?...)")
|
|
regex = None
|
|
while True:
|
|
new_regex, index = _parse(text, index, "|)")
|
|
if regex:
|
|
regex = regex | new_regex
|
|
else:
|
|
regex = new_regex
|
|
if len(text) <= index:
|
|
raise SyntaxError('Missing closing parenthesis')
|
|
if text[index] == ')':
|
|
break
|
|
index += 1
|
|
index += 1
|
|
if regex is None:
|
|
regex = RegexEmpty()
|
|
return regex, index
|
|
|
|
|
|
REPEAT_REGEX = re.compile("([0-9]+)(,[0-9]*)?}")
|
|
|
|
|
|
def parseRepeat(text, start):
|
|
"""
|
|
>>> parseRepeat('a{0,1}b', 2)
|
|
(0, 1, 6)
|
|
>>> parseRepeat('a{12}', 2)
|
|
(12, 12, 5)
|
|
"""
|
|
match = REPEAT_REGEX.match(text, start)
|
|
if not match:
|
|
raise SyntaxError('Unable to parse repetition ' + text[start:])
|
|
rmin = int(match.group(1))
|
|
if match.group(2):
|
|
text = match.group(2)[1:]
|
|
if text:
|
|
rmax = int(text)
|
|
else:
|
|
rmax = None
|
|
else:
|
|
rmax = rmin
|
|
return (rmin, rmax, match.end(0))
|
|
|
|
|
|
CHAR_TO_FUNC = {'[': parseRange, '(': parseOr}
|
|
CHAR_TO_CLASS = {'.': RegexDot, '^': RegexStart, '$': RegexEnd}
|
|
CHAR_TO_REPEAT = {'*': (0, None), '?': (0, 1), '+': (1, None)}
|
|
|
|
|
|
def _parse(text, start=0, until=None):
|
|
if len(text) == start:
|
|
return RegexEmpty(), 0
|
|
index = start
|
|
regex = RegexEmpty()
|
|
last = None
|
|
while index < len(text):
|
|
char = text[index]
|
|
if until and char in until:
|
|
break
|
|
if char in REGEX_COMMAND_CHARACTERS:
|
|
if char in CHAR_TO_FUNC:
|
|
new_regex, index = CHAR_TO_FUNC[char](text, index + 1)
|
|
elif char in CHAR_TO_CLASS:
|
|
new_regex = CHAR_TO_CLASS[char]()
|
|
index += 1
|
|
elif char == '{':
|
|
rmin, rmax, index = parseRepeat(text, index + 1)
|
|
new_regex = RegexRepeat(last, rmin, rmax)
|
|
last = None
|
|
elif char in CHAR_TO_REPEAT:
|
|
rmin, rmax = CHAR_TO_REPEAT[char]
|
|
if last is None:
|
|
raise SyntaxError(
|
|
'Repetition character (%s) without previous expression' % text[index])
|
|
new_regex = RegexRepeat(last, rmin, rmax)
|
|
last = None
|
|
index += 1
|
|
elif char == "\\":
|
|
index += 1
|
|
if index == len(text):
|
|
raise SyntaxError(
|
|
"Antislash (\\) without escaped character")
|
|
char = text[index]
|
|
if char == 'b':
|
|
new_regex = RegexWord()
|
|
else:
|
|
if not (char in REGEX_COMMAND_CHARACTERS or char in " '"):
|
|
raise SyntaxError(
|
|
"Operator '\\%s' is not supported" % char)
|
|
new_regex = RegexString(char)
|
|
index += 1
|
|
else:
|
|
raise NotImplementedError(
|
|
"Operator '%s' is not supported" % char)
|
|
if last:
|
|
regex = regex + last
|
|
last = new_regex
|
|
else:
|
|
subtext = text[index]
|
|
index += 1
|
|
if last:
|
|
regex = regex + last
|
|
last = RegexString(subtext)
|
|
if last:
|
|
regex = regex + last
|
|
return regex, index
|
|
|
|
|
|
def parse(text):
|
|
r"""
|
|
>>> parse('')
|
|
<RegexEmpty ''>
|
|
>>> parse('abc')
|
|
<RegexString 'abc'>
|
|
>>> parse("chats?")
|
|
<RegexAnd 'chats?'>
|
|
>>> parse('[bc]d')
|
|
<RegexAnd '[bc]d'>
|
|
>>> parse("\\.")
|
|
<RegexString '\.'>
|
|
"""
|
|
regex, index = _parse(text)
|
|
assert index == len(text)
|
|
return regex
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import doctest
|
|
doctest.testmod()
|