mirror of
https://github.com/SickGear/SickGear.git
synced 2025-01-10 20:13:37 +00:00
2a73990c82
Removes Levenshtein requirement with direct use of rapidfuzz instead Fallback to old fuzzywuzzy for pure python implementation
160 lines
4.9 KiB
Python
160 lines
4.9 KiB
Python
#!/usr/bin/env python
|
|
|
|
from rapidfuzz.fuzz import (
|
|
ratio as _ratio,
|
|
partial_ratio as _partial_ratio,
|
|
token_set_ratio as _token_set_ratio,
|
|
token_sort_ratio as _token_sort_ratio,
|
|
partial_token_set_ratio as _partial_token_set_ratio,
|
|
partial_token_sort_ratio as _partial_token_sort_ratio,
|
|
WRatio as _WRatio,
|
|
QRatio as _QRatio,
|
|
)
|
|
|
|
from . import utils
|
|
|
|
###########################
|
|
# Basic Scoring Functions #
|
|
###########################
|
|
|
|
|
|
def _rapidfuzz_scorer(scorer, s1, s2, force_ascii, full_process):
|
|
"""
|
|
wrapper around rapidfuzz function to be compatible with the API of thefuzz
|
|
"""
|
|
if full_process:
|
|
if s1 is None or s2 is None:
|
|
return 0
|
|
|
|
s1 = utils.full_process(s1, force_ascii=force_ascii)
|
|
s2 = utils.full_process(s2, force_ascii=force_ascii)
|
|
|
|
return int(round(scorer(s1, s2)))
|
|
|
|
|
|
def ratio(s1, s2):
|
|
return _rapidfuzz_scorer(_ratio, s1, s2, False, False)
|
|
|
|
|
|
def partial_ratio(s1, s2):
|
|
"""
|
|
Return the ratio of the most similar substring
|
|
as a number between 0 and 100.
|
|
"""
|
|
return _rapidfuzz_scorer(_partial_ratio, s1, s2, False, False)
|
|
|
|
|
|
##############################
|
|
# Advanced Scoring Functions #
|
|
##############################
|
|
|
|
# Sorted Token
|
|
# find all alphanumeric tokens in the string
|
|
# sort those tokens and take ratio of resulting joined strings
|
|
# controls for unordered string elements
|
|
def token_sort_ratio(s1, s2, force_ascii=True, full_process=True):
|
|
"""
|
|
Return a measure of the sequences' similarity between 0 and 100
|
|
but sorting the token before comparing.
|
|
"""
|
|
return _rapidfuzz_scorer(_token_sort_ratio, s1, s2, force_ascii, full_process)
|
|
|
|
|
|
def partial_token_sort_ratio(s1, s2, force_ascii=True, full_process=True):
|
|
"""
|
|
Return the ratio of the most similar substring as a number between
|
|
0 and 100 but sorting the token before comparing.
|
|
"""
|
|
return _rapidfuzz_scorer(
|
|
_partial_token_sort_ratio, s1, s2, force_ascii, full_process
|
|
)
|
|
|
|
|
|
def token_set_ratio(s1, s2, force_ascii=True, full_process=True):
|
|
return _rapidfuzz_scorer(_token_set_ratio, s1, s2, force_ascii, full_process)
|
|
|
|
|
|
def partial_token_set_ratio(s1, s2, force_ascii=True, full_process=True):
|
|
return _rapidfuzz_scorer(
|
|
_partial_token_set_ratio, s1, s2, force_ascii, full_process
|
|
)
|
|
|
|
|
|
###################
|
|
# Combination API #
|
|
###################
|
|
|
|
# q is for quick
|
|
def QRatio(s1, s2, force_ascii=True, full_process=True):
|
|
"""
|
|
Quick ratio comparison between two strings.
|
|
|
|
Runs full_process from utils on both strings
|
|
Short circuits if either of the strings is empty after processing.
|
|
|
|
:param s1:
|
|
:param s2:
|
|
:param force_ascii: Allow only ASCII characters (Default: True)
|
|
:full_process: Process inputs, used here to avoid double processing in extract functions (Default: True)
|
|
:return: similarity ratio
|
|
"""
|
|
return _rapidfuzz_scorer(_QRatio, s1, s2, force_ascii, full_process)
|
|
|
|
|
|
def UQRatio(s1, s2, full_process=True):
|
|
"""
|
|
Unicode quick ratio
|
|
|
|
Calls QRatio with force_ascii set to False
|
|
|
|
:param s1:
|
|
:param s2:
|
|
:return: similarity ratio
|
|
"""
|
|
return QRatio(s1, s2, force_ascii=False, full_process=full_process)
|
|
|
|
|
|
# w is for weighted
|
|
def WRatio(s1, s2, force_ascii=True, full_process=True):
|
|
"""
|
|
Return a measure of the sequences' similarity between 0 and 100, using different algorithms.
|
|
|
|
**Steps in the order they occur**
|
|
|
|
#. Run full_process from utils on both strings
|
|
#. Short circuit if this makes either string empty
|
|
#. Take the ratio of the two processed strings (fuzz.ratio)
|
|
#. Run checks to compare the length of the strings
|
|
* If one of the strings is more than 1.5 times as long as the other
|
|
use partial_ratio comparisons - scale partial results by 0.9
|
|
(this makes sure only full results can return 100)
|
|
* If one of the strings is over 8 times as long as the other
|
|
instead scale by 0.6
|
|
|
|
#. Run the other ratio functions
|
|
* if using partial ratio functions call partial_ratio,
|
|
partial_token_sort_ratio and partial_token_set_ratio
|
|
scale all of these by the ratio based on length
|
|
* otherwise call token_sort_ratio and token_set_ratio
|
|
* all token based comparisons are scaled by 0.95
|
|
(on top of any partial scalars)
|
|
|
|
#. Take the highest value from these results
|
|
round it and return it as an integer.
|
|
|
|
:param s1:
|
|
:param s2:
|
|
:param force_ascii: Allow only ascii characters
|
|
:type force_ascii: bool
|
|
:full_process: Process inputs, used here to avoid double processing in extract functions (Default: True)
|
|
:return:
|
|
"""
|
|
return _rapidfuzz_scorer(_WRatio, s1, s2, force_ascii, full_process)
|
|
|
|
|
|
def UWRatio(s1, s2, full_process=True):
|
|
"""
|
|
Return a measure of the sequences' similarity between 0 and 100,
|
|
using different algorithms. Same as WRatio but preserving unicode.
|
|
"""
|
|
return WRatio(s1, s2, force_ascii=False, full_process=full_process)
|