SickGear/lib/thefuzz/fuzz.py
Prinz23 2a73990c82 Add thefuzz 0.19.0 (c2cd4f4) as a replacement with fallback to fuzzywuzzy 0.18.0 (2188520)
Removes Levenshtein requirement with direct use of rapidfuzz instead
Fallback to old fuzzywuzzy for pure python implementation
2023-09-12 14:53:50 +01:00

160 lines
4.9 KiB
Python

#!/usr/bin/env python
from rapidfuzz.fuzz import (
ratio as _ratio,
partial_ratio as _partial_ratio,
token_set_ratio as _token_set_ratio,
token_sort_ratio as _token_sort_ratio,
partial_token_set_ratio as _partial_token_set_ratio,
partial_token_sort_ratio as _partial_token_sort_ratio,
WRatio as _WRatio,
QRatio as _QRatio,
)
from . import utils
###########################
# Basic Scoring Functions #
###########################
def _rapidfuzz_scorer(scorer, s1, s2, force_ascii, full_process):
"""
wrapper around rapidfuzz function to be compatible with the API of thefuzz
"""
if full_process:
if s1 is None or s2 is None:
return 0
s1 = utils.full_process(s1, force_ascii=force_ascii)
s2 = utils.full_process(s2, force_ascii=force_ascii)
return int(round(scorer(s1, s2)))
def ratio(s1, s2):
return _rapidfuzz_scorer(_ratio, s1, s2, False, False)
def partial_ratio(s1, s2):
"""
Return the ratio of the most similar substring
as a number between 0 and 100.
"""
return _rapidfuzz_scorer(_partial_ratio, s1, s2, False, False)
##############################
# Advanced Scoring Functions #
##############################
# Sorted Token
# find all alphanumeric tokens in the string
# sort those tokens and take ratio of resulting joined strings
# controls for unordered string elements
def token_sort_ratio(s1, s2, force_ascii=True, full_process=True):
"""
Return a measure of the sequences' similarity between 0 and 100
but sorting the token before comparing.
"""
return _rapidfuzz_scorer(_token_sort_ratio, s1, s2, force_ascii, full_process)
def partial_token_sort_ratio(s1, s2, force_ascii=True, full_process=True):
"""
Return the ratio of the most similar substring as a number between
0 and 100 but sorting the token before comparing.
"""
return _rapidfuzz_scorer(
_partial_token_sort_ratio, s1, s2, force_ascii, full_process
)
def token_set_ratio(s1, s2, force_ascii=True, full_process=True):
return _rapidfuzz_scorer(_token_set_ratio, s1, s2, force_ascii, full_process)
def partial_token_set_ratio(s1, s2, force_ascii=True, full_process=True):
return _rapidfuzz_scorer(
_partial_token_set_ratio, s1, s2, force_ascii, full_process
)
###################
# Combination API #
###################
# q is for quick
def QRatio(s1, s2, force_ascii=True, full_process=True):
"""
Quick ratio comparison between two strings.
Runs full_process from utils on both strings
Short circuits if either of the strings is empty after processing.
:param s1:
:param s2:
:param force_ascii: Allow only ASCII characters (Default: True)
:full_process: Process inputs, used here to avoid double processing in extract functions (Default: True)
:return: similarity ratio
"""
return _rapidfuzz_scorer(_QRatio, s1, s2, force_ascii, full_process)
def UQRatio(s1, s2, full_process=True):
"""
Unicode quick ratio
Calls QRatio with force_ascii set to False
:param s1:
:param s2:
:return: similarity ratio
"""
return QRatio(s1, s2, force_ascii=False, full_process=full_process)
# w is for weighted
def WRatio(s1, s2, force_ascii=True, full_process=True):
"""
Return a measure of the sequences' similarity between 0 and 100, using different algorithms.
**Steps in the order they occur**
#. Run full_process from utils on both strings
#. Short circuit if this makes either string empty
#. Take the ratio of the two processed strings (fuzz.ratio)
#. Run checks to compare the length of the strings
* If one of the strings is more than 1.5 times as long as the other
use partial_ratio comparisons - scale partial results by 0.9
(this makes sure only full results can return 100)
* If one of the strings is over 8 times as long as the other
instead scale by 0.6
#. Run the other ratio functions
* if using partial ratio functions call partial_ratio,
partial_token_sort_ratio and partial_token_set_ratio
scale all of these by the ratio based on length
* otherwise call token_sort_ratio and token_set_ratio
* all token based comparisons are scaled by 0.95
(on top of any partial scalars)
#. Take the highest value from these results
round it and return it as an integer.
:param s1:
:param s2:
:param force_ascii: Allow only ascii characters
:type force_ascii: bool
:full_process: Process inputs, used here to avoid double processing in extract functions (Default: True)
:return:
"""
return _rapidfuzz_scorer(_WRatio, s1, s2, force_ascii, full_process)
def UWRatio(s1, s2, full_process=True):
"""
Return a measure of the sequences' similarity between 0 and 100,
using different algorithms. Same as WRatio but preserving unicode.
"""
return WRatio(s1, s2, force_ascii=False, full_process=full_process)