SickGear/lib/thefuzz/fuzz.py

#!/usr/bin/env python

from rapidfuzz.fuzz import (
    ratio as _ratio,
    partial_ratio as _partial_ratio,
    token_set_ratio as _token_set_ratio,
    token_sort_ratio as _token_sort_ratio,
    partial_token_set_ratio as _partial_token_set_ratio,
    partial_token_sort_ratio as _partial_token_sort_ratio,
    WRatio as _WRatio,
    QRatio as _QRatio,
)

from . import utils

###########################
# Basic Scoring Functions #
###########################


def _rapidfuzz_scorer(scorer, s1, s2, force_ascii, full_process):
    """
    wrapper around rapidfuzz function to be compatible with the API of thefuzz
    """
    if full_process:
        if s1 is None or s2 is None:
            return 0

        s1 = utils.full_process(s1, force_ascii=force_ascii)
        s2 = utils.full_process(s2, force_ascii=force_ascii)

    return int(round(scorer(s1, s2)))


def ratio(s1, s2):
    return _rapidfuzz_scorer(_ratio, s1, s2, False, False)


def partial_ratio(s1, s2):
    """
    Return the ratio of the most similar substring
    as a number between 0 and 100.
    """
    return _rapidfuzz_scorer(_partial_ratio, s1, s2, False, False)


##############################
# Advanced Scoring Functions #
##############################

# Sorted Token
#   find all alphanumeric tokens in the string
#   sort those tokens and take ratio of resulting joined strings
#   controls for unordered string elements
def token_sort_ratio(s1, s2, force_ascii=True, full_process=True):
    """
    Return a measure of the sequences' similarity between 0 and 100
    but sorting the token before comparing.
    """
    return _rapidfuzz_scorer(_token_sort_ratio, s1, s2, force_ascii, full_process)


def partial_token_sort_ratio(s1, s2, force_ascii=True, full_process=True):
    """
    Return the ratio of the most similar substring as a number between
    0 and 100 but sorting the token before comparing.
    """
    return _rapidfuzz_scorer(
        _partial_token_sort_ratio, s1, s2, force_ascii, full_process
    )


def token_set_ratio(s1, s2, force_ascii=True, full_process=True):
    return _rapidfuzz_scorer(_token_set_ratio, s1, s2, force_ascii, full_process)


def partial_token_set_ratio(s1, s2, force_ascii=True, full_process=True):
    return _rapidfuzz_scorer(
        _partial_token_set_ratio, s1, s2, force_ascii, full_process
    )


###################
# Combination API #
###################

# q is for quick
def QRatio(s1, s2, force_ascii=True, full_process=True):
    """
    Quick ratio comparison between two strings.

    Runs full_process from utils on both strings
    Short circuits if either of the strings is empty after processing.

    :param s1:
    :param s2:
    :param force_ascii: Allow only ASCII characters (Default: True)
    :full_process: Process inputs, used here to avoid double processing in extract functions (Default: True)
    :return: similarity ratio
    """
    return _rapidfuzz_scorer(_QRatio, s1, s2, force_ascii, full_process)


def UQRatio(s1, s2, full_process=True):
    """
    Unicode quick ratio

    Calls QRatio with force_ascii set to False

    :param s1:
    :param s2:
    :return: similarity ratio
    """
    return QRatio(s1, s2, force_ascii=False, full_process=full_process)


# w is for weighted
def WRatio(s1, s2, force_ascii=True, full_process=True):
    """
    Return a measure of the sequences' similarity between 0 and 100, using different algorithms.

    **Steps in the order they occur**

    #. Run full_process from utils on both strings
    #. Short circuit if this makes either string empty
    #. Take the ratio of the two processed strings (fuzz.ratio)
    #. Run checks to compare the length of the strings
        * If one of the strings is more than 1.5 times as long as the other
          use partial_ratio comparisons - scale partial results by 0.9
          (this makes sure only full results can return 100)
        * If one of the strings is over 8 times as long as the other
          instead scale by 0.6

    #. Run the other ratio functions
        * if using partial ratio functions call partial_ratio,
          partial_token_sort_ratio and partial_token_set_ratio
          scale all of these by the ratio based on length
        * otherwise call token_sort_ratio and token_set_ratio
        * all token based comparisons are scaled by 0.95
          (on top of any partial scalars)

    #. Take the highest value from these results
       round it and return it as an integer.

    :param s1:
    :param s2:
    :param force_ascii: Allow only ascii characters
    :type force_ascii: bool
    :full_process: Process inputs, used here to avoid double processing in extract functions (Default: True)
    :return:
    """
    return _rapidfuzz_scorer(_WRatio, s1, s2, force_ascii, full_process)


def UWRatio(s1, s2, full_process=True):
    """
    Return a measure of the sequences' similarity between 0 and 100,
    using different algorithms. Same as WRatio but preserving unicode.
    """
    return WRatio(s1, s2, force_ascii=False, full_process=full_process)
Add thefuzz 0.19.0 (c2cd4f4) as a replacement with fallback to fuzzywuzzy 0.18.0 (2188520) Removes Levenshtein requirement with direct use of rapidfuzz instead Fallback to old fuzzywuzzy for pure python implementation 2023-09-07 22:20:37 +00:00			`#!/usr/bin/env python`

			`from rapidfuzz.fuzz import (`
			`ratio as _ratio,`
			`partial_ratio as _partial_ratio,`
			`token_set_ratio as _token_set_ratio,`
			`token_sort_ratio as _token_sort_ratio,`
			`partial_token_set_ratio as _partial_token_set_ratio,`
			`partial_token_sort_ratio as _partial_token_sort_ratio,`
			`WRatio as _WRatio,`
			`QRatio as _QRatio,`
			`)`

			`from . import utils`

			`###########################`
			`# Basic Scoring Functions #`
			`###########################`


			`def _rapidfuzz_scorer(scorer, s1, s2, force_ascii, full_process):`
			`"""`
			`wrapper around rapidfuzz function to be compatible with the API of thefuzz`
			`"""`
			`if full_process:`
			`if s1 is None or s2 is None:`
			`return 0`

			`s1 = utils.full_process(s1, force_ascii=force_ascii)`
			`s2 = utils.full_process(s2, force_ascii=force_ascii)`

			`return int(round(scorer(s1, s2)))`


			`def ratio(s1, s2):`
			`return _rapidfuzz_scorer(_ratio, s1, s2, False, False)`


			`def partial_ratio(s1, s2):`
			`"""`
			`Return the ratio of the most similar substring`
			`as a number between 0 and 100.`
			`"""`
			`return _rapidfuzz_scorer(_partial_ratio, s1, s2, False, False)`


			`##############################`
			`# Advanced Scoring Functions #`
			`##############################`

			`# Sorted Token`
			`# find all alphanumeric tokens in the string`
			`# sort those tokens and take ratio of resulting joined strings`
			`# controls for unordered string elements`
			`def token_sort_ratio(s1, s2, force_ascii=True, full_process=True):`
			`"""`
			`Return a measure of the sequences' similarity between 0 and 100`
			`but sorting the token before comparing.`
			`"""`
			`return _rapidfuzz_scorer(_token_sort_ratio, s1, s2, force_ascii, full_process)`


			`def partial_token_sort_ratio(s1, s2, force_ascii=True, full_process=True):`
			`"""`
			`Return the ratio of the most similar substring as a number between`
			`0 and 100 but sorting the token before comparing.`
			`"""`
			`return _rapidfuzz_scorer(`
			`_partial_token_sort_ratio, s1, s2, force_ascii, full_process`
			`)`


			`def token_set_ratio(s1, s2, force_ascii=True, full_process=True):`
			`return _rapidfuzz_scorer(_token_set_ratio, s1, s2, force_ascii, full_process)`


			`def partial_token_set_ratio(s1, s2, force_ascii=True, full_process=True):`
			`return _rapidfuzz_scorer(`
			`_partial_token_set_ratio, s1, s2, force_ascii, full_process`
			`)`


			`###################`
			`# Combination API #`
			`###################`

			`# q is for quick`
			`def QRatio(s1, s2, force_ascii=True, full_process=True):`
			`"""`
			`Quick ratio comparison between two strings.`

			`Runs full_process from utils on both strings`
			`Short circuits if either of the strings is empty after processing.`

			`:param s1:`
			`:param s2:`
			`:param force_ascii: Allow only ASCII characters (Default: True)`
			`:full_process: Process inputs, used here to avoid double processing in extract functions (Default: True)`
			`:return: similarity ratio`
			`"""`
			`return _rapidfuzz_scorer(_QRatio, s1, s2, force_ascii, full_process)`


			`def UQRatio(s1, s2, full_process=True):`
			`"""`
			`Unicode quick ratio`

			`Calls QRatio with force_ascii set to False`

			`:param s1:`
			`:param s2:`
			`:return: similarity ratio`
			`"""`
			`return QRatio(s1, s2, force_ascii=False, full_process=full_process)`


			`# w is for weighted`
			`def WRatio(s1, s2, force_ascii=True, full_process=True):`
			`"""`
			`Return a measure of the sequences' similarity between 0 and 100, using different algorithms.`

			`Steps in the order they occur`

			`#. Run full_process from utils on both strings`
			`#. Short circuit if this makes either string empty`
			`#. Take the ratio of the two processed strings (fuzz.ratio)`
			`#. Run checks to compare the length of the strings`
			`* If one of the strings is more than 1.5 times as long as the other`
			`use partial_ratio comparisons - scale partial results by 0.9`
			`(this makes sure only full results can return 100)`
			`* If one of the strings is over 8 times as long as the other`
			`instead scale by 0.6`

			`#. Run the other ratio functions`
			`* if using partial ratio functions call partial_ratio,`
			`partial_token_sort_ratio and partial_token_set_ratio`
			`scale all of these by the ratio based on length`
			`* otherwise call token_sort_ratio and token_set_ratio`
			`* all token based comparisons are scaled by 0.95`
			`(on top of any partial scalars)`

			`#. Take the highest value from these results`
			`round it and return it as an integer.`

			`:param s1:`
			`:param s2:`
			`:param force_ascii: Allow only ascii characters`
			`:type force_ascii: bool`
			`:full_process: Process inputs, used here to avoid double processing in extract functions (Default: True)`
			`:return:`
			`"""`
			`return _rapidfuzz_scorer(_WRatio, s1, s2, force_ascii, full_process)`


			`def UWRatio(s1, s2, full_process=True):`
			`"""`
			`Return a measure of the sequences' similarity between 0 and 100,`
			`using different algorithms. Same as WRatio but preserving unicode.`
			`"""`
			`return WRatio(s1, s2, force_ascii=False, full_process=full_process)`