SickGear/lib/thefuzz/fuzz.py

#!/usr/bin/env python

from rapidfuzz.fuzz import (
    ratio as _ratio,
    partial_ratio as _partial_ratio,
    token_set_ratio as _token_set_ratio,
    token_sort_ratio as _token_sort_ratio,
    partial_token_set_ratio as _partial_token_set_ratio,
    partial_token_sort_ratio as _partial_token_sort_ratio,
    WRatio as _WRatio,
    QRatio as _QRatio,
)

from . import utils

###########################
# Basic Scoring Functions #
###########################


def _rapidfuzz_scorer(scorer, s1, s2, force_ascii, full_process):
    """
    wrapper around rapidfuzz function to be compatible with the API of thefuzz
    """
    if full_process:
        if s1 is None or s2 is None:
            return 0

        s1 = utils.full_process(s1, force_ascii=force_ascii)
        s2 = utils.full_process(s2, force_ascii=force_ascii)

    return int(round(scorer(s1, s2)))


def ratio(s1, s2):
    return _rapidfuzz_scorer(_ratio, s1, s2, False, False)


def partial_ratio(s1, s2):
    """
    Return the ratio of the most similar substring
    as a number between 0 and 100.
    """
    return _rapidfuzz_scorer(_partial_ratio, s1, s2, False, False)


##############################
# Advanced Scoring Functions #
##############################

# Sorted Token
#   find all alphanumeric tokens in the string
#   sort those tokens and take ratio of resulting joined strings
#   controls for unordered string elements
def token_sort_ratio(s1, s2, force_ascii=True, full_process=True):
    """
    Return a measure of the sequences' similarity between 0 and 100
    but sorting the token before comparing.
    """
    return _rapidfuzz_scorer(_token_sort_ratio, s1, s2, force_ascii, full_process)


def partial_token_sort_ratio(s1, s2, force_ascii=True, full_process=True):
    """
    Return the ratio of the most similar substring as a number between
    0 and 100 but sorting the token before comparing.
    """
    return _rapidfuzz_scorer(
        _partial_token_sort_ratio, s1, s2, force_ascii, full_process
    )


def token_set_ratio(s1, s2, force_ascii=True, full_process=True):
    return _rapidfuzz_scorer(_token_set_ratio, s1, s2, force_ascii, full_process)


def partial_token_set_ratio(s1, s2, force_ascii=True, full_process=True):
    return _rapidfuzz_scorer(
        _partial_token_set_ratio, s1, s2, force_ascii, full_process
    )


###################
# Combination API #
###################

# q is for quick
def QRatio(s1, s2, force_ascii=True, full_process=True):
    """
    Quick ratio comparison between two strings.

    Runs full_process from utils on both strings
    Short circuits if either of the strings is empty after processing.

    :param s1:
    :param s2:
    :param force_ascii: Allow only ASCII characters (Default: True)
    :full_process: Process inputs, used here to avoid double processing in extract functions (Default: True)
    :return: similarity ratio
    """
    return _rapidfuzz_scorer(_QRatio, s1, s2, force_ascii, full_process)


def UQRatio(s1, s2, full_process=True):
    """
    Unicode quick ratio

    Calls QRatio with force_ascii set to False

    :param s1:
    :param s2:
    :return: similarity ratio
    """
    return QRatio(s1, s2, force_ascii=False, full_process=full_process)


# w is for weighted
def WRatio(s1, s2, force_ascii=True, full_process=True):
    """
    Return a measure of the sequences' similarity between 0 and 100, using different algorithms.

    **Steps in the order they occur**

    #. Run full_process from utils on both strings
    #. Short circuit if this makes either string empty
    #. Take the ratio of the two processed strings (fuzz.ratio)
    #. Run checks to compare the length of the strings
        * If one of the strings is more than 1.5 times as long as the other
          use partial_ratio comparisons - scale partial results by 0.9
          (this makes sure only full results can return 100)
        * If one of the strings is over 8 times as long as the other
          instead scale by 0.6

    #. Run the other ratio functions
        * if using partial ratio functions call partial_ratio,
          partial_token_sort_ratio and partial_token_set_ratio
          scale all of these by the ratio based on length
        * otherwise call token_sort_ratio and token_set_ratio
        * all token based comparisons are scaled by 0.95
          (on top of any partial scalars)

    #. Take the highest value from these results
       round it and return it as an integer.

    :param s1:
    :param s2:
    :param force_ascii: Allow only ascii characters
    :type force_ascii: bool
    :full_process: Process inputs, used here to avoid double processing in extract functions (Default: True)
    :return:
    """
    return _rapidfuzz_scorer(_WRatio, s1, s2, force_ascii, full_process)


def UWRatio(s1, s2, full_process=True):
    """
    Return a measure of the sequences' similarity between 0 and 100,
    using different algorithms. Same as WRatio but preserving unicode.
    """
    return WRatio(s1, s2, force_ascii=False, full_process=full_process)