From 2a73990c82f35bb94a853bebd3a51ed4df408c58 Mon Sep 17 00:00:00 2001
From: Prinz23 <Prinz2311@gmail.com>
Date: Fri, 8 Sep 2023 00:20:37 +0200
Subject: [PATCH] Add thefuzz 0.19.0 (c2cd4f4) as a replacement with fallback
 to fuzzywuzzy 0.18.0 (2188520)

Removes Levenshtein requirement with direct use of rapidfuzz instead
Fallback to old fuzzywuzzy for pure python implementation
---
 CHANGES.md              |   1 +
 lib/thefuzz/__init__.py |   1 +
 lib/thefuzz/fuzz.py     | 160 +++++++++++++++++++++
 lib/thefuzz/fuzz.pyi    |  10 ++
 lib/thefuzz/process.py  | 309 ++++++++++++++++++++++++++++++++++++++++
 lib/thefuzz/process.pyi |  17 +++
 lib/thefuzz/utils.py    |  22 +++
 lib/thefuzz/utils.pyi   |   3 +
 recommended.txt         |   1 -
 sickgear/tv.py          |   5 +-
 sickgear/webserve.py    |   5 +-
 11 files changed, 531 insertions(+), 3 deletions(-)
 create mode 100644 lib/thefuzz/__init__.py
 create mode 100644 lib/thefuzz/fuzz.py
 create mode 100644 lib/thefuzz/fuzz.pyi
 create mode 100644 lib/thefuzz/process.py
 create mode 100644 lib/thefuzz/process.pyi
 create mode 100644 lib/thefuzz/utils.py
 create mode 100644 lib/thefuzz/utils.pyi

diff --git a/CHANGES.md b/CHANGES.md
index d70debd4..cf9d9fd1 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -6,6 +6,7 @@
 * Update package resource API 67.5.1 (f51eccd) to 68.1.2 (1ef36f2)
 * Update soupsieve 2.3.2.post1 (792d566) to 2.4.1 (2e66beb)
 * Update Tornado Web Server 6.3.2 (e3aa6c5) to 6.3.3 (e4d6984)
+* Add thefuzz 0.19.0 (c2cd4f4) as a replacement with fallback to fuzzywuzzy 0.18.0 (2188520)
 * Fix regex that was not using py312 notation
 * Change sort backlog and manual segment search results episode number
 * Change sort episodes when set to wanted on display show page
diff --git a/lib/thefuzz/__init__.py b/lib/thefuzz/__init__.py
new file mode 100644
index 00000000..482e4a19
--- /dev/null
+++ b/lib/thefuzz/__init__.py
@@ -0,0 +1 @@
+__version__ = '0.19.0'
diff --git a/lib/thefuzz/fuzz.py b/lib/thefuzz/fuzz.py
new file mode 100644
index 00000000..7d86e483
--- /dev/null
+++ b/lib/thefuzz/fuzz.py
@@ -0,0 +1,160 @@
+#!/usr/bin/env python
+
+from rapidfuzz.fuzz import (
+    ratio as _ratio,
+    partial_ratio as _partial_ratio,
+    token_set_ratio as _token_set_ratio,
+    token_sort_ratio as _token_sort_ratio,
+    partial_token_set_ratio as _partial_token_set_ratio,
+    partial_token_sort_ratio as _partial_token_sort_ratio,
+    WRatio as _WRatio,
+    QRatio as _QRatio,
+)
+
+from . import utils
+
+###########################
+# Basic Scoring Functions #
+###########################
+
+
+def _rapidfuzz_scorer(scorer, s1, s2, force_ascii, full_process):
+    """
+    wrapper around rapidfuzz function to be compatible with the API of thefuzz
+    """
+    if full_process:
+        if s1 is None or s2 is None:
+            return 0
+
+        s1 = utils.full_process(s1, force_ascii=force_ascii)
+        s2 = utils.full_process(s2, force_ascii=force_ascii)
+
+    return int(round(scorer(s1, s2)))
+
+
+def ratio(s1, s2):
+    return _rapidfuzz_scorer(_ratio, s1, s2, False, False)
+
+
+def partial_ratio(s1, s2):
+    """
+    Return the ratio of the most similar substring
+    as a number between 0 and 100.
+    """
+    return _rapidfuzz_scorer(_partial_ratio, s1, s2, False, False)
+
+
+##############################
+# Advanced Scoring Functions #
+##############################
+
+# Sorted Token
+#   find all alphanumeric tokens in the string
+#   sort those tokens and take ratio of resulting joined strings
+#   controls for unordered string elements
+def token_sort_ratio(s1, s2, force_ascii=True, full_process=True):
+    """
+    Return a measure of the sequences' similarity between 0 and 100
+    but sorting the token before comparing.
+    """
+    return _rapidfuzz_scorer(_token_sort_ratio, s1, s2, force_ascii, full_process)
+
+
+def partial_token_sort_ratio(s1, s2, force_ascii=True, full_process=True):
+    """
+    Return the ratio of the most similar substring as a number between
+    0 and 100 but sorting the token before comparing.
+    """
+    return _rapidfuzz_scorer(
+        _partial_token_sort_ratio, s1, s2, force_ascii, full_process
+    )
+
+
+def token_set_ratio(s1, s2, force_ascii=True, full_process=True):
+    return _rapidfuzz_scorer(_token_set_ratio, s1, s2, force_ascii, full_process)
+
+
+def partial_token_set_ratio(s1, s2, force_ascii=True, full_process=True):
+    return _rapidfuzz_scorer(
+        _partial_token_set_ratio, s1, s2, force_ascii, full_process
+    )
+
+
+###################
+# Combination API #
+###################
+
+# q is for quick
+def QRatio(s1, s2, force_ascii=True, full_process=True):
+    """
+    Quick ratio comparison between two strings.
+
+    Runs full_process from utils on both strings
+    Short circuits if either of the strings is empty after processing.
+
+    :param s1:
+    :param s2:
+    :param force_ascii: Allow only ASCII characters (Default: True)
+    :full_process: Process inputs, used here to avoid double processing in extract functions (Default: True)
+    :return: similarity ratio
+    """
+    return _rapidfuzz_scorer(_QRatio, s1, s2, force_ascii, full_process)
+
+
+def UQRatio(s1, s2, full_process=True):
+    """
+    Unicode quick ratio
+
+    Calls QRatio with force_ascii set to False
+
+    :param s1:
+    :param s2:
+    :return: similarity ratio
+    """
+    return QRatio(s1, s2, force_ascii=False, full_process=full_process)
+
+
+# w is for weighted
+def WRatio(s1, s2, force_ascii=True, full_process=True):
+    """
+    Return a measure of the sequences' similarity between 0 and 100, using different algorithms.
+
+    **Steps in the order they occur**
+
+    #. Run full_process from utils on both strings
+    #. Short circuit if this makes either string empty
+    #. Take the ratio of the two processed strings (fuzz.ratio)
+    #. Run checks to compare the length of the strings
+        * If one of the strings is more than 1.5 times as long as the other
+          use partial_ratio comparisons - scale partial results by 0.9
+          (this makes sure only full results can return 100)
+        * If one of the strings is over 8 times as long as the other
+          instead scale by 0.6
+
+    #. Run the other ratio functions
+        * if using partial ratio functions call partial_ratio,
+          partial_token_sort_ratio and partial_token_set_ratio
+          scale all of these by the ratio based on length
+        * otherwise call token_sort_ratio and token_set_ratio
+        * all token based comparisons are scaled by 0.95
+          (on top of any partial scalars)
+
+    #. Take the highest value from these results
+       round it and return it as an integer.
+
+    :param s1:
+    :param s2:
+    :param force_ascii: Allow only ascii characters
+    :type force_ascii: bool
+    :full_process: Process inputs, used here to avoid double processing in extract functions (Default: True)
+    :return:
+    """
+    return _rapidfuzz_scorer(_WRatio, s1, s2, force_ascii, full_process)
+
+
+def UWRatio(s1, s2, full_process=True):
+    """
+    Return a measure of the sequences' similarity between 0 and 100,
+    using different algorithms. Same as WRatio but preserving unicode.
+    """
+    return WRatio(s1, s2, force_ascii=False, full_process=full_process)
diff --git a/lib/thefuzz/fuzz.pyi b/lib/thefuzz/fuzz.pyi
new file mode 100644
index 00000000..86916a2b
--- /dev/null
+++ b/lib/thefuzz/fuzz.pyi
@@ -0,0 +1,10 @@
+def ratio(s1: str, s2: str) -> int: ...
+def partial_ratio(s1: str, s2: str) -> int: ...
+def token_sort_ratio(s1: str, s2: str, force_ascii: bool = ..., full_process: bool = ...) -> int: ...
+def partial_token_sort_ratio(s1: str, s2: str, force_ascii: bool = ..., full_process: bool = ...) -> int: ...
+def token_set_ratio(s1: str, s2: str, force_ascii: bool = ..., full_process: bool = ...) -> int: ...
+def partial_token_set_ratio(s1: str, s2: str, force_ascii: bool = ..., full_process: bool = ...) -> int: ...
+def QRatio(s1: str, s2: str, force_ascii: bool = ..., full_process: bool = ...) -> int: ...
+def UQRatio(s1: str, s2: str, full_process: bool = ...) -> int: ...
+def WRatio(s1: str, s2: str, force_ascii: bool = ..., full_process: bool = ...) -> int: ...
+def UWRatio(s1: str, s2: str, full_process: bool = ...) -> int: ...
diff --git a/lib/thefuzz/process.py b/lib/thefuzz/process.py
new file mode 100644
index 00000000..f6b15eaa
--- /dev/null
+++ b/lib/thefuzz/process.py
@@ -0,0 +1,309 @@
+#!/usr/bin/env python
+from . import fuzz
+from . import utils
+import logging
+from rapidfuzz import fuzz as rfuzz
+from rapidfuzz import process as rprocess
+
+_logger = logging.getLogger(__name__)
+
+default_scorer = fuzz.WRatio
+default_processor = utils.full_process
+
+
+def _get_processor(processor, scorer):
+    """
+    thefuzz runs both the default preprocessing of the function and the preprocessing
+    function passed into process.* while rapidfuzz only runs the one passed into
+    process.*. This function wraps the processor to mimic this behavior
+    """
+    if scorer not in (fuzz.WRatio, fuzz.QRatio,
+                      fuzz.token_set_ratio, fuzz.token_sort_ratio,
+                      fuzz.partial_token_set_ratio, fuzz.partial_token_sort_ratio,
+                      fuzz.UWRatio, fuzz.UQRatio):
+        return processor
+
+    if not processor:
+        return utils.full_process
+
+    def wrapper(s):
+        return utils.full_process(processor(s))
+
+    return wrapper
+
+
+# this allows lowering the scorers back to the scorers used in rapidfuzz
+# this allows rapidfuzz to perform more optimizations behind the scenes.
+# These mapped scorers are the same with two expceptions
+# - default processor
+# - result is not rounded
+# these two exceptions need to be taken into account in the implementation
+_scorer_lowering = {
+    fuzz.ratio: rfuzz.ratio,
+    fuzz.partial_ratio: rfuzz.partial_ratio,
+    fuzz.token_set_ratio: rfuzz.token_set_ratio,
+    fuzz.token_sort_ratio: rfuzz.token_sort_ratio,
+    fuzz.partial_token_set_ratio: rfuzz.partial_token_set_ratio,
+    fuzz.partial_token_sort_ratio: rfuzz.partial_token_sort_ratio,
+    fuzz.WRatio: rfuzz.WRatio,
+    fuzz.QRatio: rfuzz.QRatio,
+    fuzz.UWRatio: rfuzz.WRatio,
+    fuzz.UQRatio: rfuzz.QRatio,
+}
+
+
+def _get_scorer(scorer):
+    """
+    rapidfuzz scorers require the score_cutoff argument to be available
+    This generates a compatible wrapper function
+    """
+    def wrapper(s1, s2, score_cutoff=0):
+        return scorer(s1, s2)
+
+    return _scorer_lowering.get(scorer, wrapper)
+
+
+def _preprocess_query(query, processor):
+    processed_query = processor(query) if processor else query
+    if len(processed_query) == 0:
+        _logger.warning("Applied processor reduces input query to empty string, "
+                        "all comparisons will have score 0. "
+                        f"[Query: \'{query}\']")
+
+    return processed_query
+
+
+def extractWithoutOrder(query, choices, processor=default_processor, scorer=default_scorer, score_cutoff=0):
+    """
+    Select the best match in a list or dictionary of choices.
+
+    Find best matches in a list or dictionary of choices, return a
+    generator of tuples containing the match and its score. If a dictionary
+    is used, also returns the key for each match.
+
+    Arguments:
+        query: An object representing the thing we want to find.
+        choices: An iterable or dictionary-like object containing choices
+            to be matched against the query. Dictionary arguments of
+            {key: value} pairs will attempt to match the query against
+            each value.
+        processor: Optional function of the form f(a) -> b, where a is the query or
+            individual choice and b is the choice to be used in matching.
+
+            This can be used to match against, say, the first element of
+            a list:
+
+            lambda x: x[0]
+
+            Defaults to thefuzz.utils.full_process().
+        scorer: Optional function for scoring matches between the query and
+            an individual processed choice. This should be a function
+            of the form f(query, choice) -> int.
+
+            By default, fuzz.WRatio() is used and expects both query and
+            choice to be strings.
+        score_cutoff: Optional argument for score threshold. No matches with
+            a score less than this number will be returned. Defaults to 0.
+
+    Returns:
+        Generator of tuples containing the match and its score.
+
+        If a list is used for choices, then the result will be 2-tuples.
+        If a dictionary is used, then the result will be 3-tuples containing
+        the key for each match.
+
+        For example, searching for 'bird' in the dictionary
+
+        {'bard': 'train', 'dog': 'man'}
+
+        may return
+
+        ('train', 22, 'bard'), ('man', 0, 'dog')
+    """
+    is_mapping = hasattr(choices, "items")
+    is_lowered = scorer in _scorer_lowering
+
+    query = _preprocess_query(query, processor)
+    it = rprocess.extract_iter(
+        query, choices,
+        processor=_get_processor(processor, scorer),
+        scorer=_get_scorer(scorer),
+        score_cutoff=score_cutoff
+    )
+
+    for choice, score, key in it:
+        if is_lowered:
+            score = int(round(score))
+
+        yield (choice, score, key) if is_mapping else (choice, score)
+
+
+def extract(query, choices, processor=default_processor, scorer=default_scorer, limit=5):
+    """
+    Select the best match in a list or dictionary of choices.
+
+    Find best matches in a list or dictionary of choices, return a
+    list of tuples containing the match and its score. If a dictionary
+    is used, also returns the key for each match.
+
+    Arguments:
+        query: An object representing the thing we want to find.
+        choices: An iterable or dictionary-like object containing choices
+            to be matched against the query. Dictionary arguments of
+            {key: value} pairs will attempt to match the query against
+            each value.
+        processor: Optional function of the form f(a) -> b, where a is the query or
+            individual choice and b is the choice to be used in matching.
+
+            This can be used to match against, say, the first element of
+            a list:
+
+            lambda x: x[0]
+
+            Defaults to thefuzz.utils.full_process().
+        scorer: Optional function for scoring matches between the query and
+            an individual processed choice. This should be a function
+            of the form f(query, choice) -> int.
+            By default, fuzz.WRatio() is used and expects both query and
+            choice to be strings.
+        limit: Optional maximum for the number of elements returned. Defaults
+            to 5.
+
+    Returns:
+        List of tuples containing the match and its score.
+
+        If a list is used for choices, then the result will be 2-tuples.
+        If a dictionary is used, then the result will be 3-tuples containing
+        the key for each match.
+
+        For example, searching for 'bird' in the dictionary
+
+        {'bard': 'train', 'dog': 'man'}
+
+        may return
+
+        [('train', 22, 'bard'), ('man', 0, 'dog')]
+    """
+    return extractBests(query, choices, processor=processor, scorer=scorer, limit=limit)
+
+
+def extractBests(query, choices, processor=default_processor, scorer=default_scorer, score_cutoff=0, limit=5):
+    """
+    Get a list of the best matches to a collection of choices.
+
+    Convenience function for getting the choices with best scores.
+
+    Args:
+        query: A string to match against
+        choices: A list or dictionary of choices, suitable for use with
+            extract().
+        processor: Optional function for transforming choices before matching.
+            See extract().
+        scorer: Scoring function for extract().
+        score_cutoff: Optional argument for score threshold. No matches with
+            a score less than this number will be returned. Defaults to 0.
+        limit: Optional maximum for the number of elements returned. Defaults
+            to 5.
+
+    Returns: A a list of (match, score) tuples.
+    """
+    is_mapping = hasattr(choices, "items")
+    is_lowered = scorer in _scorer_lowering
+
+    query = _preprocess_query(query, processor)
+    results = rprocess.extract(
+        query, choices,
+        processor=_get_processor(processor, scorer),
+        scorer=_get_scorer(scorer),
+        score_cutoff=score_cutoff,
+        limit=limit
+    )
+
+    for i, (choice, score, key) in enumerate(results):
+        if is_lowered:
+            score = int(round(score))
+
+        results[i] = (choice, score, key) if is_mapping else (choice, score)
+
+    return results
+
+
+def extractOne(query, choices, processor=default_processor, scorer=default_scorer, score_cutoff=0):
+    """
+    Find the single best match above a score in a list of choices.
+
+    This is a convenience method which returns the single best choice.
+    See extract() for the full arguments list.
+
+    Args:
+        query: A string to match against
+        choices: A list or dictionary of choices, suitable for use with
+            extract().
+        processor: Optional function for transforming choices before matching.
+            See extract().
+        scorer: Scoring function for extract().
+        score_cutoff: Optional argument for score threshold. If the best
+            match is found, but it is not greater than this number, then
+            return None anyway ("not a good enough match").  Defaults to 0.
+
+    Returns:
+        A tuple containing a single match and its score, if a match
+        was found that was above score_cutoff. Otherwise, returns None.
+    """
+    is_mapping = hasattr(choices, "items")
+    is_lowered = scorer in _scorer_lowering
+
+    query = _preprocess_query(query, processor)
+    res = rprocess.extractOne(
+        query, choices,
+        processor=_get_processor(processor, scorer),
+        scorer=_get_scorer(scorer),
+        score_cutoff=score_cutoff
+    )
+
+    if res is None:
+        return res
+
+    choice, score, key = res
+
+    if is_lowered:
+        score = int(round(score))
+
+    return (choice, score, key) if is_mapping else (choice, score)
+
+
+def dedupe(contains_dupes, threshold=70, scorer=fuzz.token_set_ratio):
+    """
+    This convenience function takes a list of strings containing duplicates and uses fuzzy matching to identify
+    and remove duplicates. Specifically, it uses process.extract to identify duplicates that
+    score greater than a user defined threshold. Then, it looks for the longest item in the duplicate list
+    since we assume this item contains the most entity information and returns that. It breaks string
+    length ties on an alphabetical sort.
+
+    Note: as the threshold DECREASES the number of duplicates that are found INCREASES. This means that the
+        returned deduplicated list will likely be shorter. Raise the threshold for dedupe to be less
+        sensitive.
+
+    Args:
+        contains_dupes: A list of strings that we would like to dedupe.
+        threshold: the numerical value (0,100) point at which we expect to find duplicates.
+            Defaults to 70 out of 100
+        scorer: Optional function for scoring matches between the query and
+            an individual processed choice. This should be a function
+            of the form f(query, choice) -> int.
+            By default, fuzz.token_set_ratio() is used and expects both query and
+            choice to be strings.
+
+    Returns:
+        A deduplicated list. For example:
+
+            In: contains_dupes = ['Frodo Baggin', 'Frodo Baggins', 'F. Baggins', 'Samwise G.', 'Gandalf', 'Bilbo Baggins']
+            In: dedupe(contains_dupes)
+            Out: ['Frodo Baggins', 'Samwise G.', 'Bilbo Baggins', 'Gandalf']
+    """
+    deduped = set()
+    for item in contains_dupes:
+        matches = extractBests(item, contains_dupes, scorer=scorer, score_cutoff=threshold, limit=None)
+        deduped.add(max(matches, key=lambda x: (len(x[0]), x[0]))[0])
+
+    return list(deduped) if len(deduped) != len(contains_dupes) else contains_dupes
diff --git a/lib/thefuzz/process.pyi b/lib/thefuzz/process.pyi
new file mode 100644
index 00000000..f5cd33f0
--- /dev/null
+++ b/lib/thefuzz/process.pyi
@@ -0,0 +1,17 @@
+from collections.abc import Mapping
+import typing
+from typing import Any, Callable, Union, Tuple, Generator, TypeVar, Sequence
+
+
+ChoicesT = Union[Mapping[str, str], Sequence[str]]
+T = TypeVar('T')
+ProcessorT = Union[Callable[[str, bool], str], Callable[[Any], Any]]
+ScorerT = Callable[[str, str, bool, bool], int]
+
+
+@typing.overload
+def extractWithoutOrder(query: str, choices: Mapping[str, str], processor: ProcessorT, scorer: ScorerT, score_cutoff: int = ...) -> Generator[Tuple[str, int, str], None, None]: ...
+
+
+@typing.overload
+def extractWithoutOrder(query: str, choices: Sequence[str], processor: ProcessorT, scorer: ScorerT, score_cutoff: int = ...) -> Generator[Tuple[str, int], None, None]: ...
diff --git a/lib/thefuzz/utils.py b/lib/thefuzz/utils.py
new file mode 100644
index 00000000..bcda741d
--- /dev/null
+++ b/lib/thefuzz/utils.py
@@ -0,0 +1,22 @@
+from rapidfuzz.utils import default_process as _default_process
+
+translation_table = {i: None for i in range(128, 256)}  # ascii dammit!
+
+
+def ascii_only(s):
+    return s.translate(translation_table)
+
+
+def full_process(s, force_ascii=False):
+    """
+    Process string by
+    -- removing all but letters and numbers
+    -- trim whitespace
+    -- force to lower case
+    if force_ascii == True, force convert to ascii
+    """
+
+    if force_ascii:
+        s = ascii_only(str(s))
+
+    return _default_process(s)
diff --git a/lib/thefuzz/utils.pyi b/lib/thefuzz/utils.pyi
new file mode 100644
index 00000000..2c15b143
--- /dev/null
+++ b/lib/thefuzz/utils.pyi
@@ -0,0 +1,3 @@
+
+def ascii_only(s: str) -> str: ...
+def full_process(s: str, force_ascii: bool = ...) -> str: ...
diff --git a/recommended.txt b/recommended.txt
index 2ff506f6..5f60addf 100644
--- a/recommended.txt
+++ b/recommended.txt
@@ -10,7 +10,6 @@ lxml; 'Windows' != platform_system
 orjson; 'Windows' == platform_system
 orjson; 'Linux' == platform_system and ('x86_64' == platform_machine or 'aarch64' == platform_machine or 'armv7l' == platform_machine)
 pip
-Levenshtein
 rapidfuzz < 4.0.0
 regex
 setuptools
diff --git a/sickgear/tv.py b/sickgear/tv.py
index 6e631d61..2108073a 100644
--- a/sickgear/tv.py
+++ b/sickgear/tv.py
@@ -56,7 +56,10 @@ from .tv_base import TVEpisodeBase, TVShowBase
 from lib import imdbpie, subliminal
 from lib.dateutil import tz
 from lib.dateutil.parser import parser as du_parser
-from lib.fuzzywuzzy import fuzz
+try:
+    from lib.thefuzz import fuzz
+except ImportError as e:
+    from lib.fuzzywuzzy import fuzz
 from lib.tvinfo_base import RoleTypes, TVINFO_FACEBOOK, TVINFO_INSTAGRAM, TVINFO_SLUG, TVINFO_TWITTER, \
     TVINFO_WIKIPEDIA, TVINFO_TIKTOK, TVINFO_FANSITE, TVINFO_YOUTUBE, TVINFO_REDDIT, TVINFO_LINKEDIN, TVINFO_WIKIDATA
 from lib.tvinfo_base.exceptions import *
diff --git a/sickgear/webserve.py b/sickgear/webserve.py
index 6be4c120..e76b2f45 100644
--- a/sickgear/webserve.py
+++ b/sickgear/webserve.py
@@ -88,7 +88,10 @@ from lib import subliminal
 from lib.cfscrape import CloudflareScraper
 from lib.dateutil import tz, zoneinfo
 from lib.dateutil.relativedelta import relativedelta
-from lib.fuzzywuzzy import fuzz
+try:
+    from lib.thefuzz import fuzz
+except ImportError as e:
+    from lib.fuzzywuzzy import fuzz
 from lib.api_trakt import TraktAPI
 from lib.api_trakt.exceptions import TraktException, TraktAuthException
 from lib.tvinfo_base import TVInfoEpisode