From 2a73990c82f35bb94a853bebd3a51ed4df408c58 Mon Sep 17 00:00:00 2001 From: Prinz23 Date: Fri, 8 Sep 2023 00:20:37 +0200 Subject: [PATCH] Add thefuzz 0.19.0 (c2cd4f4) as a replacement with fallback to fuzzywuzzy 0.18.0 (2188520) Removes Levenshtein requirement with direct use of rapidfuzz instead Fallback to old fuzzywuzzy for pure python implementation --- CHANGES.md | 1 + lib/thefuzz/__init__.py | 1 + lib/thefuzz/fuzz.py | 160 +++++++++++++++++++++ lib/thefuzz/fuzz.pyi | 10 ++ lib/thefuzz/process.py | 309 ++++++++++++++++++++++++++++++++++++++++ lib/thefuzz/process.pyi | 17 +++ lib/thefuzz/utils.py | 22 +++ lib/thefuzz/utils.pyi | 3 + recommended.txt | 1 - sickgear/tv.py | 5 +- sickgear/webserve.py | 5 +- 11 files changed, 531 insertions(+), 3 deletions(-) create mode 100644 lib/thefuzz/__init__.py create mode 100644 lib/thefuzz/fuzz.py create mode 100644 lib/thefuzz/fuzz.pyi create mode 100644 lib/thefuzz/process.py create mode 100644 lib/thefuzz/process.pyi create mode 100644 lib/thefuzz/utils.py create mode 100644 lib/thefuzz/utils.pyi diff --git a/CHANGES.md b/CHANGES.md index d70debd4..cf9d9fd1 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -6,6 +6,7 @@ * Update package resource API 67.5.1 (f51eccd) to 68.1.2 (1ef36f2) * Update soupsieve 2.3.2.post1 (792d566) to 2.4.1 (2e66beb) * Update Tornado Web Server 6.3.2 (e3aa6c5) to 6.3.3 (e4d6984) +* Add thefuzz 0.19.0 (c2cd4f4) as a replacement with fallback to fuzzywuzzy 0.18.0 (2188520) * Fix regex that was not using py312 notation * Change sort backlog and manual segment search results episode number * Change sort episodes when set to wanted on display show page diff --git a/lib/thefuzz/__init__.py b/lib/thefuzz/__init__.py new file mode 100644 index 00000000..482e4a19 --- /dev/null +++ b/lib/thefuzz/__init__.py @@ -0,0 +1 @@ +__version__ = '0.19.0' diff --git a/lib/thefuzz/fuzz.py b/lib/thefuzz/fuzz.py new file mode 100644 index 00000000..7d86e483 --- /dev/null +++ b/lib/thefuzz/fuzz.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python + +from rapidfuzz.fuzz import ( + ratio as _ratio, + partial_ratio as _partial_ratio, + token_set_ratio as _token_set_ratio, + token_sort_ratio as _token_sort_ratio, + partial_token_set_ratio as _partial_token_set_ratio, + partial_token_sort_ratio as _partial_token_sort_ratio, + WRatio as _WRatio, + QRatio as _QRatio, +) + +from . import utils + +########################### +# Basic Scoring Functions # +########################### + + +def _rapidfuzz_scorer(scorer, s1, s2, force_ascii, full_process): + """ + wrapper around rapidfuzz function to be compatible with the API of thefuzz + """ + if full_process: + if s1 is None or s2 is None: + return 0 + + s1 = utils.full_process(s1, force_ascii=force_ascii) + s2 = utils.full_process(s2, force_ascii=force_ascii) + + return int(round(scorer(s1, s2))) + + +def ratio(s1, s2): + return _rapidfuzz_scorer(_ratio, s1, s2, False, False) + + +def partial_ratio(s1, s2): + """ + Return the ratio of the most similar substring + as a number between 0 and 100. + """ + return _rapidfuzz_scorer(_partial_ratio, s1, s2, False, False) + + +############################## +# Advanced Scoring Functions # +############################## + +# Sorted Token +# find all alphanumeric tokens in the string +# sort those tokens and take ratio of resulting joined strings +# controls for unordered string elements +def token_sort_ratio(s1, s2, force_ascii=True, full_process=True): + """ + Return a measure of the sequences' similarity between 0 and 100 + but sorting the token before comparing. + """ + return _rapidfuzz_scorer(_token_sort_ratio, s1, s2, force_ascii, full_process) + + +def partial_token_sort_ratio(s1, s2, force_ascii=True, full_process=True): + """ + Return the ratio of the most similar substring as a number between + 0 and 100 but sorting the token before comparing. + """ + return _rapidfuzz_scorer( + _partial_token_sort_ratio, s1, s2, force_ascii, full_process + ) + + +def token_set_ratio(s1, s2, force_ascii=True, full_process=True): + return _rapidfuzz_scorer(_token_set_ratio, s1, s2, force_ascii, full_process) + + +def partial_token_set_ratio(s1, s2, force_ascii=True, full_process=True): + return _rapidfuzz_scorer( + _partial_token_set_ratio, s1, s2, force_ascii, full_process + ) + + +################### +# Combination API # +################### + +# q is for quick +def QRatio(s1, s2, force_ascii=True, full_process=True): + """ + Quick ratio comparison between two strings. + + Runs full_process from utils on both strings + Short circuits if either of the strings is empty after processing. + + :param s1: + :param s2: + :param force_ascii: Allow only ASCII characters (Default: True) + :full_process: Process inputs, used here to avoid double processing in extract functions (Default: True) + :return: similarity ratio + """ + return _rapidfuzz_scorer(_QRatio, s1, s2, force_ascii, full_process) + + +def UQRatio(s1, s2, full_process=True): + """ + Unicode quick ratio + + Calls QRatio with force_ascii set to False + + :param s1: + :param s2: + :return: similarity ratio + """ + return QRatio(s1, s2, force_ascii=False, full_process=full_process) + + +# w is for weighted +def WRatio(s1, s2, force_ascii=True, full_process=True): + """ + Return a measure of the sequences' similarity between 0 and 100, using different algorithms. + + **Steps in the order they occur** + + #. Run full_process from utils on both strings + #. Short circuit if this makes either string empty + #. Take the ratio of the two processed strings (fuzz.ratio) + #. Run checks to compare the length of the strings + * If one of the strings is more than 1.5 times as long as the other + use partial_ratio comparisons - scale partial results by 0.9 + (this makes sure only full results can return 100) + * If one of the strings is over 8 times as long as the other + instead scale by 0.6 + + #. Run the other ratio functions + * if using partial ratio functions call partial_ratio, + partial_token_sort_ratio and partial_token_set_ratio + scale all of these by the ratio based on length + * otherwise call token_sort_ratio and token_set_ratio + * all token based comparisons are scaled by 0.95 + (on top of any partial scalars) + + #. Take the highest value from these results + round it and return it as an integer. + + :param s1: + :param s2: + :param force_ascii: Allow only ascii characters + :type force_ascii: bool + :full_process: Process inputs, used here to avoid double processing in extract functions (Default: True) + :return: + """ + return _rapidfuzz_scorer(_WRatio, s1, s2, force_ascii, full_process) + + +def UWRatio(s1, s2, full_process=True): + """ + Return a measure of the sequences' similarity between 0 and 100, + using different algorithms. Same as WRatio but preserving unicode. + """ + return WRatio(s1, s2, force_ascii=False, full_process=full_process) diff --git a/lib/thefuzz/fuzz.pyi b/lib/thefuzz/fuzz.pyi new file mode 100644 index 00000000..86916a2b --- /dev/null +++ b/lib/thefuzz/fuzz.pyi @@ -0,0 +1,10 @@ +def ratio(s1: str, s2: str) -> int: ... +def partial_ratio(s1: str, s2: str) -> int: ... +def token_sort_ratio(s1: str, s2: str, force_ascii: bool = ..., full_process: bool = ...) -> int: ... +def partial_token_sort_ratio(s1: str, s2: str, force_ascii: bool = ..., full_process: bool = ...) -> int: ... +def token_set_ratio(s1: str, s2: str, force_ascii: bool = ..., full_process: bool = ...) -> int: ... +def partial_token_set_ratio(s1: str, s2: str, force_ascii: bool = ..., full_process: bool = ...) -> int: ... +def QRatio(s1: str, s2: str, force_ascii: bool = ..., full_process: bool = ...) -> int: ... +def UQRatio(s1: str, s2: str, full_process: bool = ...) -> int: ... +def WRatio(s1: str, s2: str, force_ascii: bool = ..., full_process: bool = ...) -> int: ... +def UWRatio(s1: str, s2: str, full_process: bool = ...) -> int: ... diff --git a/lib/thefuzz/process.py b/lib/thefuzz/process.py new file mode 100644 index 00000000..f6b15eaa --- /dev/null +++ b/lib/thefuzz/process.py @@ -0,0 +1,309 @@ +#!/usr/bin/env python +from . import fuzz +from . import utils +import logging +from rapidfuzz import fuzz as rfuzz +from rapidfuzz import process as rprocess + +_logger = logging.getLogger(__name__) + +default_scorer = fuzz.WRatio +default_processor = utils.full_process + + +def _get_processor(processor, scorer): + """ + thefuzz runs both the default preprocessing of the function and the preprocessing + function passed into process.* while rapidfuzz only runs the one passed into + process.*. This function wraps the processor to mimic this behavior + """ + if scorer not in (fuzz.WRatio, fuzz.QRatio, + fuzz.token_set_ratio, fuzz.token_sort_ratio, + fuzz.partial_token_set_ratio, fuzz.partial_token_sort_ratio, + fuzz.UWRatio, fuzz.UQRatio): + return processor + + if not processor: + return utils.full_process + + def wrapper(s): + return utils.full_process(processor(s)) + + return wrapper + + +# this allows lowering the scorers back to the scorers used in rapidfuzz +# this allows rapidfuzz to perform more optimizations behind the scenes. +# These mapped scorers are the same with two expceptions +# - default processor +# - result is not rounded +# these two exceptions need to be taken into account in the implementation +_scorer_lowering = { + fuzz.ratio: rfuzz.ratio, + fuzz.partial_ratio: rfuzz.partial_ratio, + fuzz.token_set_ratio: rfuzz.token_set_ratio, + fuzz.token_sort_ratio: rfuzz.token_sort_ratio, + fuzz.partial_token_set_ratio: rfuzz.partial_token_set_ratio, + fuzz.partial_token_sort_ratio: rfuzz.partial_token_sort_ratio, + fuzz.WRatio: rfuzz.WRatio, + fuzz.QRatio: rfuzz.QRatio, + fuzz.UWRatio: rfuzz.WRatio, + fuzz.UQRatio: rfuzz.QRatio, +} + + +def _get_scorer(scorer): + """ + rapidfuzz scorers require the score_cutoff argument to be available + This generates a compatible wrapper function + """ + def wrapper(s1, s2, score_cutoff=0): + return scorer(s1, s2) + + return _scorer_lowering.get(scorer, wrapper) + + +def _preprocess_query(query, processor): + processed_query = processor(query) if processor else query + if len(processed_query) == 0: + _logger.warning("Applied processor reduces input query to empty string, " + "all comparisons will have score 0. " + f"[Query: \'{query}\']") + + return processed_query + + +def extractWithoutOrder(query, choices, processor=default_processor, scorer=default_scorer, score_cutoff=0): + """ + Select the best match in a list or dictionary of choices. + + Find best matches in a list or dictionary of choices, return a + generator of tuples containing the match and its score. If a dictionary + is used, also returns the key for each match. + + Arguments: + query: An object representing the thing we want to find. + choices: An iterable or dictionary-like object containing choices + to be matched against the query. Dictionary arguments of + {key: value} pairs will attempt to match the query against + each value. + processor: Optional function of the form f(a) -> b, where a is the query or + individual choice and b is the choice to be used in matching. + + This can be used to match against, say, the first element of + a list: + + lambda x: x[0] + + Defaults to thefuzz.utils.full_process(). + scorer: Optional function for scoring matches between the query and + an individual processed choice. This should be a function + of the form f(query, choice) -> int. + + By default, fuzz.WRatio() is used and expects both query and + choice to be strings. + score_cutoff: Optional argument for score threshold. No matches with + a score less than this number will be returned. Defaults to 0. + + Returns: + Generator of tuples containing the match and its score. + + If a list is used for choices, then the result will be 2-tuples. + If a dictionary is used, then the result will be 3-tuples containing + the key for each match. + + For example, searching for 'bird' in the dictionary + + {'bard': 'train', 'dog': 'man'} + + may return + + ('train', 22, 'bard'), ('man', 0, 'dog') + """ + is_mapping = hasattr(choices, "items") + is_lowered = scorer in _scorer_lowering + + query = _preprocess_query(query, processor) + it = rprocess.extract_iter( + query, choices, + processor=_get_processor(processor, scorer), + scorer=_get_scorer(scorer), + score_cutoff=score_cutoff + ) + + for choice, score, key in it: + if is_lowered: + score = int(round(score)) + + yield (choice, score, key) if is_mapping else (choice, score) + + +def extract(query, choices, processor=default_processor, scorer=default_scorer, limit=5): + """ + Select the best match in a list or dictionary of choices. + + Find best matches in a list or dictionary of choices, return a + list of tuples containing the match and its score. If a dictionary + is used, also returns the key for each match. + + Arguments: + query: An object representing the thing we want to find. + choices: An iterable or dictionary-like object containing choices + to be matched against the query. Dictionary arguments of + {key: value} pairs will attempt to match the query against + each value. + processor: Optional function of the form f(a) -> b, where a is the query or + individual choice and b is the choice to be used in matching. + + This can be used to match against, say, the first element of + a list: + + lambda x: x[0] + + Defaults to thefuzz.utils.full_process(). + scorer: Optional function for scoring matches between the query and + an individual processed choice. This should be a function + of the form f(query, choice) -> int. + By default, fuzz.WRatio() is used and expects both query and + choice to be strings. + limit: Optional maximum for the number of elements returned. Defaults + to 5. + + Returns: + List of tuples containing the match and its score. + + If a list is used for choices, then the result will be 2-tuples. + If a dictionary is used, then the result will be 3-tuples containing + the key for each match. + + For example, searching for 'bird' in the dictionary + + {'bard': 'train', 'dog': 'man'} + + may return + + [('train', 22, 'bard'), ('man', 0, 'dog')] + """ + return extractBests(query, choices, processor=processor, scorer=scorer, limit=limit) + + +def extractBests(query, choices, processor=default_processor, scorer=default_scorer, score_cutoff=0, limit=5): + """ + Get a list of the best matches to a collection of choices. + + Convenience function for getting the choices with best scores. + + Args: + query: A string to match against + choices: A list or dictionary of choices, suitable for use with + extract(). + processor: Optional function for transforming choices before matching. + See extract(). + scorer: Scoring function for extract(). + score_cutoff: Optional argument for score threshold. No matches with + a score less than this number will be returned. Defaults to 0. + limit: Optional maximum for the number of elements returned. Defaults + to 5. + + Returns: A a list of (match, score) tuples. + """ + is_mapping = hasattr(choices, "items") + is_lowered = scorer in _scorer_lowering + + query = _preprocess_query(query, processor) + results = rprocess.extract( + query, choices, + processor=_get_processor(processor, scorer), + scorer=_get_scorer(scorer), + score_cutoff=score_cutoff, + limit=limit + ) + + for i, (choice, score, key) in enumerate(results): + if is_lowered: + score = int(round(score)) + + results[i] = (choice, score, key) if is_mapping else (choice, score) + + return results + + +def extractOne(query, choices, processor=default_processor, scorer=default_scorer, score_cutoff=0): + """ + Find the single best match above a score in a list of choices. + + This is a convenience method which returns the single best choice. + See extract() for the full arguments list. + + Args: + query: A string to match against + choices: A list or dictionary of choices, suitable for use with + extract(). + processor: Optional function for transforming choices before matching. + See extract(). + scorer: Scoring function for extract(). + score_cutoff: Optional argument for score threshold. If the best + match is found, but it is not greater than this number, then + return None anyway ("not a good enough match"). Defaults to 0. + + Returns: + A tuple containing a single match and its score, if a match + was found that was above score_cutoff. Otherwise, returns None. + """ + is_mapping = hasattr(choices, "items") + is_lowered = scorer in _scorer_lowering + + query = _preprocess_query(query, processor) + res = rprocess.extractOne( + query, choices, + processor=_get_processor(processor, scorer), + scorer=_get_scorer(scorer), + score_cutoff=score_cutoff + ) + + if res is None: + return res + + choice, score, key = res + + if is_lowered: + score = int(round(score)) + + return (choice, score, key) if is_mapping else (choice, score) + + +def dedupe(contains_dupes, threshold=70, scorer=fuzz.token_set_ratio): + """ + This convenience function takes a list of strings containing duplicates and uses fuzzy matching to identify + and remove duplicates. Specifically, it uses process.extract to identify duplicates that + score greater than a user defined threshold. Then, it looks for the longest item in the duplicate list + since we assume this item contains the most entity information and returns that. It breaks string + length ties on an alphabetical sort. + + Note: as the threshold DECREASES the number of duplicates that are found INCREASES. This means that the + returned deduplicated list will likely be shorter. Raise the threshold for dedupe to be less + sensitive. + + Args: + contains_dupes: A list of strings that we would like to dedupe. + threshold: the numerical value (0,100) point at which we expect to find duplicates. + Defaults to 70 out of 100 + scorer: Optional function for scoring matches between the query and + an individual processed choice. This should be a function + of the form f(query, choice) -> int. + By default, fuzz.token_set_ratio() is used and expects both query and + choice to be strings. + + Returns: + A deduplicated list. For example: + + In: contains_dupes = ['Frodo Baggin', 'Frodo Baggins', 'F. Baggins', 'Samwise G.', 'Gandalf', 'Bilbo Baggins'] + In: dedupe(contains_dupes) + Out: ['Frodo Baggins', 'Samwise G.', 'Bilbo Baggins', 'Gandalf'] + """ + deduped = set() + for item in contains_dupes: + matches = extractBests(item, contains_dupes, scorer=scorer, score_cutoff=threshold, limit=None) + deduped.add(max(matches, key=lambda x: (len(x[0]), x[0]))[0]) + + return list(deduped) if len(deduped) != len(contains_dupes) else contains_dupes diff --git a/lib/thefuzz/process.pyi b/lib/thefuzz/process.pyi new file mode 100644 index 00000000..f5cd33f0 --- /dev/null +++ b/lib/thefuzz/process.pyi @@ -0,0 +1,17 @@ +from collections.abc import Mapping +import typing +from typing import Any, Callable, Union, Tuple, Generator, TypeVar, Sequence + + +ChoicesT = Union[Mapping[str, str], Sequence[str]] +T = TypeVar('T') +ProcessorT = Union[Callable[[str, bool], str], Callable[[Any], Any]] +ScorerT = Callable[[str, str, bool, bool], int] + + +@typing.overload +def extractWithoutOrder(query: str, choices: Mapping[str, str], processor: ProcessorT, scorer: ScorerT, score_cutoff: int = ...) -> Generator[Tuple[str, int, str], None, None]: ... + + +@typing.overload +def extractWithoutOrder(query: str, choices: Sequence[str], processor: ProcessorT, scorer: ScorerT, score_cutoff: int = ...) -> Generator[Tuple[str, int], None, None]: ... diff --git a/lib/thefuzz/utils.py b/lib/thefuzz/utils.py new file mode 100644 index 00000000..bcda741d --- /dev/null +++ b/lib/thefuzz/utils.py @@ -0,0 +1,22 @@ +from rapidfuzz.utils import default_process as _default_process + +translation_table = {i: None for i in range(128, 256)} # ascii dammit! + + +def ascii_only(s): + return s.translate(translation_table) + + +def full_process(s, force_ascii=False): + """ + Process string by + -- removing all but letters and numbers + -- trim whitespace + -- force to lower case + if force_ascii == True, force convert to ascii + """ + + if force_ascii: + s = ascii_only(str(s)) + + return _default_process(s) diff --git a/lib/thefuzz/utils.pyi b/lib/thefuzz/utils.pyi new file mode 100644 index 00000000..2c15b143 --- /dev/null +++ b/lib/thefuzz/utils.pyi @@ -0,0 +1,3 @@ + +def ascii_only(s: str) -> str: ... +def full_process(s: str, force_ascii: bool = ...) -> str: ... diff --git a/recommended.txt b/recommended.txt index 2ff506f6..5f60addf 100644 --- a/recommended.txt +++ b/recommended.txt @@ -10,7 +10,6 @@ lxml; 'Windows' != platform_system orjson; 'Windows' == platform_system orjson; 'Linux' == platform_system and ('x86_64' == platform_machine or 'aarch64' == platform_machine or 'armv7l' == platform_machine) pip -Levenshtein rapidfuzz < 4.0.0 regex setuptools diff --git a/sickgear/tv.py b/sickgear/tv.py index 6e631d61..2108073a 100644 --- a/sickgear/tv.py +++ b/sickgear/tv.py @@ -56,7 +56,10 @@ from .tv_base import TVEpisodeBase, TVShowBase from lib import imdbpie, subliminal from lib.dateutil import tz from lib.dateutil.parser import parser as du_parser -from lib.fuzzywuzzy import fuzz +try: + from lib.thefuzz import fuzz +except ImportError as e: + from lib.fuzzywuzzy import fuzz from lib.tvinfo_base import RoleTypes, TVINFO_FACEBOOK, TVINFO_INSTAGRAM, TVINFO_SLUG, TVINFO_TWITTER, \ TVINFO_WIKIPEDIA, TVINFO_TIKTOK, TVINFO_FANSITE, TVINFO_YOUTUBE, TVINFO_REDDIT, TVINFO_LINKEDIN, TVINFO_WIKIDATA from lib.tvinfo_base.exceptions import * diff --git a/sickgear/webserve.py b/sickgear/webserve.py index 6be4c120..e76b2f45 100644 --- a/sickgear/webserve.py +++ b/sickgear/webserve.py @@ -88,7 +88,10 @@ from lib import subliminal from lib.cfscrape import CloudflareScraper from lib.dateutil import tz, zoneinfo from lib.dateutil.relativedelta import relativedelta -from lib.fuzzywuzzy import fuzz +try: + from lib.thefuzz import fuzz +except ImportError as e: + from lib.fuzzywuzzy import fuzz from lib.api_trakt import TraktAPI from lib.api_trakt.exceptions import TraktException, TraktAuthException from lib.tvinfo_base import TVInfoEpisode