mirror of
https://github.com/SickGear/SickGear.git
synced 2024-11-15 01:15:05 +00:00
Add thefuzz 0.19.0 (c2cd4f4) as a replacement with fallback to fuzzywuzzy 0.18.0 (2188520)
Removes Levenshtein requirement with direct use of rapidfuzz instead Fallback to old fuzzywuzzy for pure python implementation
This commit is contained in:
parent
af37db4aad
commit
2a73990c82
11 changed files with 531 additions and 3 deletions
|
@ -6,6 +6,7 @@
|
||||||
* Update package resource API 67.5.1 (f51eccd) to 68.1.2 (1ef36f2)
|
* Update package resource API 67.5.1 (f51eccd) to 68.1.2 (1ef36f2)
|
||||||
* Update soupsieve 2.3.2.post1 (792d566) to 2.4.1 (2e66beb)
|
* Update soupsieve 2.3.2.post1 (792d566) to 2.4.1 (2e66beb)
|
||||||
* Update Tornado Web Server 6.3.2 (e3aa6c5) to 6.3.3 (e4d6984)
|
* Update Tornado Web Server 6.3.2 (e3aa6c5) to 6.3.3 (e4d6984)
|
||||||
|
* Add thefuzz 0.19.0 (c2cd4f4) as a replacement with fallback to fuzzywuzzy 0.18.0 (2188520)
|
||||||
* Fix regex that was not using py312 notation
|
* Fix regex that was not using py312 notation
|
||||||
* Change sort backlog and manual segment search results episode number
|
* Change sort backlog and manual segment search results episode number
|
||||||
* Change sort episodes when set to wanted on display show page
|
* Change sort episodes when set to wanted on display show page
|
||||||
|
|
1
lib/thefuzz/__init__.py
Normal file
1
lib/thefuzz/__init__.py
Normal file
|
@ -0,0 +1 @@
|
||||||
|
__version__ = '0.19.0'
|
160
lib/thefuzz/fuzz.py
Normal file
160
lib/thefuzz/fuzz.py
Normal file
|
@ -0,0 +1,160 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
from rapidfuzz.fuzz import (
|
||||||
|
ratio as _ratio,
|
||||||
|
partial_ratio as _partial_ratio,
|
||||||
|
token_set_ratio as _token_set_ratio,
|
||||||
|
token_sort_ratio as _token_sort_ratio,
|
||||||
|
partial_token_set_ratio as _partial_token_set_ratio,
|
||||||
|
partial_token_sort_ratio as _partial_token_sort_ratio,
|
||||||
|
WRatio as _WRatio,
|
||||||
|
QRatio as _QRatio,
|
||||||
|
)
|
||||||
|
|
||||||
|
from . import utils
|
||||||
|
|
||||||
|
###########################
|
||||||
|
# Basic Scoring Functions #
|
||||||
|
###########################
|
||||||
|
|
||||||
|
|
||||||
|
def _rapidfuzz_scorer(scorer, s1, s2, force_ascii, full_process):
|
||||||
|
"""
|
||||||
|
wrapper around rapidfuzz function to be compatible with the API of thefuzz
|
||||||
|
"""
|
||||||
|
if full_process:
|
||||||
|
if s1 is None or s2 is None:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
s1 = utils.full_process(s1, force_ascii=force_ascii)
|
||||||
|
s2 = utils.full_process(s2, force_ascii=force_ascii)
|
||||||
|
|
||||||
|
return int(round(scorer(s1, s2)))
|
||||||
|
|
||||||
|
|
||||||
|
def ratio(s1, s2):
|
||||||
|
return _rapidfuzz_scorer(_ratio, s1, s2, False, False)
|
||||||
|
|
||||||
|
|
||||||
|
def partial_ratio(s1, s2):
|
||||||
|
"""
|
||||||
|
Return the ratio of the most similar substring
|
||||||
|
as a number between 0 and 100.
|
||||||
|
"""
|
||||||
|
return _rapidfuzz_scorer(_partial_ratio, s1, s2, False, False)
|
||||||
|
|
||||||
|
|
||||||
|
##############################
|
||||||
|
# Advanced Scoring Functions #
|
||||||
|
##############################
|
||||||
|
|
||||||
|
# Sorted Token
|
||||||
|
# find all alphanumeric tokens in the string
|
||||||
|
# sort those tokens and take ratio of resulting joined strings
|
||||||
|
# controls for unordered string elements
|
||||||
|
def token_sort_ratio(s1, s2, force_ascii=True, full_process=True):
|
||||||
|
"""
|
||||||
|
Return a measure of the sequences' similarity between 0 and 100
|
||||||
|
but sorting the token before comparing.
|
||||||
|
"""
|
||||||
|
return _rapidfuzz_scorer(_token_sort_ratio, s1, s2, force_ascii, full_process)
|
||||||
|
|
||||||
|
|
||||||
|
def partial_token_sort_ratio(s1, s2, force_ascii=True, full_process=True):
|
||||||
|
"""
|
||||||
|
Return the ratio of the most similar substring as a number between
|
||||||
|
0 and 100 but sorting the token before comparing.
|
||||||
|
"""
|
||||||
|
return _rapidfuzz_scorer(
|
||||||
|
_partial_token_sort_ratio, s1, s2, force_ascii, full_process
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def token_set_ratio(s1, s2, force_ascii=True, full_process=True):
|
||||||
|
return _rapidfuzz_scorer(_token_set_ratio, s1, s2, force_ascii, full_process)
|
||||||
|
|
||||||
|
|
||||||
|
def partial_token_set_ratio(s1, s2, force_ascii=True, full_process=True):
|
||||||
|
return _rapidfuzz_scorer(
|
||||||
|
_partial_token_set_ratio, s1, s2, force_ascii, full_process
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
###################
|
||||||
|
# Combination API #
|
||||||
|
###################
|
||||||
|
|
||||||
|
# q is for quick
|
||||||
|
def QRatio(s1, s2, force_ascii=True, full_process=True):
|
||||||
|
"""
|
||||||
|
Quick ratio comparison between two strings.
|
||||||
|
|
||||||
|
Runs full_process from utils on both strings
|
||||||
|
Short circuits if either of the strings is empty after processing.
|
||||||
|
|
||||||
|
:param s1:
|
||||||
|
:param s2:
|
||||||
|
:param force_ascii: Allow only ASCII characters (Default: True)
|
||||||
|
:full_process: Process inputs, used here to avoid double processing in extract functions (Default: True)
|
||||||
|
:return: similarity ratio
|
||||||
|
"""
|
||||||
|
return _rapidfuzz_scorer(_QRatio, s1, s2, force_ascii, full_process)
|
||||||
|
|
||||||
|
|
||||||
|
def UQRatio(s1, s2, full_process=True):
|
||||||
|
"""
|
||||||
|
Unicode quick ratio
|
||||||
|
|
||||||
|
Calls QRatio with force_ascii set to False
|
||||||
|
|
||||||
|
:param s1:
|
||||||
|
:param s2:
|
||||||
|
:return: similarity ratio
|
||||||
|
"""
|
||||||
|
return QRatio(s1, s2, force_ascii=False, full_process=full_process)
|
||||||
|
|
||||||
|
|
||||||
|
# w is for weighted
|
||||||
|
def WRatio(s1, s2, force_ascii=True, full_process=True):
|
||||||
|
"""
|
||||||
|
Return a measure of the sequences' similarity between 0 and 100, using different algorithms.
|
||||||
|
|
||||||
|
**Steps in the order they occur**
|
||||||
|
|
||||||
|
#. Run full_process from utils on both strings
|
||||||
|
#. Short circuit if this makes either string empty
|
||||||
|
#. Take the ratio of the two processed strings (fuzz.ratio)
|
||||||
|
#. Run checks to compare the length of the strings
|
||||||
|
* If one of the strings is more than 1.5 times as long as the other
|
||||||
|
use partial_ratio comparisons - scale partial results by 0.9
|
||||||
|
(this makes sure only full results can return 100)
|
||||||
|
* If one of the strings is over 8 times as long as the other
|
||||||
|
instead scale by 0.6
|
||||||
|
|
||||||
|
#. Run the other ratio functions
|
||||||
|
* if using partial ratio functions call partial_ratio,
|
||||||
|
partial_token_sort_ratio and partial_token_set_ratio
|
||||||
|
scale all of these by the ratio based on length
|
||||||
|
* otherwise call token_sort_ratio and token_set_ratio
|
||||||
|
* all token based comparisons are scaled by 0.95
|
||||||
|
(on top of any partial scalars)
|
||||||
|
|
||||||
|
#. Take the highest value from these results
|
||||||
|
round it and return it as an integer.
|
||||||
|
|
||||||
|
:param s1:
|
||||||
|
:param s2:
|
||||||
|
:param force_ascii: Allow only ascii characters
|
||||||
|
:type force_ascii: bool
|
||||||
|
:full_process: Process inputs, used here to avoid double processing in extract functions (Default: True)
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
return _rapidfuzz_scorer(_WRatio, s1, s2, force_ascii, full_process)
|
||||||
|
|
||||||
|
|
||||||
|
def UWRatio(s1, s2, full_process=True):
|
||||||
|
"""
|
||||||
|
Return a measure of the sequences' similarity between 0 and 100,
|
||||||
|
using different algorithms. Same as WRatio but preserving unicode.
|
||||||
|
"""
|
||||||
|
return WRatio(s1, s2, force_ascii=False, full_process=full_process)
|
10
lib/thefuzz/fuzz.pyi
Normal file
10
lib/thefuzz/fuzz.pyi
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
def ratio(s1: str, s2: str) -> int: ...
|
||||||
|
def partial_ratio(s1: str, s2: str) -> int: ...
|
||||||
|
def token_sort_ratio(s1: str, s2: str, force_ascii: bool = ..., full_process: bool = ...) -> int: ...
|
||||||
|
def partial_token_sort_ratio(s1: str, s2: str, force_ascii: bool = ..., full_process: bool = ...) -> int: ...
|
||||||
|
def token_set_ratio(s1: str, s2: str, force_ascii: bool = ..., full_process: bool = ...) -> int: ...
|
||||||
|
def partial_token_set_ratio(s1: str, s2: str, force_ascii: bool = ..., full_process: bool = ...) -> int: ...
|
||||||
|
def QRatio(s1: str, s2: str, force_ascii: bool = ..., full_process: bool = ...) -> int: ...
|
||||||
|
def UQRatio(s1: str, s2: str, full_process: bool = ...) -> int: ...
|
||||||
|
def WRatio(s1: str, s2: str, force_ascii: bool = ..., full_process: bool = ...) -> int: ...
|
||||||
|
def UWRatio(s1: str, s2: str, full_process: bool = ...) -> int: ...
|
309
lib/thefuzz/process.py
Normal file
309
lib/thefuzz/process.py
Normal file
|
@ -0,0 +1,309 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
from . import fuzz
|
||||||
|
from . import utils
|
||||||
|
import logging
|
||||||
|
from rapidfuzz import fuzz as rfuzz
|
||||||
|
from rapidfuzz import process as rprocess
|
||||||
|
|
||||||
|
_logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
default_scorer = fuzz.WRatio
|
||||||
|
default_processor = utils.full_process
|
||||||
|
|
||||||
|
|
||||||
|
def _get_processor(processor, scorer):
|
||||||
|
"""
|
||||||
|
thefuzz runs both the default preprocessing of the function and the preprocessing
|
||||||
|
function passed into process.* while rapidfuzz only runs the one passed into
|
||||||
|
process.*. This function wraps the processor to mimic this behavior
|
||||||
|
"""
|
||||||
|
if scorer not in (fuzz.WRatio, fuzz.QRatio,
|
||||||
|
fuzz.token_set_ratio, fuzz.token_sort_ratio,
|
||||||
|
fuzz.partial_token_set_ratio, fuzz.partial_token_sort_ratio,
|
||||||
|
fuzz.UWRatio, fuzz.UQRatio):
|
||||||
|
return processor
|
||||||
|
|
||||||
|
if not processor:
|
||||||
|
return utils.full_process
|
||||||
|
|
||||||
|
def wrapper(s):
|
||||||
|
return utils.full_process(processor(s))
|
||||||
|
|
||||||
|
return wrapper
|
||||||
|
|
||||||
|
|
||||||
|
# this allows lowering the scorers back to the scorers used in rapidfuzz
|
||||||
|
# this allows rapidfuzz to perform more optimizations behind the scenes.
|
||||||
|
# These mapped scorers are the same with two expceptions
|
||||||
|
# - default processor
|
||||||
|
# - result is not rounded
|
||||||
|
# these two exceptions need to be taken into account in the implementation
|
||||||
|
_scorer_lowering = {
|
||||||
|
fuzz.ratio: rfuzz.ratio,
|
||||||
|
fuzz.partial_ratio: rfuzz.partial_ratio,
|
||||||
|
fuzz.token_set_ratio: rfuzz.token_set_ratio,
|
||||||
|
fuzz.token_sort_ratio: rfuzz.token_sort_ratio,
|
||||||
|
fuzz.partial_token_set_ratio: rfuzz.partial_token_set_ratio,
|
||||||
|
fuzz.partial_token_sort_ratio: rfuzz.partial_token_sort_ratio,
|
||||||
|
fuzz.WRatio: rfuzz.WRatio,
|
||||||
|
fuzz.QRatio: rfuzz.QRatio,
|
||||||
|
fuzz.UWRatio: rfuzz.WRatio,
|
||||||
|
fuzz.UQRatio: rfuzz.QRatio,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _get_scorer(scorer):
|
||||||
|
"""
|
||||||
|
rapidfuzz scorers require the score_cutoff argument to be available
|
||||||
|
This generates a compatible wrapper function
|
||||||
|
"""
|
||||||
|
def wrapper(s1, s2, score_cutoff=0):
|
||||||
|
return scorer(s1, s2)
|
||||||
|
|
||||||
|
return _scorer_lowering.get(scorer, wrapper)
|
||||||
|
|
||||||
|
|
||||||
|
def _preprocess_query(query, processor):
|
||||||
|
processed_query = processor(query) if processor else query
|
||||||
|
if len(processed_query) == 0:
|
||||||
|
_logger.warning("Applied processor reduces input query to empty string, "
|
||||||
|
"all comparisons will have score 0. "
|
||||||
|
f"[Query: \'{query}\']")
|
||||||
|
|
||||||
|
return processed_query
|
||||||
|
|
||||||
|
|
||||||
|
def extractWithoutOrder(query, choices, processor=default_processor, scorer=default_scorer, score_cutoff=0):
|
||||||
|
"""
|
||||||
|
Select the best match in a list or dictionary of choices.
|
||||||
|
|
||||||
|
Find best matches in a list or dictionary of choices, return a
|
||||||
|
generator of tuples containing the match and its score. If a dictionary
|
||||||
|
is used, also returns the key for each match.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
query: An object representing the thing we want to find.
|
||||||
|
choices: An iterable or dictionary-like object containing choices
|
||||||
|
to be matched against the query. Dictionary arguments of
|
||||||
|
{key: value} pairs will attempt to match the query against
|
||||||
|
each value.
|
||||||
|
processor: Optional function of the form f(a) -> b, where a is the query or
|
||||||
|
individual choice and b is the choice to be used in matching.
|
||||||
|
|
||||||
|
This can be used to match against, say, the first element of
|
||||||
|
a list:
|
||||||
|
|
||||||
|
lambda x: x[0]
|
||||||
|
|
||||||
|
Defaults to thefuzz.utils.full_process().
|
||||||
|
scorer: Optional function for scoring matches between the query and
|
||||||
|
an individual processed choice. This should be a function
|
||||||
|
of the form f(query, choice) -> int.
|
||||||
|
|
||||||
|
By default, fuzz.WRatio() is used and expects both query and
|
||||||
|
choice to be strings.
|
||||||
|
score_cutoff: Optional argument for score threshold. No matches with
|
||||||
|
a score less than this number will be returned. Defaults to 0.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Generator of tuples containing the match and its score.
|
||||||
|
|
||||||
|
If a list is used for choices, then the result will be 2-tuples.
|
||||||
|
If a dictionary is used, then the result will be 3-tuples containing
|
||||||
|
the key for each match.
|
||||||
|
|
||||||
|
For example, searching for 'bird' in the dictionary
|
||||||
|
|
||||||
|
{'bard': 'train', 'dog': 'man'}
|
||||||
|
|
||||||
|
may return
|
||||||
|
|
||||||
|
('train', 22, 'bard'), ('man', 0, 'dog')
|
||||||
|
"""
|
||||||
|
is_mapping = hasattr(choices, "items")
|
||||||
|
is_lowered = scorer in _scorer_lowering
|
||||||
|
|
||||||
|
query = _preprocess_query(query, processor)
|
||||||
|
it = rprocess.extract_iter(
|
||||||
|
query, choices,
|
||||||
|
processor=_get_processor(processor, scorer),
|
||||||
|
scorer=_get_scorer(scorer),
|
||||||
|
score_cutoff=score_cutoff
|
||||||
|
)
|
||||||
|
|
||||||
|
for choice, score, key in it:
|
||||||
|
if is_lowered:
|
||||||
|
score = int(round(score))
|
||||||
|
|
||||||
|
yield (choice, score, key) if is_mapping else (choice, score)
|
||||||
|
|
||||||
|
|
||||||
|
def extract(query, choices, processor=default_processor, scorer=default_scorer, limit=5):
|
||||||
|
"""
|
||||||
|
Select the best match in a list or dictionary of choices.
|
||||||
|
|
||||||
|
Find best matches in a list or dictionary of choices, return a
|
||||||
|
list of tuples containing the match and its score. If a dictionary
|
||||||
|
is used, also returns the key for each match.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
query: An object representing the thing we want to find.
|
||||||
|
choices: An iterable or dictionary-like object containing choices
|
||||||
|
to be matched against the query. Dictionary arguments of
|
||||||
|
{key: value} pairs will attempt to match the query against
|
||||||
|
each value.
|
||||||
|
processor: Optional function of the form f(a) -> b, where a is the query or
|
||||||
|
individual choice and b is the choice to be used in matching.
|
||||||
|
|
||||||
|
This can be used to match against, say, the first element of
|
||||||
|
a list:
|
||||||
|
|
||||||
|
lambda x: x[0]
|
||||||
|
|
||||||
|
Defaults to thefuzz.utils.full_process().
|
||||||
|
scorer: Optional function for scoring matches between the query and
|
||||||
|
an individual processed choice. This should be a function
|
||||||
|
of the form f(query, choice) -> int.
|
||||||
|
By default, fuzz.WRatio() is used and expects both query and
|
||||||
|
choice to be strings.
|
||||||
|
limit: Optional maximum for the number of elements returned. Defaults
|
||||||
|
to 5.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of tuples containing the match and its score.
|
||||||
|
|
||||||
|
If a list is used for choices, then the result will be 2-tuples.
|
||||||
|
If a dictionary is used, then the result will be 3-tuples containing
|
||||||
|
the key for each match.
|
||||||
|
|
||||||
|
For example, searching for 'bird' in the dictionary
|
||||||
|
|
||||||
|
{'bard': 'train', 'dog': 'man'}
|
||||||
|
|
||||||
|
may return
|
||||||
|
|
||||||
|
[('train', 22, 'bard'), ('man', 0, 'dog')]
|
||||||
|
"""
|
||||||
|
return extractBests(query, choices, processor=processor, scorer=scorer, limit=limit)
|
||||||
|
|
||||||
|
|
||||||
|
def extractBests(query, choices, processor=default_processor, scorer=default_scorer, score_cutoff=0, limit=5):
|
||||||
|
"""
|
||||||
|
Get a list of the best matches to a collection of choices.
|
||||||
|
|
||||||
|
Convenience function for getting the choices with best scores.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: A string to match against
|
||||||
|
choices: A list or dictionary of choices, suitable for use with
|
||||||
|
extract().
|
||||||
|
processor: Optional function for transforming choices before matching.
|
||||||
|
See extract().
|
||||||
|
scorer: Scoring function for extract().
|
||||||
|
score_cutoff: Optional argument for score threshold. No matches with
|
||||||
|
a score less than this number will be returned. Defaults to 0.
|
||||||
|
limit: Optional maximum for the number of elements returned. Defaults
|
||||||
|
to 5.
|
||||||
|
|
||||||
|
Returns: A a list of (match, score) tuples.
|
||||||
|
"""
|
||||||
|
is_mapping = hasattr(choices, "items")
|
||||||
|
is_lowered = scorer in _scorer_lowering
|
||||||
|
|
||||||
|
query = _preprocess_query(query, processor)
|
||||||
|
results = rprocess.extract(
|
||||||
|
query, choices,
|
||||||
|
processor=_get_processor(processor, scorer),
|
||||||
|
scorer=_get_scorer(scorer),
|
||||||
|
score_cutoff=score_cutoff,
|
||||||
|
limit=limit
|
||||||
|
)
|
||||||
|
|
||||||
|
for i, (choice, score, key) in enumerate(results):
|
||||||
|
if is_lowered:
|
||||||
|
score = int(round(score))
|
||||||
|
|
||||||
|
results[i] = (choice, score, key) if is_mapping else (choice, score)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def extractOne(query, choices, processor=default_processor, scorer=default_scorer, score_cutoff=0):
|
||||||
|
"""
|
||||||
|
Find the single best match above a score in a list of choices.
|
||||||
|
|
||||||
|
This is a convenience method which returns the single best choice.
|
||||||
|
See extract() for the full arguments list.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: A string to match against
|
||||||
|
choices: A list or dictionary of choices, suitable for use with
|
||||||
|
extract().
|
||||||
|
processor: Optional function for transforming choices before matching.
|
||||||
|
See extract().
|
||||||
|
scorer: Scoring function for extract().
|
||||||
|
score_cutoff: Optional argument for score threshold. If the best
|
||||||
|
match is found, but it is not greater than this number, then
|
||||||
|
return None anyway ("not a good enough match"). Defaults to 0.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A tuple containing a single match and its score, if a match
|
||||||
|
was found that was above score_cutoff. Otherwise, returns None.
|
||||||
|
"""
|
||||||
|
is_mapping = hasattr(choices, "items")
|
||||||
|
is_lowered = scorer in _scorer_lowering
|
||||||
|
|
||||||
|
query = _preprocess_query(query, processor)
|
||||||
|
res = rprocess.extractOne(
|
||||||
|
query, choices,
|
||||||
|
processor=_get_processor(processor, scorer),
|
||||||
|
scorer=_get_scorer(scorer),
|
||||||
|
score_cutoff=score_cutoff
|
||||||
|
)
|
||||||
|
|
||||||
|
if res is None:
|
||||||
|
return res
|
||||||
|
|
||||||
|
choice, score, key = res
|
||||||
|
|
||||||
|
if is_lowered:
|
||||||
|
score = int(round(score))
|
||||||
|
|
||||||
|
return (choice, score, key) if is_mapping else (choice, score)
|
||||||
|
|
||||||
|
|
||||||
|
def dedupe(contains_dupes, threshold=70, scorer=fuzz.token_set_ratio):
|
||||||
|
"""
|
||||||
|
This convenience function takes a list of strings containing duplicates and uses fuzzy matching to identify
|
||||||
|
and remove duplicates. Specifically, it uses process.extract to identify duplicates that
|
||||||
|
score greater than a user defined threshold. Then, it looks for the longest item in the duplicate list
|
||||||
|
since we assume this item contains the most entity information and returns that. It breaks string
|
||||||
|
length ties on an alphabetical sort.
|
||||||
|
|
||||||
|
Note: as the threshold DECREASES the number of duplicates that are found INCREASES. This means that the
|
||||||
|
returned deduplicated list will likely be shorter. Raise the threshold for dedupe to be less
|
||||||
|
sensitive.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
contains_dupes: A list of strings that we would like to dedupe.
|
||||||
|
threshold: the numerical value (0,100) point at which we expect to find duplicates.
|
||||||
|
Defaults to 70 out of 100
|
||||||
|
scorer: Optional function for scoring matches between the query and
|
||||||
|
an individual processed choice. This should be a function
|
||||||
|
of the form f(query, choice) -> int.
|
||||||
|
By default, fuzz.token_set_ratio() is used and expects both query and
|
||||||
|
choice to be strings.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A deduplicated list. For example:
|
||||||
|
|
||||||
|
In: contains_dupes = ['Frodo Baggin', 'Frodo Baggins', 'F. Baggins', 'Samwise G.', 'Gandalf', 'Bilbo Baggins']
|
||||||
|
In: dedupe(contains_dupes)
|
||||||
|
Out: ['Frodo Baggins', 'Samwise G.', 'Bilbo Baggins', 'Gandalf']
|
||||||
|
"""
|
||||||
|
deduped = set()
|
||||||
|
for item in contains_dupes:
|
||||||
|
matches = extractBests(item, contains_dupes, scorer=scorer, score_cutoff=threshold, limit=None)
|
||||||
|
deduped.add(max(matches, key=lambda x: (len(x[0]), x[0]))[0])
|
||||||
|
|
||||||
|
return list(deduped) if len(deduped) != len(contains_dupes) else contains_dupes
|
17
lib/thefuzz/process.pyi
Normal file
17
lib/thefuzz/process.pyi
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
from collections.abc import Mapping
|
||||||
|
import typing
|
||||||
|
from typing import Any, Callable, Union, Tuple, Generator, TypeVar, Sequence
|
||||||
|
|
||||||
|
|
||||||
|
ChoicesT = Union[Mapping[str, str], Sequence[str]]
|
||||||
|
T = TypeVar('T')
|
||||||
|
ProcessorT = Union[Callable[[str, bool], str], Callable[[Any], Any]]
|
||||||
|
ScorerT = Callable[[str, str, bool, bool], int]
|
||||||
|
|
||||||
|
|
||||||
|
@typing.overload
|
||||||
|
def extractWithoutOrder(query: str, choices: Mapping[str, str], processor: ProcessorT, scorer: ScorerT, score_cutoff: int = ...) -> Generator[Tuple[str, int, str], None, None]: ...
|
||||||
|
|
||||||
|
|
||||||
|
@typing.overload
|
||||||
|
def extractWithoutOrder(query: str, choices: Sequence[str], processor: ProcessorT, scorer: ScorerT, score_cutoff: int = ...) -> Generator[Tuple[str, int], None, None]: ...
|
22
lib/thefuzz/utils.py
Normal file
22
lib/thefuzz/utils.py
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
from rapidfuzz.utils import default_process as _default_process
|
||||||
|
|
||||||
|
translation_table = {i: None for i in range(128, 256)} # ascii dammit!
|
||||||
|
|
||||||
|
|
||||||
|
def ascii_only(s):
|
||||||
|
return s.translate(translation_table)
|
||||||
|
|
||||||
|
|
||||||
|
def full_process(s, force_ascii=False):
|
||||||
|
"""
|
||||||
|
Process string by
|
||||||
|
-- removing all but letters and numbers
|
||||||
|
-- trim whitespace
|
||||||
|
-- force to lower case
|
||||||
|
if force_ascii == True, force convert to ascii
|
||||||
|
"""
|
||||||
|
|
||||||
|
if force_ascii:
|
||||||
|
s = ascii_only(str(s))
|
||||||
|
|
||||||
|
return _default_process(s)
|
3
lib/thefuzz/utils.pyi
Normal file
3
lib/thefuzz/utils.pyi
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
|
||||||
|
def ascii_only(s: str) -> str: ...
|
||||||
|
def full_process(s: str, force_ascii: bool = ...) -> str: ...
|
|
@ -10,7 +10,6 @@ lxml; 'Windows' != platform_system
|
||||||
orjson; 'Windows' == platform_system
|
orjson; 'Windows' == platform_system
|
||||||
orjson; 'Linux' == platform_system and ('x86_64' == platform_machine or 'aarch64' == platform_machine or 'armv7l' == platform_machine)
|
orjson; 'Linux' == platform_system and ('x86_64' == platform_machine or 'aarch64' == platform_machine or 'armv7l' == platform_machine)
|
||||||
pip
|
pip
|
||||||
Levenshtein
|
|
||||||
rapidfuzz < 4.0.0
|
rapidfuzz < 4.0.0
|
||||||
regex
|
regex
|
||||||
setuptools
|
setuptools
|
||||||
|
|
|
@ -56,7 +56,10 @@ from .tv_base import TVEpisodeBase, TVShowBase
|
||||||
from lib import imdbpie, subliminal
|
from lib import imdbpie, subliminal
|
||||||
from lib.dateutil import tz
|
from lib.dateutil import tz
|
||||||
from lib.dateutil.parser import parser as du_parser
|
from lib.dateutil.parser import parser as du_parser
|
||||||
from lib.fuzzywuzzy import fuzz
|
try:
|
||||||
|
from lib.thefuzz import fuzz
|
||||||
|
except ImportError as e:
|
||||||
|
from lib.fuzzywuzzy import fuzz
|
||||||
from lib.tvinfo_base import RoleTypes, TVINFO_FACEBOOK, TVINFO_INSTAGRAM, TVINFO_SLUG, TVINFO_TWITTER, \
|
from lib.tvinfo_base import RoleTypes, TVINFO_FACEBOOK, TVINFO_INSTAGRAM, TVINFO_SLUG, TVINFO_TWITTER, \
|
||||||
TVINFO_WIKIPEDIA, TVINFO_TIKTOK, TVINFO_FANSITE, TVINFO_YOUTUBE, TVINFO_REDDIT, TVINFO_LINKEDIN, TVINFO_WIKIDATA
|
TVINFO_WIKIPEDIA, TVINFO_TIKTOK, TVINFO_FANSITE, TVINFO_YOUTUBE, TVINFO_REDDIT, TVINFO_LINKEDIN, TVINFO_WIKIDATA
|
||||||
from lib.tvinfo_base.exceptions import *
|
from lib.tvinfo_base.exceptions import *
|
||||||
|
|
|
@ -88,7 +88,10 @@ from lib import subliminal
|
||||||
from lib.cfscrape import CloudflareScraper
|
from lib.cfscrape import CloudflareScraper
|
||||||
from lib.dateutil import tz, zoneinfo
|
from lib.dateutil import tz, zoneinfo
|
||||||
from lib.dateutil.relativedelta import relativedelta
|
from lib.dateutil.relativedelta import relativedelta
|
||||||
from lib.fuzzywuzzy import fuzz
|
try:
|
||||||
|
from lib.thefuzz import fuzz
|
||||||
|
except ImportError as e:
|
||||||
|
from lib.fuzzywuzzy import fuzz
|
||||||
from lib.api_trakt import TraktAPI
|
from lib.api_trakt import TraktAPI
|
||||||
from lib.api_trakt.exceptions import TraktException, TraktAuthException
|
from lib.api_trakt.exceptions import TraktException, TraktAuthException
|
||||||
from lib.tvinfo_base import TVInfoEpisode
|
from lib.tvinfo_base import TVInfoEpisode
|
||||||
|
|
Loading…
Reference in a new issue