#!/usr/bin/env python # encoding: utf-8 from __future__ import unicode_literals import platform import warnings try: from .StringMatcher import StringMatcher as SequenceMatcher except ImportError: if platform.python_implementation() != "PyPy": warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning') from difflib import SequenceMatcher from . import utils ########################### # Basic Scoring Functions # ########################### @utils.check_for_none @utils.check_for_equivalence @utils.check_empty_string def ratio(s1, s2): s1, s2 = utils.make_type_consistent(s1, s2) m = SequenceMatcher(None, s1, s2) return utils.intr(100 * m.ratio()) @utils.check_for_none @utils.check_for_equivalence @utils.check_empty_string def partial_ratio(s1, s2): """"Return the ratio of the most similar substring as a number between 0 and 100.""" s1, s2 = utils.make_type_consistent(s1, s2) if len(s1) <= len(s2): shorter = s1 longer = s2 else: shorter = s2 longer = s1 m = SequenceMatcher(None, shorter, longer) blocks = m.get_matching_blocks() # each block represents a sequence of matching characters in a string # of the form (idx_1, idx_2, len) # the best partial match will block align with at least one of those blocks # e.g. shorter = "abcd", longer = XXXbcdeEEE # block = (1,3,3) # best score === ratio("abcd", "Xbcd") scores = [] for block in blocks: long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0 long_end = long_start + len(shorter) long_substr = longer[long_start:long_end] m2 = SequenceMatcher(None, shorter, long_substr) r = m2.ratio() if r > .995: return 100 else: scores.append(r) return utils.intr(100 * max(scores)) ############################## # Advanced Scoring Functions # ############################## def _process_and_sort(s, force_ascii, full_process=True): """Return a cleaned string with token sorted.""" # pull tokens ts = utils.full_process(s, force_ascii=force_ascii) if full_process else s tokens = ts.split() # sort tokens and join sorted_string = u" ".join(sorted(tokens)) return sorted_string.strip() # Sorted Token # find all alphanumeric tokens in the string # sort those tokens and take ratio of resulting joined strings # controls for unordered string elements @utils.check_for_none def _token_sort(s1, s2, partial=True, force_ascii=True, full_process=True): sorted1 = _process_and_sort(s1, force_ascii, full_process=full_process) sorted2 = _process_and_sort(s2, force_ascii, full_process=full_process) if partial: return partial_ratio(sorted1, sorted2) else: return ratio(sorted1, sorted2) def token_sort_ratio(s1, s2, force_ascii=True, full_process=True): """Return a measure of the sequences' similarity between 0 and 100 but sorting the token before comparing. """ return _token_sort(s1, s2, partial=False, force_ascii=force_ascii, full_process=full_process) def partial_token_sort_ratio(s1, s2, force_ascii=True, full_process=True): """Return the ratio of the most similar substring as a number between 0 and 100 but sorting the token before comparing. """ return _token_sort(s1, s2, partial=True, force_ascii=force_ascii, full_process=full_process) @utils.check_for_none def _token_set(s1, s2, partial=True, force_ascii=True, full_process=True): """Find all alphanumeric tokens in each string... - treat them as a set - construct two strings of the form: <sorted_intersection><sorted_remainder> - take ratios of those two strings - controls for unordered partial matches""" if not full_process and s1 == s2: return 100 p1 = utils.full_process(s1, force_ascii=force_ascii) if full_process else s1 p2 = utils.full_process(s2, force_ascii=force_ascii) if full_process else s2 if not utils.validate_string(p1): return 0 if not utils.validate_string(p2): return 0 # pull tokens tokens1 = set(p1.split()) tokens2 = set(p2.split()) intersection = tokens1.intersection(tokens2) diff1to2 = tokens1.difference(tokens2) diff2to1 = tokens2.difference(tokens1) sorted_sect = " ".join(sorted(intersection)) sorted_1to2 = " ".join(sorted(diff1to2)) sorted_2to1 = " ".join(sorted(diff2to1)) combined_1to2 = sorted_sect + " " + sorted_1to2 combined_2to1 = sorted_sect + " " + sorted_2to1 # strip sorted_sect = sorted_sect.strip() combined_1to2 = combined_1to2.strip() combined_2to1 = combined_2to1.strip() if partial: ratio_func = partial_ratio else: ratio_func = ratio pairwise = [ ratio_func(sorted_sect, combined_1to2), ratio_func(sorted_sect, combined_2to1), ratio_func(combined_1to2, combined_2to1) ] return max(pairwise) def token_set_ratio(s1, s2, force_ascii=True, full_process=True): return _token_set(s1, s2, partial=False, force_ascii=force_ascii, full_process=full_process) def partial_token_set_ratio(s1, s2, force_ascii=True, full_process=True): return _token_set(s1, s2, partial=True, force_ascii=force_ascii, full_process=full_process) ################### # Combination API # ################### # q is for quick def QRatio(s1, s2, force_ascii=True, full_process=True): """ Quick ratio comparison between two strings. Runs full_process from utils on both strings Short circuits if either of the strings is empty after processing. :param s1: :param s2: :param force_ascii: Allow only ASCII characters (Default: True) :full_process: Process inputs, used here to avoid double processing in extract functions (Default: True) :return: similarity ratio """ if full_process: p1 = utils.full_process(s1, force_ascii=force_ascii) p2 = utils.full_process(s2, force_ascii=force_ascii) else: p1 = s1 p2 = s2 if not utils.validate_string(p1): return 0 if not utils.validate_string(p2): return 0 return ratio(p1, p2) def UQRatio(s1, s2, full_process=True): """ Unicode quick ratio Calls QRatio with force_ascii set to False :param s1: :param s2: :return: similarity ratio """ return QRatio(s1, s2, force_ascii=False, full_process=full_process) # w is for weighted def WRatio(s1, s2, force_ascii=True, full_process=True): """ Return a measure of the sequences' similarity between 0 and 100, using different algorithms. **Steps in the order they occur** #. Run full_process from utils on both strings #. Short circuit if this makes either string empty #. Take the ratio of the two processed strings (fuzz.ratio) #. Run checks to compare the length of the strings * If one of the strings is more than 1.5 times as long as the other use partial_ratio comparisons - scale partial results by 0.9 (this makes sure only full results can return 100) * If one of the strings is over 8 times as long as the other instead scale by 0.6 #. Run the other ratio functions * if using partial ratio functions call partial_ratio, partial_token_sort_ratio and partial_token_set_ratio scale all of these by the ratio based on length * otherwise call token_sort_ratio and token_set_ratio * all token based comparisons are scaled by 0.95 (on top of any partial scalars) #. Take the highest value from these results round it and return it as an integer. :param s1: :param s2: :param force_ascii: Allow only ascii characters :type force_ascii: bool :full_process: Process inputs, used here to avoid double processing in extract functions (Default: True) :return: """ if full_process: p1 = utils.full_process(s1, force_ascii=force_ascii) p2 = utils.full_process(s2, force_ascii=force_ascii) else: p1 = s1 p2 = s2 if not utils.validate_string(p1): return 0 if not utils.validate_string(p2): return 0 # should we look at partials? try_partial = True unbase_scale = .95 partial_scale = .90 base = ratio(p1, p2) len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2)) # if strings are similar length, don't use partials if len_ratio < 1.5: try_partial = False # if one string is much much shorter than the other if len_ratio > 8: partial_scale = .6 if try_partial: partial = partial_ratio(p1, p2) * partial_scale ptsor = partial_token_sort_ratio(p1, p2, full_process=False) \ * unbase_scale * partial_scale ptser = partial_token_set_ratio(p1, p2, full_process=False) \ * unbase_scale * partial_scale return utils.intr(max(base, partial, ptsor, ptser)) else: tsor = token_sort_ratio(p1, p2, full_process=False) * unbase_scale tser = token_set_ratio(p1, p2, full_process=False) * unbase_scale return utils.intr(max(base, tsor, tser)) def UWRatio(s1, s2, full_process=True): """Return a measure of the sequences' similarity between 0 and 100, using different algorithms. Same as WRatio but preserving unicode. """ return WRatio(s1, s2, force_ascii=False, full_process=full_process)