SickGear/lib/imdb/parser/sql/cutils.c

270 lines
7 KiB
C
Raw Normal View History

/*
* cutils.c module.
*
* Miscellaneous functions to speed up the IMDbPY package.
*
* Contents:
* - pyratcliff():
* Function that implements the Ratcliff-Obershelp comparison
* amongst Python strings.
*
* - pysoundex():
* Return a soundex code string, for the given string.
*
* Copyright 2004-2009 Davide Alberani <da@erlug.linux.it>
* Released under the GPL license.
*
* NOTE: The Ratcliff-Obershelp part was heavily based on code from the
* "simil" Python module.
* The "simil" module is copyright of Luca Montecchiani <cbm64 _at_ inwind.it>
* and can be found here: http://spazioinwind.libero.it/montecchiani/
* It was released under the GPL license; original comments are leaved
* below.
*
*/
/*========== Ratcliff-Obershelp ==========*/
/*****************************************************************************
*
* Stolen code from :
*
* [Python-Dev] Why is soundex marked obsolete?
* by Eric S. Raymond [4]esr@thyrsus.com
* on Sun, 14 Jan 2001 14:09:01 -0500
*
*****************************************************************************/
/*****************************************************************************
*
* Ratcliff-Obershelp common-subpattern similarity.
*
* This code first appeared in a letter to the editor in Doctor
* Dobbs's Journal, 11/1988. The original article on the algorithm,
* "Pattern Matching by Gestalt" by John Ratcliff, had appeared in the
* July 1988 issue (#181) but the algorithm was presented in assembly.
* The main drawback of the Ratcliff-Obershelp algorithm is the cost
* of the pairwise comparisons. It is significantly more expensive
* than stemming, Hamming distance, soundex, and the like.
*
* Running time quadratic in the data size, memory usage constant.
*
*****************************************************************************/
#include <Python.h>
#define DONTCOMPARE_NULL 0.0
#define DONTCOMPARE_SAME 1.0
#define COMPARE 2.0
#define STRING_MAXLENDIFFER 0.7
/* As of 05 Mar 2008, the longest title is ~600 chars. */
#define MXLINELEN 1023
#define MAX(a,b) ((a) > (b) ? (a) : (b))
//*****************************************
// preliminary check....
//*****************************************
static float
strings_check(char const *s, char const *t)
{
float threshold; // lenght difference
int s_len = strlen(s); // length of s
int t_len = strlen(t); // length of t
// NULL strings ?
if ((t_len * s_len) == 0)
return (DONTCOMPARE_NULL);
// the same ?
if (strcmp(s, t) == 0)
return (DONTCOMPARE_SAME);
// string lenght difference threshold
// we don't want to compare too different lenght strings ;)
if (s_len < t_len)
threshold = (float) s_len / (float) t_len;
else
threshold = (float) t_len / (float) s_len;
if (threshold < STRING_MAXLENDIFFER)
return (DONTCOMPARE_NULL);
// proceed
return (COMPARE);
}
static int
RatcliffObershelp(char *st1, char *end1, char *st2, char *end2)
{
register char *a1, *a2;
char *b1, *b2;
char *s1 = st1, *s2 = st2; /* initializations are just to pacify GCC */
short max, i;
if (end1 <= st1 || end2 <= st2)
return (0);
if (end1 == st1 + 1 && end2 == st2 + 1)
return (0);
max = 0;
b1 = end1;
b2 = end2;
for (a1 = st1; a1 < b1; a1++) {
for (a2 = st2; a2 < b2; a2++) {
if (*a1 == *a2) {
/* determine length of common substring */
for (i = 1; a1[i] && (a1[i] == a2[i]); i++)
continue;
if (i > max) {
max = i;
s1 = a1;
s2 = a2;
b1 = end1 - max;
b2 = end2 - max;
}
}
}
}
if (!max)
return (0);
max += RatcliffObershelp(s1 + max, end1, s2 + max, end2); /* rhs */
max += RatcliffObershelp(st1, s1, st2, s2); /* lhs */
return max;
}
static float
ratcliff(char *s1, char *s2)
/* compute Ratcliff-Obershelp similarity of two strings */
{
int l1, l2;
float res;
// preliminary tests
res = strings_check(s1, s2);
if (res != COMPARE)
return(res);
l1 = strlen(s1);
l2 = strlen(s2);
return 2.0 * RatcliffObershelp(s1, s1 + l1, s2, s2 + l2) / (l1 + l2);
}
/* Change a string to lowercase. */
static void
strtolower(char *s1)
{
int i;
for (i=0; i < strlen(s1); i++) s1[i] = tolower(s1[i]);
}
/* Ratcliff-Obershelp for two python strings; returns a python float. */
static PyObject*
pyratcliff(PyObject *self, PyObject *pArgs)
{
char *s1 = NULL;
char *s2 = NULL;
PyObject *discard = NULL;
char s1copy[MXLINELEN+1];
char s2copy[MXLINELEN+1];
/* The optional PyObject parameter is here to be compatible
* with the pure python implementation, which uses a
* difflib.SequenceMatcher object. */
if (!PyArg_ParseTuple(pArgs, "ss|O", &s1, &s2, &discard))
return NULL;
strncpy(s1copy, s1, MXLINELEN);
strncpy(s2copy, s2, MXLINELEN);
/* Work on copies. */
strtolower(s1copy);
strtolower(s2copy);
return Py_BuildValue("f", ratcliff(s1copy, s2copy));
}
/*========== soundex ==========*/
/* Max length of the soundex code to output (an uppercase char and
* _at most_ 4 digits). */
#define SOUNDEX_LEN 5
/* Group Number Lookup Table */
static char soundTable[26] =
{ 0 /* A */, '1' /* B */, '2' /* C */, '3' /* D */, 0 /* E */, '1' /* F */,
'2' /* G */, 0 /* H */, 0 /* I */, '2' /* J */, '2' /* K */, '4' /* L */,
'5' /* M */, '5' /* N */, 0 /* O */, '1' /* P */, '2' /* Q */, '6' /* R */,
'2' /* S */, '3' /* T */, 0 /* U */, '1' /* V */, 0 /* W */, '2' /* X */,
0 /* Y */, '2' /* Z */};
static PyObject*
pysoundex(PyObject *self, PyObject *pArgs)
{
int i, j, n;
char *s = NULL;
char word[MXLINELEN+1];
char soundCode[SOUNDEX_LEN+1];
char c;
if (!PyArg_ParseTuple(pArgs, "s", &s))
return NULL;
j = 0;
n = strlen(s);
/* Convert to uppercase and exclude non-ascii chars. */
for (i = 0; i < n; i++) {
c = toupper(s[i]);
if (c < 91 && c > 64) {
word[j] = c;
j++;
}
}
word[j] = '\0';
n = strlen(word);
if (n == 0) {
/* If the string is empty, returns None. */
return Py_BuildValue("");
}
soundCode[0] = word[0];
/* Build the soundCode string. */
j = 1;
for (i = 1; j < SOUNDEX_LEN && i < n; i++) {
c = soundTable[(word[i]-65)];
/* Compact zeroes and equal consecutive digits ("12234112"->"123412") */
if (c != 0 && c != soundCode[j-1]) {
soundCode[j++] = c;
}
}
soundCode[j] = '\0';
return Py_BuildValue("s", soundCode);
}
static PyMethodDef cutils_methods[] = {
{"ratcliff", pyratcliff,
METH_VARARGS, "Ratcliff-Obershelp similarity."},
{"soundex", pysoundex,
METH_VARARGS, "Soundex code for strings."},
{NULL}
};
void
initcutils(void)
{
Py_InitModule("cutils", cutils_methods);
}