#ifndef similcmp_c
#define similcmp_c 1
#ifdef LIB_STRINGS
#include
#else
#include
#endif
/*
File: similcmp.c
Authors: John Ratcliffe, and an anonymous coder.
Purpose: Better approximate string equality test.
Functions exported: simil
Internal functions: GCsubstr
Description:
See below. I'll replace this one eventually with my own
algorithm which does sophisticated compound-grapheme analysis
and returns a degree of phonetic similarity and a probability
that the transformations made were valid.
'ghoti==fish' (Match = 90%, Prob = 1%) ;-)
lauGH = f match 100% prob 30%
wOmen = i match 90% prob 10%
staTIon = sh match 100% prob 5%
*/
/* Ratcliff/Obershelp pattern matching
*
* Approximate word matching: takes two words and returns a
* similarity score based on co-occurrence of subpatterns.
*
* This code appeared in a letter to the editor in DDJ, 11/88.
* The original article on the pattern matching, "Pattern Matching
* by Gestalt" by John Ratcliff, appeared in the July 1988
* issue (#181) but the algorithm was presented in assembly.
*
* The 11/88 issue also contained another verison in C which was
* a more faithful translation of the original; it has the
* advantage of not being recursive.
*
* The algorithm seems to work nicely for a variety of test cases.
* The main drawback of the algorithm is the cost of the pairwise
* comparisons. It is significantly more expensive than stemming,
* soundex, and the like. Might consider using this as a second
* phase...
*/
int
#ifdef PROTOTYPES
GCsubstr(char *st1, char *end1, char *st2, char *end2)
#else
GCsubstr(st1, end1, st2, end2)
char *st1;
char *end1;
char *st2;
char *end2;
#endif
{
register char *a1, *a2;
char *b1, *s1, *b2, *s2;
short max, i;
if (end1 <= st1 || end2 <= st2) return(0);
if (end1 == st1 + 1 && end2 == st2 + 1) return(0);
max = 0; b1 = end1; b2 = end2;
for (a1 = st1; a1 < b1; a1++) {
for (a2 = st2; a2 < b2; a2++) {
if (*a1 == *a2) {
/* determine length of common substring */
for (i = 1; a1[i] && (a1[i] == a2[i]); i++) /* do nothing */;
if (i > max) {
max = i; s1 = a1; s2 = a2;
b1 = end1 - max; b2 = end2 - max;
}
}
}
}
if (!max) return(0);
max += GCsubstr(s1 + max, end1, s2 + max, end2); /* rhs */
max += GCsubstr(st1, s1, st2, s2); /* lhs */
return(max);
}
int
#ifdef PROTOTYPES
simil(char *s1, char *s2)
#else
simil(s1, s2)
char *s1;
char *s2;
#endif
{
int l1 = strlen(s1), l2 = strlen(s2);
if (strcmp(s1, s2) == 0) return(100); /* exact match end-case */
return(200 * GCsubstr(s1, s1 + l1, s2, s2 + l2) / (l1 + l2));
}
#endif /* similcmp_c */