#ifndef similcmp_c
#define similcmp_c 1

#ifdef LIB_STRINGS
#include <strings.h>
#else
#include <string.h>
#endif

/*

    File:    similcmp.c
    Authors: John Ratcliffe, and an anonymous coder.
    Purpose: Better approximate string equality test.
    Functions exported:  simil
    Internal functions:  GCsubstr

Description:
  See below.  I'll replace this one eventually with my own
  algorithm which does sophisticated compound-grapheme analysis
  and returns a degree of phonetic similarity and a probability
  that the transformations made were valid.


  'ghoti==fish' (Match = 90%, Prob = 1%) ;-)
  lauGH = f       match 100% prob 30%
  wOmen = i       match  90% prob 10%
  staTIon = sh    match 100% prob 5%

*/

/* Ratcliff/Obershelp pattern matching
 *
 *      Approximate word matching: takes two words and returns a
 *      similarity score based on co-occurrence of subpatterns.
 *
 *      This code appeared in a letter to the editor in DDJ, 11/88.
 *      The original article on the pattern matching, "Pattern Matching
 *      by Gestalt" by John Ratcliff, appeared in the July 1988
 *      issue (#181) but the algorithm was presented in assembly.  
 *
 *      The 11/88 issue also contained another verison in C which was
 *      a more faithful translation of the original; it has the
 *      advantage of not being recursive.
 *
 *      The algorithm seems to work nicely for a variety of test cases.
 *      The main drawback of the algorithm is the cost of the pairwise
 *      comparisons.  It is significantly more expensive than stemming,
 *      soundex, and the like.  Might consider using this as a second
 *      phase...
 */

int
#ifdef PROTOTYPES
GCsubstr(char *st1, char *end1, char *st2, char *end2)
#else
GCsubstr(st1, end1, st2, end2)
  char *st1;
  char *end1;
  char *st2;
  char *end2;
#endif
{
register char *a1, *a2;
char *b1, *s1, *b2, *s2;
short max, i;

  if (end1 <= st1 || end2 <= st2) return(0);
  if (end1 == st1 + 1 && end2 == st2 + 1) return(0);
                
  max = 0; b1 = end1; b2 = end2;
        
  for (a1 = st1; a1 < b1; a1++) {
    for (a2 = st2; a2 < b2; a2++) {
      if (*a1 == *a2) {
        /* determine length of common substring */
        for (i = 1; a1[i] && (a1[i] == a2[i]); i++) /* do nothing */;
        if (i > max) {
          max = i; s1 = a1; s2 = a2;
          b1 = end1 - max; b2 = end2 - max;
        }
      }
    }
  }
  if (!max) return(0);
  max += GCsubstr(s1 + max, end1, s2 + max, end2);        /* rhs */
  max += GCsubstr(st1, s1, st2, s2);                      /* lhs */
  return(max);
}

int
#ifdef PROTOTYPES
simil(char *s1, char *s2)
#else
simil(s1, s2)
 char *s1;
 char *s2;
#endif
{
int l1 = strlen(s1), l2 = strlen(s2);
 if (strcmp(s1, s2) == 0) return(100); /* exact match end-case */
 return(200 * GCsubstr(s1, s1 + l1, s2, s2 + l2) / (l1 + l2));
}
#endif /* similcmp_c */

