/*

   Keywords: spelling checker, spell check, dictionary check,
   fast password checking algorithm, hash table, word lists,
   probabilistic, probability, dictionary compression, scrabble,
   boggle, word games, small memory, small ram, handhelds, palmtops,
   download, C source.

   A probabilistic dictionary checking algorithm using a hash table.
   Graham Toal, Texas, 10 July 98.  (*19*98, for those of you reading
   the old archives)


   There is an old algorithm for checking to see if a word exists
   which uses a single bit boolean flag in a hash table, set if the
   word was entered into the table.  Everyone reinvents this algorithm
   every few years and then forgets about it, because much better
   algorithms exist which don't give false positives on the occasional
   invalid word.  However, for a computer that's cramped for RAM (like
   the ones I was brought up on, and nowadays on handhelds such as
   the PalmPilot) there's a use for this code again.

   In particular, if writing a Scrabble for the PalmPilot you might want
   to have the complete official dictionary present for verifying if a
   play by the opponent is an allowed word or not.  There may not be enough
   RAM to have the whole dictionary in store in a way that the computer
   could extract the words to play with, but by using a probabilistic
   hash table, you can check what is played to 99% accuracy with a passive
   vocabulary - and use a smaller dictionary in some other format (like
   a DAWG) for the active vocabulary with which the computer plays.

   This algorithm generates false positives only, no false negatives, so
   when it says 'that is not a valid word' it is correct (assuming it was
   fed an authoritative word list).

   It will only say 'that *was* a valid word' when in fact it was *not*
   1 in ten times, so using this check strongly discourages human opponents
   from cheating.  (The 1 in 10 factor can be made higher by increasing
   the size of the hash table proportionally)

   I.e. it is 100% accurate for valid plays and 90% accurate for invalid plays.

   The reason I'm putting this on the web (and mailing it to the word-game
   writers list) is that I wasted several hours on the net searching for
   such a program in vain before I finally decided it was quicker just to
   go write it...  I thought I'd save the next person the wasted searching
   time... (Also, although most of the old hands here know this algorithm,
   it might be new to our younger readers who have been spoiled with
   16Mb systems from the age they could type ;-) - this is how we did
   things in the old days when code & data had to fit into 16K, not 16M...)

   You can try tweaking the 1 in 10 number yourself.  1 in 9 is still
   usable at a pinch, but not recommended.  1 in 8 is very bad.  You can
   go higher than 1 in 10 but I can't see the point.

   (Note: this algorithm could be improved at little extra cost by adding
   a 2D array of valid digraph flags.)

   By the way, This was just research - I'm not actually writing a Palm
   Pilot version of Scrabble.  Not yet anyway, and if I did I don't think
   I'd use this algorithm, because...

   for one dictionary of about 1050K of text, the hash table takes up
   about 147K bytes.  This is bang on ballpark for Scrabble on the PalmPilot.
   But the same data in dawg format would be about 323K, so this is only needed
   where you really are counting the bytes in RAM.  I'm sure a dedicated
   Scrabbler would be happy to give up all the RAM on his PalmPilot to
   the dictionary :-)  I'd only do this for a cut down game for beginners
   where you didn't mind the fact that the active vocabulary that the
   computer used was heavily reduced


   This code (c) 10 July 1998  Graham Toal <gtoal@gtoal.com>

   This code may be used in any application either freeware or commercial
   with no restrictions and no obligations.  Contact me if your lawyers
   need that in writing.

 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define BITS 10

/* Trivial hash with end-around carry.  No thought went into this.
   You can probably do better.  This worked well enough for me on
   the first try I didn't see a need to improve it.

   Return a hash of the string between 0 and max-1.  A small degree
   of care was exerted so that a, aa, aaa and aaaa etc didn't all hash to 0.
 */

int hashfun(char *s, int max)
{
  int c;
  int rem;
  int h = 0;

  for (;;) {
    c = *s++;
    if (c == '\0') break;
    if (isalpha(c)) c = c - (isupper(c) ? 'A' : 'a');
    h = (h + (c&31) + 1) * 26;
    for (;;) {
      /* I suspect this can only be executed twice at most but
         I'm lazy and paranoid, hence a loop */
      rem = h / max;
      h = (h % max) + rem;
      if (h < max) break;
    }
  }
  return(h);
}

int main(int argc, char **argv)
{
  FILE *dict;
  FILE *data;
  int i;
  char *s;
  char line[128];

  int N;                   /* Determine size of text (no of words) */
  int dictsize;

  int hashsize, hashpos;   /* Allocate hash table of 10 * N bits */
  unsigned char *hashtable;

  /* Files are just words, one per line.  Nothing fancy.  Standard newlines. */

  if (argc != 3) {
    fprintf(stderr, "syntax: %s dictfile testwords\n", argv[0]);
    exit(1);
  }

  /* First arg is the large dictionary file */
  dict = fopen(argv[1], "r");
  if (dict == NULL) {
    fprintf(stderr, "Cannot open %s (master dictionary)\n", argv[1]);
    exit(1);
  }

  /* Second arg is a short file of words to check - real words or bogus */
  data = fopen(argv[2], "r");
  if (data == NULL) {
    fprintf(stderr, "Cannot open %s (list of words to check)\n", argv[2]);
    exit(1);
  }

  /* First pass - count dict size.  Saves so much hassle. */
  N = 0; dictsize = 0;
  for (;;) {
    s = fgets(line, 127, dict);
    if (s == NULL) break;
    N += 1;
    dictsize += strlen(line);
  }
  fclose(dict);
  dict = fopen(argv[1], "r");
  if (dict == NULL) {
    fprintf(stderr, "Cannot reopen %s (master dictionary)\n", argv[1]);
    exit(1);
  }

  /* Allocate hash table of 10 * N bits */
  hashsize = (N * BITS) >> 3;
  hashtable = malloc(hashsize+1);
  if (hashtable == NULL) {
    fprintf(stderr, "Not enough ram for a %d byte hash table\?\?\?\n",
      hashsize+1);
    exit(1);
  }
  for (i = 0; i < hashsize+1; i++) hashtable[i] = 0;

  /* Add words to table. */
  for (;;) {
    s = fgets(line, 127, dict);
    if (s == NULL) break;
    s = strchr(s, '\n'); *s = '\0';
    hashpos = hashfun(line, N * BITS);
    hashtable[hashpos >> 3] |= (1 << (hashpos & 7));
  }
  fclose(dict);

  /* Take test file, confirm: all real words pass, 90% of bogus words fail */
  for (;;) {
    s = fgets(line, 127, data);
    if (s == NULL) break;
    s = strchr(s, '\n'); *s = '\0';
    hashpos = hashfun(line, N * BITS);
    fprintf(stdout, "%s %s at %d [%02x/%02x]\n",
     line,
     (((hashtable[hashpos >> 3] & (1 << (hashpos & 7))) != 0)
       ? "OK" : "not found"),
     hashpos>>3,
     hashtable[hashpos >> 3], 1 << (hashpos & 7));
  }
  fclose(data);
  fflush(stdout);
  fprintf(stderr, "Dictionary size was %d bytes\n", dictsize);
  fprintf(stderr, "Table size was %d bytes\n", hashsize+1);
  exit(0);
  return(1);
}