/* The #define is to repair mangling by BITNET mailers */
#define NOT ~
              /* This should be Tilde (twiddle) -- C unary Not   */
/*

    File:    dyntrie.c
    Author:  Graham Toal
    Purpose: Check a word using DAWG or TRIE.
    Functions exported:  trie_new, trie_add, trie_dump
    Internal functions:  insert_simple recurse_add

Description:

   Builds DAWG-compatible trie in store, incrementally; words do not
   have to be presented in order.  (Note it is NOT a 'packed-trie';
   in our scheme it is closer to a dawg - but without the tail-compression
   precomputed dawgs have).

   Any tries built using this should be used by check_dawg, print_dawg etc.

   Has rather nice property of only claiming enough memory for the
   job; dynamically moves the data structure when it fills!

*/

  /* To avoid global state, I'm putting these useful bits of info
     in the two words before the dawg itself; I *HOPE* that all systems
     allow negative indexing off arrays; I'm not at all sure they do
     though.  However since I moved the base of the dict up by adding
     2 to it, I *should* be able to get the old address by by subtracting
     2 from it, so I'm using the first pair of macros below, not the
     more intuitive second pair.
       I should really do this for ordinary dawgs & tries too.  Indeed
     I should *really* implement the dictlib.h interface, but thats a
     month or two off.  Also I'ld like some feedback first.
   */

#define EXTRAS 2
#define MAX_ENTRIES(dawg) (*(dawg-2))          /* Safe way of aliasing */
#define NEXT_FREE(dawg) (*(dawg-1))


  /* #define MAX_ENTRIES(dawg) dawg[-2] */     /* 'Obvious' alias      */
  /* #define NEXT_FREE(dawg) dawg[-1] */       /* may be buggy         */

#define EMPTY 0
#define INIT_MAX_ENTRIES 512

/* Slop is so that we don't need to be continually checking nextfree
   as being close to max_entries */
#define SLOP 12

/* A debugging macro which rangechecks arrays -- happens when
   utils.c is included with RCHECK defined. Otherwise no overhead. */
#define DICT(idx) dict[RANGECHECK(idx, MAX_ENTRIES(dict))]


  /*
      This quick hack job has special-case code in each function
      for the root node-set.  Its rather tacky; I could clean it
      up and make the root node like all other nodes, but why
      bother; this is only test code anyway...
   */



static INDEX
#ifdef PROTOTYPES
insert_simple(NODE PCCRAP **dictp, char *word)
#else
insert_simple(dictp, word)
NODE PCCRAP **dictp;
char *word;
#endif
#define dict (*dictp)
{
NODE bugfix;
INDEX i; NODE n; int c;

  if (NEXT_FREE(dict) >= MAX_ENTRIES(dict)-SLOP) {
    NODE PCCRAP *moved;
    moved = MALLOC((SIZE_T)MAX_ENTRIES(dict)*2+EXTRAS, sizeof(NODE));
    if (moved != NULL) {
      moved += EXTRAS;
      MAX_ENTRIES(moved) = MAX_ENTRIES(dict);
      NEXT_FREE(moved) = NEXT_FREE(dict);
      /* Should use realloc but appears to be buggy :-( */
      for (i = MAX_ENTRIES(dict); i < MAX_ENTRIES(dict)*2; i++) {
        moved[i] = EMPTY;
      }
      for (i = 0; i < MAX_ENTRIES(dict); i++) moved[i] = DICT(i);
      dict -= EXTRAS;
      FREE(dict);
      dict = moved; MAX_ENTRIES(dict) *= 2;
    } else {
      fprintf(stderr, "Can\'t add any more words again\n");
    }
  }

  c = (*word++)&255;
  if (c == '\0') return(TERMINAL_NODE);
  i = NEXT_FREE(dict)++;
  bugfix = insert_simple(&dict, word);
  n = ((NODE)c << (NODE)V_LETTER) | M_END_OF_NODE | bugfix;
  DICT(i) = n; if (*word == '\0') DICT(i) |= M_END_OF_WORD;
  return(i);
#undef dict
}

static void
#ifdef PROTOTYPES
recurse_add(NODE PCCRAP **dictp, INDEX base, char *word)
#else
recurse_add(dictp, base, word)
NODE PCCRAP **dictp;
INDEX base;
char *word;
#endif
#define dict (*dictp)
{
NODE bugfix;
INDEX i = base, newbase;
NODE unpicked[MAX_CHARS], n;
int c, ch;
int gap, nextslot = 0, j;
  /* First see if first char is already in trie */
  ch = (*word++)&255;
  for (;;) {
    c = (int)(((dict[i] >> V_LETTER) & M_LETTER)&255);
    if (ch == c) {
      newbase = dict[i]&M_NODE_POINTER;
      if (newbase == 0) {
        /* have abc (this is c), adding (abc)defg */
        dict[i] &= (NOT M_NODE_POINTER);
        bugfix = insert_simple(&dict, word);
        dict[i] |= bugfix;
      } else {
        if (*word == '\0') {
          dict[i] |= M_END_OF_WORD;
        } else {
          recurse_add(dictp, newbase, word);
        }
      }
      return;
    }
    if ((dict[i]&M_END_OF_NODE) != 0) break;
    i++;
  }

  /* unpick base */
  for (nextslot = 0; nextslot < MAX_CHARS; nextslot++) {
    unpicked[nextslot] = EMPTY;
  }
  nextslot = 0;

  i = base; j = 0;
  for (;;) {
    if ((j == 0) && (((dict[i] >> V_LETTER) & M_LETTER) > ch)) {
      j = 1; newbase = nextslot++;
    }
    n = dict[i]; dict[i] = EMPTY; i++;
    unpicked[nextslot++] = n & (NOT M_END_OF_NODE);
    if ((n & M_END_OF_NODE) != 0) break;
  }
  if (j == 0) newbase = nextslot++; /* Was it last alphabetically? */
  /* add this char to the node */
  if (*word == '\0') {
    unpicked[newbase] =
      ((NODE)ch << (NODE)V_LETTER) | TERMINAL_NODE | M_END_OF_WORD;
  } else {
    /* and insert the rest of the
       string with insert_simple */
    bugfix = insert_simple(&dict, word);
    unpicked[newbase] =
      ((NODE)ch << (NODE)V_LETTER) | bugfix;
      /* The insert_simple call has to come after above loop, because
         of freeing slots... */
  }
  unpicked[nextslot-1] |= M_END_OF_NODE;

  /* scan for suitably-sized gap */
  /* MAX_CHARS+1 should be first 'real' node base (0, 1..MAX_CHARS reserved) */

  /*
     The particular application this is wanted for doesn't have a large
     number of words to be added dynamically, so the BFI code below which
     finds free holes in the trie space works fine.  However if some bright
     spark cares to keep a freelist *in the holes* then this program
     effectively implements a linear-time sorting algorithm.

     (I know it's not *really*, but think about it before jumping
     down my throat; the N^2 case is very statistically unlikely)
   */

  for (i = (INDEX)MAX_CHARS+1; i < MAX_ENTRIES(dict)-nextslot-SLOP; i++) {
    /*
        Even without the freelist, the sneaky hack in pack.c for
        noting earliest bunch of free holes might well make a
        significant difference... (It was a lowest_search_start
        variable and used a bit of hysteresis)
     */
    gap = TRUE;
    for (j = 0; j < nextslot; j++) {
      if (dict[i+j] != EMPTY) {
        gap = FALSE;
        break;
      }
    }
    if (gap) break;
  }
  if (!gap) {
    NODE PCCRAP *moved;
    moved = MALLOC((SIZE_T)MAX_ENTRIES(dict)*2+EXTRAS, sizeof(NODE));
    if (moved != NULL) {
      moved += EXTRAS;
      MAX_ENTRIES(moved) = MAX_ENTRIES(dict);
      NEXT_FREE(moved) = NEXT_FREE(dict);
      /* Should use realloc but appears to be buggy :-( */
      for (i = MAX_ENTRIES(dict);
           i < MAX_ENTRIES(dict)*2; i++) moved[i] = EMPTY;
      for (i = 0; i < MAX_ENTRIES(dict); i++) moved[i] = DICT(i);
      dict -= EXTRAS;
      FREE(dict);
      dict = moved; /* If your compiler has aliasing bugs they may hit here. */
      MAX_ENTRIES(dict) *= 2;
      /* scan for suitably-sized gap */
      for (i = ((MAX_ENTRIES(dict)/2)-(MAX_CHARS+1));
           i < MAX_ENTRIES(dict)-nextslot; i++) {
        gap = TRUE;
        for (j = 0; j < nextslot; j++) {
          if (dict[i+j] != EMPTY) {
            gap = FALSE;
            break;
          }
        }
        if (gap) break;
      }
    }
    if (!gap) {
      fprintf(stderr, "Can\'t add any more words\n");
      return;
    }
  }
  newbase = i;
  /* reinsert */
  for (j = 0; j < nextslot; j++) {
    dict[newbase+j] = unpicked[j];
  }
  if (newbase+nextslot-1 >= NEXT_FREE(dict)) NEXT_FREE(dict) = newbase+nextslot;
  /* update pointer to the base of this node */
  for (i = 0; i < MAX_ENTRIES(dict); i++) {
    if ((dict[i] & M_NODE_POINTER) == base) {
      dict[i] &= (NOT M_NODE_POINTER);
      dict[i] |= newbase;
      break;            /* (should only be one since trie, not dawg) */
    }
  }
#undef dict
}

void
#ifdef PROTOTYPES
trie_add(NODE PCCRAP **dictp, char *word)
#else
trie_add(dictp, word)
NODE PCCRAP **dictp;
char *word;
#endif
#define dict (*dictp)
{
INDEX next;
INDEX bugfix;
int c;
  /* Root node is pre-reserved MAX_CHARS entries */
  c = (*word++)&255; /* bugfix - remember chars can be signed :-( */
  if (DICT((INDEX)ROOT_NODE+c) == EMPTY) {
    /* I'm allowing the root node to be sparse for speed; it *should*
       be dense like any other node.  May change this later */
    if (*word == '\0') {
      DICT((INDEX)ROOT_NODE+c) =
        ((NODE)c << (NODE)V_LETTER) | M_END_OF_WORD;
    } else {
      bugfix = insert_simple(&dict, word);
      DICT((INDEX)ROOT_NODE+c) =
        ((NODE)c << (NODE)V_LETTER) | bugfix;
    }
  } else {
    if (*word == '\0') {
      DICT((INDEX)ROOT_NODE+c) |= M_END_OF_WORD;
    } else {
      next = DICT((INDEX)ROOT_NODE+c) & M_NODE_POINTER;
      if (next == 0) {
        /* have 'a', adding 'abcd' */
        /* (Should really check the letter for safety...) */
        bugfix = insert_simple(&dict, word);
        DICT((INDEX)ROOT_NODE+c) = ((NODE)c << (NODE)V_LETTER)
          | bugfix | M_END_OF_WORD;
      } else {
        recurse_add(dictp, next, word);
      }
    }
  }
#undef dict
}

int
#ifdef PROTOTYPES
trie_new(NODE PCCRAP **dawgp)
#else
trie_new(dawgp)
NODE PCCRAP **dawgp;
#endif
#define dawg (*dawgp)
{
INDEX i;

  dawg = MALLOC(INIT_MAX_ENTRIES+EXTRAS, sizeof(NODE));
  dawg += EXTRAS;

  MAX_ENTRIES(dawg) = INIT_MAX_ENTRIES; NEXT_FREE(dawg) = MAX_CHARS+1;

  dawg[0] = 0;
  for (i = 1; i <= MAX_CHARS; i++) dawg[i] = 0;
  dawg[MAX_CHARS] |= M_END_OF_NODE;
  /* 0 base, MAX_CHARS chars */

  return(TRUE);
#undef dawg
}

int
#ifdef PROTOTYPES
trie_dump(NODE PCCRAP *dawg, char *filename)
#else
trie_dump(dawg, filename)
NODE PCCRAP *dawg;
char *filename;
#endif
{
INDEX cnt, max;
FILE *dumper;
  max = NEXT_FREE(dawg)*sizeof(NODE);
    /* I *knew* the next_free variable would come in handy :-) */
  dumper = fopen(filename, WRITE_BIN);
  if (dumper == NULL) {
    fprintf(stderr, "Failed to dump trie on file \'%s\'\n", filename);
    return(FALSE);
  }
  putword(max, dumper);
  if ((cnt = putwords(dawg, max, dumper)) != max) {
    fprintf(stderr, "Failed to dump: %ld bytes written instead of %ld\n",
      cnt, max);
    return(FALSE);
  }
  fclose(dumper);
  return(TRUE);
}

#undef MAX_ENTRIES
#undef NEXT_FREE
#undef DICT
#undef SLOP

