/* This header file does not describe any of the code in the
   accompanying files; it is a rough sketch of the NEXT iteration
   of the spelling/word utilities.  Comments are invited.  Missing
   functionality or bits that won't work should be pointed out
   please!   Send mail to gtoal@ed.ac.uk  (via nsfnet-relay.ac.uk
   from US sites)
   many thanks.

      Graham Toal 23/06/90
 */


typedef long INDEX;
typedef long NODE;
#define ROOTNODE 1L

typedef struct DICT {

  /* Although primitive procedures will be provided for handling the
     basic dawg array, by using this interface applications can
     remain independent of the implementation details of the
     dictionary. */

  /* So far there are three forms of dict; 1: a simple dawg with edges
     as 4-byte integers; each node (a set of branches) is stored
     sequentially.  2: An *indexed* form of the above -- much faster
     to lookup in, but slower to walk through.  Although indexing
     would normally be heavy on space, this uses Liang's packed
     overlapping tries, thus using almost exactly the same space as
     method 1.  3: A form of 1, but with all the stops pulled out
     to get as much bit-level compaction as possible; edges can
     be two bytes instead of 4. (by Keith Bostic) */

  /* This struct contains both access procedures and information;
     most fields will be filled in by our code when the dictionary
     is loaded by load_dict().  The fields may be added to in
     later releases, but they will always be upwards compatible;
     none of this data is saved to disk in this format, so changing
     the struct layout will not cause problems as long as the names
     remain the same. */

  /* Note that some procedures take a user-supplied 'apply' parameter;
     This 'apply' procedure has a 'context' parameter which is merely
     a handle to allow the user to pass data in to his code without
     having to use global variables.  If the user's apply procedure
     returns anything other than 0, it will cause an early exit, with
     the users return code being returned from the calling function. */

  /* It is intended that these are used in an object-oriented way;
     although we will supply an external dict_check(word, dict) procedure, for
     instance, all it will do is call dict->check(word, dict->dawg) */

  NODE *dawg;
                                   /* Root of the dictionary tree */
  char *dict_name,                 /* "smiths" - used in command lines etc. */
       *dict_info,                 /* "Johns Smiths Rhyming Dictionary,\n
                                       (c) Smith & Jones, 1892\n
                                       Whitechapel, London.\n" */
       *dict_fname,                /* "/usr/dict/smiths" */

       *lang_charset;              /* "ISO 8859/1 Latin 1" */

  /* These are filled in by us to make life easier for the applications
     programmer.  We'll supply a default table for simple 7-bit ascii
     dawgs; but this mechanism allows us the option of including all
     the salient points about the character set used in a dictionary
     in the dictionary itself.  Because they are char*'s, we'll probably
     end up sharing the same table between multiple dictionaries --
     so don't write to them. (Although you may replace the _pointer_
     with one to a table of your own. */

  /* Later note: I've just found out about LOCALE.H etc.  This stuff
     may therefore change... */

  int *chartype;
                                    /*  1             whitespace           */
                                    /*  2             punctuation          */
                                    /*  4             blank                */
                                    /*  8             lower case letter    */
                                    /*  16            upper case letter    */
                                    /*  32            (decimal) digit      */
                                    /*  64            underscore           */
                                    /*  128           A-F and a-f          */
                                    /*  256           Ignore words starting
                                                      with this char       */
                                    /*  512           Include in definition
                                                      of a word -- mainly
                                                      alpha but also hyphen
                                                      and apostrophe       */

char   *lower,                     /* personal implementation of tolower() */
       *unaccented,                /* map accented chars to non-accenmted */
       *lexorder;                  /* an arbitrary ordering for sorting:
                                      e-acute would come after e and before f
                                      */

  int  (* check)  (NODE *base, INDEX state, struct DICT *d, char *word);
                     /* Check a word - exact match; return true/false */

  int  (* fuzzy)  (NODE *base, INDEX state, struct DICT *d, char *word,
                   char *value);
     /* Check a word - fuzzy case, accents, hyphens etc; return true/false */
     /* value is filled in with the dictionary's idea of what the
        word should look like. */

  /* Fancier correction procedures may be synthesised out of user code and
     'walk' */
  int  (* correct) (
                char *word,
                NODE *base, INDEX state,
                                /* Normally dict's root, but may be subtree */
                struct DICT *d,   /* lowercase/accent information from here */
                int   maxwords,
                int   order,                /* 0: by alpha
                                               1: by highest score first
                                               2: by lexical ordering
                                            (ignores case, accents, hyphens)
                                             */
                int   (* apply) (
                      char *word,     /* Corrected word */
                            int   score,    /* Normalised to range 0..255 */
                            void *context),
                void  *context);
                                   /* Offer spelling corrections for the
                                      given word.  If maxwords > 0, return
                                      the best <maxwords> words in order
                                      of most-likely first; otherwise
                                      many words may be returned, in
                                      alphabetical order, coupled with
                                      a score: 255 means exact match; 0
                                      means totally different */

                                    /* returns 0 if user returned 0 for
                                       all applications */
  int  (* walk) (
               NODE *base, INDEX state,
               int   (* apply) (char *word, void *context),
               void  *context);
                                   /* Apply the procedure given to every
                                      word in the dictionary tree or subtree,
                                      passing in the user-supplied context */

                                    /* returns 0 if user returned 0 for
                                       all applications */

  int (* lookup) (
              NODE *base, INDEX state,
              char  *key,
              /* Out */
              INDEX values);
                                   /* Return the index of the subtree of the
                                      dictionary which has 'key' as a prefix;
                                      for instance, in a reverse telephone
                                      book, you might find entries:
                                         0874=Anytown
                                         0875=Cockenzie
                                         0875=Port Seton
                                         0875=Prestonpans
                                         0876=Toytown
                                       In this case, if searching for "0875=",
                                       a subtree would be returned containing:
                                         Cockenzie
                                         Port Seton
                                         Prestonpans
                                       This subtree would then be walked
                                       using the 'walk' function and a user-
                                       written 'apply' procedure. */

                                    /* returns 0 if successful */

  /* The following two are performance hints ONLY.  They should not
     affect the correct functioning of a program. */

  int  (* uncache) (struct DICT *d);/* The space a dictionary uses may be
                                       reclaimed by calling this.  If the
                                       dictionary is subsequently accessed,
                                       there *may* be a performance penalty --
                                       for instance the dictionary may be
                                       accessed off disk.  However on a machine
                                       with sufficient memory, the system
                                       may choose to leave the data in ram
                                       until, say, malloc runs out of space.
                                         If there is no convenient mechanism
                                       for organising the demand-recacheing,
                                       this may be a no-op. */

                                    /* returns 0 if successful */

  int  (* recache) (struct DICT *d);/* The data of the dictionary is reloaded
                                       into active memory where it will stay */

                                    /* returns 0 if successful */
  /* May be extended in the future */
} DICT;


/*
                            dict_load

   This is provided as a primitive so that dictionaries can be loaded
in the most efficient way the machine supports.  The space for the dict
is supplied *by* this procedure - not *to* it.  This allows a Mach (or Emas)
implementation to mmap (or Connect) the file in memory - the connection
address being chosen by the OS and outside our control.  It also allows
systems like RISC OS on the Acorn Archimedes to use an optimised whole-
file load call to pull the file off disk into real ram in a single disk
operation.

   Since the data parts of the type 1 & 2 dicts are just a set of 4-byte
integers, it is easy to correct a faulty byte-sex by running down this
array *once at start-up* to reverse the order.  This is easy if the
data is loaded into physical ram.  If it is connected as a file by a VM system
however, the VM system must support copy-on-write; if it only has read-only
mapping there must be two files available -- one of each sex.

   A *possible* scheme on VM systems is to byte-sex-reverse a whole
page the first time that page is touched.  This may be a lot of unneccesary
work - I'd recommend keeping a correct-sex file on-line.

   This code will create the DICT struct, and fill in the various fields,
either from the dictionary file itself or from private knowledge if not available.

*/

int dict_load_file(char *fname, DICT **dict_struct);

                                   /* fname is the literal file name */

                                   /* dict_struct is allocated by us,
                                      not by the user. */

                                   /* returns 0 if successfully loaded */

int dict_load_dict(char *dname, DICT **dict_struct);

                                   /* dname is the dictionary name,
                                      e.g. "words", or "web2" etc.
                                      The corresponding file will be searched
                                      for via a system-dependent path, logical
                                      variable, or fixed place. */

                                   /* dict_struct is allocated by us,
                                      not by the user. */

                                   /* returns 0 if successfully loaded */

 int dict_unload(DICT *d);         /*  Use when totally finished with data */

 /* These are user-level veneers for the internal procedures used above.
    Use them in preference to the above.  Use the above only if
    you are a speed-freak! (but disguise them in macros such as:

  #define  dict_check(dawg, dict, word) dict->check(dict->dawg, dict, word)

   These macros will of course go 'Bang!' if dict ptr is unassigned or NULL.
 */

 /* In the worst case, if a machine has a poor quality compiler which
    doesn't support procedure variables well, these routines could be
    the *acual* implementation rather than just a pointer to it. */

 int  check  (NODE *base, INDEX state, DICT *d, char *word);
 int  fuzzy  (NODE *base, INDEX state, DICT *d, char *word, char *value);
 int  correct (char *word,
               NODE *base, INDEX state,
                           /* Normally dictionary root, but may be subtree */
               DICT *d,          /* lowercase/accent information from here */
               int   maxwords,
               int   order,                /* 0: by alpha
                                              1: by highest score first
                                              2: by lexical ordering
                                            */
               int   apply(char *word,     /* Corrected word */
                           int   score,    /* Normalised to range 0..255 */
                           void *context),
               void  *context);
 int  walk   (NODE *base, INDEX state,     /* Yoyo fanatics will be pleased
                                 to learn that they can 'walk the dawg'... :-) */
              int    apply(char *word, void *context),
              void  *context);
 int lookup (NODE *base, INDEX state,
             char  *key,
             /* Out */
             INDEX values);


/* Now here come the fast internal procedures which the above eventually call */
int dawg_check(NODE *base, INDEX state, char *word);
  /* Standard dawg */
int pack_check(NODE *base, INDEX state, char *word);
  /* Overlapped indexed dawg */
int bsd_check(NODE *base, INDEX state, char *word);
  /* Space-optimised dawg (called bsd because Keith Bostic is working
                   on this format for the next bsd4.4 spelling checker) */
/* Must make the internl procs agree with these... save the macros for later... 
#define dict_check(word,dict) \
          (dict != NULL ? (dict->check(dict->dawg, ROOTNODE, word)) : -1)
*/
int dawg_walk(NODE *base, INDEX state,
              int    apply(char *word, void *context),
              void  *context);
int pack_walk(NODE *base, INDEX state,
              int    apply(char *word, void *context),
              void  *context);
int bsd_walk(NODE *base, INDEX state,
              int    apply(char *word, void *context),
              void  *context);
/*
#define dict_walk(dict, apply, context) \
          (dict != NULL ? (dict->walk(dict->dawg, ROOTNODE, apply, context)) : -1)
*/

int dawg_lookup(NODE *base, INDEX state, char  *key, /* Out */ INDEX values);
int pack_lookup(NODE *base, INDEX state, char  *key, /* Out */ INDEX values);
int bsd_lookup(NODE *base, INDEX state, char  *key, /* Out */ INDEX values);
/* etc... */

/* Get list of possible next characters from this point in the tree,
   put it into user-supplied array branch[256].  Filled with
   NULL if no branch, or an INDEX if it points to more of the tree.
   Words which may terminate on a particular letter have terminate[let] != 0

   The user-supplied terminate array is deliberately a long array to allow
   for expansion; it may be used one day to return arbitrary codes such as
  'Noun' etc...

 */

void dawg_branch(NODE *base, INDEX state, NODE *branch, long *terminate);
void pack_branch(NODE *base, INDEX state, NODE *branch, long *terminate);
void bsd_branch(NODE *base, INDEX state, NODE *branch, long *terminate);
/* No internal proc yet... add it in the morning. */

/* In fact there will be even simpler procedures which can be used by
   people who prefer to include the whole source of this package rather
   than just the library interface parts.  In applications where utmost
   speed is a neccessity (eg Scrabble(tm)) you could use these, or write
   your own.  But most of us can use this clean interface with unnoticable
   overhead. */