/* compdict - a program to construct the load file dict.o for scrabble

   Use:
	compdict wordlist
   creates the file dict.o, the load module for scrabble containing
   the dictionary database.  Two global symbols "dawg" and "root" are
   defined in dict.o.  dawg is of type unsigned long [] (in the text
   segment since it's read only), and root is unsigned long.
   Statistics about nodes and edges in the dawg are reported on
   standard output.

   See the file dawg.h for the format of the dawg data structure.

   The wordlist MUST be in the form ([a-z]*\n)* and in STRICTLY
   increasing lexicographic order, or compdict will weird out.
*/

#include <stdio.h>
#include "dawg.h"

#define SCALE	0.25	/* Ratio of file size to hash table size */

char stringbuf[80];	/* Space for current string */
char *string;		/* Marks END of current string */

unsigned long nodesused, nodessaved,	/* Used for accounting */
	      edgesused, edgessaved;

char *input, *endofinput, *snarf();	/* input stuff */

FILE *outfile;				/* output stuff */

struct node {
    struct node *next;		/* Next node with same hash key */
    unsigned long index;	/* Index of this node */
    int numedges;		/* Number of edges */
} **table, *hash();
/* The edges of a node structure are stored in the numedges following
    words of memory */

unsigned long hashsize;		/* Number of hash buckets allocated */

main(ac,av) int ac; char *av[];
{
    char  *calloc ();
    unsigned long   size,	/* ... of input file */
                    makenode ();

    size = flength (av[1]);	/* Get size of dictionary */
    input = snarf (av[1], size);
    endofinput = input + size;	/* Remember end of input */

 /* Allocate the hash table */
    hashsize = size * SCALE;
    table = (struct node  **)
               calloc (hashsize, sizeof (struct node *));

    outfile = fopen ("dict.o","w");
    skipheader();		/* Reserve space for .o header */
    putw (0, outfile);		/* Special node zero */

 /* Initialize accounting */
    nodesused = edgesused = 1;
    nodessaved = edgessaved = 0;

 /* Call makenode with null (relative to stringbuf) prefix;
    Initialize string to null; Put index of start node on output */
    putstab(makenode (string = stringbuf));	/* Put out symbol table */
    fixheader();		/* Fix up .o header */
    
    fclose (outfile);
    printf ("%d+%d nodes;  %d+%d edges.\n",
	    nodesused, nodessaved, edgesused, edgessaved);
    exit (0);
}

/* Makenode takes a prefix (as postion relative to stringbuf) and
   returns an index of the start node of a dawg that recognizes all
   words beginning with that prefix.  String is a pointer (relative
   to stringbuf) indicating how much of prefix is matched in the
   input.
*/
  
unsigned long makenode(prefix) char *prefix;
{
    unsigned long   edges[26];
    register unsigned long *edge = edges;
    register struct node  *n;
    register int    numedges;

    while (prefix == string) {			/* More edges out of node */
	*edge++ = (*string++ = *input++) & CHAR;
	if (*input == '\n') {			/* End of a word */
	    edge[-1] |= WORD;			/* Mark edge */
	    *string++ = *input++;		/* Skip \n */
	    if (input == endofinput)		/* At end of input? */
		break;
	    for (string = stringbuf; *string++ == *input++;);
	    --string;				/* Reset string */
	    --input;
	}
	edge[-1] |= makenode (prefix + 1);	/* make dawg pointed to
						   by this edge */
    }

    numedges = edge - edges;
    if (numedges == 0)
	return 0;		/* Special node zero - no edges */

    edge[-1] |= LAST;		/* Mark the last edge */

    n = hash (edges, numedges);	/* Look up node in hash table */
    if (n -> index) {		/* same as an existing node */
	edgessaved += numedges;
	nodessaved++;
    }
    else {			/* a new node */
	n -> index = edgesused;	/* enter node's index into table */
	edgesused += numedges;
	nodesused++;

     /* Output the edges of this node */
	fwrite (edges, sizeof (edge), numedges, outfile);
    }

    return (n -> index << INDEX);
}

/* Hash takes an array of edges (with a count) and maps it to a pointer
   to a node, filling in the edge info if needed.  It uses simple
   bucket chaining to keep a list of all nodes which hash to the same key.
*/

struct node *hash(e,n) unsigned *e; int n;
{
    register unsigned long  key = 0;
    register unsigned long  i;
    register unsigned long *slot_edges;
    register struct node   *slot;
    char   *small_alloc ();

 /* Cheesy signature method */
    for (i = 0; i < n; i++)
	key ^= (key << 3) ^ (key >> 1) ^ e[i];
    key %= hashsize;

 /* Look for identical node in hash table */
    for (slot = table[key]; slot; slot = slot -> next)
	if (n == slot -> numedges) {
	 /* Same number of edges ... */
	    slot_edges = (unsigned long *) (slot + 1);
	    for (i = 0; i < n; i++)
		if (slot_edges[i] != e[i])
	     /* Some edge is different. */
		    break;
	    if (i == n)		/* All edges were the same ... */
		return slot;	/* Just return a pointer */
	}

 /* Found an empty position; allocate and copy in edge info */
    slot = (struct node *) small_alloc (sizeof (struct node)
	                                 + n * sizeof (unsigned long));
    slot -> numedges = n;
    slot_edges = (unsigned long *) (slot + 1);
    for (i = 0; i < n; i++)
	slot_edges[i] = e[i];

 /* Link this node into the bucket */
    slot -> next = table[key];
    table[key] = slot;
    return slot;
}

/* Small_alloc doles out zeroed, aligned memory in small pieces, calling
   calloc to get a new CHUNK.  I do this because malloc is a pig when
   allocating small blocks.
*/

/* ALIGN rounds up to a multiple of 4 (machine dependant!) */
#define ALIGN(char_offset) (char_offset = (char_offset + 3) & ~3)

/* number of bytes calloc'ed in a CHUNK */
#define CHUNK 10236

char *small_alloc(nbytes) int nbytes;
{
    static char *buf;
    static int  used = CHUNK + 1;
 /* CHUNK + 1 to force allocation on first call */
    int     i;

    if (used + nbytes > CHUNK) {
     /* No room, start a new CHUNK */
	used = 0;
	buf = calloc (1, CHUNK);
    }
    i = used;
    used += nbytes;
    ALIGN (used);		/* Align the offset */
    return buf + i;
}

/* Routines to build .o format stuff */

#include <a.out.h>
#include <stab.h>

/* Reserve room for .o header in outfile */
skipheader ()
{
    struct exec junk;

    fwrite (&junk, sizeof junk, 1, outfile);
}


/* Fill in .o header in outfile */
fixheader()
{
    struct exec header;

#ifdef SUN4
    header.a_dynamic = 0;
    header.a_toolversion = TV_SUN4;
    header.a_machtype = M_SPARC;
#endif
 /* OMAGIC - magic # for loader .o files */
/**    header.a_magic = OMAGIC; **/
 /* Put dawg in text segment to make it shared and r/o */
    header.a_text = edgesused * sizeof (unsigned long);
    header.a_data = sizeof (unsigned long);
    header.a_bss = 0;
 /* Defining two symbols */
    header.a_syms = 2 * sizeof (struct nlist);
    header.a_entry = 0;
    header.a_trsize = 0;
    header.a_drsize = 0;

 /* Write out header info at beginning of outfile */
    rewind (outfile);
    fwrite (&header, sizeof header, 1, outfile);
}


/* Two global symbols to be defined in dict.o: */
char s_dawg[] = "_dawg", s_root[] = "_root";

/*
   Putstab puts out the single word in the data segment giving the
   value of the root edge, then writes symbol table and string table
   info into outfile. e is an edge pointing to the root of the dawg.
*/

putstab(e) unsigned long e;
{
    struct nlist    sym;	/* Symbol table entry */

 /* Output data segment ... */
    putw (e, outfile);		/* Value of edge into dawg root */

 /* Symbol table entry for "unsigned long dawg[]" */
    sym.n_un.n_strx = sizeof (long);
				/* First offset into string table just
				   past longword holding string table size
				   */
    sym.n_type = N_TEXT | N_EXT;/* Externally visible in text segment */
    sym.n_desc = N_GSYM;	/* Global symbol */
    sym.n_value = 0;		/* offset 0 in text segment */
    fwrite (&sym, sizeof sym, 1, outfile);/* output symbol (_dawg) */

 /* Symbol table entry for "unsigned long root" */
    sym.n_un.n_strx += sizeof s_dawg;/* Next offset into string table */
    sym.n_type = N_DATA | N_EXT;/* Externally visible in data segment */
    sym.n_desc = N_GSYM;	/* Global symbol */
    sym.n_value += edgesused * sizeof (unsigned long);
    fwrite (&sym, sizeof sym, 1, outfile);/* output symbol (_root) */

    putw (sizeof (long) + sizeof s_dawg + sizeof s_root, outfile);
				/* output size of string table */

    fwrite (s_dawg, sizeof s_dawg, 1, outfile);/* _dawg string */
    fwrite (s_root, sizeof s_root, 1, outfile);/* _root string */
}
