/*
 * Title:   token.c
 *
 * Authors: Richard Hooker & Graham Toal
 *
 * Purpose: Convert a sorted dictionary into a word-dag
 *
 * $Log:        c.token
 *
 * $ Revision 1.4  91/07/15  18:47:31  gtoal Changed name to token.
 *
 * Revision 1.2  91/03/23  17:56:35  gtoal About to be released
 *
 * Revision 1.1  91/03/23  16:15:35  gtoal Initial revision
 *
 */

char   *RCS_ID = "$Header: rcs.c.token 1.4 91/07/15 18:47:31 gtoal Exp Locker: gtoal $";

#include "copyr.h"
#include "splib.h"

#include <string.h>
#include <ctype.h>
#include <unistd.h>

/* Use system getopt.  Is there a header file for this? */
/**/
/*extern int getopt(int argc, char **argv, char *opstring);*/
extern char *optarg;
extern int optind, opterr;
/**/
#ifdef MEMDEBUG
#include "../mnemosyne/mnemosyn.h"
#endif

#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define ABS(x) ((x) < 0 ? -(x) : (x))

/* pick one about 20% larger than needed */
long    hash_table_size = 0;
#define HASH_TABLE_SIZE hash_table_size
#define MIN_HASH_SIZE 60013L
#define MAX_EDGES (HASH_TABLE_SIZE-1)
#define MAX_FUDGE 4L		/* Factor by which our estimate might be out
				 * for atypical files */

char   *infile = NULL, *outfile = NULL;
static FILE *fpin, *fpout;	/* Input/output files */
static unsigned char current_word[(MAX_LINE + 1)];	/* The last word read from
	            					 * fpin */

static int first_diff;
/* The position of the first letter at which */
/* current_word differs from the previous    */
/* word read                                 */

static NODE *hash_table;
static int first_time = TRUE;
static int this_char = '?', lastch;
static NODE *dawg;

static int FILE_ENDED = FALSE;	/* I'm having real problems getting merged
				 * dawgs to work on the PC; this is yet
				 * another filthy hack. Sorry. */

static char *array_name = "_token";
static char *progname = "token", *realprogname = NULL;
static int token_verbose = FALSE;
static int cfile = FALSE;	/* Write output as a C file instead */
static int short_circuit = FALSE;	/* Skip dawg building - use
					 * precomputed one */
static INDEX nwords, nnodes, total_edges;

/*
 * Macros for array accesses with checking.  If RCHECK isn't defined, these
 * don't contribute any overhead. I suggest leaving RCHECK on except for
 * production code.
 */

#define EDGE_LIST(idx) \
  edge_list[RANGECHECK(idx, MAX_CHARS)]
#define CURRENT_WORD(idx) \
  current_word[RANGECHECK(idx, MAX_LINE+1)]
#define DAWG(idx) \
  dawg[RANGECHECK(idx, MAX_EDGES)]
#define HASH_TABLE(idx) \
  hash_table[RANGECHECK(idx, HASH_TABLE_SIZE)]

/*
 * Forward references
 */

static INDEX build_node(int depth);
static INDEX add_node(NODE * edge_list, INDEX nedges);
static void read_next_word(void);
static INDEX compute_hashcode(NODE * edge_list, INDEX nedges);
static void report_size(void);

void
usage(void)
{
	fprintf(stderr,
	    "usage: %s [-c] [-l | -H hashsize] [-d tokenfile | -f wordlist] [-o outfile]\n",
	    progname);
	fprintf(stderr,
	    "       %s -H 100000 -f /usr/share/dict/web2 -o web2.dwg\n", progname);
	fprintf(stderr,
	    "       %s -l -f anagdict.txt -o anags.dwg\n", progname);
	fprintf(stderr,
	    "       %s -c -f connectives.txt -o connectives.h\n", progname);
	fprintf(stderr,
	    "       %s -c -d anags.dwg -o anags.h\n", progname);
}


static char *
hex(long l)
{
	static char tmp[11];
	char    hexdig[17] = "0123456789abcdef";
	int     p;
	strcpy(tmp, "         L");
	p = 9;
	for (;;) {
		tmp[p] = hexdig[l & 15];
		l = (l >> 4L) & 0xfffffffL;
		if (l == 0) {
			p -= 1;
			tmp[p] = 'x';
			p -= 1;
			tmp[p] = '0';
			return (tmp);
		}
		p -= 1;
	}
}

static void
dump_c_array(FILE * f, NODE * data, INDEX length)
{
	INDEX   i, j;
	for (i = 0; i < length; i += 6) {
		for (j = i; j < i + 6 && j < length; j++) {
			fprintf(f, "%s", hex(data[j]));
			if (j + 1 != length)
				fprintf(f, ", ");
		}
		fprintf(f, "\n");
	}
}


/*
 * Take a list of alpha sorted and uniq words and convert into token file
 */

int
main(int argc, char **argv)
{
	INDEX   i;
	long    insize = 0L;
	long    fudge_factor = 1L;
	int     c;

	realprogname = progname = argv[0];
	short_circuit = FALSE;
	token_verbose = FALSE;
	while ((c = getopt(argc, argv, "cd:f:H:lo:vh?")) != -1) {
		switch (c) {
		case 'c':
			/* Generate given C output file rather than .dwg file */
			cfile = TRUE;
			if (token_verbose)
				fprintf(stderr, "%s: output file will be C source\n", progname);
			break;
		case 'd':
			/* This option lets you give a precomputed token file
			 * as input, to be converted to C. */
			if (infile != NULL) {
				fprintf(stderr, "%s: cannot have both -f and -d\n", progname);
				exit(EXIT_FAILURE);
			}
			infile = optarg;
			if (token_verbose)
				fprintf(stderr, "%s: reading directly from token file %s\n",
				    progname, optarg);
			short_circuit = TRUE;
			break;
		case 'f':
			if (infile != NULL) {
				fprintf(stderr, "%s: cannot have both -d and -f\n", progname);
				exit(EXIT_FAILURE);
			}
			infile = optarg;
			if (token_verbose)
				fprintf(stderr, "%s: reading from word list %s\n", progname, infile);
			break;
		case 'H':
			if (!isdigit(*optarg)) {
				fprintf(stderr, "%s: argument to -h must be an integer\n", progname);
				usage();
			}
			hash_table_size = atol(optarg);
			if (token_verbose)
				fprintf(stderr, "%s: setting hash_table_size to %ld\n",
				    progname, hash_table_size);
			break;
		case 'l':
			fudge_factor = MAX_FUDGE;
			if (token_verbose)
				fprintf(stderr, "%s: making some elbow-room!\n", progname);
			break;
		case 'n':
			array_name = optarg;
			break;
		case 'o':
			if (outfile != NULL) {
				fprintf(stderr, "%s: cannot have both more than one -o\n", progname);
				exit(EXIT_FAILURE);
			}
			outfile = optarg;
			if (token_verbose)
				fprintf(stderr, "%s: creating token file %s\n", progname, outfile);
			break;
		case 'v':
			token_verbose = TRUE;
			break;
		case 'h':
		case '?':
			usage();
			exit(EXIT_SUCCESS);
		default:
			usage();
			exit(EXIT_FAILURE);
		}
	}

	/* Spelling from stdin? */
	if (optind != argc) {
		fprintf(stderr, "%s: spurious extra parameter %s\n", progname, argv[optind]);
		exit(EXIT_FAILURE);
	}
	/* Find out how large the input file is; guess how many nodes will be
	 * needed. */

	/* This code should be reworked so that if infile is not given, you
	 * read from stdin instead.  Currently it does not do so, but that was
	 * only because of the difficulty of estimating hash table sizes.  If
	 * you allow the hash table size to be a parameter, it would make
	 * sense to accept input from a pipe instead. */
	if ((infile == NULL) && (outfile == NULL)) {
		usage();
		exit(EXIT_FAILURE);
	}
	if (infile == NULL) {
		fprintf(stderr, "%s: requires a -f infile parameter\n", progname);
		exit(EXIT_FAILURE);
	}
	if (outfile == NULL) {
		fprintf(stderr, "%s: requires a -o outfile parameter\n", progname);
		exit(EXIT_FAILURE);
	}
	if (short_circuit && !cfile) {
		/* short-circuit implies cfile */
		/* do this check before anything gets overwritten! */
		fprintf(stderr, "%s: giving \"-d %s\" usually implies -c too;\n", progname, infile);
		fprintf(stderr, "%s: didn\'t you really intend to generate a C file perhaps?\n", progname);
		exit(EXIT_FAILURE);
	}
	fpout = fopen(outfile, WRITE_BIN);
	if (fpout == NULL) {
		fprintf(stderr, "%s: can\'t open output file \"%s\"\n", progname, outfile);
		exit(EXIT_FAILURE);
	}
	if (short_circuit) {
		/* Load dawg directly */
		if (!dawg_init(infile, &dawg, &total_edges)) {
			fprintf(stderr, "%s: cannot load pre-computed dictionary file %s\n",
			    progname, infile);
			exit(EXIT_FAILURE);
		} else {
			if (token_verbose)
				fprintf(stderr, "%s: Loaded dict %s of %ld edges\n",
				    progname, infile, total_edges);
		}
	} else {
		if (hash_table_size <= 0) {	/* I.e. not given explicitly */
			fpin = fopen(infile, READ_BIN);	/* Has to be binary for
							 * ftell to work */
			if (fpin == NULL) {
				fprintf(stderr, "%s: can\'t open text file \"%s\"\n", progname, infile);
				/* Could print out error string but it's not
				 * portable... */
				exit(EXIT_FAILURE);
			}
			fseek(fpin, 0L, SEEK_END);
			insize = ftell(fpin);
			fclose(fpin);

			hash_table_size = insize / 11L;	/* Pretty close estimate
							 * of nodes */
			hash_table_size = hash_table_size * 133L;
			hash_table_size = hash_table_size / 100L;	/* Plus 33% */
			hash_table_size = hash_table_size * fudge_factor;

		}
		hash_table_size = prime(hash_table_size);	/* Up to next prime
								 * number */
		hash_table_size = MAX(hash_table_size, MIN_HASH_SIZE);

		if (token_verbose)
			fprintf(stderr, "%s: using a hash table of %ld %d-byte edges\n",
			    progname, hash_table_size, sizeof(NODE *));

		fpin = fopen(infile, READ_TEXT);	/* Had better work! */
		/**
	        *  Allocate the memory for the dawg
	        **/

		if ((dawg = malloc((size_t) MAX_EDGES * sizeof(NODE *))) == NULL) {
			fprintf(stderr, "%s: can\'t allocate dictionary memory\n", progname);
			exit(EXIT_FAILURE);
		}
		for (i = 0; i < (INDEX) MAX_EDGES; i++)
			dawg[i] = 0L;
		/**
	        *  Allocate the hash table.
	        *  Fill it with zeroes later just before use.  Don't trust calloc etc.
	        *  - not portable enough.  Anyway, in the multi-pass version we don't
	        *  want to continually free/claim...
	        **/

		if ((hash_table =
			malloc((size_t) (HASH_TABLE_SIZE + 1L) * sizeof(NODE))) == NULL) {
			fprintf(stderr, "%s: can\'t allocate memory for hash table\n", progname);
			exit(EXIT_FAILURE);
		}
		/**
	        *  Read the first word from the dictionary
	        **/

		first_time = TRUE;
		nwords = 0;
		current_word[0] = 0;
		read_next_word();
		lastch = *current_word;
		/**
	        *  Initialise the counters, taking account of the root node (which is
	        *  a special case)
	        **/

		nnodes = 1;
		total_edges = MAX_CHARS;

		/**
	        *  Build the dawg and report the outcome
	        **/

		/* Explicitly initialise hash table to all zeros */
		{
			INDEX   a;
			for (a = 0; a <= HASH_TABLE_SIZE; a++)
				hash_table[a] = 0;
		}

		(void) build_node(0);
		fclose(fpin);
	}

	/**
        *  Save the dawg to file
        **/

	if (cfile) {
		/* Generate C output instead */
		fprintf(fpout, "long %s[%ld] = {\n", array_name, total_edges + 1);
		/* name of 'long _token[]' to be a parameter */
		dump_c_array(fpout, dawg, total_edges + 1);	/* Loses DAWG_MAGIC and
								 * length.  Retains
								 * initial 0L. */
		fprintf(fpout, "};\n");
	} else {
		/* Prepend a magic-number, then write raw data to token file.  */
		{
			NODE    dawg_magic = DAWG_MAGIC;
			fwrite(&dawg_magic, 1, sizeof(dawg_magic), fpout);
		}
		{
			NODE    nedges = total_edges + 1;
			fwrite(&nedges, 1, sizeof(nedges), fpout);
		}
		*dawg = 0L;
		total_edges = sizeof(NODE) * (total_edges + 1);	/* Convert to byte count */

		{
			long    cnt;
			if ((cnt = fwrite((char *) dawg, 1, (size_t) total_edges, fpout)) != total_edges) {
				fprintf(stderr, "%s: %ld bytes written instead of %ld\n.",
				    progname, cnt, total_edges);
				exit(EXIT_FAILURE);
			}
		}
	}
	fflush(fpout);
	fclose(fpout);

	if (token_verbose)
		fprintf(stderr, "%s: %sfile %s generated\n", progname, (cfile ? "C " : ""), outfile);
	exit(EXIT_SUCCESS);
}
/**
*       BUILD_NODE
*
*       Recursively build the next node and all its sub-nodes
**/

static  INDEX
build_node(int depth)
{
	INDEX   nedges = 0;
	INDEX   i;
	NODE   *edge_list;

	edge_list = NULL;
	if (CURRENT_WORD(depth) == 0) {
		/**
	        *  End of word reached. If the next word isn't a continuation of the
	        *  current one, then we've reached the bottom of the recursion tree.
	        **/

		read_next_word();
		if (first_diff < depth)
			return (0);
	}
	/* This malloc below costs like hell!  May someday rewrite the store
	 * management for this bit... */
	edge_list = (NODE *) malloc(MAX_CHARS * sizeof(NODE));
	if (edge_list == NULL) {
		fprintf(stderr, "%s: stack full (depth %d)\n", progname, depth);
		exit(EXIT_FAILURE);
	}
	for (i = 0; i < MAX_CHARS; i++)
		EDGE_LIST(i) = 0L;

	/**
        *  Loop through all the sub-nodes until a word is read which can't
        *  be reached via this node
        **/

	do {
		/* Construct the edge. Letter.... */
		EDGE_LIST(nedges) = (NODE) (((NODE) CURRENT_WORD(depth)))
		    << (NODE) V_LETTER;
		/* ....end-of-word flag.... */
		if (CURRENT_WORD(depth + 1L) == 0)
			EDGE_LIST(nedges) |= M_END_OF_WORD;
		/* ....and node pointer. */
		EDGE_LIST(nedges) |= build_node(depth + 1);
		nedges++;
		/* (don't ++ in a macro) */
	} while (first_diff == depth);

	if (first_diff > depth) {
		fprintf(stderr, "%s: internal error -- first_diff = %d, depth = %d\n",
		    progname, first_diff, depth);
		exit(EXIT_FAILURE);
	}
	EDGE_LIST(nedges - 1) |= M_END_OF_NODE;
	/* Flag the last edge in the node */

	/**
        *  Add the node to the dawg
        **/

	if (depth) {
		NODE    result = add_node(edge_list, nedges);
		free(edge_list);
		return (result);
	}
	/**
        *  depth is zero, so the root node (as a special case) goes at the start
        **/

	edge_list[MAX_CHARS - 1] |= M_END_OF_NODE;	/* For backward searches */
	for (i = 0; i < MAX_CHARS; i++) {
		dawg[i + 1] = edge_list[i];
	}
	free(edge_list);
	return (0);
}
/**
*       ADD_NODE
*
*       Add a node to the dawg if it isn't already there. A hash table is
*       used to speed up the search for an identical node.
**/

static  INDEX
add_node(NODE * edge_list, INDEX nedges)
{
	NODE    hash_entry;
	INDEX   inc;
	INDEX   a, first_a;
	INDEX   i;

	/**
        *  Look for an identical node. A quadratic probing algorithm is used
        *  to traverse the hash table.
        **/

	first_a = compute_hashcode(edge_list, nedges);
	first_a = ABS(first_a) % HASH_TABLE_SIZE;
	a = first_a;
	inc = 9;

	for (;;) {
		hash_entry = HASH_TABLE(a) & M_NODE_POINTER;

		if (hash_entry == 0)
			break;	/* Node not found, so add it to the dawg */

		for (i = 0; i < nedges; i++)
			if (DAWG((hash_entry + i) % HASH_TABLE_SIZE) != EDGE_LIST(i))
				break;

		/* On the 1.6M dictionary, this gave a rangecheck with < 0.
		 * Now fixed I think - it was a problem with MOD giving
		 * unexpected results. */

		if (i == nedges) {
			return (hash_entry);	/* Node found */
		}
		/**
	        *  Node not found here. Look in the next spot
	        **/

		a += inc;
		inc += 8;
		if (a >= HASH_TABLE_SIZE)
			a -= HASH_TABLE_SIZE;
		if (inc >= HASH_TABLE_SIZE)
			inc -= HASH_TABLE_SIZE;
		if (a == first_a) {
			fprintf(stderr, "%s: hash table full\n", progname);
			exit(EXIT_FAILURE);
		}
	}

	/**
        *  Add the node to the dawg
        **/

	if (total_edges + nedges >= MAX_EDGES) {
		fprintf(stderr,
		    "%s: error -- dictionary full - total edges = %ld\n",
		    progname, total_edges);
		fprintf(stderr,
		    "%s: try running again with \"token -l infile outfile\"\n", progname);
		exit(EXIT_FAILURE);
	}
	HASH_TABLE(a) |= total_edges + 1;
	nnodes++;
	for (i = 0; i < nedges; i++) {
		DAWG((total_edges + 1 + i) % HASH_TABLE_SIZE) = EDGE_LIST(i);
	}
	total_edges += nedges;
	return (total_edges - nedges + 1);
}
/**
*       READ_NEXT_WORD
*
*       Read the next word from the input file, setting first_diff accordingly
**/

static void
read_next_word(void)
{
	/* This stuff imposes the limitation of not allowing '\0' in a word;
	 * not yet a problem but the dawg structure itself could probably cope
	 * if the feature were wanted. (Maybe with a little teweaking) */
	unsigned char	linebuff[(MAX_LINE + 1)];
	int     length;
	for (;;) {
		int     next = 0, c;
		for (;;) {
			c = fgetc(fpin);
			if (FILE_ENDED || (c == EOF) || ferror(fpin) || feof(fpin)) {
				/* for some reason, we always get a blank line
				 * at the end of files */
				current_word[0] = 0;
				first_diff = -1;
				linebuff[next] = '\0';
				FILE_ENDED = TRUE;
				return;
			}
			c &= 255;
			if (next == 0 && c == '\n')
				continue;	/* skip blank lines... */
			linebuff[next++] = c;
			if (c == '\n')
				break;
		}
		linebuff[next] = '\0';

		length = strlen(linebuff);

		if (linebuff[length - 1] == '\n')
			linebuff[length - 1] = '\0';
		if (linebuff[length] == '\n')
			linebuff[length] = '\0';

		if ((length < 2) || (length > MAX_LINE - 1)) {
			fprintf(stderr, "\n%s: %s - invalid length\n", progname, linebuff);
			continue;	/* Previously exit()ed, but now ignore
					 * so that test sites without my
					 * pddict can use /usr/dict/words */
		}
		break;
	}
	for (length = 0; current_word[length] == linebuff[length]; length++) {
		/* Get common part of string to check order */
	}
	if ((((unsigned int)current_word[length])&255) > (((unsigned int)linebuff[length])&255)) {
		fprintf(stderr,
		    "%s: word out of sequence (%s > %s) -- \"sort -u\" the dictionary please.\n(Be wary of 8-bit charsets and unix sort)\n",
		    progname, linebuff, current_word);
		exit(EXIT_FAILURE);
	}
	first_diff = length;

	nwords++;
	strcpy(current_word, linebuff);

	if ((nwords > 1) && (!(nwords & 0x3FF)))
		report_size();
	this_char = current_word[0];	/* for diagnostics... */
}
/**
*       COMPUTE_HASHCODE
*
*       Compute the hash code for a node
**/

static  INDEX
compute_hashcode(NODE * edge_list, INDEX nedges)
{
	INDEX   i;
	INDEX   res = 0L;

	for (i = 0; i < nedges; i++)
		res = ((res << 1) | (res >> 31)) ^ EDGE_LIST(i);
	return (res);
}
/**
*       REPORT_SIZE
*
*       Report the current size etc
**/

static void
report_size(void)
{

	if (first_time) {
		if (token_verbose)
			fprintf(stderr, "      Words    Nodes    Edges    Bytes    BitsPW\n");
		if (token_verbose)
			fprintf(stderr, "      -----    -----    -----    -----    ------\n");
		first_time = FALSE;
	}
	if (*current_word) {
		if (token_verbose)
			fprintf(stderr, "%c:", *current_word);
	} else {
		if (token_verbose)
			fprintf(stderr, "Total:");
	}

	if (token_verbose)
		fprintf(stderr, "  %7ld  %7ld  %7ld  %7ld  %8d\n",
		    nwords, nnodes, total_edges,
		    (long) sizeof(NODE *) * (total_edges + 1),
		    (int) ((sizeof(NODE *) * (total_edges + 1) * 8) / nwords)
		    );
}