#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define BITS 10
/* Trivial hash with end-around carry. No thought went into this.
You can probably do better. This worked well enough for me on
the first try I didn't see a need to improve it.
Return a hash of the string between 0 and max-1. A small degree
of care was exerted so that a, aa, aaa and aaaa etc didn't all hash to 0.
*/
int hashfun(char *s, int max)
{
int c;
int rem;
int h = 0;
for (;;) {
c = *s++;
if (c == '\0') break;
if (isalpha(c)) c = c - (isupper(c) ? 'A' : 'a');
h = (h + (c&15) + 1) * 10;
for (;;) {
/* I suspect this can only be executed twice at most but
I'm lazy and paranoid, hence a loop */
rem = h / max;
h = (h % max) + rem;
if (h < max) break;
}
}
return(h);
}
int main(int argc, char **argv)
{
FILE *dict;
FILE *data;
int i;
char *s;
char line[128];
int N; /* Determine size of text (no of words) */
int dictsize;
int hashsize, hashpos; /* Allocate hash table of 10 * N bits */
unsigned char *hashtable;
/* Files are just words, one per line. Nothing fancy. Standard newlines. */
if (argc != 3) {
fprintf(stderr, "syntax: %s dictfile testwords\n", argv[0]);
exit(1);
}
/* First arg is the large dictionary file */
dict = fopen(argv[1], "r");
if (dict == NULL) {
fprintf(stderr, "Cannot open %s (master dictionary)\n", argv[1]);
exit(1);
}
/* Second arg is a short file of words to check - real words or bogus */
data = fopen(argv[2], "r");
if (data == NULL) {
fprintf(stderr, "Cannot open %s (list of words to check)\n", argv[2]);
exit(1);
}
/* First pass - count dict size. Saves so much hassle. */
N = 0; dictsize = 0;
for (;;) {
s = fgets(line, 127, dict);
if (s == NULL) break;
N += 1;
dictsize += strlen(line);
}
fclose(dict);
dict = fopen(argv[1], "r");
if (dict == NULL) {
fprintf(stderr, "Cannot reopen %s (master dictionary)\n", argv[1]);
exit(1);
}
/* Allocate hash table of 10 * N bits */
hashsize = (N * BITS) >> 3;
hashtable = malloc(hashsize+1);
if (hashtable == NULL) {
fprintf(stderr, "Not enough ram for a %d byte hash table\?\?\?\n",
hashsize+1);
exit(1);
}
for (i = 0; i < hashsize+1; i++) hashtable[i] = 0;
/* Add words to table. */
for (;;) {
s = fgets(line, 127, dict);
if (s == NULL) break;
s = strchr(s, '\n'); *s = '\0';
hashpos = hashfun(line, N * BITS);
hashtable[hashpos >> 3] |= (1 << (hashpos & 7));
}
fclose(dict);
#ifdef TEST
/* Take test file, confirm: all real words pass, 90% of bogus words fail */
for (;;) {
s = fgets(line, 127, data);
if (s == NULL) break;
s = strchr(s, '\n'); *s = '\0';
hashpos = hashfun(line, N * BITS);
fprintf(stdout, "%s %s at %d [%02x/%02x]\n",
line,
(((hashtable[hashpos >> 3] & (1 << (hashpos & 7))) != 0)
? "OK" : "not found"),
hashpos>>3,
hashtable[hashpos >> 3], 1 << (hashpos & 7));
}
fclose(data);
#else
fprintf(stdout, "#define N %d\n", N);
fprintf(stdout, "#define HASHSIZE %d\n", hashsize);
fprintf(stdout, "unsigned char hashtable[HASHSIZE+1] = {\n");
for (i = 0; i < hashsize+1; i++) {
fprintf(stdout, "%4d, ", hashtable[i]);
if ((i&7) == 7) fprintf(stdout, "\n");
}
fprintf(stdout, "\n};\n");
#endif
fflush(stdout);
fprintf(stderr, "Dictionary size was %d bytes\n", dictsize);
fprintf(stderr, "Table size was %d bytes\n", hashsize+1);
exit(0);
return(1);
}