From demon!bnr.co.uk!pipex!uunet!cs.utexas.edu!utnut!utcsri!eecg.toronto.edu!lewis Fri Apr 16 20:25:45 GMT 1993 Article: 1015 of alt.sources Path: demon!bnr.co.uk!pipex!uunet!cs.utexas.edu!utnut!utcsri!eecg.toronto.edu!lewis Newsgroups: alt.sources From: lewis@eecg.toronto.edu (david lewis) Subject: Re: Less trivial anagram program Message-ID: <1993Apr13.100533.14270@jarvis.csri.toronto.edu> Organization: CSRI, University of Toronto References: <9304120308.AA06939@pizzabox.demon.co.uk> Date: 13 Apr 93 14:05:33 GMT Lines: 311 This program is less trivial, because it can generate words not in its dictionaory. It uses the dictionary only to generate probability of letter tuples. Each possible word has its probability of being an actual word generated by using the dictionary database. It can handle partly solved anagrams. Eg. to run it anagram /usr/dict/words 3 word, mask: tst .oa.. toast 14.19 toast 14.19 soatt 13.83 soatt 13.83 toats 12.04 toats 12.04 word, mask: toast ..... ostat 17.00 ostat 17.00 stato 16.94 stato 16.94 The command tells the program to use triples. It grinds for a while (20 sec on Sparc-IPX), then prompts with "word,mask". Enter the letters, and the pattern, with "." for unknown letters. It prints out the possible solutions with the log of probability. As you can see, it often generates spurious words, but the actual word is usually in the first 20 or so. Also, it doesn't eliminate duplicates if one or more letters are duplicated. This is a quick hack, so beware. Some code may be hazardous to your health (eg. while statments with printf and scanf in the condition). Guaranteed 100% comment free. Also, it uses "", some systems supply "", you may need to change appropriate things. Compile it -lm. Email me if you actually use this thing. **********cut here******** #include #include #include #include #define debug2 int pleasestop = 0; intr () { if (! pleasestop) pleasestop = 1; else exit (2); } #define maxfrag 8 #define hashtablen 17973 #define chunksize 1000 typedef struct hashentry *hashptr; struct hashentry { hashptr next; char frag [maxfrag]; int count; }; hashptr *hashtable [maxfrag]; hashptr hashfree; int nhashentryfree = 0; #define maxpermutelen 8 #define maxpermwordlen 16 #define permutesize 40320 struct permuterec { char word [maxpermwordlen]; double cost; }; struct permuterec *permuterecs; char permword [maxpermwordlen]; int npermutes; hashptr hashenter (word, start, len, addit) char *word; int start, len; { register int hashval; register int i; register hashptr hp; hashval = 0; for (i = 0; i < len; i++) hashval = ((hashval * 99) & 0xffffff) + (word [i + start] & 0x3ff); hashval %= hashtablen; for (hp = hashtable [len - 1] [hashval]; hp != NULL && strncmp (word + start, hp->frag, len); hp = hp->next) ; if (addit) { if (hp == NULL) { if (nhashentryfree == 0) { hashfree = (hashptr)malloc (chunksize * sizeof(struct hashentry)); if (hashfree == 0) { fprintf (stderr, "no core for table\n"); exit (2); } nhashentryfree = chunksize; } hp = hashfree++; nhashentryfree--; hp->count = 1; hp->next = hashtable [len - 1] [hashval]; hashtable [len - 1] [hashval] = hp; strncpy (hp->frag, word + start, len); } else hp->count++; } #ifdef debug1 printf ("hash %s %d %d", word, start, len); if (hp != NULL) printf (" count = %d\n", hp->count); else printf ("\n"); #endif return (hp); } adddotword (mask, n, wordlen) int *mask; int n, wordlen; { register int i; register int k; register int n1; n1 = n - 1; k = mask [n1]; for (i = 0; i < 26; i++) { permword [k] = 'a' + i; if (n1) adddotword (mask, n1, wordlen); else { strncpy (permuterecs [npermutes].word, permword, wordlen); npermutes++; } } } addpermute (word, mask, len, depth, permwordlen) int *mask; char *word; int depth; { register char c; register int i; int j; for (i = 0; i < len; i++) { if (word [i] != '\0') { c = word [i]; permword [mask [depth - 1]] = c; word [i] = '\0'; if (depth == 1 || pleasestop) { strncpy (permuterecs [npermutes].word, permword, permwordlen); npermutes++; } else addpermute (word, mask, len, depth - 1, permwordlen); word [i] = c; } } } main (argc, argv) int argc; char *argv[]; { register int i; register int j; register int k; register int l; register hashptr hp; register FILE *f; register int wordcnt; double t; int maxlen; int wordlen; int masklen; int finddots; int mask [maxpermwordlen]; char tempword [maxpermutelen]; char wordbuf [100]; char dotwordbuf [100]; char maskstr [100]; finddots = 0; if (argc < 3) { fprintf (stderr, "usage: anagram wordfile maxlen\n"); exit (); } if (argc > 3) finddots = 1; if ((f = fopen (argv [1], "r")) == NULL) { fprintf (stderr, "can't open %s\n", argv [1]); exit (2); } signal (SIGINT, intr); if (sscanf (argv [2], "%d", &maxlen) != 1) { fprintf (stderr, "bad maxlen\n"); exit (2); } if (maxlen > maxfrag) { fprintf (stderr, "max len is %d\n", maxfrag); maxlen = maxfrag; } for (i = 0; i < maxlen; i++) { if ((hashtable [i] = (hashptr *)malloc (hashtablen * sizeof (hashptr))) == NULL) { fprintf (stderr, "no core\n"); exit (2); } } for (i = 0; i < maxlen; i++) for (j = 0; j < hashtablen; j++) hashtable [i] [j] = NULL; for (wordcnt = 0; fscanf (f, " %100s", wordbuf) == 1; wordcnt++) { wordlen = strlen (wordbuf); for (i = 1; i <= maxlen; i++) { for (j = 0; j <= wordlen - i; j++) { if (!finddots) hashenter (wordbuf, j, i, 1); else { for (k = 0; k < (1 << i); k++) { strncpy (dotwordbuf, wordbuf, wordlen); for (l = 0; l < i; l++) { if ((1 << l) & k) dotwordbuf [j + l] = '.'; } hashenter (dotwordbuf, j, i, 1); } } } } } fclose (f); permuterecs = (struct permuterec*)malloc (permutesize * sizeof (struct permuterec)); while (printf ("word, mask: "), scanf (" %100s %100s", wordbuf, maskstr) == 2) { pleasestop = 0; wordlen = strlen (wordbuf); masklen = strlen (maskstr); if (wordlen > maxpermutelen || masklen > maxpermwordlen) { printf ("too long\n"); continue; } npermutes = 0; j = 0; for (i = 0; i < masklen; i++) { if (maskstr [i] == '.') mask [j++] = i; else permword [i] = maskstr [i]; } if (wordbuf [0] != '?') { if (j != wordlen) { printf ("bad mask\n"); continue; } addpermute (wordbuf, mask, wordlen, wordlen, masklen); } else { if (!finddots) printf ("didn't do dot hashing\n"); else if (j > 0) adddotword (mask, j, masklen); } for (i = 0; i < npermutes && !pleasestop; i++) { permuterecs [i].cost = 1.0; for (j = 0; j < masklen; j++) { k = masklen - j; if (k > maxlen) k = maxlen; hp = hashenter (permuterecs [i].word, j, k, 0); if (hp == NULL) permuterecs [i].cost *= .5; else permuterecs [i].cost *= hp->count; } } for (i = 0; i < npermutes && !pleasestop; i++) { for (j = i + 1; j < npermutes; j++) { if (permuterecs [i].cost < permuterecs [j].cost) { strncpy (tempword, permuterecs [i].word, masklen); strncpy (permuterecs [i].word, permuterecs [j].word, masklen); strncpy (permuterecs [j].word, tempword, masklen); t = permuterecs [i].cost; permuterecs [i].cost = permuterecs [j].cost; permuterecs [j].cost = t; } } for (j = 0; j < masklen; j++) putchar (permuterecs [i].word [j]); printf (" %4.2f\n", log10 (permuterecs [i].cost)); } } } **********end**********