traditional-stable.c

/* stable release traditional-1.0.4      */
/* (c) Graham Toal, 06June1999 */

/* See the file README in this directory for a description of the
   problem.
 */

/* This version is orders of magnitudes faster than ws-stable.c
   because it generates only 'traditional' squares, ie it cannot
   generate double squares. */

/*

This is a very short description of my wordsquare program.
To understand the implementation details as opposed to the
big picture you will also need to read the document on
DAWGs (Directed Acyclic Word Graphs) which I'll be writing next.

The high-level part of the algorithm is actually very obvious,
basic and simple:  take an empty square then put down a word
in the top row.  Place a word underneath it, and if the downward
columns look like they can also form valid words (checked by
looking for the starts of the word in the dictionary), then
continue to place words on subsequent rows until the square is
full.  If at any time some partially formed word in a downward
column is nonsense, remove the last word you placed and try another
one.  Sometimes you may backtrack more than one row before you
find another word that can be placed which allows sensible partially
formed vertical words.

That's the basic algorithm, but in practice it is refined considerably
because as it stands it would take forever to finish.

To make the description easier, let's assume we've placed 5 full
rows successfully, and are looking at placing the 6th row.  We could
simply try each word in the dictionary sequentially, and as we place
each word, check the 6-letter downward stems, to confirm that we have
not blocked placement of a 7th row by some nonsensical downward
stem, such as "peacez...."  But... by placing whole words at a time,
we might find ourselves repeating the first 7 or 8 vertical checks
every time, eg if we place "superhuman" on row 6 and the vertical
check fails on the 9th column, then we place 'supersonic" on row 6,
we end up checking the same 5 vertical words again, that intersect
with the prefix 'super'.

So, rather than place whole words at a time, and backtracking whole
words when one doesn't allow further placements, we actually place
the horizontal words one letter at a time, and when a letter fails,
we backtrack just one letter.  So, after placing "supersoni" and
realising that the "i" blocked a downward placement in column 9, we
would then look for something else beginning with "superson..".  Not
finding any, we backtrach to "superso..." etc until we eventually
get to a position "super....." which allows another letter to be
placed, in this case "superh...." which is the start of "superhuman".

Next, the big optimisation comes from never actually checking
the vertical prefixes twice.  We use the 'dawg' structure to perform
an incremental spelling check down the vertical columns, BUT as
we build the square up row by row, we store the intermediate
stages of each vertical cross-check in another square array the
same size as the wordsquare, and so when we add one more row to the
square, for each column we only have to check that one more letter
can be added to the partial word at that stage.  Let's say we had
already formed the partial word 'peace' on one of the downward
columns, then if the horizontal letter being placed in the 6th
row at that column is say 'm', we know from our partially checked
word 'peace' that it can be followed at that point by 'maker'
and so 'm' is valid.

Finally, having this mechanism in place means that we know that
immediately below any partially formed vertical word, only a small
number of letters are valid - eg with 'peace.....' we may have only
"peacemaker" or "peacetimes" so only 'm' or 't' are valid at that
point.  This means we no longer have to try every word in the
dictionary in turn - because we can limit the letters in words we
are placing horizontally to only those that allow vertical words
to be formed.  This cuts down the iterations tremendously.  The only
extra trick needed here is simply to mark the first row as allowing
any letter from 'a' to 'z' so that all words are valid in the
first row, so that after the first row is placed, only horizontally
placed letters that allow valid downward words to be formed can be
placed.

This whole strategy of trimming what is placed horizontally by
the partially-formed vertical words works only because the data
structure that we use to represent the word list is a tree, and
for any position in any word, we have instant access to the
suffixes that can follow that prefix, for the cost of a single
memory operation rather than any complex database lookup.

The next iteration of this document will include some examples
and partially formed squares, and a link to the description of the
dawg data structure.

Graham Toal
Texas, 15Aug99

 */

#include <stdio.h>
#include <stdlib.h>

#include "splib.h"

#ifndef TRUE
#define TRUE (0==0)
#define FALSE (!TRUE)
#endif

/* The dictionary itself is global, as these variables are seldom
   referenced, and I don't want to pass them in as parameters to every
   procedure just so I can access them occassionaly */

NODE *rootdawg;
INDEX nedges;

#define MAX 11 /* Biggest we're willing to tackle will eventually be 10 */

static int sq[/MAX* row */][/MAX* col */];		/* letters */

static NODE *fork_point[MAX+/1* row */][/MAX* col */];
             /* downward word branch choices from this row */

static char check[MAX+/1* row */][/MAX* col */][256+/1* alphabet */];
                  /* string of letters which
                     are valid in each position in the row below */

/* This one is *NOT* row, col */
static char vertical_word[MAX /* col */][MAX+1 /* row/wordlen */];

/* This is probably redundant - it's just a printable horizontal word 
   also used for communication.  Can probably use sq[][] instead. */
char word[MAX /* row */][MAX+1 /* len */];


/* Doubles as producing answers and debug routine */
void print_square(int row, int col, int max, char *s)
{
  int i, j;

  printf(s);
  for (j = 0; j < row; j++) {
    for (i = 0; i < max; i++) putchar(sq[j][i]);
    putchar('\n');
  }
  for (j = row; j < max; j++) {
    for (i = 0; i < max; i++) putchar('.');
    putchar('\n');
  }
  putchar('\n');
  fflush(stdout);
}

/* parameter col is the length is of the horizontal word and should
   always be invoked externally with len=0.  hnode should be dawg+1 for
   the start of every word in the full wordlist. */

/* Iterate a letter at a time across each row; recurse at the end of a
   row to go to the next row.  Backtrack when the square is blocked. */

void next_letter(NODE *hnode, int row, int max, int col)
{
  NODE *nextnode;
  int ch;
  int i;
  NODE *savednode;
  char savedcheck[256];

  if (row == max || col == max) return;
  /* safety - I'd prefer this were done at the point of the recursive call  */
  /* It's because of things like that that some of the arrays have to be +1 */

  for (;;) { /* loop across horizontal letter placement */
    ch = (char)(((*hnode) >> V_LETTER) & M_LETTER);
    word[row][col] = ch;
    word[row][col+1] = '\0';
    sq[row][col] = ch;
    vertical_word[col][row] = ch;
    vertical_word[col][row+1] = '\0';

    nextnode = rootdawg+((*hnode) & M_NODE_POINTER);

    /* Test to see if this edge letter is allowed in the vertical
       crosscheck list */

    /* ***BUT*** if generating single squares, throw away the generated
                 cross-check, and force the downward word to match the
                 relevant across word:

      col 012345
          FREDDY   row 0
          ROGERS   row <n>
          EGBERT   row (just placed)
          DEE      row+1 (next to look at)
          DRR
          YST

     */

    if (
        (    (col < row)
          && (ch == sq[col][row])
          && (strchr(check[row][col], ch) != NULL)
        )
        ||
        (    (col >= row)
          && (strchr(check[row][col], ch) != NULL)
        )
       ) {
      int nextfree;
      int found;
      NODE *vnode;

      /* letter is placed.  Before we go any further, did this
         placement generate a solution? */

      /* letter is placed in the square, now see if it is valid.
         We place next horizontal letter recursively if this one was
           accepted OK.  It may be rejected due to vertical crosscheck
           and it may be rejected because it blocks off the vertical
           word from being expanded further downwards.  Of course
           we shouldn't care about the latter if this is the final row. */

      /* INLINE: 
         fork_point[row+1][col] = follow_edge(fork_point[row][col], ch); */

      vnode = fork_point[row][col];

      found = FALSE;
      for (;;) { /* placed a letter, look for next set of options */
        if (ch == (((*vnode) >> V_LETTER) & M_LETTER)) {
          vnode = rootdawg+(*vnode & M_NODE_POINTER);
          found = TRUE; break;
	}
        if ((*vnode & M_END_OF_NODE) != 0) break;
        vnode++;
      }
      if (found) {
        savednode = fork_point[row+1][col];
        strcpy(savedcheck, check[row+1][col]);
        fork_point[row+1][col] = vnode;
        nextfree = 0;
        for (;;) {
          if (vnode == NULL) {
            fprintf(stdout, "Don't know why vnode is NULL\n");
            exit(1);
          }
          check[row+1][col][nextfree++] = ((*vnode) >> V_LETTER) & M_LETTER;
          if ((*vnode & M_END_OF_NODE) != 0) break;
          vnode++;
        }
        check[row+1][col][nextfree++] = '\0';

        if (col+1 == max) {
          /* good placement on this row.
             recurse to next row.
             reset the dictionary to start of line */

          if (row+1 == max) {
            /* As the word would not have been placed unless it formed a
               vertical word also, we must have a solution.  Print it and return */
            print_square(max,max,max,"Solution:\n");
          }

          /* good placement.  recurse to next row.  reset the dictionary */
          next_letter(rootdawg+1, row+1, max, 0);
        } else {
          /* place next letter horizontally */
          next_letter(
            nextnode, row, max, col+1);
	}
        /* what I worry about here is when we backtrack, do all
           the things like check[] and fork_point[] work properly?
           They *should* do, because ones we backtrack over should
           never be looked at, but I have this niggling doubt, at
           least as long as the program isn't working...
         */

        fork_point[row+1][col] = savednode;
        strcpy(check[row+1][col], savedcheck);
      } else {
        char tmp[256];
        /* Cannot make this play because it blocks of progress for
           a vertical word.  Go back up a row. */;
        /* (or maybe backtrack a letter and try another word on this row?) */
      }
    }

    /* now see if there is another letter at this position. */

    if ((*hnode & M_END_OF_NODE) != 0) break; /* end of node */
    hnode++;
  } /* end loop across horizontal letter placement */
}

int main(int argc, char **argv) {
  char fname[256];
  int row, col;
  int max;

  if (argc == 1) {
    max = 4;
  } else if (argc == 2) {
    max = *argv[1];
    if (isdigit(max)) max = max - '0'; else max = 4;
  } else {
    fprintf(stderr, "syntax: ws 4\n        (or any number 2-9)\n");
    exit(1);
  }

  sprintf(fname, "dawg.%0d", max);
  if (!dawg_init(fname, &rootdawg, &nedges)) {
    fprintf(stdout, "Cannot open %s\n", fname);
    exit(1);
  }

  /* Have to initialise first row specially.
     *all* words are valid at this point. */
  row = 0;
  for (row = 0; row < max; row++) {
    /* wrong for all but 0th row, but we don't care as it'll be filled in
       in time anyway. */
    for (col = 0; col < max; col++) {
      sq[row][col] = '_';
      /* This would be better done by extraction from the dawg
         if I weren't so lazy, but the 'strchr()' overhead is low */
      strcpy(check[row][col],
        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
      fork_point[row][col] = rootdawg+1;
      strcpy(vertical_word[col], "");
    }
  }
  next_letter(rootdawg+1, 0, max, /0**/len);

  print_square(max,max,max,"Final state (non solution):\n");

  exit(0);
}