%{
/*  Bugs; foot\-notes slips through
    Doesn't translate \"u etc.  Should do.
    (Reuse stuff from cardbox->sgml project)
    \input filename not done specially
    \font ... ditto
    More  kern-3in units to be recognised
    as soon as 'NUL in data' error corrected
 */
#include <stdio.h> 
#include <string.h> 
#include <stdlib.h> 

/*extern YY_CHAR *yytext;*/

#define EOFTOK 0

#define WHITESPACE 256
#define ESCAPE 257
#define STYLE 258
#define NEWLINE 259
#define WORD 260
#define COMMENT 261

int yylex(void);

static int ignore;
static int cur_line = 1, cur_col = 1; /* 'real-world' conventions */

void output_word(char *word, int line, int col, char *fileinf) {
  char *p = word+strlen(word)-1;
  while ((p != word) && ((*p == '\'') || (*p == '-'))) *p-- = '\0';
  /*printf("%sLine %d, Col %d: %s\n", fileinf, line, col, word);*/
  printf("%s\n", word); fflush(stdout);
}

int main(int argc, char **argv)
{
  char fname[256];
  int token;

  if (argc == 2) {
    if (freopen(argv[1], "r", stdin) == NULL) {
      fprintf(stderr, "wordtex: cannot open input file %s\n", argv[1]);
      exit(0);
    }
    sprintf(fname, "\"%s\", ", argv[1]);
  } else {
    *fname = '\0';
  }
  ignore = (0!=0);
  while ((token = yylex()) != 0) {
    if (token == COMMENT) ignore = (0==0);
    else if (token == NEWLINE) {
      ignore = (0!=0);
      cur_line += 1; cur_col = 0;
    } else if (token == WORD) {
      if (!ignore) output_word(yytext, cur_line, cur_col, fname);
    }
    cur_col += strlen(yytext);
  }
}
%}
%%

\\[a-zA-Z]*[\ ]*(\[[A-Za-z,\ \t]+\])*("{"[A-Za-z,\ \t]+"}")* {
  /* this was tested on the rather buggy v2.00 of flex which
     gives spurious 'NUL in data' errors.  because of this
     bug, some of the grammar is rather contorted :-(
        in this instance I'd like to be able to recognise
     \documentstyle[]{} explicitly to pull in document-style-
     specific spelling dictionaries. */
  return(WHITESPACE);
}

["\\"|@][a-zA-Z]+ {
  return(ESCAPE);
}


[a-zA-Z](\-[a-zA-Z]|[a-zA-Z']|\\\-)+ {
  /* '+' rather than '*' above means single-letter words are ignored!
     It's pretty good---it handles dashes like these! */
  return(WORD);
}

1st {
  return(WHITESPACE);
}

2nd {
  return(WHITESPACE);
}

3rd {
  return(WHITESPACE);
}

[456789]th {
  return(WHITESPACE);
}

[0-9]+("em"|"ex"|"pt"|"mm") {
  return(WHITESPACE);
}

[^\\]"%" {
  /* The 'not-backslash' before the % isn't ideal, but I'm having trouble
     with this buggy flex trying to do comments properly */
  return(COMMENT);
}

"\n" {
  return(NEWLINE);
}

[\001-\377] {
  return(WHITESPACE);
}
