%{ /* Bugs; foot\-notes slips through Doesn't translate \"u etc. Should do. (Reuse stuff from cardbox->sgml project) \input filename not done specially \font ... ditto More kern-3in units to be recognised as soon as 'NUL in data' error corrected */ #include #include #include /*extern YY_CHAR *yytext;*/ #define EOFTOK 0 #define WHITESPACE 256 #define ESCAPE 257 #define STYLE 258 #define NEWLINE 259 #define WORD 260 #define COMMENT 261 int yylex(void); static int ignore; static int cur_line = 1, cur_col = 1; /* 'real-world' conventions */ void output_word(char *word, int line, int col, char *fileinf) { char *p = word+strlen(word)-1; while ((p != word) && ((*p == '\'') || (*p == '-'))) *p-- = '\0'; /*printf("%sLine %d, Col %d: %s\n", fileinf, line, col, word);*/ printf("%s\n", word); fflush(stdout); } int main(int argc, char **argv) { char fname[256]; int token; if (argc == 2) { if (freopen(argv[1], "r", stdin) == NULL) { fprintf(stderr, "wordtex: cannot open input file %s\n", argv[1]); exit(0); } sprintf(fname, "\"%s\", ", argv[1]); } else { *fname = '\0'; } ignore = (0!=0); while ((token = yylex()) != 0) { if (token == COMMENT) ignore = (0==0); else if (token == NEWLINE) { ignore = (0!=0); cur_line += 1; cur_col = 0; } else if (token == WORD) { if (!ignore) output_word(yytext, cur_line, cur_col, fname); } cur_col += strlen(yytext); } } %} %% \\[a-zA-Z]*[\ ]*(\[[A-Za-z,\ \t]+\])*("{"[A-Za-z,\ \t]+"}")* { /* this was tested on the rather buggy v2.00 of flex which gives spurious 'NUL in data' errors. because of this bug, some of the grammar is rather contorted :-( in this instance I'd like to be able to recognise \documentstyle[]{} explicitly to pull in document-style- specific spelling dictionaries. */ return(WHITESPACE); } ["\\"|@][a-zA-Z]+ { return(ESCAPE); } [a-zA-Z](\-[a-zA-Z]|[a-zA-Z']|\\\-)+ { /* '+' rather than '*' above means single-letter words are ignored! It's pretty good---it handles dashes like these! */ return(WORD); } 1st { return(WHITESPACE); } 2nd { return(WHITESPACE); } 3rd { return(WHITESPACE); } [456789]th { return(WHITESPACE); } [0-9]+("em"|"ex"|"pt"|"mm") { return(WHITESPACE); } [^\\]"%" { /* The 'not-backslash' before the % isn't ideal, but I'm having trouble with this buggy flex trying to do comments properly */ return(COMMENT); } "\n" { return(NEWLINE); } [\001-\377] { return(WHITESPACE); }