parse.c

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
// #include <kernel.h> -- ARM only ...

/*      The KLATT phoneme codes:
**
**              IY      bEEt            IH      bIt
**              EY      gAte            EH      gEt
**              AE      fAt             AA      fAther
**              AO      lAWn            OW      lOne
**              UH      fUll            UW      fOOl
**              ER      mURdER          AX      About
**              AH      bUt             AY      hIde
**              AW      hOW             OY      tOY
**      
**              p       Pack            b       Back
**              t       Time            d       Dime
**              k       Coat            g       Goat
**              f       Fault           v       Vault
**              TH      eTHer           DH      eiTHer
**              s       Sue             z       Zoo
**              SH      leaSH           ZH      leiSure
**              HH      How             m       suM
**              n       suN             NG      suNG
**              l       Laugh           w       Wear
**              y       Young           r       Rate
**              CH      CHar            j       Jar
**              WH      WHere
*/


#define NONE_FOUND 1
#ifdef KLATT
#undef NONE_FOUND
#endif
#ifdef TOAL
#undef NONE_FOUND
#endif
#ifdef KLATT
#undef NONE_FOUND
#endif
#ifdef NONE_FOUND
#define KLATT
#endif

#define MAX_LENGTH 128

static FILE *In_file;
static FILE *Out_file;

static int Char, Char1, Char2, Char3;

static char *Rep_code2[128][128];
static char *Rep_code[128];

/*
** main(argc, argv)
**      int argc;
**      char *argv[];
**
**      This is the main program.  It takes up to two file names (input
**      and output)  and translates the input file to phoneme codes
**      (see ENGLISH.C) on the output file.
*/
main(argc, argv)
        int argc;
        char *argv[];
        {
        if (argc > 3)
                {
                fputs("Usage: PHONEME [infile [outfile]]\n", stderr);
                exit(1);
                }

        if (argc == 1)
                {
#ifdef SUPERIOR
		  // _kernel_oscli("say hello. this is my text to speech demo\r");
#endif
                fputs("Enter english text:\n", stderr);
                }

        if (argc > 1)
                {
                In_file = fopen(argv[1], "r");
                if (In_file == 0)
                        {
                        fputs("Error: Cannot open input file.\n", stderr);
                        fputs("Usage: PHONEME [infile [outfile]]\n", stderr);
                        exit(1);
                        }
                }
        else
                In_file = stdin;

        if (argc > 2)
                {
                Out_file = fopen(argv[2], "w");
                if (Out_file == 0)
                        {
                        fputs("Error: Cannot create output file.\n", stderr);
                        fputs("Usage: PHONEME [infile [outfile]]\n", stderr);
                        exit(1);
                        }
                }
        else
                Out_file = stdout;


#ifndef ORIGINAL
       Init_Rules();
#endif

       {
       int i, j;
         for (i = 0; i < 128; i++) {
           Rep_code[i] = "<UNDEF>";
           for (j = 0; j < 128; j++) {
             Rep_code2[i][j] = "<UNDEF>";
           }
         }
       }

        Rep_code['a'] = "A4";
        Rep_code['b'] = "B4";
        Rep_code['c'] = "C4";
        Rep_code['d'] = "D4";
        Rep_code['e'] = "E4";
        Rep_code['f'] = "F4";
        Rep_code['g'] = "G4";
        Rep_code['h'] = "H4";
        Rep_code['i'] = "I4";
        Rep_code['j'] = "J4";
        Rep_code['k'] = "K4";
        Rep_code['l'] = "L4";
        Rep_code['m'] = "M4";
        Rep_code['n'] = "N4";
        Rep_code['o'] = "O4";
        Rep_code['p'] = "P4";
        Rep_code['q'] = "Q4";
        Rep_code['r'] = "R4";
        Rep_code['s'] = "S4";
        Rep_code['t'] = "T4";
        Rep_code['u'] = "U4";
        Rep_code['v'] = "V4";
        Rep_code['w'] = "W4";
        Rep_code['x'] = "X4";
        Rep_code['y'] = "Y4";
        Rep_code['z'] = "Z4";

        Rep_code['A'] = "A4";
        Rep_code['B'] = "B4";
        Rep_code['C'] = "C4";
        Rep_code['D'] = "D4";
        Rep_code['E'] = "E4";
        Rep_code['F'] = "F4";
        Rep_code['G'] = "G4";
        Rep_code['H'] = "H4";
        Rep_code['I'] = "I4";
        Rep_code['J'] = "J4";
        Rep_code['K'] = "K4";
        Rep_code['L'] = "L4";
        Rep_code['M'] = "M4";
        Rep_code['N'] = "N4";
        Rep_code['O'] = "O4";
        Rep_code['P'] = "P4";
        Rep_code['Q'] = "Q4";
        Rep_code['R'] = "R4";
        Rep_code['S'] = "S4";
        Rep_code['T'] = "T4";
        Rep_code['U'] = "U4";
        Rep_code['V'] = "V4";
        Rep_code['W'] = "W4";
        Rep_code['X'] = "X4";
        Rep_code['Y'] = "Y4";
        Rep_code['Z'] = "Z4";

#ifdef KLATT
        Rep_code2['I']['Y'] = "IY";
        Rep_code2['E']['Y'] = "EY";
        Rep_code2['A']['E'] = "AE";
        Rep_code2['A']['O'] = "AO";
        Rep_code2['U']['H'] = "UH";
        Rep_code2['E']['R'] = "ER";
        Rep_code2['A']['H'] = "AH";
        Rep_code2['A']['W'] = "AW";
        Rep_code2['I']['H'] = "IH";
        Rep_code2['E']['H'] = "EH";
        Rep_code2['A']['A'] = "AA";
        Rep_code2['O']['W'] = "OW";
        Rep_code2['U']['W'] = "UW";
        Rep_code2['A']['X'] = "AX";
        Rep_code2['A']['Y'] = "AY";
        Rep_code2['O']['Y'] = "OY";
        Rep_code2['T']['H'] = "TH";
        Rep_code2['S']['H'] = "SH";
        Rep_code2['H']['H'] = "HH";
        Rep_code2['C']['H'] = "CH";
        Rep_code2['W']['H'] = "WH";
        Rep_code2['D']['H'] = "DH";
        Rep_code2['Z']['H'] = "ZH";
        Rep_code2['N']['G'] = "NG";
#endif
#ifdef SUPERIOR                     /* first iteration - guessing */
        Rep_code2['I']['Y'] = "EE4";
        Rep_code2['E']['Y'] = "AY4";
        Rep_code2['A']['E'] = "AE4";
        Rep_code2['A']['O'] = "OH4";
        Rep_code2['U']['H'] = "UH4"; /* full ??? */
        Rep_code2['E']['R'] = "AXR4";
        Rep_code2['A']['H'] = "AH4";
        Rep_code2['A']['W'] = "AW4";
        Rep_code2['I']['H'] = "IH4";
        Rep_code2['E']['H'] = "EH4";
        Rep_code2['A']['A'] = "O4"; /* zombie -> ZAAMBIH */
        Rep_code2['O']['W'] = "OW4";
        Rep_code2['U']['W'] = "UW4";/* fool ??? Can't see any difference! */
        Rep_code2['A']['X'] = "AX4"; /*  *a*bout ???  */
        Rep_code2['A']['Y'] = "IY4";
        Rep_code2['O']['Y'] = "OY4";
        Rep_code2['T']['H'] = "TH4"; /* The??? */
        Rep_code2['S']['H'] = "SH4";
        Rep_code2['H']['H'] = "H4";
        Rep_code2['C']['H'] = "CH4";
        Rep_code2['W']['H'] = "HW4";
        Rep_code2['D']['H'] = "DH4"; /* This */
        Rep_code2['Z']['H'] = "ZH4"; /* I did't have this one! - added! */
        Rep_code2['N']['G'] = "NX4";
#endif
#ifdef TOAL
        Rep_code2['I']['Y'] = "Ê";
        Rep_code2['E']['Y'] = "Â";
        Rep_code2['A']['E'] = "a";
        Rep_code2['A']['O'] = "o";
        Rep_code2['U']['H'] = "û"; /* full ??? */
        Rep_code2['E']['R'] = "ør";
        Rep_code2['A']['H'] = "u";
        Rep_code2['A']['W'] = "ö";
        Rep_code2['I']['H'] = "i";
        Rep_code2['E']['H'] = "e";
        Rep_code2['A']['A'] = "ø"; /* zombie -> ZAAMBIH */
        Rep_code2['O']['W'] = "ô";
        Rep_code2['U']['W'] = "û"; /* fool ??? Can't see any difference! */
        Rep_code2['A']['X'] = "i"; /*  *a*bout ???  */
        Rep_code2['A']['Y'] = "ı";
        Rep_code2['O']['Y'] = "ó";
        Rep_code2['T']['H'] = "Ğ"; /* The */
        Rep_code2['S']['H'] = "§";
        Rep_code2['H']['H'] = "h";
        Rep_code2['C']['H'] = "Ç";
        Rep_code2['W']['H'] = "µ";
        Rep_code2['D']['H'] = "Ğ"; /* This */
        Rep_code2['Z']['H'] = "J"; /* I did't have this one! - added! */
        Rep_code2['N']['G'] = "Ñ";
#endif

        xlate_file();
        }

outstring(string)
        char *string;
        {
        while (*string != '\0')
                outchar(*string++);
        }

outchar(chr)
        int chr;
        {
        static int first;
        static int UC_PENDING = (0!=0);
        char action[256];
        static char line[256] = { '\0' };
        static char *linep = line;

          if (UC_PENDING) {
            linep += sprintf(linep, "%s", Rep_code2[first][chr]);
            fprintf(Out_file, "%s", Rep_code2[first][chr]);
            UC_PENDING = (0!=0);
            return;
          }
          if (isupper(chr&127)) {
            first = chr&127;
            UC_PENDING = (0==0);
          } else {
            if (chr == '\n' || chr == ' ') {
              sprintf(action, "speakw %s\r", line);
              fprintf(stderr, " (%s) ", line);
#ifdef SUPERIOR
              // _kernel_oscli(action);
#endif
              linep = line; *linep = '\0';
            } else {
              linep += sprintf(linep, "%s", Rep_code[chr&127]);
            }
            if (chr != ' ') fputc(chr,Out_file);
          }
        }


int makeupper(character)
        int character;
        {
        if (islower(character))
                return toupper(character);
        else
                return character;
        }

new_char()
        {
        /*
        If the cache is full of newline, time to prime the look-ahead
        again.  If an EOF is found, fill the remainder of the queue with
        EOF's.
        */
        if (Char == '\n'  && Char1 == '\n' && Char2 == '\n' && Char3 == '\n')
                {       /* prime the pump again */
                Char = getc(In_file);
                if (Char == EOF)
                        {
                        Char1 = EOF;
                        Char2 = EOF;
                        Char3 = EOF;
                        return Char;
                        }
                if (Char == '\n')
                        return Char;

                Char1 = getc(In_file);
                if (Char1 == EOF)
                        {
                        Char2 = EOF;
                        Char3 = EOF;
                        return Char;
                        }
                if (Char1 == '\n')
                        return Char;

                Char2 = getc(In_file);
                if (Char2 == EOF)
                        {
                        Char3 = EOF;
                        return Char;
                        }
                if (Char2 == '\n')
                        return Char;

                Char3 = getc(In_file);
                }
        else
                {
                /*
                Buffer not full of newline, shuffle the characters and
                either get a new one or propagate a newline or EOF.
                */
                Char = Char1;
                Char1 = Char2;
                Char2 = Char3;
                if (Char3 != '\n' && Char3 != EOF)
                        Char3 = getc(In_file);
                }
        return Char;
        }

/*
** xlate_file()
**
**      This is the input file translator.  It sets up the first character
**      and uses it to determine what kind of text follows.
*/
xlate_file()
        {
        /* Prime the queue */
        Char = '\n';
        Char1 = '\n';
        Char2 = '\n';
        Char3 = '\n';
        new_char();     /* Fill Char, Char1, Char2 and Char3 */

        while (Char != EOF)     /* All of the words in the file */
                {
                if (isdigit(Char))
                        have_number();
                else
                if (isalpha(Char) || Char == '\'')
                        have_letter();
                else
                if (Char == '$' && isdigit(Char1))
                        have_dollars();
                else
                        have_special();
                }
        }

have_dollars()
        {
        long int value;

        value = 0L;
        for (new_char() ; isdigit(Char) || Char == ',' ; new_char())
                {
                if (Char != ',')
                        value = 10 * value + (Char-'0');
                }

        say_cardinal(value);    /* Say number of whole dollars */

        /* Found a character that is a non-digit and non-comma */

        /* Check for no decimal or no cents digits */
        if (Char != '.' || !isdigit(Char1))
                {
                if (value == 1L)
                        outstring("dAAlER ");
                else
                        outstring("dAAlAArz ");
                return;
                }

        /* We have '.' followed by a digit */

        new_char();     /* Skip the period */

        /* If it is ".dd " say as " DOLLARS AND n CENTS " */
        if (isdigit(Char1) && !isdigit(Char2))
                {
                if (value == 1L)
                        outstring("dAAlER ");
                else
                        outstring("dAAlAArz ");
                if (Char == '0' && Char1 == '0')
                        {
                        new_char();     /* Skip tens digit */
                        new_char();     /* Skip units digit */
                        return;
                        }

                outstring("AAnd ");
                value = (Char-'0')*10 + Char1-'0';
                say_cardinal(value);

                if (value == 1L)
                        outstring("sEHnt ");
                else
                        outstring("sEHnts ");
                new_char();     /* Used Char (tens digit) */
                new_char();     /* Used Char1 (units digit) */
                return;
                }

        /* Otherwise say as "n POINT ddd DOLLARS " */

        outstring("pOYnt ");
        for ( ; isdigit(Char) ; new_char())
                {
                say_ascii(Char);
                }

        outstring("dAAlAArz ");

        return;
        }

have_special()
        {
        if (Char == '\n')
                outchar('\n');
        else
        if (!isspace(Char))
                say_ascii(Char);

        new_char();
        return;
        }


have_number()
        {
        long int value;
        int lastdigit;

        value = Char - '0';
        lastdigit = Char;

        for (new_char() ; isdigit(Char) ; new_char())
                {
                value = 10 * value + (Char-'0');
                lastdigit = Char;
                }

        /* Recognize ordinals based on last digit of number */
        switch (lastdigit)
                {
        case '1':       /* ST */
                if (makeupper(Char) == 'S' && makeupper(Char1) == 'T' &&
                    !isalpha(Char2) && !isdigit(Char2))
                        {
                        say_ordinal(value);
                        new_char();     /* Used Char */
                        new_char();     /* Used Char1 */
                        return;
                        }
                break;

        case '2':       /* ND */
                if (makeupper(Char) == 'N' && makeupper(Char1) == 'D' &&
                    !isalpha(Char2) && !isdigit(Char2))
                        {
                        say_ordinal(value);
                        new_char();     /* Used Char */
                        new_char();     /* Used Char1 */
                        return;
                        }
                break;

        case '3':       /* RD */
                if (makeupper(Char) == 'R' && makeupper(Char1) == 'D' &&
                    !isalpha(Char2) && !isdigit(Char2))
                        {
                        say_ordinal(value);
                        new_char();     /* Used Char */
                        new_char();     /* Used Char1 */
                        return;
                        }
                break;

        case '0':       /* TH */
        case '4':       /* TH */
        case '5':       /* TH */
        case '6':       /* TH */
        case '7':       /* TH */
        case '8':       /* TH */
        case '9':       /* TH */
                if (makeupper(Char) == 'T' && makeupper(Char1) == 'H' &&
                    !isalpha(Char2) && !isdigit(Char2))
                        {
                        say_ordinal(value);
                        new_char();     /* Used Char */
                        new_char();     /* Used Char1 */
                        return;
                        }
                break;
                }

        say_cardinal(value);

        /* Recognize decimal points */
        if (Char == '.' && isdigit(Char1))
                {
                outstring("pOYnt ");
                for (new_char() ; isdigit(Char) ; new_char())
                        {
                        say_ascii(Char);
                        }
                }

        /* Spell out trailing abbreviations */
        if (isalpha(Char))
                {
                while (isalpha(Char))
                        {
                        say_ascii(Char);
                        new_char();
                        }
                }

        return;
        }


have_letter()
        {
        char buff[MAX_LENGTH];
        int count;

        count = 0;
        buff[count++] = ' ';    /* Required initial blank */

        buff[count++] = makeupper(Char);

        for (new_char() ; isalpha(Char) || Char == '\'' ; new_char())
                {
                buff[count++] = makeupper(Char);
                if (count > MAX_LENGTH-2)
                        {
                        buff[count++] = ' ';
                        buff[count++] = '\0';
                        xlate_word(buff);
                        count = 1;
                        }
                }

        buff[count++] = ' ';    /* Required terminating blank */
        buff[count++] = '\0';

        /* Check for AAANNN type abbreviations */
        if (isdigit(Char))
                {
                spell_word(buff);
                return;
                }
        else
        if (strlen(buff) == 3)   /* one character, two spaces */
                say_ascii(buff[1]);
        else
        if (Char == '.')                /* Possible abbreviation */
                abbrev(buff);
        else
                {
                char *s;
                /*fprintf(Out_file, "<");*/
                xlate_word(buff);
                s = buff;
                while (*s != '\0') {
                  if (isalpha(*s) && isupper(*s)) *s = tolower(*s);
                  s++;
                }
                if (*buff == ' ')
                  fprintf(Out_file, "=%s", buff+1); /* was >= */
                else
                  fprintf(Out_file, "=%s", buff);   /* was >= */
                }

        if (Char == '-' && isalpha(Char1))
                new_char();     /* Skip hyphens */

        }

/* Handle abbreviations.  Text in buff was followed by '.' */
abbrev(buff)
        char buff[];
        {
        if (strcmp(buff, " DR ") == 0)
                {
                xlate_word(" DOCTOR ");
                new_char();
                }
        else
        if (strcmp(buff, " MR ") == 0)
                {
                xlate_word(" MISTER ");
                new_char();
                }
        else
        if (strcmp(buff, " MRS ") == 0)
                {
                xlate_word(" MISSUS ");
                new_char();
                }
        else
        if (strcmp(buff, " PHD ") == 0)
                {
                spell_word(" PHD ");
                new_char();
                }
        else
                xlate_word(buff);
        }