hamish.c

// TO DO:
// http://www.ef.edu/english-resources/english-grammar/numbers-english/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#define ishex(ch) (int)(strchr("0123456789ABCDEFabcdef", (char)((ch)&255)) != NULL)
int hextobin(char x)
{
  // Too crude, just a quick hack.  Seldom called.
  if (('0' <= x) && (x <= '9')) {        return x-'0';
  } else if (('A' <= x) && (x <= 'F')) { return x-'A'+10;
  } else if (('a' <= x) && (x <= 'f')) { return x-'a'+10;
  } else {                               return 0;
    // ASSERT: INTERNAL ERROR!
  }
}
void indent(int depth, FILE *f)
{
  int i;
  for (i = 0; i < depth; i++) fprintf(f, "  ");
}

#define DEBUG_PARSER
#include "hamish.h" // GENERATED GRAMMAR FROM takeon.c and hamish.g

#ifndef FALSE
#define FALSE (0!=0)
#define TRUE (0==0)
#endif

char *progname;

int debug_parser = FALSE;

// Built-in phrase codes.  Must match with grammar file.
#define TYPE_EOF 0
#define TYPE_TAG 1
#define TYPE_INT 2
#define TYPE_KEYWORD 3

typedef struct sourceinfo { // ATOMS for processed input stream
  char *s; // string contents
  int l;   // lineno
  int col; // column
  int t;   // type - tag, int, or keyword
  char *f; // source or includefile  name
} sourceinfo;

int bestparse = -1; // for error reporting.
char *looking_for = NULL;

// C is the source character token stream
static sourceinfo *c = NULL;
int nextfree = 0, arraysize = 0;
char *onecharstr = NULL;

// A is the Analysis record.  Contents point to a C[] structure when the item is a BIP,
// otherwise the format is [phraseno] [altno] [no-of-subphrases] [subphrases and/or BIPs...]
// eg A[25] = 10 means that A[25] is C[10].
static int *A = NULL;
int next_free_a = 0, a_size = 0;


static void *makespace_(void *c, int nextfree, int *arraysize, int objsize) {
  if (nextfree >= *arraysize) {
    *arraysize = (*arraysize * 2) + 1;
    c = (void *)realloc(c, (*arraysize+1) * objsize); // 0:arraysize, inclusive. eg 0:15, 0:127 etc
    if (c == NULL) {fprintf(stderr, "tacctwo: %s\n", strerror(errno)); exit(errno);}
  }
  return c;
}
#define makespace(c, nextfree, arraysize) c = (typeof(c))makespace_(c, nextfree, &arraysize, sizeof(c[0]))

void stores(char *s, int lineno, int col, int type, char *fname) {
  makespace(c, nextfree, arraysize);
  c[nextfree].s = s; c[nextfree].l = lineno; c[nextfree].col = col;
  c[nextfree].f = fname; c[nextfree].t = type;
  nextfree++;
}

void storec(int ch, int lineno, int col, int type, char *fname) {
  onecharstr[ch*2] = ch; onecharstr[ch*2+1] = '\0';  // convert char to 1-char string before saving.
  stores(&onecharstr[ch*2], lineno, col, type, fname);
}

int iskeyword(char *s) {
  int i;
  for (i = 0; i < MAX_KEYWORD; i++) if (strcmp(s, keyword[i]) == 0) return TRUE;
  return FALSE;
}



//--------------------------------------------------------------------------

FILE *sourcefile;
char *curfile;
int startline = TRUE, whitespace = TRUE, lineno = 1, col = 0, ch;

void line_reconstruction(void)  // PRE-PROCESS INPUT READY FOR PARSING
{
  for (;;) {
    ch = fgetc(sourcefile); if (ch == EOF || ch == '\n') break;
    ch &= 255; // int, positive.

    if (isalpha(ch)) {
        int xxnextfree = 0, strsize = 0, startcol = col;
        char *token = NULL;

        whitespace = FALSE;
        for (;;) {
          makespace(token, xxnextfree, strsize);
          if (isalpha(ch) || isdigit(ch) || (ch == '_')) {
            // digits and '_' allowed after 1st char.
            col++;
            token[xxnextfree++] = ch;
          } else {
            token[xxnextfree] = '\0'; ungetc(ch, sourcefile);
            break;
          }
          ch = fgetc(sourcefile);
        }
        stores(strdup(token), lineno, startcol, iskeyword(token) ? TYPE_KEYWORD : TYPE_TAG, curfile);
        free(token);
    } else if (isdigit(ch)) {
        int xxnextfree = 0, numsize = 0;
        char *number = NULL;

        // Store as a string...
        whitespace = FALSE;
        for (;;) {
          makespace(number, xxnextfree, numsize);
          if (isdigit(ch)) {
            col++;
            number[xxnextfree++] = ch;
          } else {
            number[xxnextfree] = '\0'; ungetc(ch, sourcefile);
            break;
          }
          ch = fgetc(sourcefile);
        }
        stores(strdup(number), lineno, col, TYPE_INT, curfile);
        free(number);
    } else switch (ch) {

    // WHITESPACE
    case '\n':      lineno++;
    case '\r':      startline = TRUE; col = 0; whitespace = TRUE;
      break;

    case '\t':
    case ' ':       col++; // Does not affect whitespace
      break;

    default:
      whitespace = FALSE;
    }
  }
  // set up a dummy at the end because we sometimes look ahead by 1
  // in the parsing code and don't want to hit uninitialised data.
  makespace(c, nextfree, arraysize);
  c[nextfree].t = TYPE_EOF;
  c[nextfree].s = "<EOF>";
  c[nextfree].l = lineno;
  c[nextfree].col = col;
}

//--------------------------------------------------------------------------

int cp = 0;  // code pointer.  Has to be global state.
int ap = 0;  // Analysis record pointer.

int parse(int pp, int depth) // depth is only for indentation in diags
{
  int saved_cp, saved_ap, i, gp, alts, match;

  gp = phrase_start[pp-512-MAX_BIP];
  alts = gram[gp];

  if (debug_parser) {
    fprintf(stderr, "\n");
    indent(depth, stderr);
    fprintf(stderr, "Phrase %s/%d (%d alternatives) = ", phrasename[pp-512], pp, alts);
    fflush(stderr);
  }
  gp++; // gp now points to first element (length) of first alt

  saved_cp = cp;
  saved_ap = ap;

  for (i = 0; i < alts; i++) {
    int each, phrases = gram[gp++], phrase_count, gap = 0;

    cp = saved_cp;
    ap = saved_ap;

    if (ap+3 > next_free_a) next_free_a = ap+3;
    makespace(A, next_free_a, a_size);
    A[ap++] = pp;   // record which phrase (could be done outside loop)
    A[ap++] = i;    // and which alt.


    // Count slots needed.  *Could* be precalculated and stored
    // in the grammar, either embedded (after the ALT) or as a
    // separate table

    for (each = 0; each < phrases; each++) if (gram[gp+each] >= 512) gap++;
    A[ap++] = gap;    // Count of alts (gap)
    // ap+gap now points to the slot after the space required, which
    // is where the first subphrase will be stored.
    ap = ap+gap; // recursive subphrases are stored after this phrase.
                 // ap is left updated if successful.


    // successfully parsed phrases are stored in A[saved_ap+3+n]

    if (saved_ap+3+gap > next_free_a) next_free_a = saved_ap+3+gap;
    makespace(A, next_free_a, a_size);

    // this loop is only for diagnostics
    if (debug_parser) {
      fprintf(stderr, "\n");
      indent(depth, stderr);
      fprintf(stderr, "Alternative %d: (%d phrases) ", i+1, phrases);
      for (each = 0; each < phrases; each++) {
        int phrase = gram[gp+each];
        if (phrase < 256) {
          fprintf(stderr, " '%c'", phrase);
        } else if (phrase < 512) {
          fprintf(stderr, " \"%s\"/%d", keyword[phrase-256], phrase-256);
        } else if (phrase < 512+MAX_BIP) {
          fprintf(stderr, " {%s/BIP%d}", phrasename[phrase-512], BIP[phrase-512]);
        } else {
          fprintf(stderr, " <%s/%d>", phrasename[phrase-512], phrase);
        }
      }
      fprintf(stderr, "\n");
      fflush(stderr);
    }

    match = TRUE; // stays true if all subphrases match
    phrase_count = 0; // only phrases which make it into the A record,
                      // i.e. excluding literals and keywords
    for (each = 0; each < phrases; each++) {
      int phrase = gram[gp+each];

      if (debug_parser) {
        indent(depth, stderr);
        fprintf(stderr, "Input token stream = '%s' '%s' '%s' ...\n",
          (cp < nextfree ? c[cp].s : "EOF"),
          (cp+1 < nextfree ? c[cp+1].s : "EOF"),
          (cp+2 < nextfree ? c[cp+2].s : "EOF"));
      }
      if (cp > bestparse) {
        static char s[128];
        if (phrase < 256) {
          sprintf(s, "'%c'", phrase);
        } else if (phrase < 512) {
          sprintf(s, "\"%s\"", keyword[phrase-256]);
        } else if (phrase < 512+MAX_BIP) {
          sprintf(s, "{%s}", phrasename[phrase-512]);
        } else {
          sprintf(s, "<%s>", phrasename[phrase-512]);
        }
        looking_for = s;
        bestparse = cp;
      }

      if (debug_parser) indent(depth, stderr);
      if (phrase < 256) {
        if (debug_parser) fprintf(stderr, "'%c'", phrase);
        if (c[cp].s[0] != phrase) match = FALSE; else cp++;
        // Don't record literals
      } else if (phrase < 512) {
        if (debug_parser) fprintf(stderr, "\"%s\"/%d", keyword[phrase-256], phrase-256);
        if (strcmp(keyword[phrase-256], c[cp].s) != 0) match = FALSE; else cp++;
        // Don't record keywords
      } else if (phrase < 512+MAX_BIP) {
        int where = ap; // next phrase to be parsed will be stored at current 'ap'.
        if (debug_parser) fprintf(stderr, "{%s/BIP%d}", phrasename[phrase-512], BIP[phrase-512]);
        if (c[cp].t != BIP[phrase-512]) match = FALSE; else {
          A[ap++] = phrase;
          A[ap++] = 1;
          A[ap++] = 1;
          A[ap++] = cp++;
          A[saved_ap+3+phrase_count++] = where; // Record BIP
        }
      } else {
        int where = ap; // next phrase to be parsed will be stored at current 'ap'.
        if (debug_parser) fprintf(stderr, "<%s/%d>", phrasename[phrase-512], phrase);
        if (!parse(phrase, depth+1)) match = FALSE; else {
          A[saved_ap+3+phrase_count++] = where;
        }
      }
      if (debug_parser) {
        fprintf(stderr, "\n");
        indent(depth, stderr);
        fprintf(stderr, "Tried alternative %d: result was %s\n", each+1, (match ? "TRUE" : "FALSE"));
        fflush(stderr);
      }
      if (!match) break;
    }
    gp += phrases; // move over all phrases, to next alt
    if (match) break; else if (debug_parser) {
      indent(depth, stderr);
      fprintf(stderr, "** Alternative %d FAILED.\n", i+1);
    }
    // gp now points to first element (length) of next alt, or start of next phrase
  }
  return(match);
}

//--------------------------------------------------------------------------

typedef int TRIP;

//------------------------------------------------------------------------------------

/*
      In previous compilers, I had to write custom code for every tree-based
    optimisation, in order to walk down the tree to the right place to find
    the leaves to be optimised.  In this one, I have a generic tree-walking
    procedure which can walk the entire program, but it can be customised 
    so that it takes action only on specific phrases.  This is possible in
    this design only because each set of subphrases stores the count of
    subphrases befoe it - thus allowing a generic tree-walking procedure
    that doesn't have to know what each node consists of until it happens
    across a node of the type it is looking for.
 */

void walk(int ap, int depth, int wanted(int phraseno), void perform(int ap, int before, int depth))
{
  int i;

  if (wanted(A[ap])) perform(ap, TRUE, depth);
  for (i = 3; i < A[ap+2]+3; i++) {
    if (A[A[ap+i]] >= 512+MAX_BIP) walk(A[ap+i], depth+1, wanted, perform);
  }
  if (wanted(A[ap])) perform(ap, FALSE, depth);

}

int want_all(int phraseno) {
  return TRUE;
}

void print_all(int ap, int before, int depth) {
  int saved_ap = ap;
  int phrase = A[ap++];
  int alt = A[ap++];
  int phrases = A[ap++]; // defined subphrases
  int i;

  indent(depth, stderr);
  fprintf(stderr, "<%s%s/%d>  ", (before ? "" : "/"), phrasename[phrase-512], alt);
  for (i = 0; i < (3+phrases); i++) {
    fprintf(stderr, "A[%d] = %d, ", saved_ap+i, A[saved_ap+i]);
  }
  fprintf(stderr, "\n");

}

TRIP compile(int ap, int depth)
{
  int saved_ap = ap;
  int phrase = A[ap++];  // A[ap] is the phrase number. A[ap+1] is the alt.
  int alt = A[ap++]; // For consistency, in BIPs, the Alt should always be 1
                     // although that may not be the case at the moment :-(
  int phrases = A[ap++]; // defined subphrases
  int i;

  // The following ecce command executed on this file will generate hamish.g:
  // ecce -c "(v.//\\.s..(v/ /e)?m,k)0;%c" hamish.c hamish.g
  // May later tweak takeon.c to read from hamish.c rather than hamish.g
  // thus simplifying build process and becoming more like yacc.

  switch (phrase) {

//\\ # BIPS (Built-in Phrases)are linked to the type-code returned
//\\ # by the line-reconstruction code (aka lexer)
//\\
//\\ # These *must* come first.

// See TYPE_* in first page for the values to use.

//\\ B<EOF>=0;
  case P_EOF:
    fprintf(stderr, "%s", c[A[ap]].s);
    break;

//\\
//\\ B<IDENT>=1;
  case P_IDENT:
    fprintf(stderr, "%s", c[A[ap]].s);
    break;

//\\ B<NUM>=2;
  case P_NUM:
    fprintf(stderr, "%s", c[A[ap]].s);
    break;

//\\
//\\ # Phrase definitions.  PROGRAM is the main entry point.
//\\
//\\ # This is the first draft of a grammar, based on the one
//\\ # collectively determined at the wiki site.  At the moment
//\\ # this has not been debugged extensively.


  case P_PROGRAM:
//\\ P<PROGRAM> = <PREFIX_PLEASE> <SIMPLE> <EOF> | <SIMPLE> <POSTFIX_PLEASE> <EOF> ;
    compile(A[ap], depth+1);
    break;

  case P_PREFIX_PLEASE:
//\\ P<PREFIX_PLEASE> = "please" | "would" "you" | ;
    break;

  case P_POSTFIX_PLEASE:
//\\ P<POSTFIX_PLEASE> = "please" | ;
    break;

  case P_SIMPLE:
//\\ P<SIMPLE> = <TIME> | <DATE> | <DAY> | <LIGHTS> | <ALARM> | <TIMER>;
    compile(A[ap], depth+1);
    break;

  case P_TIME:
//\\ P<TIME> = "what" "time" "is" "it" | "whats" "the" "time" | "time" ;
    break;

  case P_DATE:
//\\ P<DATE> = "what" "the" "date" | "whats" "todays" "date" | "date" ;
    break;

  case P_DAY:
//\\ P<DAY> = "what" "day" "is" <THISDAY> | "what" "days" <THISDAY> | "day" "of" "the" "week" | "day" ;
    break;

  case P_THISDAY:
//\\ P<THISDAY> = "it" | "this" | "today" ;
    break;

  case P_ALARM:
//\\ P<ALARM> = "set" "alarm" "for" <ABSTIME>;
    break;

  case P_TIMER:
//\\ P<TIMER> = <DELTATIME> "timer";
    break;

  case P_DELTATIME:
//\\ P<DELTATIME> = <NUM> "minute";
    break;

  case P_ABSTIME:
//\\ P<ABSTIME> = <NUM> "oclock" | <NUM> "a" "m" <TODAYTOMORROW>
//\\            | <NUM> "p" "m" <TODAYTOMORROW> | <NUM> "in" "the" <TIMEPERIOD> | <NUM> "tomorrow" <TIMEPERIOD> | <NUM> ;
    break;

  case P_TODAYTOMORROW:
//\\ P<TODAYTOMORROW> = "today" | "tomorrow" | ;
    break;

  case P_TIMEPERIOD:
//\\ P<TIMEPERIOD> = "morning" | "afternoon" | "evening" ;
    break;

  case P_LIGHTS:
//\\ P<LIGHTS> = <IDENT> <OFF_OR_ON> | "turn" "the" <IDENT> <OFF_OR_ON> | "turn" <OFF_OR_ON> "the" <IDENT> | <DIM>;
    break;

  case P_DIM:
//\\ P<DIM> = <IDENT> <UP_OR_DOWN> | "dim" "the" <IDENT> | "dim" <IDENT> ;
    break;

  case P_OFF_OR_ON:
//\\ P<OFF_OR_ON> = "off" | "on" ;
    break;

  case P_UP_OR_DOWN:
//\\ P<UP_OR_DOWN> = "up" | "down" | "dim" | "low" | "bright" ;
    break;

//\\
//\\ E
//\\ # 'E' is end of grammar.  Everything after this is ignored.
  default:
    fprintf(stderr, "*** Internal error at line %d.  ap=%d  phrase=%d\n", __LINE__, ap, phrase); exit(2);
  }

  return(-1); // DUMMY TRIP, NOTHING TO RETURN

}

//--------------------------------------------------------------------------



int main(int argc, char **argv) {
char *s;

  // Get clean version of executable name.  Should work on most existing systems (2006)
  progname = argv[0];
  if ((s = strrchr(progname, '/')) != NULL) progname = s+1;  // Unix
  if ((s = strrchr(progname, '\\')) != NULL) progname = s+1; // M$
  if ((s = strrchr(progname, ']')) != NULL) progname = s+1;  // Dec
  if ((s = strrchr(progname, ';')) != NULL) *s = '\0';       // Version no's
  if (((s = strrchr(progname, '.')) != NULL) && (strcasecmp(s, ".exe") == 0)) *s = '\0';
  if (((s = strrchr(progname, '.')) != NULL) && (strcasecmp(s, ".com") == 0)) *s = '\0';

  if ((argc > 1) && strcmp(argv[1], "-d") == 0) {
    argv++; argc--; debug_parser = TRUE;
  }

  if (argc == 1) {
    sourcefile = stdin; curfile = "/dev/stdin"; 
  } else if (argc == 2) {

    sourcefile = fopen(argv[1], "r");
    if (sourcefile == NULL) {
      fprintf(stderr, "%s: %s - %s\n", progname, strerror(errno), argv[1]);
      exit(errno);
    }

    curfile = argv[1];
  } else {
    fprintf(stderr, "syntax: %s [-d] filename\n", progname);
    exit(1);
  }

for (;;) {

  bestparse = -1; // for error reporting.
  looking_for = "<UNKNOWN>";    // 'while looking for <PHRASENAME>' (or literal) ... (Never freed)
  nextfree = 0, arraysize = 0;
  if (c != NULL) free(c); c = NULL;
  if (A != NULL) free(A); A = NULL;
  next_free_a = 0, a_size = 0;
  cp = ap = 0;
  startline = TRUE; whitespace = TRUE;
  if (onecharstr == NULL) onecharstr = (char *)malloc(512);
  line_reconstruction();

  if (debug_parser) {
    int i; // DEBUG ONLY
    for (i = 0; i < nextfree; i++) {
      fprintf(stderr, "%s, line %d, col %d: [%0d] %s\n",
                       c[i].f, c[i].l, c[i].col, c[i].t, c[i].s);
    }
  }

  if (c[0].t == TYPE_EOF) break;

  if (!parse(PHRASE_BASE, 0)) {
    if (bestparse == nextfree) {
      fprintf(stderr, "\"%s\", Line %d, Col %d: Premature end of file while looking for %s\n",
                       curfile, c[bestparse].l, c[bestparse].col+1, looking_for);
    } else {
      int i;
      fprintf(stderr, "\"%s\", Line %d, Col %d: Syntax error while looking for %s near ",
                       curfile, c[bestparse].l, c[bestparse].col+1, looking_for);
      for (i = bestparse; i < bestparse+3; i++) {
        if (i == nextfree) {
          fprintf(stderr, "<End of file>");
          break;
        }
        switch (c[i].t) {
        case TYPE_TAG:
        case TYPE_INT:
        case TYPE_KEYWORD:
          fprintf(stderr, "%s", c[i].s);
          break;
        }
        fprintf(stderr, (i == (bestparse+2) ? " ..." : " "));
      }
      fprintf(stderr, "\n");
    }
    continue;
  }

  fprintf(stderr, "Parsed OK, now compiling.\n");
  fflush(stderr);

  if (debug_parser) walk(0, 0, want_all, print_all); // Diags: print final parse tree

  // The structure at A[] is a concrete syntax tree.  We could
  // generate code directly from this tree, however for the
  // sake of cleaner design, we will first convert the concrete
  // syntax tree into an Abstract Syntax Tree (AST)

  {int program = compile(0, 0);
    // Now generate the output code from the AST.
    // codegen(program, -1, -1, -1)
  }

  fflush(stderr);
}

  exit(0); return(1);
}