// TO DO: // http://www.ef.edu/english-resources/english-grammar/numbers-english/ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <errno.h> #define ishex(ch) (int)(strchr("0123456789ABCDEFabcdef", (char)((ch)&255)) != NULL) int hextobin(char x) { // Too crude, just a quick hack. Seldom called. if (('0' <= x) && (x <= '9')) { return x-'0'; } else if (('A' <= x) && (x <= 'F')) { return x-'A'+10; } else if (('a' <= x) && (x <= 'f')) { return x-'a'+10; } else { return 0; // ASSERT: INTERNAL ERROR! } } void indent(int depth, FILE *f) { int i; for (i = 0; i < depth; i++) fprintf(f, " "); } #define DEBUG_PARSER #include "hamish.h" // GENERATED GRAMMAR FROM takeon.c and hamish.g #ifndef FALSE #define FALSE (0!=0) #define TRUE (0==0) #endif char *progname; int debug_parser = FALSE; // Built-in phrase codes. Must match with grammar file. #define TYPE_EOF 0 #define TYPE_TAG 1 #define TYPE_INT 2 #define TYPE_KEYWORD 3 typedef struct sourceinfo { // ATOMS for processed input stream char *s; // string contents int l; // lineno int col; // column int t; // type - tag, int, or keyword char *f; // source or includefile name } sourceinfo; int bestparse = -1; // for error reporting. char *looking_for = NULL; // C is the source character token stream static sourceinfo *c = NULL; int nextfree = 0, arraysize = 0; char *onecharstr = NULL; // A is the Analysis record. Contents point to a C[] structure when the item is a BIP, // otherwise the format is [phraseno] [altno] [no-of-subphrases] [subphrases and/or BIPs...] // eg A[25] = 10 means that A[25] is C[10]. static int *A = NULL; int next_free_a = 0, a_size = 0; static void *makespace_(void *c, int nextfree, int *arraysize, int objsize) { if (nextfree >= *arraysize) { *arraysize = (*arraysize * 2) + 1; c = (void *)realloc(c, (*arraysize+1) * objsize); // 0:arraysize, inclusive. eg 0:15, 0:127 etc if (c == NULL) {fprintf(stderr, "tacctwo: %s\n", strerror(errno)); exit(errno);} } return c; } #define makespace(c, nextfree, arraysize) c = (typeof(c))makespace_(c, nextfree, &arraysize, sizeof(c[0])) void stores(char *s, int lineno, int col, int type, char *fname) { makespace(c, nextfree, arraysize); c[nextfree].s = s; c[nextfree].l = lineno; c[nextfree].col = col; c[nextfree].f = fname; c[nextfree].t = type; nextfree++; } void storec(int ch, int lineno, int col, int type, char *fname) { onecharstr[ch*2] = ch; onecharstr[ch*2+1] = '\0'; // convert char to 1-char string before saving. stores(&onecharstr[ch*2], lineno, col, type, fname); } int iskeyword(char *s) { int i; for (i = 0; i < MAX_KEYWORD; i++) if (strcmp(s, keyword[i]) == 0) return TRUE; return FALSE; } //-------------------------------------------------------------------------- FILE *sourcefile; char *curfile; int startline = TRUE, whitespace = TRUE, lineno = 1, col = 0, ch; void line_reconstruction(void) // PRE-PROCESS INPUT READY FOR PARSING { for (;;) { ch = fgetc(sourcefile); if (ch == EOF || ch == '\n') break; ch &= 255; // int, positive. if (isalpha(ch)) { int xxnextfree = 0, strsize = 0, startcol = col; char *token = NULL; whitespace = FALSE; for (;;) { makespace(token, xxnextfree, strsize); if (isalpha(ch) || isdigit(ch) || (ch == '_')) { // digits and '_' allowed after 1st char. col++; token[xxnextfree++] = ch; } else { token[xxnextfree] = '\0'; ungetc(ch, sourcefile); break; } ch = fgetc(sourcefile); } stores(strdup(token), lineno, startcol, iskeyword(token) ? TYPE_KEYWORD : TYPE_TAG, curfile); free(token); } else if (isdigit(ch)) { int xxnextfree = 0, numsize = 0; char *number = NULL; // Store as a string... whitespace = FALSE; for (;;) { makespace(number, xxnextfree, numsize); if (isdigit(ch)) { col++; number[xxnextfree++] = ch; } else { number[xxnextfree] = '\0'; ungetc(ch, sourcefile); break; } ch = fgetc(sourcefile); } stores(strdup(number), lineno, col, TYPE_INT, curfile); free(number); } else switch (ch) { // WHITESPACE case '\n': lineno++; case '\r': startline = TRUE; col = 0; whitespace = TRUE; break; case '\t': case ' ': col++; // Does not affect whitespace break; default: whitespace = FALSE; } } // set up a dummy at the end because we sometimes look ahead by 1 // in the parsing code and don't want to hit uninitialised data. makespace(c, nextfree, arraysize); c[nextfree].t = TYPE_EOF; c[nextfree].s = "<EOF>"; c[nextfree].l = lineno; c[nextfree].col = col; } //-------------------------------------------------------------------------- int cp = 0; // code pointer. Has to be global state. int ap = 0; // Analysis record pointer. int parse(int pp, int depth) // depth is only for indentation in diags { int saved_cp, saved_ap, i, gp, alts, match; gp = phrase_start[pp-512-MAX_BIP]; alts = gram[gp]; if (debug_parser) { fprintf(stderr, "\n"); indent(depth, stderr); fprintf(stderr, "Phrase %s/%d (%d alternatives) = ", phrasename[pp-512], pp, alts); fflush(stderr); } gp++; // gp now points to first element (length) of first alt saved_cp = cp; saved_ap = ap; for (i = 0; i < alts; i++) { int each, phrases = gram[gp++], phrase_count, gap = 0; cp = saved_cp; ap = saved_ap; if (ap+3 > next_free_a) next_free_a = ap+3; makespace(A, next_free_a, a_size); A[ap++] = pp; // record which phrase (could be done outside loop) A[ap++] = i; // and which alt. // Count slots needed. *Could* be precalculated and stored // in the grammar, either embedded (after the ALT) or as a // separate table for (each = 0; each < phrases; each++) if (gram[gp+each] >= 512) gap++; A[ap++] = gap; // Count of alts (gap) // ap+gap now points to the slot after the space required, which // is where the first subphrase will be stored. ap = ap+gap; // recursive subphrases are stored after this phrase. // ap is left updated if successful. // successfully parsed phrases are stored in A[saved_ap+3+n] if (saved_ap+3+gap > next_free_a) next_free_a = saved_ap+3+gap; makespace(A, next_free_a, a_size); // this loop is only for diagnostics if (debug_parser) { fprintf(stderr, "\n"); indent(depth, stderr); fprintf(stderr, "Alternative %d: (%d phrases) ", i+1, phrases); for (each = 0; each < phrases; each++) { int phrase = gram[gp+each]; if (phrase < 256) { fprintf(stderr, " '%c'", phrase); } else if (phrase < 512) { fprintf(stderr, " \"%s\"/%d", keyword[phrase-256], phrase-256); } else if (phrase < 512+MAX_BIP) { fprintf(stderr, " {%s/BIP%d}", phrasename[phrase-512], BIP[phrase-512]); } else { fprintf(stderr, " <%s/%d>", phrasename[phrase-512], phrase); } } fprintf(stderr, "\n"); fflush(stderr); } match = TRUE; // stays true if all subphrases match phrase_count = 0; // only phrases which make it into the A record, // i.e. excluding literals and keywords for (each = 0; each < phrases; each++) { int phrase = gram[gp+each]; if (debug_parser) { indent(depth, stderr); fprintf(stderr, "Input token stream = '%s' '%s' '%s' ...\n", (cp < nextfree ? c[cp].s : "EOF"), (cp+1 < nextfree ? c[cp+1].s : "EOF"), (cp+2 < nextfree ? c[cp+2].s : "EOF")); } if (cp > bestparse) { static char s[128]; if (phrase < 256) { sprintf(s, "'%c'", phrase); } else if (phrase < 512) { sprintf(s, "\"%s\"", keyword[phrase-256]); } else if (phrase < 512+MAX_BIP) { sprintf(s, "{%s}", phrasename[phrase-512]); } else { sprintf(s, "<%s>", phrasename[phrase-512]); } looking_for = s; bestparse = cp; } if (debug_parser) indent(depth, stderr); if (phrase < 256) { if (debug_parser) fprintf(stderr, "'%c'", phrase); if (c[cp].s[0] != phrase) match = FALSE; else cp++; // Don't record literals } else if (phrase < 512) { if (debug_parser) fprintf(stderr, "\"%s\"/%d", keyword[phrase-256], phrase-256); if (strcmp(keyword[phrase-256], c[cp].s) != 0) match = FALSE; else cp++; // Don't record keywords } else if (phrase < 512+MAX_BIP) { int where = ap; // next phrase to be parsed will be stored at current 'ap'. if (debug_parser) fprintf(stderr, "{%s/BIP%d}", phrasename[phrase-512], BIP[phrase-512]); if (c[cp].t != BIP[phrase-512]) match = FALSE; else { A[ap++] = phrase; A[ap++] = 1; A[ap++] = 1; A[ap++] = cp++; A[saved_ap+3+phrase_count++] = where; // Record BIP } } else { int where = ap; // next phrase to be parsed will be stored at current 'ap'. if (debug_parser) fprintf(stderr, "<%s/%d>", phrasename[phrase-512], phrase); if (!parse(phrase, depth+1)) match = FALSE; else { A[saved_ap+3+phrase_count++] = where; } } if (debug_parser) { fprintf(stderr, "\n"); indent(depth, stderr); fprintf(stderr, "Tried alternative %d: result was %s\n", each+1, (match ? "TRUE" : "FALSE")); fflush(stderr); } if (!match) break; } gp += phrases; // move over all phrases, to next alt if (match) break; else if (debug_parser) { indent(depth, stderr); fprintf(stderr, "** Alternative %d FAILED.\n", i+1); } // gp now points to first element (length) of next alt, or start of next phrase } return(match); } //-------------------------------------------------------------------------- typedef int TRIP; //------------------------------------------------------------------------------------ /* In previous compilers, I had to write custom code for every tree-based optimisation, in order to walk down the tree to the right place to find the leaves to be optimised. In this one, I have a generic tree-walking procedure which can walk the entire program, but it can be customised so that it takes action only on specific phrases. This is possible in this design only because each set of subphrases stores the count of subphrases befoe it - thus allowing a generic tree-walking procedure that doesn't have to know what each node consists of until it happens across a node of the type it is looking for. */ void walk(int ap, int depth, int wanted(int phraseno), void perform(int ap, int before, int depth)) { int i; if (wanted(A[ap])) perform(ap, TRUE, depth); for (i = 3; i < A[ap+2]+3; i++) { if (A[A[ap+i]] >= 512+MAX_BIP) walk(A[ap+i], depth+1, wanted, perform); } if (wanted(A[ap])) perform(ap, FALSE, depth); } int want_all(int phraseno) { return TRUE; } void print_all(int ap, int before, int depth) { int saved_ap = ap; int phrase = A[ap++]; int alt = A[ap++]; int phrases = A[ap++]; // defined subphrases int i; indent(depth, stderr); fprintf(stderr, "<%s%s/%d> ", (before ? "" : "/"), phrasename[phrase-512], alt); for (i = 0; i < (3+phrases); i++) { fprintf(stderr, "A[%d] = %d, ", saved_ap+i, A[saved_ap+i]); } fprintf(stderr, "\n"); } TRIP compile(int ap, int depth) { int saved_ap = ap; int phrase = A[ap++]; // A[ap] is the phrase number. A[ap+1] is the alt. int alt = A[ap++]; // For consistency, in BIPs, the Alt should always be 1 // although that may not be the case at the moment :-( int phrases = A[ap++]; // defined subphrases int i; // The following ecce command executed on this file will generate hamish.g: // ecce -c "(v.//\\.s..(v/ /e)?m,k)0;%c" hamish.c hamish.g // May later tweak takeon.c to read from hamish.c rather than hamish.g // thus simplifying build process and becoming more like yacc. switch (phrase) { //\\ # BIPS (Built-in Phrases)are linked to the type-code returned //\\ # by the line-reconstruction code (aka lexer) //\\ //\\ # These *must* come first. // See TYPE_* in first page for the values to use. //\\ B<EOF>=0; case P_EOF: fprintf(stderr, "%s", c[A[ap]].s); break; //\\ //\\ B<IDENT>=1; case P_IDENT: fprintf(stderr, "%s", c[A[ap]].s); break; //\\ B<NUM>=2; case P_NUM: fprintf(stderr, "%s", c[A[ap]].s); break; //\\ //\\ # Phrase definitions. PROGRAM is the main entry point. //\\ //\\ # This is the first draft of a grammar, based on the one //\\ # collectively determined at the wiki site. At the moment //\\ # this has not been debugged extensively. case P_PROGRAM: //\\ P<PROGRAM> = <PREFIX_PLEASE> <SIMPLE> <EOF> | <SIMPLE> <POSTFIX_PLEASE> <EOF> ; compile(A[ap], depth+1); break; case P_PREFIX_PLEASE: //\\ P<PREFIX_PLEASE> = "please" | "would" "you" | ; break; case P_POSTFIX_PLEASE: //\\ P<POSTFIX_PLEASE> = "please" | ; break; case P_SIMPLE: //\\ P<SIMPLE> = <TIME> | <DATE> | <DAY> | <LIGHTS> | <ALARM> | <TIMER>; compile(A[ap], depth+1); break; case P_TIME: //\\ P<TIME> = "what" "time" "is" "it" | "whats" "the" "time" | "time" ; break; case P_DATE: //\\ P<DATE> = "what" "the" "date" | "whats" "todays" "date" | "date" ; break; case P_DAY: //\\ P<DAY> = "what" "day" "is" <THISDAY> | "what" "days" <THISDAY> | "day" "of" "the" "week" | "day" ; break; case P_THISDAY: //\\ P<THISDAY> = "it" | "this" | "today" ; break; case P_ALARM: //\\ P<ALARM> = "set" "alarm" "for" <ABSTIME>; break; case P_TIMER: //\\ P<TIMER> = <DELTATIME> "timer"; break; case P_DELTATIME: //\\ P<DELTATIME> = <NUM> "minute"; break; case P_ABSTIME: //\\ P<ABSTIME> = <NUM> "oclock" | <NUM> "a" "m" <TODAYTOMORROW> //\\ | <NUM> "p" "m" <TODAYTOMORROW> | <NUM> "in" "the" <TIMEPERIOD> | <NUM> "tomorrow" <TIMEPERIOD> | <NUM> ; break; case P_TODAYTOMORROW: //\\ P<TODAYTOMORROW> = "today" | "tomorrow" | ; break; case P_TIMEPERIOD: //\\ P<TIMEPERIOD> = "morning" | "afternoon" | "evening" ; break; case P_LIGHTS: //\\ P<LIGHTS> = <IDENT> <OFF_OR_ON> | "turn" "the" <IDENT> <OFF_OR_ON> | "turn" <OFF_OR_ON> "the" <IDENT> | <DIM>; break; case P_DIM: //\\ P<DIM> = <IDENT> <UP_OR_DOWN> | "dim" "the" <IDENT> | "dim" <IDENT> ; break; case P_OFF_OR_ON: //\\ P<OFF_OR_ON> = "off" | "on" ; break; case P_UP_OR_DOWN: //\\ P<UP_OR_DOWN> = "up" | "down" | "dim" | "low" | "bright" ; break; //\\ //\\ E //\\ # 'E' is end of grammar. Everything after this is ignored. default: fprintf(stderr, "*** Internal error at line %d. ap=%d phrase=%d\n", __LINE__, ap, phrase); exit(2); } return(-1); // DUMMY TRIP, NOTHING TO RETURN } //-------------------------------------------------------------------------- int main(int argc, char **argv) { char *s; // Get clean version of executable name. Should work on most existing systems (2006) progname = argv[0]; if ((s = strrchr(progname, '/')) != NULL) progname = s+1; // Unix if ((s = strrchr(progname, '\\')) != NULL) progname = s+1; // M$ if ((s = strrchr(progname, ']')) != NULL) progname = s+1; // Dec if ((s = strrchr(progname, ';')) != NULL) *s = '\0'; // Version no's if (((s = strrchr(progname, '.')) != NULL) && (strcasecmp(s, ".exe") == 0)) *s = '\0'; if (((s = strrchr(progname, '.')) != NULL) && (strcasecmp(s, ".com") == 0)) *s = '\0'; if ((argc > 1) && strcmp(argv[1], "-d") == 0) { argv++; argc--; debug_parser = TRUE; } if (argc == 1) { sourcefile = stdin; curfile = "/dev/stdin"; } else if (argc == 2) { sourcefile = fopen(argv[1], "r"); if (sourcefile == NULL) { fprintf(stderr, "%s: %s - %s\n", progname, strerror(errno), argv[1]); exit(errno); } curfile = argv[1]; } else { fprintf(stderr, "syntax: %s [-d] filename\n", progname); exit(1); } for (;;) { bestparse = -1; // for error reporting. looking_for = "<UNKNOWN>"; // 'while looking for <PHRASENAME>' (or literal) ... (Never freed) nextfree = 0, arraysize = 0; if (c != NULL) free(c); c = NULL; if (A != NULL) free(A); A = NULL; next_free_a = 0, a_size = 0; cp = ap = 0; startline = TRUE; whitespace = TRUE; if (onecharstr == NULL) onecharstr = (char *)malloc(512); line_reconstruction(); if (debug_parser) { int i; // DEBUG ONLY for (i = 0; i < nextfree; i++) { fprintf(stderr, "%s, line %d, col %d: [%0d] %s\n", c[i].f, c[i].l, c[i].col, c[i].t, c[i].s); } } if (c[0].t == TYPE_EOF) break; if (!parse(PHRASE_BASE, 0)) { if (bestparse == nextfree) { fprintf(stderr, "\"%s\", Line %d, Col %d: Premature end of file while looking for %s\n", curfile, c[bestparse].l, c[bestparse].col+1, looking_for); } else { int i; fprintf(stderr, "\"%s\", Line %d, Col %d: Syntax error while looking for %s near ", curfile, c[bestparse].l, c[bestparse].col+1, looking_for); for (i = bestparse; i < bestparse+3; i++) { if (i == nextfree) { fprintf(stderr, "<End of file>"); break; } switch (c[i].t) { case TYPE_TAG: case TYPE_INT: case TYPE_KEYWORD: fprintf(stderr, "%s", c[i].s); break; } fprintf(stderr, (i == (bestparse+2) ? " ..." : " ")); } fprintf(stderr, "\n"); } continue; } fprintf(stderr, "Parsed OK, now compiling.\n"); fflush(stderr); if (debug_parser) walk(0, 0, want_all, print_all); // Diags: print final parse tree // The structure at A[] is a concrete syntax tree. We could // generate code directly from this tree, however for the // sake of cleaner design, we will first convert the concrete // syntax tree into an Abstract Syntax Tree (AST) {int program = compile(0, 0); // Now generate the output code from the AST. // codegen(program, -1, -1, -1) } fflush(stderr); } exit(0); return(1); }