This is G o o g l e's cache of http://www.gtoal.com/compilers101/tinc/02_Graham_Toal/backup/teeny-200604090835.c as retrieved on 6 May 2006 21:45:41 GMT.
G o o g l e's cache is the snapshot that we took of the page as we crawled the web.
The page may have changed since that time. Click here for the current page without highlighting.
This cached page may reference images which are no longer available. Click here for the cached text only.
To link to or bookmark this page, use the following url: http://www.google.com/search?q=cache:wdXiDMp5Vb4J:www.gtoal.com


Google is neither affiliated with the authors of this page nor responsible for its content.

#include #include #include #include #define ishex(ch) (int)strchr("0123456789ABCDEFabcdef", ch) int hextobin(char x) { // Too crude, just a quick hack. Seldom called. if (('0' <= x) && (x <= '9')) { return x-'0'; } else if (('A' <= x) && (x <= 'F')) { return x-'A'+10; } else if (('a' <= x) && (x <= 'f')) { return x-'a'+10; } else { return 0; // ASSERT: INTERNAL ERROR! } } void indent(int depth, FILE *f) { int i; for (i = 0; i < depth; i++) fprintf(f, " "); } #define DEBUG_PARSER #include "teeny.h" // GENERATED GRAMMAR FROM takeon.c and teeny.g #ifndef FALSE #define FALSE (0!=0) #define TRUE (0==0) #endif char *progname; int debug_parser = FALSE; // Built-in phrase codes. Must match with grammar file. #define TYPE_EOF 0 #define TYPE_TAG 1 #define TYPE_STRING 2 #define TYPE_CHARCONST 3 #define TYPE_CHAR 4 #define TYPE_INT 5 #define TYPE_HEXINT 6 #define TYPE_KEYWORD 7 typedef struct sourceinfo { // ATOMS for processed input stream char *s; // string contents int l; // lineno int col; // column int t; // type - tag, "string", 'charconst', or char, so far char *f; // source or includefile name } sourceinfo; int bestparse = -1; // for error reporting. char *looking_for = ""; // 'while looking for ' (or literal) ... // C is the source character token stream static sourceinfo *c = NULL; int nextfree = 0, arraysize = 0; char *onecharstr; // A is the Analysis record. Contents point to a C[] structure when the item is a BIP, // otherwise the format is [phraseno] [altno] [no-of-subphrases] [subphrases and/or BIPs...] // eg A[25] = 10 means that A[25] is C[10]. static int *A = NULL; int next_free_a = 0, a_size = 0; static void *makespace_(void *c, int nextfree, int *arraysize, int objsize) { if (nextfree >= *arraysize) { *arraysize = (*arraysize * 2) + 1; c = (void *)realloc(c, (*arraysize+1) * objsize); // 0:arraysize, inclusive. eg 0:15, 0:127 etc if (c == NULL) {fprintf(stderr, "tacctwo: %s\n", strerror(errno)); exit(errno);} } return c; } #define makespace(c, nextfree, arraysize) c = (typeof(c))makespace_(c, nextfree, &arraysize, sizeof(c[0])) void stores(char *s, int lineno, int col, int type, char *fname) { makespace(c, nextfree, arraysize); c[nextfree].s = s; c[nextfree].l = lineno; c[nextfree].col = col; c[nextfree].f = fname; c[nextfree].t = type; nextfree++; } void storec(int ch, int lineno, int col, int type, char *fname) { onecharstr[ch*2] = ch; onecharstr[ch*2+1] = '\0'; // convert char to 1-char string before saving. stores(&onecharstr[ch*2], lineno, col, type, fname); } int iskeyword(char *s) { int i; for (i = 0; i < MAX_KEYWORD; i++) if (strcmp(s, keyword[i]) == 0) return TRUE; return FALSE; } //-------------------------------------------------------------------------- FILE *sourcefile; char *curfile; int startline = TRUE, whitespace = TRUE, lineno = 1, col = 0, ch, peek; void line_reconstruction(void) // PRE-PROCESS INPUT READY FOR PARSING { for (;;) { ch = fgetc(sourcefile); if (ch == EOF) break; ch &= 255; // int, positive. peek = fgetc(sourcefile); ungetc(peek, sourcefile); if (isalpha(ch)) { int nextfree = 0, strsize = 0, startcol = col; char *token = NULL; whitespace = FALSE; for (;;) { makespace(token, nextfree, strsize); if (isalpha(ch) || isdigit(ch) || (ch == '_')) { // digits and '_' allowed after 1st char. col++; token[nextfree++] = ch; } else { token[nextfree] = '\0'; ungetc(ch, sourcefile); break; } ch = fgetc(sourcefile); } stores(strdup(token), lineno, startcol, iskeyword(token) ? TYPE_KEYWORD : TYPE_TAG, curfile); free(token); } else if (isdigit(ch)) { int nextfree = 0, numsize = 0; char *number = NULL; // Store as a string... whitespace = FALSE; for (;;) { makespace(number, nextfree, numsize); if (isdigit(ch)) { col++; number[nextfree++] = ch; } else { number[nextfree] = '\0'; ungetc(ch, sourcefile); break; } ch = fgetc(sourcefile); } stores(strdup(number), lineno, col, TYPE_INT, curfile); free(number); } else switch (ch) { case '$': // Hex constant \$[0-9a-fA-F]+ // Q: store the '$' in the string or not? whitespace = FALSE; col++; if (ishex(peek)) { int nextfree = 0, numsize = 0; char *number = NULL; for (;;) { makespace(number, nextfree, numsize); ch = fgetc(sourcefile); if (ishex(ch)) { col++; number[nextfree++] = ch; } else { number[nextfree] = '\0'; ungetc(ch, sourcefile); break; } } stores(strdup(number), lineno, col, TYPE_HEXINT, curfile); free(number); } else { // Warn: probably an error... should not be any naked '$' symbols. // If the error to be prined would have been a generic syntax // error at the same location, then maybe give a more informative // error message such as "Unexpected character '$' near: ..." // On the other hand the generic mechanism probably reports this // almsost as accurately. storec(ch, lineno, col++, TYPE_CHAR, curfile); } break; case '\'': // Handle 'c' character constants case '"': // Handle "string" { int nextfree = 0, strsize = 0, quotech = ch; char *string = NULL; whitespace = FALSE; col++; for (;;) { ch = fgetc(sourcefile); // Newlines are allowed col++; makespace(string, nextfree, strsize); if (ch == '\\') { ch = fgetc(sourcefile); col++; if (ch == '\\') { string[nextfree++] = ch; } else if (ch == '\'') { string[nextfree++] = '\''; } else if (ch == '"') { string[nextfree++] = '"'; } else if (ch == 'n') { string[nextfree++] = '\n'; } else if (ch == 'r') { string[nextfree++] = '\r'; } else if (ch == 't') { string[nextfree++] = '\t'; } else if (ch == 'x') { int x, x1, x2; x1 = fgetc(sourcefile); col++; if (!ishex(x1)) { // WARN: Bad format continue; } x2 = fgetc(sourcefile); col++; if (!ishex(x2)) { // WARN: Bad format continue; } x = (hextobin(x1)<<4) | hextobin(x2); if (x == 0) { // WARN: embedded NUL in a string is asking for trouble... } string[nextfree++] = x; } else { // Warn of unknown (to me) \x escape. Probably an error. string[nextfree++] = '\\'; string[nextfree++] = ch; } } else if (ch != quotech) { string[nextfree++] = ch; } else { string[nextfree] = '\0'; break; } } if (quotech == '\'') { if (strlen(string) == 1) { } else if (strlen(string) <= 4) { // Warn that 'xx' as a 32-bit int is a non-standard extension } else { // Warn that this is probably a string with the wrong type of quote. } } stores(strdup(string), lineno, col, (quotech == '\'' ? TYPE_CHARCONST : TYPE_STRING), curfile); free(string); } break; case '/': // COMMENTS (or just a divide symbol) col++; whitespace = FALSE; if (peek == '/') { // Handle line comment do {ch = fgetc(sourcefile);} while (ch != '\n'); lineno++; col = 0; whitespace = TRUE; } else if (peek == '*') { /* Handle potential multi-line comment */ ch = fgetc(sourcefile); // Now we have read '/*' for (;;) { col++; ch = fgetc(sourcefile); peek = fgetc(sourcefile); if ((ch == '*') && (peek == '/')) break; ungetc(peek, sourcefile); } col += 2; (void)fgetc(sourcefile); // Remove '/' // QUESTION: How does this affect # directives? } else { storec(ch, lineno, col, TYPE_CHAR, curfile); } break; // WHITESPACE case '\n': lineno++; case '\r': startline = TRUE; col = 0; whitespace = TRUE; break; case '\t': case ' ': col++; // Does not affect whitespace break; // DIRECTIVES case '#': // If we interpret any #-directives while lexing, we don't want to // do an expensive test on every token, so what we can do is set // a countdown timer on the introductory token (either this '#' // or the actual keyword such as 'ifdef') and then test that the // *previous* tokens match when the timer hits 0, eg // C[cp-3] == '#' && C[cp-2] == 'include' ... etc if (!whitespace) { // WARN: probably an error... should not be any '#' symbols in the // middle of a line. (This language uses "!=" or "<>" for not-equal) } // Drop through default: whitespace = FALSE; storec(ch, lineno, col++, TYPE_CHAR, curfile); } } // set up a dummy at the end because we sometimes look ahead by 1 // in the parsing code and don't want to hit uninitialised data. c[nextfree].t = TYPE_EOF; c[nextfree].s = ""; c[nextfree].l = lineno; c[nextfree].col = col; } //-------------------------------------------------------------------------- int cp = 0; // code pointer. Has to be global state. int ap = 0; // Analysis record pointer. int parse(int pp, int depth) // depth is only for indentation in diags { int saved_cp, saved_ap, i, gp, alts, match; gp = phrase_start[pp-512-MAX_BIP]; alts = gram[gp]; if (debug_parser) { fprintf(stdout, "\n"); indent(depth, stdout); fprintf(stdout, "Phrase %s/%d (%d alternatives) = ", phrasename[pp-512], pp, alts); fflush(stdout); } gp++; // gp now points to first element (length) of first alt saved_cp = cp; saved_ap = ap; for (i = 0; i < alts; i++) { int each, phrases = gram[gp++], phrase_count, gap = 0; cp = saved_cp; ap = saved_ap; if (ap+3 > next_free_a) next_free_a = ap+3; makespace(A, next_free_a, a_size); A[ap++] = pp; // record which phrase (could be done outside loop) A[ap++] = i; // and which alt. // Count slots needed. *Could* be precalculated and stored // in the grammar, either embedded (after the ALT) or as a // separate table for (each = 0; each < phrases; each++) if (gram[gp+each] >= 512) gap++; A[ap++] = gap; // Count of alts (gap) // ap+gap now points to the slot after the space required, which // is where the first subphrase will be stored. ap = ap+gap; // recursive subphrases are stored after this phrase. // ap is left updated if successful. // successfully parsed phrases are stored in A[saved_ap+3+n] if (saved_ap+3+gap > next_free_a) next_free_a = saved_ap+3+gap; makespace(A, next_free_a, a_size); // this loop is only for diagnostics if (debug_parser) { fprintf(stdout, "\n"); indent(depth, stdout); fprintf(stdout, "Alternative %d: (%d phrases) ", i+1, phrases); for (each = 0; each < phrases; each++) { int phrase = gram[gp+each]; if (phrase < 256) { fprintf(stdout, " '%c'", phrase); } else if (phrase < 512) { fprintf(stdout, " \"%s\"/%d", keyword[phrase-256], phrase-256); } else if (phrase < 512+MAX_BIP) { fprintf(stdout, " {%s/BIP%d}", phrasename[phrase-512], BIP[phrase-512]); } else { fprintf(stdout, " <%s/%d>", phrasename[phrase-512], phrase); } } fprintf(stdout, "\n"); fflush(stdout); } match = TRUE; // stays true if all subphrases match phrase_count = 0; // only phrases which make it into the A record, // i.e. excluding literals and keywords for (each = 0; each < phrases; each++) { int phrase = gram[gp+each]; if (debug_parser) { indent(depth, stdout); fprintf(stdout, "Input token stream = '%s' '%s' '%s' ...\n", (cp < nextfree ? c[cp].s : "EOF"), (cp+1 < nextfree ? c[cp+1].s : "EOF"), (cp+2 < nextfree ? c[cp+2].s : "EOF")); } if (cp > bestparse) { static char s[128]; if (phrase < 256) { sprintf(s, "'%c'", phrase); } else if (phrase < 512) { sprintf(s, "\"%s\"", keyword[phrase-256]); } else if (phrase < 512+MAX_BIP) { sprintf(s, "{%s}", phrasename[phrase-512]); } else { sprintf(s, "<%s>", phrasename[phrase-512]); } looking_for = s; bestparse = cp; } if (debug_parser) indent(depth, stdout); if (phrase < 256) { if (debug_parser) fprintf(stdout, "'%c'", phrase); if ((c[cp].t != TYPE_CHAR) || (c[cp].s[0] != phrase)) match = FALSE; else cp++; // Don't record literals } else if (phrase < 512) { if (debug_parser) fprintf(stdout, "\"%s\"/%d", keyword[phrase-256], phrase-256); if (strcmp(keyword[phrase-256], c[cp].s) != 0) match = FALSE; else cp++; // Don't record keywords } else if (phrase < 512+MAX_BIP) { int where = ap; // next phrase to be parsed will be stored at current 'ap'. if (debug_parser) fprintf(stdout, "{%s/BIP%d}", phrasename[phrase-512], BIP[phrase-512]); if (c[cp].t != BIP[phrase-512]) match = FALSE; else { A[ap++] = phrase; A[ap++] = 1; A[ap++] = 1; A[ap++] = cp++; A[saved_ap+3+phrase_count++] = where; // Record BIP } } else { int where = ap; // next phrase to be parsed will be stored at current 'ap'. if (debug_parser) fprintf(stdout, "<%s/%d>", phrasename[phrase-512], phrase); if (!parse(phrase, depth+1)) match = FALSE; else { A[saved_ap+3+phrase_count++] = where; } } if (debug_parser) { fprintf(stdout, "\n"); indent(depth, stdout); fprintf(stdout, "Tried alternative %d: result was %s\n", each+1, (match ? "TRUE" : "FALSE")); fflush(stdout); } if (!match) break; } gp += phrases; // move over all phrases, to next alt if (match) break; else if (debug_parser) { indent(depth, stdout); fprintf(stdout, "** Alternative %d FAILED.\n", i+1); } // gp now points to first element (length) of next alt, or start of next phrase } return(match); } //-------------------------------------------------------------------------- #ifdef CODEGEN // We use a stringpool, and strings are indexes into this pool. This // is useful for the same reasons that the AST is an array indexed // by integers rather than a struct with pointers. It may also // save space by reusing common strings. And we get a free tag // to describe strings. (Also we can compare strings just with // an integer tag comparison, if we ever want to) char stringpool[1024]; int nextstring = 0; typedef enum opcode { ERROR, SEQUENCE, CONST, TAG, ASSIGN, DECLARE, VAR, IFTHEN, IFTHENELSE, INPUT, PRINT, AND, OR, ADD, SUB, MUL, DIV, NEG, NOT, EQ, NE, LT, GT, LE, GE, INT, REAL, LOOP, ENDLOOP, BREAK, CONTINUE, REDIRECT, LINE } OPCODE; char *name[] = { "ERROR", "SEQUENCE", "CONST", "TAG", "ASSIGN", "DECLARE", "VAR", "IFTHEN", "IFTHENELSE", "INPUT", "PRINT", "AND", "OR", "ADD", "SUB", "MUL", "DIV", "NEG", "NOT", "EQ", "NE", "LT", "GT", "LE", "GE", "INT", "REAL", "LOOP", "ENDLOOP", "BREAK", "CONTINUE", "REDIRECT", "LINE" }; char *shortname[] = { "ERROR", ";", "CONST", "TAG", "=", "DECLARE", "VAR", "if (...) then", "if (...) then ... else ...", "INPUT", "PRINT", "&&", "||", "'+'", "'-'", "'*'", "'/'", "'-'", "'~'", "==", "!=", "<", ">", "<=", ">=", "INT", "REAL", "LOOP", "ENDLOOP", "BREAK", "CONTINUE", "REDIRECT", "LINE" }; char *c_infix_op[] = { "ERROR", ";", "CONST", "TAG", "=", "DECLARE", "VAR", "if (...) then", "if (...) then ... else ...", "INPUT", "PRINT", "&&", "||", "+", "-", "*", "/", "-", "~", "==", "!=", "<", ">", "<=", ">=", "INT", "REAL", "LOOP", "ENDLOOP", "BREAK", "CONTINUE", "REDIRECT", "LINE" }; int arity[] = { 3, 3, 3, 3, 3, 2 /* DECLARE */, 3, 3, 4, 2, 2, 3, 3, 3, 3, 3, 3, /* NEG */ 2, /* NOT */ 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 }; int display_children[] = { 0, 2, 0, 0, 2, 1, 0, 2, 3, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 0, 0/*LOOP*/, 0, 0, 0, 0, 0 }; int display_leftchild[] = { FALSE, TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE }; int display_rightchild[] = { FALSE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE }; #define opsym(root) AST[root] #define leftchild(root) AST[root+1] #define rightchild(root) AST[root+2] #define nthchild(root, n) AST[root+n] #define MAXTRIPS 1024 int AST[1024]; int nexttrip = 0; /* trip is not a pointer, just an index into the array of triples */ /* - this makes debugging the array *MUCH* easier */ typedef int TRIP; /* Globals */ TRIP prevtrip = -1; TRIP curtrip = -1; TRIP progstart = -1; char *nameof(TRIP t) { // I realise that the malloc here is heap lossage, but for now // its not worth fixing. Choices are either use a stack to allocate // temporary space (same as in tacc) or keep a cyclic buffer, since // this is only called in the context of printing so we really // only need a few active at once. char *result = malloc(40 /* constants are also bad. I'm lazy. */); int op = opsym(t); if (op == CONST) { sprintf(result, "%d", rightchild(t)); } else if (op == VAR) { // for simpler diagrams, skip a level sprintf(result, "%s", stringpool+rightchild(leftchild(t))); // punt to tag } else { sprintf(result, "%s", shortname[op]); } return(result); } void declare(char *name) { fprintf(stdout, " int %s;\n", name); } void loadconst(char *format, TRIP dest, int value) { fprintf(stdout, " "); fprintf(stdout, format, dest, value); } void loadvar(char *format, TRIP dest, char *varname) { fprintf(stdout, " "); fprintf(stdout, format, dest, varname); } void store(char *format, char *varname, TRIP source) { fprintf(stdout, " "); fprintf(stdout, format, varname, source); } void operate(char *format, TRIP dest, TRIP leftop, char *op, TRIP rightop) { fprintf(stdout, " "); fprintf(stdout, format, dest, leftop, op, rightop); } void monoperate(char *format, TRIP dest, char *op, TRIP leftop) { fprintf(stdout, " "); fprintf(stdout, format, dest, op, leftop); } void put_goto(int lab) { fprintf(stdout, " "); fprintf(stdout, "goto L%02d;\n", lab); } void put_ifgoto(int cond, int lab, int sense) { fprintf(stdout, " "); fprintf(stdout, "if (%s_t%d) goto L%02d;\n", (sense ? "" : "!"), cond, lab); } void input(TRIP i) { /* TACC BUG! CANNOT ESCAPE double quotes properly */ fprintf(stdout, " "); fprintf(stdout, "fprintf(stderr, %c%s: %c); fflush(stderr); fscanf(stdin, %c%%d%c, &%s);\n", '"', nameof(leftchild(i)), '"', '"', '"', nameof(leftchild(i))); } void print(TRIP i) { fprintf(stdout, " "); fprintf(stdout, "fprintf(stdout, %c%%d\\n%c, _t%d); fflush(stdout);\n", '"', '"', leftchild(i)); } // this macro was used in debugging, not really needed now. #define put_label(n) xput_label(n, __LINE__) void xput_label(int lab, int line) { fprintf(stdout, "L%02d:\n", lab); } static TRIP entrypoint; // temporary hack until code is restructured // nested blocks/control structures... // not much used yet in the demo language // purpose is to find enclosing loops for break, // make sure something that starts as a for loop doesn't finish // as an until, etc :-) // This is most needed when compiling statement at a time rather // than recursing in the parser to compile a whole block. #define MAX_NEST 40 static int control_block[MAX_NEST]; static int type_check[MAX_NEST]; static int next_nest = 0; void push_block(int typecheck, int lab) { // when we actually have more types of nested structure, we'll // go to the bother of checking that starts and finished match up... control_block[next_nest++] = lab; } void pop_block(int typecheck, int *lab) { *lab = control_block[--next_nest]; } static int nextlab = 0; // Codegen is the guts of the compiler, which effectively serialises // the AST into sequentially executable statements. A further phase // is required to actually generate executable code. // (However... if you just want to interpret, you can just about // get away with doing that directly in the AST without this stage) void codegen(TRIP root, int thenlab, int elselab, int donelab) { // the labels are only used in short-circuit conditionals - it's // easier to handle that here than have a complex structure of // mutually recursive routines. When not needed they will be -1. TRIP cond, thenpart, elsepart; switch (opsym(root)) { case CONST: loadconst("_t%d = %d;\n", root, rightchild(root)); break; case DECLARE: root = leftchild(root); declare(stringpool+rightchild(leftchild(root))); break; case VAR: loadvar("_t%d = %s;\n", root, nameof(root)); break; case IFTHENELSE: /* Simple if ... then ... */ /* Use two trips to simulate a three-way branch: if ... then ... else ... */ // now that we have n-ary tuples, this code needs to change to make // a new IFTHENELSE opcode use a quad instead. cond = leftchild(root); thenpart = rightchild(root); elsepart = nthchild(root, 3); { int thenlab = ++nextlab; int donelab = ++nextlab; int elselab = ++nextlab; codegen(cond, thenlab, elselab, thenlab); put_label(thenlab); codegen(thenpart, -1, -1, -1); put_goto(donelab); put_label(elselab); codegen(elsepart, -1, -1, -1); put_label(donelab); } break; case IFTHEN: /* Simple if ... then ... */ cond = leftchild(root); thenpart = rightchild(root); { int thenlab = ++nextlab; int donelab = ++nextlab; codegen(cond, thenlab, donelab, thenlab); put_label(thenlab); codegen(thenpart, -1, -1, -1); put_label(donelab); } break; case ASSIGN: codegen(rightchild(root), -1, -1, -1); store("%s = _t%d;\n", nameof(leftchild(root)), rightchild(root)); break; case LOOP: put_label(leftchild(root)); break; case ENDLOOP: put_goto(leftchild(root)); put_label(rightchild(root)); break; case BREAK: put_goto(rightchild(root)); break; case SEQUENCE: codegen(leftchild(root), -1, -1, -1); codegen(rightchild(root), -1, -1, -1); break; // handling short-circuit evaluation of booleans is a bit tricky // and I don't guarantee that this code is 100% the right way // to do it - actually it depends a lot on the language being // compiled. Algol differs from Pascal differs from C, for example. case AND: case OR: { int truelab = thenlab, falselab = elselab, dropthrough = donelab; int thenlab, elselab, donelab; int nexttestlab = ++nextlab; if (opsym(root) == AND) { thenlab = nexttestlab; elselab = falselab; } else { thenlab = truelab; elselab = nexttestlab; } donelab = nexttestlab; codegen(leftchild(root), thenlab, elselab, donelab); /* if the first one was true, drop through to next part of && and check it too */ put_label(nexttestlab); codegen(rightchild(root), truelab, falselab, dropthrough); /* drop through may be to truelab; if not (because we're nested), jump to truelab */ } break; case INPUT: input(root); break; case PRINT: codegen(leftchild(root), -1, -1, -1); print(root); break; case NEG: case NOT: codegen(leftchild(root), thenlab, elselab, donelab); monoperate("_t%d = %s_t%d;\n", root, c_infix_op[opsym(root)], leftchild(root)); break; default: /* Be careful not to default anything other than binary operators! */ if (arity[opsym(root)] != 3) { fprintf(stdout, "*** Not Implemented: codegen(%s)\n", name[opsym(root)]); break; } codegen(leftchild(root), thenlab, elselab, donelab); codegen(rightchild(root), thenlab, elselab, donelab); operate("_t%d = (_t%d %s _t%d);\n", root, leftchild(root), c_infix_op[opsym(root)], rightchild(root)); {int op; // maybe a case statement would be more efficient here, or a range test if (((op = opsym(root)) == EQ) || (op == NE) || (op == LT) || (op == GT) || (op == LE) || (op == GE)) { // need to change this to call a procedure rather than print if (thenlab != donelab) put_ifgoto(root, thenlab, TRUE); // fprintf(stdout, " if (_t%d) goto L%02d;\n", root, thenlab); if (elselab != donelab) put_ifgoto(root, elselab, FALSE); // fprintf(stdout, " if (!t_%d) goto L%02d;\n", root, elselab); } } break; } } #define tripsize(i) arity[opsym(i)] TRIP make_unary_tuple(OPCODE op, TRIP parm1) { int trip = nexttrip; opsym(trip) = op; nexttrip += tripsize(trip); leftchild(trip) = parm1; #ifdef DEBUG_TRIPS printtrip(trip); #endif return trip; } TRIP make_binary_tuple(OPCODE op, TRIP parm1, TRIP parm2) { int trip = nexttrip; opsym(trip) = op; nexttrip += tripsize(trip); leftchild(trip) = parm1; rightchild(trip) = parm2; #ifdef DEBUG_TRIPS printtrip(trip); #endif return trip; } TRIP make_nary_tuple(OPCODE op, TRIP parm1, ...) { // to do - use varagrs or stdargs // we use this for if/then/else, and procedure calls with // parameter lists. int trip; int parm; va_list ptr; trip = nexttrip++; opsym(trip) = op; leftchild(trip) = parm1; va_start(ptr, parm1); for (parm = 2; parm <= arity[op]; parm++) { nthchild(trip, parm) = va_arg(ptr, TRIP); nexttrip += 1; } va_end(ptr); #ifdef DEBUG_TRIPS printtrip(trip); #endif return trip; } TRIP make_int_const(int datatype, char *value) { return make_binary_tuple(CONST, datatype, (int)atol(value)); } TRIP make_real_const(int datatype, char *value) { return make_binary_tuple(CONST, datatype, (int)atof(value) /* Hacky. Should fix this. */); } /* getvar needs to be more complex. Currently it just maps from a string to the triple for a pre-declared var, adding any unrecognised string as an int. Should really add it as an error type, and handle scope rules */ TRIP getvar(char *s) { /* tag must exist */ int i, trip, tag; strcpy(stringpool+nextstring, s); /* Create a backstop in case not found */ for (tag = 0; tag <= nextstring; tag++) { if (strcmp(stringpool+tag, s) == 0) { break; /* found, one way or another */ } } if (tag == nextstring) { nextstring += strlen(s)+1; /* Not found, add it */ trip = make_binary_tuple(TAG, 0, tag); fprintf(stderr, "ERROR: variable '%s' not declared. Creating as int.\n", s); return make_binary_tuple(VAR, trip, INT); } /* having located the stringpool entry, now find the appropriate declaration whose tag is using it */ for (i = 0; i < nexttrip; i++) { if ((opsym(i) == VAR) && (rightchild(leftchild(i)) == tag) && (rightchild(i) == INT)) { return i; } } fprintf(stderr, "Oops: cannot find declaration for this name!\n"); exit(1); } TRIP newtag(char *s) { /* tag must *not* exist */ int i, trip, tag; strcpy(stringpool+nextstring, s); /* Create a backstop for when not found */ for (tag = 0; tag <= nextstring; tag++) { if (strcmp(stringpool+tag, s) == 0) { break; /* found, one way or another */ } } if (tag != nextstring) { fprintf(stderr, "ERROR: name '%s' already exists.\n", s); /* forget about scope rules for now */ } else nextstring += strlen(s)+1; /* Not found, add it */ return make_binary_tuple(TAG, 0, tag); } #else typedef int TRIP; #endif //------------------------------------------------------------------------------------ /* In previous compilers, I had to write custom code for every tree-based optimisation, in order to walk down the tree to the right place to find the leaves to be optimised. In this one, I have a generic tree-walking procedure which can walk the entire program, but it can be customised so that it takes action only on specific phrases. This is possible in this design only because each set of subphrases stores the count of subphrases befoe it - thus allowing a generic tree-walking procedure that doesn't have to know what each node consists of until it happens across a node of the type it is looking for. */ void walk(int ap, int depth, int wanted(int phraseno), void perform(int ap, int before, int depth)) { int i; if (wanted(A[ap])) perform(ap, TRUE, depth); for (i = 3; i < A[ap+2]+3; i++) { if (A[A[ap+i]] >= 512+MAX_BIP) walk(A[ap+i], depth+1, wanted, perform); } if (wanted(A[ap])) perform(ap, FALSE, depth); } int want_all(int phraseno) { return TRUE; } void print_all(int ap, int before, int depth) { int saved_ap = ap; int phrase = A[ap++]; int alt = A[ap++]; int phrases = A[ap++]; // defined subphrases int i; indent(depth, stderr); fprintf(stderr, "<%s%s/%d> ", (before ? "" : "/"), phrasename[phrase-512], alt); for (i = 0; i < (3+phrases); i++) { fprintf(stderr, "A[%d] = %d, ", saved_ap+i, A[saved_ap+i]); } fprintf(stderr, "\n"); } TRIP compile(int ap, int depth) { int saved_ap = ap; int phrase = A[ap++]; // A[ap] is the phrase number. A[ap+1] is the alt. int alt = A[ap++]; // For consistency, in BIPs, the Alt should always be 1 // although that may not be the case at the moment :-( int phrases = A[ap++]; // defined subphrases int i; // The following ecce command executed on this file will generate teeny.g: // ecce -c "(v.//\\.s..(v/ /e)?m,k)0;%c" teeny.c teeny.g // May later tweak takeon.c to read from teeny.c rather than teeny.g // thus simplifying build process and becoming more like yacc. switch (phrase) { //\\ # BIPS (Built-in Phrases)are linked to the type-code returned //\\ # by the line-reconstruction code (aka lexer) //\\ //\\ # These *must* come first. // See TYPE_* in first page for the values to use. //\\ //\\ B=1; case P_IDENT: fprintf(stdout, "%s", c[A[ap]].s); break; //\\ B=5; case P_NUM: fprintf(stdout, "%s", c[A[ap]].s); break; //\\ B=6; case P_HEXNUM: fprintf(stdout, "0x%s", c[A[ap]].s); break; //\\ B=3; case P_CHARLIT: fprintf(stdout, "'%s'", c[A[ap]].s); // ADD ESCAPES break; //\\ B=2; case P_STRING: fprintf(stdout, "\"%s\"", c[A[ap]].s); // ADD ESCAPES break; //\\ //\\ # Phrase definitions. PROGRAM is the main entry point. //\\ //\\ # This is the first draft of a grammar, based on the one //\\ # collectively determined at the wiki site. At the moment //\\ # this has not been debugged extensively. case P_PROGRAM: //\\ P = "program" ; // A[ap] A[ap+1] A[ap+2] Note that literals are discarded. fprintf(stdout, "#include \n"); fprintf(stdout, "void put(int c) {\n"); fprintf(stdout, " fputc(c, stdout);\n"); fprintf(stdout, "}\n"); compile(A[ap], depth+1); compile(A[ap+1], depth+1); fprintf(stdout, "int main(int argc, char **argv)\n"); compile(A[ap+2], depth+1); // Would be nice if we could force an 'exit(0)' at the end of // the mail program block. break; case P_OPTGLOBAL: //\\ //\\ P = , //\\ ; if (alt == 1) break; compile(A[ap], depth+1); compile(A[ap+1], depth+1); break; case P_OPTPROC: //\\ //\\ P = , //\\ ; if (alt == 1) break; compile(A[ap], depth+1); compile(A[ap+1], depth+1); break; case P_MAINBLOCK: case P_BLOCK: //\\ //\\ P = "begin" "end"; //\\ P = "begin" "end"; fprintf(stdout, "{\n"); compile(A[ap], depth+1); compile(A[ap+1], depth+1); if (phrase == P_MAINBLOCK) fprintf(stdout, " exit(0); return(0);\n"); fprintf(stdout, "}\n"); break; case P_SSLIST: //\\ //\\ P = , //\\ ; if (alt == 1) break; compile(A[ap], depth+1); compile(A[ap+1], depth+1); break; case P_OPTLOCAL: //\\ //\\ P = , //\\ ; if (alt == 0) compile(A[ap], depth+1); break; case P_RESTOFDECL: //\\ //\\ P = ',' , //\\ ; if (alt == 1) break; fprintf(stdout, ", "); compile(A[ap], depth+1); compile(A[ap+1], depth+1); break; case P_DIRECTIVE: //\\ //\\ P = '#' ; compile(A[ap], depth+1); break; case P_COMMAND: //\\ //\\ P = "define" , //\\ "ifdef" , //\\ "endif"; // See comment next to '#' in the line reconstruction for one way // to handle these directives. fprintf(stdout, "\n#%s", (alt == 0 ? "define" : (alt == 1 ? "ifdef" : "endif"))); if (alt < 2) { fprintf(stdout, " "); compile(A[ap], depth+1); } if (alt == 0) { fprintf(stdout, " "); compile(A[ap+1], depth+1); } fprintf(stdout, "\n"); break; case P_DEFN: //\\ //\\ P = , , , , ; compile(A[ap], depth+1); break; case P_GLOBAL: //\\ //\\ P = '#' "include" , //\\ ; if (alt == 0) { fprintf(stdout, "\n#%s"); compile(A[ap], depth+1); fprintf(stdout, "\n"); } else { compile(A[ap], depth+1); } break; case P_LOCAL: //\\ //\\ P = , //\\ ; if (alt == 0) { compile(A[ap], depth+1); } else { fprintf(stdout, " "); compile(A[ap], depth+1); compile(A[ap+1], depth+1); compile(A[ap+2], depth+1); fprintf(stdout, ";\n"); } break; case P_TYPE: //\\ //\\ P = ; compile(A[ap], depth+1); compile(A[ap+1], depth+1); break; case P_OPTUNSIGNED: //\\ //\\ P = "unsigned", //\\ ; if (alt == 0) fprintf(stdout, "unsigned "); break; case P_SIZE: //\\ //\\ P = "char", //\\ "int"; fprintf(stdout, (alt == 0 ? "char " : "int ")); break; case P_DECL: //\\ //\\ P = '*' , //\\ , //\\ ; if (alt==0) fprintf(stdout, "*"); compile(A[ap], depth+1); if (alt==1) compile(A[ap+1], depth+1); break; case P_OPTINDEX: //\\ //\\ P = '[' ']', //\\ ; if (alt == 1) break; fprintf(stdout, "["); compile(A[ap], depth+1); fprintf(stdout, "]"); break; case P_PROC: //\\ //\\ P = "procedure" '(' ')' , //\\ "procedure" '(' ')' ; if (alt == 0) { fprintf(stdout, "void ");compile(A[ap],depth+1);fprintf(stdout, "(void)\n"); compile(A[ap+1],depth+1); } else { fprintf(stdout, "void ");compile(A[ap],depth+1);fprintf(stdout, "(");compile(A[ap+1],depth+1);fprintf(stdout, ")\n"); compile(A[ap+2],depth+1); } break; case P_OPTBLOCK: //\\ //\\ P = , //\\ ; if (alt == 0) { compile(A[ap], depth+1); } else fprintf(stdout, ";"); break; case P_FORMALLIST: //\\ //\\ P = , //\\ ; if (alt == 1) { fprintf(stdout, "void"); break; } compile(A[ap], depth+1); compile(A[ap+1], depth+1); break; case P_RESTOFFORMAL: //\\ //\\ P = ',' , //\\ ; if (alt == 1) break; fprintf(stdout, ", "); compile(A[ap], depth+1); compile(A[ap+1], depth+1); break; case P_FORMAL: // \\ // \\ P = '*' , // \\ '*', // \\ , // \\ ; // on its own is prsumably for declarations like: // unsigned int M, N - in which case the type is that of // the preceding item. But what if the IDENT comes first, // i.e. procedure fred(jim); Would that default to an int // or is it simply an error? // compile(A[ap], depth+1); // if (alt < 2) fprintf(stdout, " *"); // if ((alt&1) == 0) compile(A[ap+1], depth+1); // break; // Trying a different grammar for FORMAL: //\\ //\\ P = '*' , //\\ '*', //\\ , //\\ ; // *** I think we forgot naked type in the otiginal grammar if ((alt&1) == 1) { compile(A[ap], depth+1); if (alt == 1) fprintf(stdout, " *"); } else { int namelist = A[ap+1]; // Need to look down into structure for namelist in order to // explicitly type each variable, i.e. "int i, int j" rather // than "int i, j" // namelist points to a IDENTLIST for (;;) { int name = A[namelist+3]; compile(A[ap], depth+1); if (alt == 0) fprintf(stdout, " *"); compile(name, depth+1); namelist = A[namelist+4]; // namelist now points to a RESTOFIDENTLIST if (A[namelist+1] == 0) { fprintf(stdout, ", "); } else { break; } } // // Can we do this elegantly with the 'walk' procedure? } break; //\\ P = ; //\\ P = ',' , //\\ ; case P_OPTFORMAL: //\\ //\\ P = , //\\ ; if (alt == 1) return; compile(A[ap], depth+1); break; case P_STATEMENT: //\\ //\\ P = ; compile(A[ap], depth+1); break; case P_SS: switch (alt) { //\\ case 0: //\\ P = , compile(A[ap], depth+1); break; case 1: //\\ , compile(A[ap], depth+1); break; case 2: //\\ , compile(A[ap], depth+1); break; case 3: //\\ '#' "inline" , break; case 4: //\\ , break; case 5: //\\ ; compile(A[ap], depth+1); break; } break; case P_IF: //\\ //\\ P = "if" '(' ')' "endif"; fprintf(stdout, " if ("); compile(A[ap], depth+1); fprintf(stdout, ") {\n"); compile(A[ap+1], depth+1); compile(A[ap+2], depth+1); fprintf(stdout, " }\n"); break; case P_OPTELSE: //\\ //\\ P = "else" , //\\ ; if (alt == 1) break; fprintf(stdout, " } else {\n"); compile(A[ap], depth+1); break; case P_WHILE: //\\ //\\ P = "while" '(' ')' "endwhile"; fprintf(stdout, " while ("); compile(A[ap], depth+1); fprintf(stdout, ") {\n"); compile(A[ap+1], depth+1); fprintf(stdout, " }\n"); break; case P_FOR: //\\ //\\ P = "for" '(' ')' "endfor"; fprintf(stdout, " for ("); compile(A[ap], depth+1); fprintf(stdout, "; "); compile(A[ap+2], depth+1); fprintf(stdout, "; "); compile(A[ap+4], depth+1); fprintf(stdout, ") {\n"); compile(A[ap+6], depth+1); fprintf(stdout, " }\n"); break; case P_SIMPLE: //\\ //\\ P = '(' ')', //\\ '(' ')', //\\ '=' ; fprintf(stdout, " "); compile(A[ap], depth+1); if (alt < 2) { fprintf(stdout, "("); if (alt == 0) compile(A[ap+1], depth+1); fprintf(stdout, ")"); } else { fprintf(stdout, " = "); compile(A[ap+1], depth+1); } fprintf(stdout, ";\n"); break; case P_PARAMLIST: //\\ //\\ P = //\\ ; if (alt == 1) break; compile(A[ap], depth+1); compile(A[ap+1], depth+1); break; case P_RESTOFPARAMLIST: //\\ P = , //\\ ; if (alt == 1) break; fprintf(stdout, ", "); compile(A[ap+1], depth+1); break; case P_OPTCOMMA: //\\ //\\ P = ',', //\\ ; break; case P_VARIABLE: //\\ //\\ P = '*' , //\\ '[' ']', //\\ ; if (alt == 0) fprintf(stdout, "*"); compile(A[ap], depth+1); if (alt == 1) { fprintf(stdout, "["); compile(A[ap+1], depth+1); fprintf(stdout, "]"); } break; case P_ADDR: //\\ //\\ P = , //\\ ; compile(A[ap], depth+1); break; case P_PARAM: //\\ //\\ P = , //\\ ; compile(A[ap], depth+1); break; case P_BOOLEXPR: //\\ //\\ P = ; compile(A[ap], depth+1); compile(A[ap+1], depth+1); break; case P_RESTOFBOOLTERM: //\\ //\\ P = , //\\ ; if (alt == 1) break; compile(A[ap], depth+1); compile(A[ap+1], depth+1); compile(A[ap+2], depth+1); break; case P_BOOLTERM: //\\ //\\ P = ; compile(A[ap], depth+1); compile(A[ap+1], depth+1); break; case P_RESTOFBOOLFACTOR: //\\ //\\ P = '&' , //\\ ; if (alt == 1) break; fprintf(stdout, " && "); compile(A[ap], depth+1); compile(A[ap+1], depth+1); break; case P_BOOLFACTOR: //\\ //\\ P = ; compile(A[ap], depth+1); compile(A[ap+1], depth+1); break; case P_OPTNOT: //\\ //\\ P = '!', //\\ ; if (alt == 0) fprintf(stdout, "!"); break; case P_OPTSEMI: //\\ //\\ P = ';', //\\ ; break; case P_RELATION: //\\ //\\ P = ; compile(A[ap], depth+1); compile(A[ap+1], depth+1); break; case P_RESTOFRELATION: //\\ //\\ P = , //\\ ; // implicitly " != 0", if no relop given. if (alt == 1) break; compile(A[ap], depth+1); compile(A[ap+1], depth+1); break; case P_EXPR: //\\ //\\ P = ; compile(A[ap], depth+1); compile(A[ap+1], depth+1); break; case P_RESTOFEXPR: //\\ //\\ P = , //\\ ; if (alt == 1) break; compile(A[ap], depth+1); compile(A[ap+1], depth+1); compile(A[ap+2], depth+1); break; case P_SUM: //\\ //\\ P = ; compile(A[ap], depth+1); compile(A[ap+1], depth+1); compile(A[ap+2], depth+1); break; case P_OPTADDOP: //\\ //\\ P = , //\\ ; if (alt == 1) break; compile(A[ap], depth+1); break; case P_RESTOFSUM: //\\ //\\ P = , //\\ ; if (alt == 1) break; compile(A[ap], depth+1); compile(A[ap+1], depth+1); compile(A[ap+2], depth+1); break; case P_TERM: //\\ //\\ P = ; compile(A[ap], depth+1); compile(A[ap+1], depth+1); break; case P_RESTOFTERM: //\\ //\\ P = , //\\ ; if (alt == 1) break; compile(A[ap], depth+1); compile(A[ap+1], depth+1); compile(A[ap+2], depth+1); break; case P_OROP: //\\ //\\ P = '|', '~'; fprintf(stdout, (alt == 0 ? " | " : " ^ ")); break; case P_RELOP: //\\ //\\ P = '<>', '<=', '<', '>=', '>', '!=', '='; switch (alt) { case 0: case 5: fprintf(stdout, " != "); break; case 1: fprintf(stdout, " <= "); break; case 2: fprintf(stdout, " < "); break; case 3: fprintf(stdout, " >= "); break; case 4: fprintf(stdout, " > "); break; case 6: fprintf(stdout, " == "); break; } break; case P_SHIFTOP: //\\ //\\ P = '<<', '>>'; fprintf(stdout, (alt == 0 ? " << " : " >> ")); break; case P_ADDOP: //\\ //\\ P = '+', '-'; fprintf(stdout, (alt == 0 ? " + " : " - ")); break; case P_MULOP: //\\ //\\ P = '*', '/'; fprintf(stdout, (alt == 0 ? " * " : " / ")); break; case P_FACTOR: //\\ //\\ P = '(' ')', if (alt == 0) { // when this is in the switch, it is not executed. Compiler bug? fprintf(stdout, " ("); compile(A[ap], depth+1); fprintf(stdout, ") "); break; } else switch (alt) { case 1: //\\ '&' '[' ']', case 2: //\\ '&' , fprintf(stdout, " &"); compile(A[ap], depth+1); if (alt == 1) { fprintf(stdout, "["); compile(A[ap+1], depth+1); fprintf(stdout, "]"); } break; case 3: //\\ , case 4: //\\ , case 5: //\\ ; compile(A[ap], depth+1); } break; //\\ //\\ # Shouldn't HEXNUM be here with NUM and CHARLIT? //\\ # The apparent restriction is that a HEXNUM cannot //\\ # be used as the size of an array declaration - but //\\ # a character constant can??? case P_NUMBER: //\\ //\\ P = , //\\ ; compile(A[ap], depth+1); break; //\\ //\\ E //\\ # 'E' is end of grammar. Everything after this is ignored. default: fprintf(stderr, "*** Internal error at line %d. ap=%d phrase=%d\n", __LINE__, ap, phrase); exit(2); } return(-1); // DUMMY TRIP, NOTHING TO RETURN } //-------------------------------------------------------------------------- int main(int argc, char **argv) { char *s; // Get clean version of executable name. Should work on most existing systems (2006) progname = argv[0]; if ((s = strrchr(progname, '/')) != NULL) progname = s+1; // Unix if ((s = strrchr(progname, '\\')) != NULL) progname = s+1; // M$ if ((s = strrchr(progname, ']')) != NULL) progname = s+1; // Dec if ((s = strrchr(progname, ';')) != NULL) *s = '\0'; // Version no's if (((s = strrchr(progname, '.')) != NULL) && (strcasecmp(s, ".exe") == 0)) *s = '\0'; if (((s = strrchr(progname, '.')) != NULL) && (strcasecmp(s, ".com") == 0)) *s = '\0'; if ((argc == 3) && strcmp(argv[1], "-d") == 0) { argv++; argc--; debug_parser = TRUE; } if (argc != 2) { fprintf(stderr, "syntax: %s [-d] filename\n", progname); exit(1); } sourcefile = fopen(argv[1], "r"); if (sourcefile == NULL) { fprintf(stderr, "%s: %s - %s\n", progname, strerror(errno), argv[1]); exit(errno); } curfile = argv[1]; startline = TRUE; whitespace = TRUE; onecharstr = (char *)malloc(512); line_reconstruction(); if (debug_parser) { int i; // DEBUG ONLY for (i = 0; i < nextfree; i++) { fprintf(stdout, "%s, line %d, col %d: [%0d] %s\n", c[i].f, c[i].l, c[i].col, c[i].t, c[i].s); } } if (!parse(PHRASE_BASE, 0)) { if (bestparse == nextfree) { fprintf(stderr, "\"%s\", Line %d, Col %d: Premature end of file while looking for %s\n", argv[1], c[bestparse].l, c[bestparse].col+1, looking_for); } else { int i; fprintf(stderr, "\"%s\", Line %d, Col %d: Syntax error while looking for %s near ", argv[1], c[bestparse].l, c[bestparse].col+1, looking_for); for (i = bestparse; i < bestparse+3; i++) { if (i == nextfree) { fprintf(stderr, ""); break; } switch (c[i].t) { case TYPE_HEXINT: fprintf(stderr, "$"); // *OR* ... We could put the '$' back in front of the string /* drop through */ // and probably save much code whenever printing. Use str+1 case TYPE_TAG: case TYPE_CHAR: case TYPE_INT: case TYPE_KEYWORD: fprintf(stderr, "%s", c[i].s); break; case TYPE_STRING: fprintf(stderr, "\"%s\"", c[i].s); break; case TYPE_CHARCONST: fprintf(stderr, "'%s'", c[i].s); break; } fprintf(stderr, (i == (bestparse+2) ? " ..." : " ")); } fprintf(stderr, "\n"); } exit(1); } fprintf(stderr, "Parsed OK, now compiling.\n"); if (debug_parser) walk(0, 0, want_all, print_all); // Diags: print final parse tree // The structure at A[] is a concrete syntax tree. We could // generate code directly from this tree, however for the // sake of cleaner design, we will first convert the concrete // syntax tree into an Abstract Syntax Tree (AST) {int program = compile(0, 0); // Now generate the output code from the AST. // codegen(program, -1, -1, -1) } exit(0); return(1); }