/* * cse.y * * Example of a simple expression parser with common sub-expression * elimination (CSE) which builds DAGs with n-ary trees and generates simple * code. * * The primary problem with building n-ary trees lies with compound * statements, where the number of children cannot be determined until all of * them have been processed. * * For nodes which have a fixed number of children (such as arithmetic operat- * ors with 2 children), there is no problem, but for compound statements, the * solution is to have a tree built for each statement and then appended to * a list. * * Each list element is a struct which contains 2 fields: the tree, and a * "next" pointer. When a node is to be created, it counts the number of * elements in the CHILD_LIST, allocates that many DAG * pointers, and sets * each pointer to a member of the list. */%{ #include <ctype.h> #include <stdio.h> #include <stdlib.h>/***************************************************************************** Data Structures Definitions for data structures related to the DAGs. *****************************************************************************//* * DAG * * A dag node. */typedefstructdag_struct DAG;structdag_struct {intop;// '+', '-', '*', '/', '=', 's', 'n', or 'v'/* * Data: * * This union serves two purposes: * * 1. For 'n' (number) and 'v' (variable) nodes, the appropriate * field is set to the data that the lexer provided. * 2. During code generation, the "num" member is set to a scratch * variable name (0 means T0, 1 is T1, etc.) if the node's op is * NOT 'n' or 'v', otherwise, "num" is a constant number if op * == 'n' and "var" is a variable if op == 'v'. * * For nodes other than 'n' and 'v', data.num is set to -1 by default in * order to signify to the code generator that no scratch register has * yet been assigned (-1 is invalid for scratch registers.) */union{intnum;// number (if op == 'n')charvar;// variable (if op == 'v')} data;/* * Children: * * Nodes can have an arbitrary number of children. The child[] array is an * array of pointers to child DAG nodes and only has as many elements as * the num_children member specifies. */intnum_children;// number of childrenDAG **child;// up to 3 children/* * Global list: * * A linear list of all the nodes is maintained by chaining them all * together with this pointer. This helps with CSE. */DAG *next;// next node in the global list of nodes};/* * CHILD_LIST * * Sometimes, the number of children for a given DAG node cannot be * determined until after all of the children have been created. In this * grammar, this situation only occurs with lists of statements. * * The problem is resolved by creating a list of CHILD_LIST structures, each * of which contains a DAG and a pointer to the next structure. As the parser * parses a series of statements in a statement list, a new DAG is added to * the list for each statement encountered. * * When all is done, the parent node of all of these children is created, the * list is counted, and enough pointers are allocated to hold all of the * children. Then, each DAG in the list is plucked one at a time and attached * to the parent node's child[] array. */typedefstructchild_list_struct CHILD_LIST;structchild_list_struct { DAG *node;// the child node itselfCHILD_LIST *next; };/***************************************************************************** Global Data The DAG and global list of nodes are defined here. *****************************************************************************//* * DAG */staticDAG *dag;// DAG (NULL if no non-null statements in program)staticDAG *node_list;// global list of nodes (newest first)/***************************************************************************** Function Prototypes These functions are used by the parser. *****************************************************************************/staticDAG *MakeArbitraryNode(int, CHILD_LIST *);staticDAG *MakeNode(int,int, DAG *, DAG *);staticDAG *MakeDataNode(int,int);staticintyylex();staticvoidyyerror(constchar*); %}/***************************************************************************** Token Definitions YYSTYPE and the tokens are defined here. Tokens are only defined for numbers and variable names. The lexer will return ASCII characters for everything else which helps make the grammar a little more readable. *****************************************************************************/%union{intnum;// numbercharvar;// variableDAG *node;/* * A list of child DAGs. The 2 pointer system is for fixed-time insertion * of new elements to the end of the list (each time one is inserted, the * "last" pointer is updated, but "first" remains unchanged.) */struct{ CHILD_LIST *first, *last;// pointers to first and last children} children; } %token <num> NUMBER %token <var> VARIABLE %type <children> stmt_list %type <node> stmt %type <node> expr %type <node> assgn_expr %type <node> factor %type <node> element %%/***************************************************************************** Grammar Ye olde grammar. The language is really quite simple. It's just a series of statements terminated by semicolons. Multiple statements can appear between brackets, but there are no scope rules, so these brackets do nothing more than serve as a demonstration for how to implement compound statements using n-ary tree structures. There are 52 possible variable names: a-z and A-Z. They are always available and cannot be declared or undeclared. A single statement may be an assignment statement: a = 3; b = a; c = d = e = 4 * 3; Or an arithmetic expression using +, -, *, and / operators: 1 + 2 * 3 / a; The above example clearly does nothing since there is no assignment, but it's perfectly valid. *****************************************************************************//* * file * * A file is simply a list of statements. stmt_list is set to a list of * DAGs. These are attached to a statement list node. */file: stmt_list {if($1.first != NULL)// if there were any non-null statementsdag = MakeArbitraryNode('s', $1.first);elsedag = NULL; } |// empty;/* * stmt_list * * Takes advantage of left recursion in bottom-up parsing to allow an * arbitrary number of statements to be repeated in a list. Consult Holub's * book to learn about how this sort of left recursion works. * * When a stmt is on the stack, the first production is reduced to stmt_list. * Then, another stmt will make "stmt_list stmt" which again reduces to * stmt_list, etc. So, for each stmt_list, the "stmt_list->stmt" * production gets reduced once (and it happens first.) * * A CHILD_LIST is created and each time the second production is reduced, the * action executed adds the DAG associated with stmt to stmt_list and * attaches it to $$. */stmt_list: stmt// this is the first kind of stmt_list production to be// reduced because of how left recursion works{ $$.first = $$.last = NULL;if($1 != NULL)// don't corrupt child list w/ NULLs{ $$.first = malloc(sizeof(CHILD_LIST)); $$.last = $$.first; $$.first->node = $1; $$.first->next = NULL; }else$$.first = $$.last = NULL; } | stmt_list stmt { $$ = $1;// keep the list alive by passing it to $$if($2 != NULL)// don't corrupt child list with NULLs{if($$.last == NULL)// nothing has been allocated!{ $$.first = malloc(sizeof(CHILD_LIST)); $$.last = $$.first; $$.first->node = $2; $$.first->next = NULL; }else// allocate to end of list{ ($$.last)->next = malloc(sizeof(CHILD_LIST)); ($$.last)->next->node = $2; ($$.last)->next->next = NULL; $$.last = ($$.last)->next; } } } ;/* * stmt * * A single statement, which can be an expression, an assignment, or a * compound statement (just another statement list.) C-style null-statements * are perfectly acceptable and don't generate any DAG nodes. */stmt: expr ';' { $$ = $1; } | ';' { $$ = NULL; } | '{' stmt_list '}' {if($2.first == NULL)// no actual statements$$ = NULL;else$$ = MakeArbitraryNode('s', $2.first);// make a 's' (statement list) node and attach all the children} | '{' '}' { $$ = NULL; } | assgn_expr ';' { $$ = $1; } ;/* * assgn_expr * * Assignment expression. The entire expression itself takes the value of the * newly-modified variable, thus chaining is possible: * * a = b = c = 3 * 4; * * Note that the variable node is created first, and then the operator node * is created and overwrites $$. */assgn_expr: VARIABLE '=' assgn_expr { $$ = MakeDataNode('v', $1); $$ = MakeNode(2, '=', $$, $3); } | VARIABLE '=' expr { $$ = MakeDataNode('v', $1); $$ = MakeNode(2, '=', $$, $3); } ;/* * expr * * The lowest precedence expression rules (+ and -) are handled here. */expr: expr '+' factor { $$ = MakeNode(2, '+', $1, $3); } | expr '-' factor { $$ = MakeNode(2, '-', $1, $3); } | factor { $$ = $1; } ;/* * factor * * Multiplication and division are higher precedence than +/- (expr.) */factor: factor '*' element { $$ = MakeNode(2, '*', $1, $3); } | factor '/' element { $$ = MakeNode(2, '/', $1, $3); } | element { $$ = $1; } ;/* * element * * A basic element -- a number, variable, or parenthetical expression. */element: '(' expr ')' { $$ = $2; } | '(' assgn_expr ')' { $$ = $2; } | NUMBER { $$ = MakeDataNode('n', $1); } | VARIABLE { $$ = MakeDataNode('v', $1); } ; %%/***************************************************************************** DAG Functions Functions for making DAG nodes and attaching children to them. Nodes other than 'n' and 'v' have data.num set to -1 to indicate that no scratch register has been assigned to the node by the code generator yet (see the code generator code and the DAG structure definition for more info.) *****************************************************************************//* * MakeArbitraryNode(): * * Creates a node which can have an arbitrary amount of children. * * Attaches all of the DAGs in the CHILD_LIST as children to the node (first * argument.) This is done by counting the number of DAGs in the list, * allocating that many child pointers, and then taking each element in the * list and putting it in the newly-allocated child[] array. * * Next, the result is checked against all of the other DAG nodes to see if * there is already one like this, and if so, this one is discarded and the * existant one is returned. * * NOTE: When searching for a matching node in the global node list, the * "data" member is not checked because it is assumed that it is unused in * this sort of node. */staticDAG *MakeArbitraryNode(intop, CHILD_LIST *cl) { CHILD_LIST *l; DAG *node, *n;intnum_children, i, all_equal;/* * Count number of children by iterating through list */for(l = cl, num_children = 0; l != NULL; l = l->next, num_children++) ;/* * Make the node */if((node = calloc(1,sizeof(DAG))) == NULL)returnNULL; node->op = op; node->data.num = -1;/* * Allocate space for their pointers and then copy them in */node->num_children = num_children; node->child = malloc(sizeof(DAG *) * num_children);for(i = 0, l = cl; i < num_children; i++, l = l->next) node->child[i] = l->node;/* * Try to see if a matching node already exists */for(n = node_list; n != NULL; n = n->next) {if(n->op == node->op && n->num_children == node->num_children) { all_equal = 1;// will be cleared if not all children equalfor(i = 0; i < n->num_children; i++) {if(n->child[i] != node->child[i]) { all_equal = 0;break; } }if(all_equal) { free(node);// destroy current nodereturnn; } } }/* * No match found, insert this node in the global list and return it */node->next = node_list; node_list = node;returnnode; }/* * MakeNode(): * * Makes a DAG node with the specified number of children (0 is acceptable.) * The number of children determines how big child[] will be. If the number * of children is 1 or 2, they are passed as parameters a and b. * * NOTE: When searching for a matching node in the global node list, the * "data" member is not checked because it is assumed that it is unused in * this sort of node. * * MakeNode() is not intended for number ('n') and variable ('v') nodes, use * MakeDataNode() for that purpose. */staticDAG *MakeNode(intnum_children,intop, DAG *a, DAG *b) { DAG *node;/* * Try to see if a matching node exists */for(node = node_list; node != NULL; node = node->next) {if(node->op != op)// can't be the same node!continue;if(node->num_children != num_children)continue;/* * Ugly and not optimal... I know, but it's easy to understand :) */if(num_children == 0)returnnode;// must be a match!elseif(num_children == 1) {if(node->child[0] == a)returnnode; }elseif(num_children == 2) {if(node->child[0] == a && node->child[1] == b)returnnode; } }/* * Otherwise, allocate a new one */if((node = calloc(1,sizeof(DAG))) == NULL)returnNULL; node->next = node_list; node_list = node; node->op = op; node->data.num = -1; node->num_children = num_children;if(num_children) { node->child = malloc(sizeof(DAG *) * num_children); node->child[0] = a;if(num_children == 2) node->child[1] = b; }returnnode; }/* * MakeDataNode(): * * Virtually the same as the above MakeNode() function, except it is intended * for use with 'n' and 'v' nodes only. */staticDAG *MakeDataNode(intop,intdata) { DAG *node;for(node = node_list; node != NULL; node = node->next) {/* * Only check op and fields... Children are irrelevant for 'n' and 'v' */if(node->op == op) {if(op == 'n') {if(node->data.num == data)returnnode; }else// 'v'{if(node->data.var == (char) data)returnnode; } } }if((node = calloc(1,sizeof(DAG))) == NULL)returnNULL; node->next = node_list; node_list = node; node->op = op;if(op == 'n') node->data.num = data;elseif(op == 'v') node->data.var = data;elsenode->data.num = -1;// to indicate this isn't being used (codegen)node->num_children = 0;returnnode; }/***************************************************************************** Code Generator Code is output in a simplistic quadruple format (op, dest, src, src) with a scratch variable assigned for each operation. This means that a lot of scratch variables are generated that could be eliminated, but that would be fairly trivial as a post-step. CSE (common sub-expression elimination) is performed during code generation. It's really quite simple and most of the work was done in constructing the DAGs themselves. When a scratch variable is found to be assigned to a given node (which happens once the code generator touches it the first time), it is re-used. If a sub-expression is no longer valid (when another statement changes the contents of a variable used as an operand, for instance), the scratch variable is disassociated with the node (data.num is set to -1.) The code generator assumes (especially in InvalidateDependencies()) that all nodes other than 'n', 'v', and '=' can only be represented by a temporary variable (whose number is in data.num), Tx. *****************************************************************************/staticintcurrent_scratch = 0;/* * NewScratch(): * * Returns a new scratch variable. */staticintNewScratch() {returncurrent_scratch++; }/* * PrintOperand(): * * Prints the current node as an operand. It assumes that it has already been * consumed by GenerateCode(). The rules for operand printing are: * * - If op == 'n', print data.num. * - If op == 'v', print data.var. * - If op == '=', print data.var. * - Otherwise, data.num contains the number of a scratch variable (Tx) * that must be printed. */staticvoidPrintOperand(DAG *node) {switch(node->op) {case'n': printf("%d", node->data.num);break;case'v':case'=': putchar(node->data.var);break;default: printf("T%d", node->data.num);break; } }/* * InvalidateDependencies(): * * Invalidates all dependencies on the variable which holds this node. Scans * the global node list to find nodes that have a child which is the current * node. Recursively eliminates dependencies on dependencies, etc. */staticvoidInvalidateDependencies(DAG *node) { DAG *n;inti;for(n = node_list; n != NULL; n = n->next) {for(i = 0; i < n->num_children; i++) {if(n->child[i] == node) { InvalidateDependencies(n);/* * '=' and 'v' both use data.var to store an actual (a-z,A-Z) * user-defined variable which must never be invalidated. */if(n->op != '=' && n->op != 'v')// these 2 will always have an actual (a-z,A-Z) variablen->data.num = -1;break; } } } }/* * GenerateCode(): * * Generates code from a DAG and prints it to stdout. */staticvoidGenerateCode(DAG *node) {inti;if(node == NULL)return;switch(node->op) {case's':/* * List of statements */for(i = 0; i < node->num_children; i++) GenerateCode(node->child[i]);break;case'=':/* * Assignment */GenerateCode(node->child[1]);// child[0] guaranteed to be a varnode->data.var = node->child[0]->data.var;// this node becomes// equivalent to the varprintf("%c = ", node->child[0]->data.var); PrintOperand(node->child[1]); printf("\n"); InvalidateDependencies(node->child[0]);break;case'+':case'-':case'*':case'/':/* * Binary arithmetic operators */if(node->data.num != -1)// scratch register already assigned?break;// do nothing (parent node will treat this// as an operand by re-using the scratch// register)GenerateCode(node->child[0]); GenerateCode(node->child[1]); node->data.num = NewScratch(); printf("T%d = ", node->data.num); PrintOperand(node->child[0]); printf(" %c ", node->op); PrintOperand(node->child[1]); printf("\n");break;case'n':case'v':break;// do nothing} }/***************************************************************************** Lexical Analyzer A simple lexical analyzer which recognizes tokens from file input. *****************************************************************************/staticFILE *fp;staticintyylex() {inti, num;if(feof(fp))return0; i = fgetc(fp);while(i == ' ' || i == '\t' || i == '\r' || i == '\n') i = fgetc(fp);if(isdigit(i)) { num = i - '0';while(1) { i = fgetc(fp);if(isdigit(i)) { num *= 10; num += (i - '0'); }else{ ungetc(i, fp); yylval.num = num;returnNUMBER; } } }elseif(isalpha(i)) { yylval.var = i;returnVARIABLE; }returni; }/***************************************************************************** Main main(), yyerror(), and friends. *****************************************************************************/staticvoidPrintDAG(DAG *node,intindent) {inti;if(node == NULL)return;for(i = 0; i < indent * 2; i++) printf(" ");if(node->op == 'n') printf("%d\n", node->data.num);elseif(node->op == 'v') printf("%c\n", node->data.var);else{ printf("%c\n", node->op);for(i = 0; i < node->num_children; i++) PrintDAG(node->child[i], indent + 1); } }staticvoidyyerror(constchar*msg) { fprintf(stderr, "%s\n", msg); }intmain(intargc,char**argv) {if(argc <= 1) { puts("usage: cse <file>"); exit(0); }else{if((fp = fopen(argv[1], "r")) == NULL) { fprintf(stderr, "error: unable to open %s\n", argv[1]); exit(1); } } node_list = NULL;if(yyparse()) exit(0); puts("DAG:\n----\n"); PrintDAG(dag, 0); puts("\nCode:\n-----\n"); GenerateCode(dag);return0; }