/*
* cse.y
*
* Example of a simple expression parser with common subexpression
* elimination (CSE) which builds DAGs with nary trees and generates simple
* code.
*
* The primary problem with building nary trees lies with compound
* statements, where the number of children cannot be determined until all of
* them have been processed.
*
* For nodes which have a fixed number of children (such as arithmetic operat
* ors with 2 children), there is no problem, but for compound statements, the
* solution is to have a tree built for each statement and then appended to
* a list.
*
* Each list element is a struct which contains 2 fields: the tree, and a
* "next" pointer. When a node is to be created, it counts the number of
* elements in the CHILD_LIST, allocates that many DAG * pointers, and sets
* each pointer to a member of the list.
*/
%{
#include
#include
#include
/*****************************************************************************
Data Structures
Definitions for data structures related to the DAGs.
*****************************************************************************/
/*
* DAG
*
* A dag node.
*/
typedef struct dag_struct DAG;
struct dag_struct
{
int op; // '+', '', '*', '/', '=', 's', 'n', or 'v'
/*
* Data:
*
* This union serves two purposes:
*
* 1. For 'n' (number) and 'v' (variable) nodes, the appropriate
* field is set to the data that the lexer provided.
* 2. During code generation, the "num" member is set to a scratch
* variable name (0 means T0, 1 is T1, etc.) if the node's op is
* NOT 'n' or 'v', otherwise, "num" is a constant number if op
* == 'n' and "var" is a variable if op == 'v'.
*
* For nodes other than 'n' and 'v', data.num is set to 1 by default in
* order to signify to the code generator that no scratch register has
* yet been assigned (1 is invalid for scratch registers.)
*/
union
{
int num; // number (if op == 'n')
char var; // variable (if op == 'v')
} data;
/*
* Children:
*
* Nodes can have an arbitrary number of children. The child[] array is an
* array of pointers to child DAG nodes and only has as many elements as
* the num_children member specifies.
*/
int num_children; // number of children
DAG **child; // up to 3 children
/*
* Global list:
*
* A linear list of all the nodes is maintained by chaining them all
* together with this pointer. This helps with CSE.
*/
DAG *next; // next node in the global list of nodes
};
/*
* CHILD_LIST
*
* Sometimes, the number of children for a given DAG node cannot be
* determined until after all of the children have been created. In this
* grammar, this situation only occurs with lists of statements.
*
* The problem is resolved by creating a list of CHILD_LIST structures, each
* of which contains a DAG and a pointer to the next structure. As the parser
* parses a series of statements in a statement list, a new DAG is added to
* the list for each statement encountered.
*
* When all is done, the parent node of all of these children is created, the
* list is counted, and enough pointers are allocated to hold all of the
* children. Then, each DAG in the list is plucked one at a time and attached
* to the parent node's child[] array.
*/
typedef struct child_list_struct CHILD_LIST;
struct child_list_struct
{
DAG *node; // the child node itself
CHILD_LIST *next;
};
/*****************************************************************************
Global Data
The DAG and global list of nodes are defined here.
*****************************************************************************/
/*
* DAG
*/
static DAG *dag; // DAG (NULL if no nonnull statements in program)
static DAG *node_list; // global list of nodes (newest first)
/*****************************************************************************
Function Prototypes
These functions are used by the parser.
*****************************************************************************/
static DAG *MakeArbitraryNode(int, CHILD_LIST *);
static DAG *MakeNode(int, int, DAG *, DAG *);
static DAG *MakeDataNode(int, int);
static int yylex();
static void yyerror(const char *);
%}
/*****************************************************************************
Token Definitions
YYSTYPE and the tokens are defined here. Tokens are only defined for numbers
and variable names. The lexer will return ASCII characters for everything
else which helps make the grammar a little more readable.
*****************************************************************************/
%union
{
int num; // number
char var; // variable
DAG *node;
/*
* A list of child DAGs. The 2 pointer system is for fixedtime insertion
* of new elements to the end of the list (each time one is inserted, the
* "last" pointer is updated, but "first" remains unchanged.)
*/
struct
{
CHILD_LIST *first, *last; // pointers to first and last children
} children;
}
%token NUMBER
%token VARIABLE
%type stmt_list
%type stmt
%type expr
%type assgn_expr
%type factor
%type element
%%
/*****************************************************************************
Grammar
Ye olde grammar.
The language is really quite simple. It's just a series of statements
terminated by semicolons. Multiple statements can appear between brackets,
but there are no scope rules, so these brackets do nothing more than serve
as a demonstration for how to implement compound statements using nary
tree structures.
There are 52 possible variable names: az and AZ. They are always available
and cannot be declared or undeclared.
A single statement may be an assignment statement:
a = 3;
b = a;
c = d = e = 4 * 3;
Or an arithmetic expression using +, , *, and / operators:
1 + 2 * 3 / a;
The above example clearly does nothing since there is no assignment, but it's
perfectly valid.
*****************************************************************************/
/*
* file
*
* A file is simply a list of statements. stmt_list is set to a list of
* DAGs. These are attached to a statement list node.
*/
file: stmt_list
{
if ($1.first != NULL) // if there were any nonnull statements
dag = MakeArbitraryNode('s', $1.first);
else
dag = NULL;
}
 // empty
;
/*
* stmt_list
*
* Takes advantage of left recursion in bottomup parsing to allow an
* arbitrary number of statements to be repeated in a list. Consult Holub's
* book to learn about how this sort of left recursion works.
*
* When a stmt is on the stack, the first production is reduced to stmt_list.
* Then, another stmt will make "stmt_list stmt" which again reduces to
* stmt_list, etc. So, for each stmt_list, the "stmt_list>stmt"
* production gets reduced once (and it happens first.)
*
* A CHILD_LIST is created and each time the second production is reduced, the
* action executed adds the DAG associated with stmt to stmt_list and
* attaches it to $$.
*/
stmt_list:
stmt // this is the first kind of stmt_list production to be
// reduced because of how left recursion works
{
$$.first = $$.last = NULL;
if ($1 != NULL) // don't corrupt child list w/ NULLs
{
$$.first = malloc(sizeof(CHILD_LIST));
$$.last = $$.first;
$$.first>node = $1;
$$.first>next = NULL;
}
else
$$.first = $$.last = NULL;
}

stmt_list stmt
{
$$ = $1; // keep the list alive by passing it to $$
if ($2 != NULL) // don't corrupt child list with NULLs
{
if ($$.last == NULL) // nothing has been allocated!
{
$$.first = malloc(sizeof(CHILD_LIST));
$$.last = $$.first;
$$.first>node = $2;
$$.first>next = NULL;
}
else // allocate to end of list
{
($$.last)>next = malloc(sizeof(CHILD_LIST));
($$.last)>next>node = $2;
($$.last)>next>next = NULL;
$$.last = ($$.last)>next;
}
}
}
;
/*
* stmt
*
* A single statement, which can be an expression, an assignment, or a
* compound statement (just another statement list.) Cstyle nullstatements
* are perfectly acceptable and don't generate any DAG nodes.
*/
stmt: expr ';' { $$ = $1; }
 ';' { $$ = NULL; }
 '{' stmt_list '}'
{
if ($2.first == NULL) // no actual statements
$$ = NULL;
else
$$ = MakeArbitraryNode('s', $2.first); // make a 's' (statement list) node and attach all the children
}
 '{' '}' { $$ = NULL; }
 assgn_expr ';' { $$ = $1; }
;
/*
* assgn_expr
*
* Assignment expression. The entire expression itself takes the value of the
* newlymodified variable, thus chaining is possible:
*
* a = b = c = 3 * 4;
*
* Note that the variable node is created first, and then the operator node
* is created and overwrites $$.
*/
assgn_expr: VARIABLE '=' assgn_expr
{
$$ = MakeDataNode('v', $1);
$$ = MakeNode(2, '=', $$, $3);
}
 VARIABLE '=' expr
{
$$ = MakeDataNode('v', $1);
$$ = MakeNode(2, '=', $$, $3);
}
;
/*
* expr
*
* The lowest precedence expression rules (+ and ) are handled here.
*/
expr: expr '+' factor { $$ = MakeNode(2, '+', $1, $3); }
 expr '' factor { $$ = MakeNode(2, '', $1, $3); }
 factor { $$ = $1; }
;
/*
* factor
*
* Multiplication and division are higher precedence than +/ (expr.)
*/
factor: factor '*' element { $$ = MakeNode(2, '*', $1, $3); }
 factor '/' element { $$ = MakeNode(2, '/', $1, $3); }
 element { $$ = $1; }
;
/*
* element
*
* A basic element  a number, variable, or parenthetical expression.
*/
element: '(' expr ')' { $$ = $2; }
 '(' assgn_expr ')' { $$ = $2; }
 NUMBER { $$ = MakeDataNode('n', $1); }
 VARIABLE { $$ = MakeDataNode('v', $1); }
;
%%
/*****************************************************************************
DAG Functions
Functions for making DAG nodes and attaching children to them.
Nodes other than 'n' and 'v' have data.num set to 1 to indicate that no
scratch register has been assigned to the node by the code generator yet
(see the code generator code and the DAG structure definition for more info.)
*****************************************************************************/
/*
* MakeArbitraryNode():
*
* Creates a node which can have an arbitrary amount of children.
*
* Attaches all of the DAGs in the CHILD_LIST as children to the node (first
* argument.) This is done by counting the number of DAGs in the list,
* allocating that many child pointers, and then taking each element in the
* list and putting it in the newlyallocated child[] array.
*
* Next, the result is checked against all of the other DAG nodes to see if
* there is already one like this, and if so, this one is discarded and the
* existant one is returned.
*
* NOTE: When searching for a matching node in the global node list, the
* "data" member is not checked because it is assumed that it is unused in
* this sort of node.
*/
static DAG *MakeArbitraryNode(int op, CHILD_LIST *cl)
{
CHILD_LIST *l;
DAG *node, *n;
int num_children, i, all_equal;
/*
* Count number of children by iterating through list
*/
for (l = cl, num_children = 0; l != NULL; l = l>next, num_children++)
;
/*
* Make the node
*/
if ((node = calloc(1, sizeof(DAG))) == NULL)
return NULL;
node>op = op;
node>data.num = 1;
/*
* Allocate space for their pointers and then copy them in
*/
node>num_children = num_children;
node>child = malloc(sizeof(DAG *) * num_children);
for (i = 0, l = cl; i < num_children; i++, l = l>next)
node>child[i] = l>node;
/*
* Try to see if a matching node already exists
*/
for (n = node_list; n != NULL; n = n>next)
{
if (n>op == node>op && n>num_children == node>num_children)
{
all_equal = 1; // will be cleared if not all children equal
for (i = 0; i < n>num_children; i++)
{
if (n>child[i] != node>child[i])
{
all_equal = 0;
break;
}
}
if (all_equal)
{
free(node); // destroy current node
return n;
}
}
}
/*
* No match found, insert this node in the global list and return it
*/
node>next = node_list;
node_list = node;
return node;
}
/*
* MakeNode():
*
* Makes a DAG node with the specified number of children (0 is acceptable.)
* The number of children determines how big child[] will be. If the number
* of children is 1 or 2, they are passed as parameters a and b.
*
* NOTE: When searching for a matching node in the global node list, the
* "data" member is not checked because it is assumed that it is unused in
* this sort of node.
*
* MakeNode() is not intended for number ('n') and variable ('v') nodes, use
* MakeDataNode() for that purpose.
*/
static DAG *MakeNode(int num_children, int op, DAG *a, DAG *b)
{
DAG *node;
/*
* Try to see if a matching node exists
*/
for (node = node_list; node != NULL; node = node>next)
{
if (node>op != op) // can't be the same node!
continue;
if (node>num_children != num_children)
continue;
/*
* Ugly and not optimal... I know, but it's easy to understand :)
*/
if (num_children == 0)
return node; // must be a match!
else if (num_children == 1)
{
if (node>child[0] == a)
return node;
}
else if (num_children == 2)
{
if (node>child[0] == a && node>child[1] == b)
return node;
}
}
/*
* Otherwise, allocate a new one
*/
if ((node = calloc(1, sizeof(DAG))) == NULL)
return NULL;
node>next = node_list;
node_list = node;
node>op = op;
node>data.num = 1;
node>num_children = num_children;
if (num_children)
{
node>child = malloc(sizeof(DAG *) * num_children);
node>child[0] = a;
if (num_children == 2)
node>child[1] = b;
}
return node;
}
/*
* MakeDataNode():
*
* Virtually the same as the above MakeNode() function, except it is intended
* for use with 'n' and 'v' nodes only.
*/
static DAG *MakeDataNode(int op, int data)
{
DAG *node;
for (node = node_list; node != NULL; node = node>next)
{
/*
* Only check op and fields... Children are irrelevant for 'n' and 'v'
*/
if (node>op == op)
{
if (op == 'n')
{
if (node>data.num == data)
return node;
}
else // 'v'
{
if (node>data.var == (char) data)
return node;
}
}
}
if ((node = calloc(1, sizeof(DAG))) == NULL)
return NULL;
node>next = node_list;
node_list = node;
node>op = op;
if (op == 'n')
node>data.num = data;
else if (op == 'v')
node>data.var = data;
else
node>data.num = 1; // to indicate this isn't being used (codegen)
node>num_children = 0;
return node;
}
/*****************************************************************************
Code Generator
Code is output in a simplistic quadruple format (op, dest, src, src) with a
scratch variable assigned for each operation. This means that a lot of
scratch variables are generated that could be eliminated, but that would be
fairly trivial as a poststep.
CSE (common subexpression elimination) is performed during code generation.
It's really quite simple and most of the work was done in constructing the
DAGs themselves. When a scratch variable is found to be assigned to a given
node (which happens once the code generator touches it the first time), it is
reused.
If a subexpression is no longer valid (when another statement changes the
contents of a variable used as an operand, for instance), the scratch
variable is disassociated with the node (data.num is set to 1.)
The code generator assumes (especially in InvalidateDependencies()) that
all nodes other than 'n', 'v', and '=' can only be represented by a temporary
variable (whose number is in data.num), Tx.
*****************************************************************************/
static int current_scratch = 0;
/*
* NewScratch():
*
* Returns a new scratch variable.
*/
static int NewScratch()
{
return current_scratch++;
}
/*
* PrintOperand():
*
* Prints the current node as an operand. It assumes that it has already been
* consumed by GenerateCode(). The rules for operand printing are:
*
*  If op == 'n', print data.num.
*  If op == 'v', print data.var.
*  If op == '=', print data.var.
*  Otherwise, data.num contains the number of a scratch variable (Tx)
* that must be printed.
*/
static void PrintOperand(DAG *node)
{
switch (node>op)
{
case 'n':
printf("%d", node>data.num);
break;
case 'v':
case '=':
putchar(node>data.var);
break;
default:
printf("T%d", node>data.num);
break;
}
}
/*
* InvalidateDependencies():
*
* Invalidates all dependencies on the variable which holds this node. Scans
* the global node list to find nodes that have a child which is the current
* node. Recursively eliminates dependencies on dependencies, etc.
*/
static void InvalidateDependencies(DAG *node)
{
DAG *n;
int i;
for (n = node_list; n != NULL; n = n>next)
{
for (i = 0; i < n>num_children; i++)
{
if (n>child[i] == node)
{
InvalidateDependencies(n);
/*
* '=' and 'v' both use data.var to store an actual (az,AZ)
* userdefined variable which must never be invalidated.
*/
if (n>op != '=' && n>op != 'v') // these 2 will always have an actual (az,AZ) variable
n>data.num = 1;
break;
}
}
}
}
/*
* GenerateCode():
*
* Generates code from a DAG and prints it to stdout.
*/
static void GenerateCode(DAG *node)
{
int i;
if (node == NULL)
return;
switch (node>op)
{
case 's':
/*
* List of statements
*/
for (i = 0; i < node>num_children; i++)
GenerateCode(node>child[i]);
break;
case '=':
/*
* Assignment
*/
GenerateCode(node>child[1]); // child[0] guaranteed to be a var
node>data.var = node>child[0]>data.var; // this node becomes
// equivalent to the var
printf("%c = ", node>child[0]>data.var);
PrintOperand(node>child[1]);
printf("\n");
InvalidateDependencies(node>child[0]);
break;
case '+':
case '':
case '*':
case '/':
/*
* Binary arithmetic operators
*/
if (node>data.num != 1) // scratch register already assigned?
break; // do nothing (parent node will treat this
// as an operand by reusing the scratch
// register)
GenerateCode(node>child[0]);
GenerateCode(node>child[1]);
node>data.num = NewScratch();
printf("T%d = ", node>data.num);
PrintOperand(node>child[0]);
printf(" %c ", node>op);
PrintOperand(node>child[1]);
printf("\n");
break;
case 'n':
case 'v':
break; // do nothing
}
}
/*****************************************************************************
Lexical Analyzer
A simple lexical analyzer which recognizes tokens from file input.
*****************************************************************************/
static FILE *fp;
static int yylex()
{
int i, num;
if (feof(fp))
return 0;
i = fgetc(fp);
while (i == ' '  i == '\t'  i == '\r'  i == '\n')
i = fgetc(fp);
if (isdigit(i))
{
num = i  '0';
while (1)
{
i = fgetc(fp);
if (isdigit(i))
{
num *= 10;
num += (i  '0');
}
else
{
ungetc(i, fp);
yylval.num = num;
return NUMBER;
}
}
}
else if (isalpha(i))
{
yylval.var = i;
return VARIABLE;
}
return i;
}
/*****************************************************************************
Main
main(), yyerror(), and friends.
*****************************************************************************/
static void PrintDAG(DAG *node, int indent)
{
int i;
if (node == NULL)
return;
for (i = 0; i < indent * 2; i++)
printf(" ");
if (node>op == 'n')
printf("%d\n", node>data.num);
else if (node>op == 'v')
printf("%c\n", node>data.var);
else
{
printf("%c\n", node>op);
for (i = 0; i < node>num_children; i++)
PrintDAG(node>child[i], indent + 1);
}
}
static void yyerror(const char *msg)
{
fprintf(stderr, "%s\n", msg);
}
int main(int argc, char **argv)
{
if (argc <= 1)
{
puts("usage: cse ");
exit(0);
}
else
{
if ((fp = fopen(argv[1], "r")) == NULL)
{
fprintf(stderr, "error: unable to open %s\n", argv[1]);
exit(1);
}
}
node_list = NULL;
if (yyparse())
exit(0);
puts("DAG:\n\n");
PrintDAG(dag, 0);
puts("\nCode:\n\n");
GenerateCode(dag);
return 0;
}