You are on page 1of 36

CSCI468: Compilers Portfolio

Spring 2016

Drew Antonich & Anthony Schwartz


Section 1:

# ------------------------------------------------------------
# converter.py
# Authors: Drew Antonich & Anthony Schwartz
#
# Converter of IR code to assembly Tiny Code
#
# Tried to implement Lab4 first to see if on the right track
# Should be able to continue off of the lab4 and fully implement
# for project.
# ------------------------------------------------------------

import sys
import Stack

"""
Main function for Converter.py

Parameter all_variables: List of all variables declared within Tiny


source file
Parameter IRNode_list: List of all IRNodes that were generated during
parsing

"""
def main(all_variables, IRNode_list):

#TO-DO: Handle all_variables


for var in all_variables:
if(var.type == "INT" or var.type == "FLOAT"):
print("var " + var.name)
elif(var.type == "STRING"):
print("str " + var.name + " " + var.value)

little_node_list = conversion(IRNode_list)
for node in little_node_list:
print(node.op_code + " " + node.op1 + " " + node.op2)

def conversion(ir_node_list):
node_list = []
Labels = {"STOREI": "move", "STOREF": "move", "ADDF": "addr", "SUBI":
"subi", "SUBF": "subr", "MULTF": "mulr", "DIVF": "divr",
"GT": "jgt", "GE": "jge", "LT": "jlt", "LE": "jle", "NE":
"jne", "EQ": "jeq", "JUMP": "jmp", "LABEL": "label", "READI": "sys readi",
"READF": "sys readr",
"MULTI": "muli", "ADDI": "addi", "DIVI": "divi", "WRITEI":
"sys writei", "WRITEF": "sys writer", "WRITES": "sys writes", "RET": "sys
halt"}
var_stack = Stack.Stack()
seen_var_names = []
for ir_node in ir_node_list:
if ir_node.opcode in ["STOREI", "STOREF"]:
new_op1, new_op2, _ = new_op(ir_node.op1, ir_node.result)
node = LittleNode("move", new_op1, new_op2)
node_list.append(node)
elif ir_node.opcode in ["LABEL"]:
node = LittleNode("label", ir_node.result)
node_list.append(node)
elif ir_node.opcode in ["JUMP"]:
node = LittleNode("jmp", ir_node.result)
node_list.append(node)
elif ir_node.opcode in ["GT", "GE", "LT", "LE", "NE", "EQ"]:
new_op1, new_op2, new_result = new_op(ir_node.op1,
ir_node.op2, ir_node.result)
node0 = LittleNode("cmpi", new_op1, new_op2)
node1 = LittleNode(Labels[ir_node.opcode],new_result)
node_list.append(node0)
node_list.append(node1)
elif ir_node.opcode in ["READI", "READF"]:
new_op1, new_op2, _ = new_op(ir_node.op1, ir_node.op2)
node = LittleNode(Labels[ir_node.opcode],ir_node.result)
node_list.append(node)
elif ir_node.opcode in ["ADDI", "ADDF", "SUBI", "SUBF", "DIVI",
"DIVF", "MULTI", "MULTF"]:
new_op1, new_op2, new_result = new_op(ir_node.op1,
ir_node.op2, ir_node.result)
node0 = LittleNode("move", new_op1, new_result)
node1 = LittleNode(Labels[ir_node.opcode], new_op2,
new_result)
node_list.append(node0)
node_list.append(node1)
pass
elif ir_node.opcode in ["WRITEI", "WRITEF", "WRITES", "RET"]:
new_op1, _, _ = new_op(ir_node.op1)
node = LittleNode(Labels[ir_node.opcode], ir_node.result)
node_list.append(node)
else:
print("ERROR - Unhandeled opcode:", ir_node.opcode)

# put the variable declarations into the beginning of little_node_list


in the correct order.
for _ in range(len(var_stack.items)):
node_list.insert(0, var_stack.pop())

return node_list

def new_op(op1, op2="", op3=""):


# Replace "$T" from ir_node op codes with "r".
new_op1, new_op2, new_op3 = op1, op2, op3
if "$T" in op1:
new_op1 = "r" + str(int(op1[2:]) - 1)

if "$T" in op2:
new_op2 = "r" + str(int(op2[2:]) - 1)
if "$T" in op3:
new_op3 = "r" + str(int(op3[2:]) - 1)

return new_op1, new_op2, new_op3

class Symbol:
def __init__(self, name, value, type):
self.name = name
self.value = value
self.type = type

class SymbolTable(object):
# each symbol table has a list of symbols it contains and a list of
sub symbol tables
def __init__(self, name):
self.name = name
self.symbols = Stack.Stack()

def add_symbol(self, symbol):


output = "name " + symbol.name + " type " + str(symbol.type)
if symbol.value is not None:
output = output + " value " + str(symbol.value)

duplicate_exists = self.check_duplicate(symbol)
if duplicate_exists:
quit()
else:
print(output)
self.symbols.push(symbol)

def check_duplicate(self, symbol):


for sym in self.symbols.items:
if symbol.name == sym.name:
print("DECLARATION ERROR", symbol.name)
return True
return False

class LittleNode:
def __init__(self, op_code, op1="", op2=""):
self.op_code = op_code
self.op1 = op1
self.op2 = op2

# ------------------------------------------------------------
# Driver.py
# Authors: Drew Antonich & Anthony Schwartz
#
# Driver that converts written Tiny to Tiny assembly code
#
# To run the program, supply the file name of the file to be
# parsed as an argument.
# ------------------------------------------------------------

import sys
import scanner_parser
import Converter

def main(filename):
file_contents = open_file(filename)
result = scanner_parser.main(file_contents)
all_variables = result[0]
IRNode_list = result[1]
Converter.main(all_variables, IRNode_list)

def open_file(filename):
with open(filename, 'r') as content_file:
content = content_file.read()
return content

"""
Get name of file to parse from command line
"""
if len(sys.argv) >= 2: # If file name was passed as an argument
main(sys.argv[1])
else:
print("Please supply a filename to parse")

# ------------------------------------------------------------
# scanner_parser.py
# Authors: Drew Antonich & Anthony Schwartz
#
# tokenizer & parser for the LITTLE grammer
#
# To run the program, supply the file name of the file to be
# parsed as an argument.
# ------------------------------------------------------------
import ply.lex as lex
import ply.yacc as yacc
import sys
import operator

######################
# GLOBAL VARS
######################

filename = None # File name to be used


block_counter = 1 # Counter to keep track of current block
position_counter = 0 # Used to track position of symbol table in print out
symbol_table_stack = [] # Stack used during parsing
post_symbol_table_stack = [] # Tables popped from symbol_table_stack get
placed here for printing
temp_counter = 1 # Counter to hold current temporary variable count
label_counter = 1
irnode_list = []

def main(file_data):
global symbol_table_stack
global irnode_list
lex.lex()
yacc.yacc()
yacc.parse(file_data)
all_variables = []
current_symbol_table = post_symbol_table_stack[-1]
for var in current_symbol_table.members:
all_variables.append(var)
irnode_list = post_process_irnodes(irnode_list)
for node in irnode_list:
node.print_info()
return (all_variables, irnode_list) # Return a list of declared
variables and the IR code generated

def print_sym_table(sym_table_stack):
sym_table_stack.sort(key=operator.attrgetter('position')) # Sort by
position
for i in range(len(post_symbol_table_stack)): # For all symbol tables
print("Symbol table " + post_symbol_table_stack[i].name)
for variable in post_symbol_table_stack[i].members:
if(variable.value is not None):
print("name " + variable.name + " type STRING value " +
variable.value)
else:
print("name " + variable.name + " type " + variable.type)

if(i != len(post_symbol_table_stack)-1):
print("")

def post_process_irnodes(irnode_list):
for irnode in irnode_list:

# Checking opcode:
if(isinstance(irnode.opcode, int)): # If opcode is an int
irnode.op1 = str(irnode.opcode)
elif(isinstance(irnode.opcode, float)): # If opcode is a float
irnode.opcode = str(irnode.opcode)
elif(irnode.opcode is None):
irnode.opcode = ""

# Checking op1
if(isinstance(irnode.op1, int)): # If op1 is an int
irnode.op1 = str(irnode.op1)
elif(isinstance(irnode.op1, float)): # If op1 is a float
irnode.op1 = str(irnode.op1)
elif(irnode.op1 is None):
irnode.op1 = ""

# Checking op2
if(isinstance(irnode.op2, int)): # If op2 is an int
irnode.op2 = str(irnode.op2)
elif(isinstance(irnode.op1, float)): # If op2 is a float
irnode.op2 = str(irnode.op2)
elif(irnode.op2 is None):
irnode.op2 = ""

# Checking result
if(isinstance(irnode.result, int)): # If result is an int
irnode.result = str(irnode.result)
elif(isinstance(irnode.result, float)): # If result is a float
irnode.result = str(irnode.result)
elif(irnode.result is None):
irnode.result = ""

return_node = IRNode("RET", "", "", "")


irnode_list.append(return_node)
return irnode_list

##########
# CLASSES
##########

class SymbolTable:
name = None
position = None
members = []

def __init__(self, members=None):


if members is None:
self.members = []

class Variable:
name = None
type = None
value = None

def __init__(self, name, type):


self.name = name
self.type = type

class IRNode:
opcode = None
op1 = None
op2 = None
result = None

def __init__(self, opcode, op1, op2, result):


self.opcode = opcode
self.op1 = op1
self.op2 = op2
self.result = result

def print_info(self):
print('{}{} {} {} {}'.format(";", self.opcode, self.op1, self.op2,
self.result))

class if_stmt:
out_label = None
next_else_label = None

class while_stmt:
top_label = None
out_label = None

########################
# SYMBOL TABLE METHODS
########################

"""
Add argument Variable to current symbol table
"""
def add_to_curr_table(Variable):
curr_table = symbol_table_stack[-1] # Get last symbol table
for member in curr_table.members:
if(member.name == Variable.name): # If Variable has already been
defined in scope
print("DECLARATION ERROR " + str(Variable.name))
sys.exit()
curr_table.members.append(Variable) # Add Variable passed in to the
current table
"""
Finds a variable name in the current symbol table

Used to get the type of a variable once the assignment has been done
"""
def find_in_curr_table(var_name):
curr_table = symbol_table_stack[-1] # Get last symbol table
for member in curr_table.members:
if(member.name == var_name): # Found a match (which is expected)
return member
return None

"""
Pop the current symbol table; rename it; push onto stack for printing
later
"""
def pop_curr_table(name):
curr_table = symbol_table_stack.pop()
if(name is not None):
curr_table.name = name
post_symbol_table_stack.append(curr_table)

"""
Add all vars declared in a list or by themselves
"""
def process_id_list(type, id_list):
split_list = id_list.split(',')
for var in split_list:
var_entry = Variable(var, type)
add_to_curr_table(var_entry)

"""
Add a string to current symbol table
"""
def process_string(id, value):
string_variable = Variable(id, "STRING")
string_variable.value = value
add_to_curr_table(string_variable)

##########################
# CODE GENERATION METHODS
##########################

def get_curr_temp():
global temp_counter
result = "$T" + str(temp_counter)
temp_counter += 1
return result

def get_new_label():
global label_counter
result = "LABEL" + str(label_counter)
label_counter += 1
return result

def generate_var_code(Variable):
curr_temp = get_curr_temp()
if(Variable.type == "INT"):
irnode1 = IRNode("STOREI", Variable.value, None, curr_temp)
add_node_to_list(irnode1)
irnode2 = IRNode("STOREI", curr_temp, None, Variable.name)
add_node_to_list(irnode2)
elif(Variable.type == "FLOAT"):
irnode1 = IRNode("STOREF", Variable.value, None, curr_temp)
add_node_to_list(irnode1)
irnode2 = IRNode("STOREF", curr_temp, None, Variable.name)
add_node_to_list(irnode2)

def generate_string_code(Variable):
print("generate_string_code Not yet implemented")

"""
Given a Variable object, generate the appropriate read IR code
"""
def generate_read_code(Variable):
if(Variable.type == "INT"):
irnode = IRNode("READI", None, None, Variable.name)
add_node_to_list(irnode)
elif(Variable.type == "FLOAT"):
irnode = IRNode("READF", None, None, Variable.name)
add_node_to_list(irnode)
elif(Variable.type == "STRING"):
irnode = IRNode("READS", None, None, Variable.name)
add_node_to_list(irnode)
"""
Given a Variable object, generate the appropriate write IR code
"""
def generate_write_code(Variable):
if(Variable.type == "INT"):
irnode = IRNode("WRITEI", None, None, Variable.name)
add_node_to_list(irnode)
elif(Variable.type == "FLOAT"):
irnode = IRNode("WRITEF", None, None, Variable.name)
add_node_to_list(irnode)
elif(Variable.type == "STRING"):
irnode = IRNode("WRITES", None, None, Variable.name)
add_node_to_list(irnode)

"""
Because we need to jump to a certain label if a boolean condition
fails,
this method is used to generate the logical opposite of what the user
enters into their condition.

Once the logical opposite has been found, the appropriate IRNode is
generated.
"""
def generate_boolean_expr_code(first_expr, compop, second_expr,
first_expr_temp, second_expr_temp):
irnode = None
if(compop == "="):
if(second_expr_temp is not None):
irnode = IRNode("NE", first_expr, second_expr_temp, None)
else:
irnode = IRNode("NE", first_expr, second_expr, None)
elif(compop == "!="):
if(second_expr_temp is not None):
irnode = IRNode("EQ", first_expr, second_expr_temp, None)
else:
irnode = IRNode("EQ", first_expr, second_expr, None)
elif(compop == ">"):
if(second_expr_temp is not None):
irnode = IRNode("LE", first_expr, second_expr_temp, None)
else:
irnode = IRNode("LE", first_expr, second_expr, None)
elif(compop == "<"):
if(second_expr_temp is not None):
irnode = IRNode("GE", first_expr, second_expr_temp, None)
else:
irnode = IRNode("GE", first_expr, second_expr, None)
elif(compop == ">="):
if(second_expr_temp is not None):
irnode = IRNode("LT", first_expr, second_expr_temp, None)
else:
irnode = IRNode("LT", first_expr, second_expr, None)
elif(compop == "<="):
if(second_expr_temp is not None):
irnode = IRNode("GT", first_expr, second_expr_temp, None)
else:
irnode = IRNode("GT", first_expr, second_expr, None)
return irnode

def add_node_to_list(IRNode):
global irnode_list
irnode_list.append(IRNode)
######################
# TOKENIZER
######################

# List of token names


tokens = [
'IDENTIFIER',
'INTLITERAL',
'FLOATLITERAL',
'STRINGLITERAL',
'COMMENT',
'ASSIGNMENT',
'PLUS',
'MINUS',
'MULTIPLY',
'DIVIDE',
'EQUAL',
'NOTEQUAL',
'LESSTHAN',
'GREATERTHAN',
'LPAREN',
'RPAREN',
'SEMICOLON',
'COMMA',
'LESSTHANEQUAL',
'GREATERTHANEQUAL'
]

# List of LITTLE keywords


keywords = [
'PROGRAM',
'BEGIN',
'END',
'FUNCTION',
'READ',
'WRITE',
'IF',
'ELSE',
'ENDIF',
'WHILE',
'ENDWHILE',
'CONTINUE',
'BREAK',
'RETURN',
'INT',
'VOID',
'STRING',
'FLOAT'
]

tokens += keywords

# Regular expression rules for simple tokens


t_STRINGLITERAL = r'\"[^"\n]+\"'
t_ASSIGNMENT = r'\:\='
t_PLUS = r'\+'
t_MINUS = r'\-'
t_MULTIPLY = r'\*'
t_DIVIDE = r'\/'
t_EQUAL = r'\='
t_NOTEQUAL = r'\!\='
t_LESSTHAN = r'\<'
t_GREATERTHAN = r'\>'
t_LPAREN = r'\('
t_RPAREN = r'\)'
t_SEMICOLON = r'\;'
t_COMMA = '\,'
t_LESSTHANEQUAL = r'\<\='
t_GREATERTHANEQUAL = r'\>\='

def t_FLOATLITERAL(t):
r'\d*\.\d+' # Match a number, decimal, then a number or just a decimal
then number
t.value = float(t.value)
return t

def t_INTLITERAL(t):
r'\d+'
t.value = int(t.value)
return t

def t_IDENTIFIER(t):
r'[a-zA-Z_][a-zA-Z0-9_]*'
if t.value in keywords:
t.type = str(t.value)
return t
else:
t.type = 'IDENTIFIER'
return t

def t_COMMENT(t):
r'\-\-.*'
pass # Don't do anything with token
t_ignore = ' \t'

# Error handling rule


def t_error(t):
t.lexer.skip(1)

# Build the lexer


##lex.lex()

########################
# PARSER RULES
########################

def p_program_start(p):
'''program_start : PROGRAM add_symbol_table id BEGIN pgm_body END'''
pop_curr_table("GLOBAL")

def p_id(p):
'''id : IDENTIFIER'''
p[0] = p[1]

def p_pgm_body(p):
'''pgm_body : decl func_declarations'''

def p_decl(p):
'''decl : string_decl decl
| var_decl decl
| empty'''

def p_string_decl(p):
'''string_decl : STRING id ASSIGNMENT str SEMICOLON'''
process_string(p[2], p[4])

def p_str(p):
'''str : STRINGLITERAL'''
p[0] = p[1]

def p_var_decl(p):
'''var_decl : var_type id_list SEMICOLON'''
p[0] = p[1] + "" + p[2]
process_id_list(p[1], p[2])

def p_var_type(p):
'''var_type : FLOAT
| INT'''
p[0] = p[1]

def p_any_type(p):
'''any_type : var_type
| VOID'''
p[0] = p[1]

def p_id_list(p):
'''id_list : id id_tail'''
if(p[2] is None):
p[0] = p[1]
else:
p[0] = p[1] + "" + p[2]

def p_id_tail(p):
'''id_tail : COMMA id id_tail
| empty'''
if(len(p) != 2): # Processing a regular id_tail
if(p[3] is not None):
p[0] = p[1] + "" + p[2] + "" + p[3]
else:
p[0] = p[1] + "" + p[2]
else: # Processing an empty id_tail
p[0] = None

def p_param_decl_list(p):
'''param_decl_list : param_decl param_decl_tail
| empty'''

def p_param_decl(p):
'''param_decl : var_type id'''
process_id_list(p[1], p[2])

def p_paramdecl_tail(p):
'''param_decl_tail : COMMA param_decl param_decl_tail
| empty'''

def p_func_declarations(p):
'''func_declarations : func_decl func_declarations
| empty'''

def p_func_decl(p):
'''func_decl : FUNCTION any_type id LPAREN param_decl_list RPAREN
BEGIN func_body END'''

def p_func_body(p):
'''func_body : decl stmt_list'''

def p_stmt_list(p):
'''stmt_list : stmt stmt_list
| empty'''
def p_stmt(p):
'''stmt : base_stmt
| if_stmt
| while_stmt'''

def p_base_stmt(p):
'''base_stmt : assign_stmt
| read_stmt
| write_stmt
| return_stmt'''

def p_assign_stmt(p):
'''assign_stmt : assign_expr SEMICOLON'''

def p_assign_expr(p):
'''assign_expr : id ASSIGNMENT expr'''
if(p[3] is not None):
variable = find_in_curr_table(p[1]) # Check to see if expr
returned is in symbol table
if(variable is not None): # Found a match
variable.value = p[3]
generate_var_code(variable)

def p_read_stmt(p):
'''read_stmt : READ LPAREN id_list RPAREN SEMICOLON'''
split_list = p[3].split(',')
for id in split_list:
variable = find_in_curr_table(id)
if(variable is not None): # If var is found
generate_read_code(variable)

def p_write_stmt(p):
'''write_stmt : WRITE LPAREN id_list RPAREN SEMICOLON'''
split_list = p[3].split(',')
for id in split_list:
variable = find_in_curr_table(id)
if(variable is not None): # If var is found
generate_write_code(variable)

def p_return_stmt(p):
'''return_stmt : RETURN expr SEMICOLON'''

def p_expr(p):
'''expr : expr_prefix factor'''
if(p[1] is None): # if expr_prefix goes to empty
p[0] = p[2] # expr = factor

def p_expr_prefix(p):
'''expr_prefix : expr_prefix factor addop
| empty'''

def p_factor(p):
'factor : factor_prefix postfix_expr'
if(p[1] is None): # if factor_prefix goes to empty
p[0] = p[2] # factor = postfix_expr

def p_factor_prefix(p):
'''factor_prefix : factor_prefix postfix_expr mulop
| empty'''

def p_postfix_expr(p):
'''postfix_expr : primary
| call_expr'''
p[0] = p[1] # postfix_expr = primary | call_expr

def p_call_expr(p):
'''call_expr : id LPAREN expr_list RPAREN'''

def p_expr_list(p):
'''expr_list : expr expr_list_tail
| empty'''

def p_expr_list_tail(p):
'''expr_list_tail : COMMA expr expr_list_tail
| empty'''

def p_primary(p):
'''primary : LPAREN expr RPAREN
| id
| INTLITERAL
| FLOATLITERAL'''
if(len(p) == 2): # assigning id, INTLITERAL OR FLOATLITERAL
if(p[1] is not None):
p[0] = p[1]

def p_addop(p):
'''addop : PLUS
| MINUS'''

def p_mulop(p):
'''mulop : MULTIPLY
| DIVIDE'''

def p_if_stmt(p):
'''if_stmt : IF start_if LPAREN cond RPAREN if_test decl stmt_list
else_part ENDIF gen_out_label'''

"""
Semantic action for starting an if statement
and generating the appropriate label.
"""
def p_start_if(p):
'''start_if : empty'''
if_stmt_object = if_stmt()
if_stmt_object.next_else_label = get_new_label()
p[0] = if_stmt_object

"""
With the returned results from the cond rule, generate the appropriate
IRNode that incorporates the condition and where to jump to if the
if-check fails.
"""
def p_if_test(p):
'''if_test : empty'''
if_stmt_object = p[-4] # Get original if_stmt_object
cond_statement = p[-2] # Get the if statement conditional part

if_stmt_object.out_label = get_new_label()
first_expr = cond_statement[0]
compopr = cond_statement[1] # Comparison operator
second_expr = cond_statement[2]
first_expr_temp = cond_statement[3] # Temp storing first expr
second_expr_temp = cond_statement[4] # Temp storing second expr

irnode = generate_boolean_expr_code(first_expr, compopr, second_expr,


first_expr_temp, second_expr_temp)

irnode.result = if_stmt_object.next_else_label
add_node_to_list(irnode)

p[0] = if_stmt_object

def p_gen_out_label(p):
'''gen_out_label : empty'''
if_stmt_object = p[-5] # Get if statment object from previous rule
irnode = IRNode("LABEL", None, None, if_stmt_object.out_label)
add_node_to_list(irnode)

def p_else_part(p):
'''else_part : ELSE gen_jump gen_else_label decl stmt_list
| empty gen_else_label'''
if(len(p) != 2): # Make sure we aren't processing an empty production
pass
"""
Generates a JUMP label based on the argument if_stmt_object
"""
def p_gen_jump(p):
'''gen_jump : empty'''
if_stmt_object = p[-4]

irnode = IRNode("JUMP", None, None, if_stmt_object.out_label)


add_node_to_list(irnode)
p[0] = "jump"

"""
Generates the else label to an if statement
"""
def p_gen_else_label(p):
'''gen_else_label : empty'''
if(p[-1] == "jump"): # Previous instruction generated a jump
if_stmt_object = p[-5]
irnode = IRNode("LABEL", None, None,
if_stmt_object.next_else_label)
add_node_to_list(irnode)
else:
if_stmt_object = p[-4]
irnode = IRNode("LABEL", None, None,
if_stmt_object.next_else_label)
add_node_to_list(irnode)
def p_cond(p):
'''cond : expr compop expr'''

# Get previous expressions


first_expr = find_in_curr_table(p[1])
second_expr = find_in_curr_table(p[3])

if(first_expr is None): # Couldn't find first expr in symbol table


pass
elif(second_expr is None):
curr_temp = get_curr_temp()
p[0] = (p[1], p[2], p[3], None, curr_temp) # Tuple of the form:
(first_expr, compop, second_expr, first_expr_temp, second_expr_temp)
if(isinstance(p[3], int)): # If second expression is an int
irnode = IRNode("STOREI", p[3], None, curr_temp)
add_node_to_list(irnode)
elif(isinstance(p[3], float)): # If second expression is a float
irnode = IRNode("STOREF", p[3], None, curr_temp)
add_node_to_list(irnode)
elif(second_expr is not None):
p[0] = (p[1], p[2], p[3], None, None)

def p_compop(p):
'''compop : LESSTHAN
| GREATERTHAN
| EQUAL
| NOTEQUAL
| LESSTHANEQUAL
| GREATERTHANEQUAL'''
p[0] = p[1]

def p_while_stmt(p):
'''while_stmt : WHILE start_while LPAREN cond RPAREN while_test decl
stmt_list ENDWHILE finish_while'''

def p_start_while(p):
'''start_while : empty'''
label = get_new_label()
irnode = IRNode("LABEL", None, None, label)
add_node_to_list(irnode)
while_stmt_object = while_stmt()
while_stmt_object.top_label = label
p[0] = while_stmt_object

def p_while_test(p):
'''while_test : empty'''
while_stmt_object = p[-4] # Assign original if_stmt_object
cond_statement = p[-2] # Assign the while statement conditional part
while_stmt_object.out_label = get_new_label()

first_expr = cond_statement[0]
compopr = cond_statement[1] # Comparison operator
second_expr = cond_statement[2]
first_expr_temp = cond_statement[3] # Temp storing first expr
second_expr_temp = cond_statement[4] # Temp storing second expr

irnode = generate_boolean_expr_code(first_expr, compopr, second_expr,


first_expr_temp, second_expr_temp)

irnode.result = while_stmt_object.out_label
add_node_to_list(irnode)

p[0] = while_stmt_object

def p_finish_while(p):
'''finish_while : empty'''
while_stmt_object = p[-4]
irnode1 = IRNode("JUMP", None, None, while_stmt_object.top_label)
add_node_to_list(irnode1)
irnode2 = IRNode("LABEL", None, None, while_stmt_object.out_label)
add_node_to_list(irnode2)

def p_add_symbol_table(p):
'''add_symbol_table : empty'''
global position_counter
sym_table = SymbolTable() # Create new symbol table
sym_table.position = position_counter
position_counter = position_counter + 1
symbol_table_stack.append(sym_table) # Append new table to global
table stack

def p_empty(p):
'empty :'
pass

notAccepted = False

# Error rule for syntax errors


def p_error(p):
global notAccepted
notAccepted = True

class Stack:
def __init__(self):
self.items = []
self.debug = True

def is_empty(self):
return self.items == []

def push(self, item):


self.items.append(item)

def pop(self):
return self.items.pop()

def peek(self):
return self.items[len(self.items)-1]

def size(self):
return len(self.items)

def printStackOp(self, op, sym_id, size):


if self.debug is True:
pass

Section 2:
Project step 1(Scanner): Team member 1 wrote the Python code for the scanner
portion of the compiler and did a small amount of research on scanners using the PLY
library for Python. Team member 2 did a significant amount of research on scanners in
PLY and collaborated with team member 1 in writing the code for step 1 of the project.

Project step 2(Parser): Team member 1 wrote the Python code for the parser
portion of the compiler and did a small amount of research on parser implementations in
PLY. Team member 2 did the remainder for the research required for parser
implementations in PLY. Both team members collaborated on the code that was to be
written for the parser section of the compiler.

Project step 3(Symbol Table): Team member 1 wrote the Python code that
implemented a symbol table for the compiler. Team member 2 researched not only
implementations of a symbol table in PLY, but also other variations in order to decide on
what was best for the group’s project.

Project step 4(Code Generation/Conversion): Team member 1 wrote the Python


code that created intermediate representation code from the original source code. Team
member 2 wrote the Python code that converted the intermediate representation code
into Tiny assembly code that would run on the provided Tiny simulator. Both team
members researched code generation and conversion to not only add to the final report
but to also help generate ideas of what code should be written to best accomplish step
4 of the project.

Section 3: While the design patterns used in the final project aren’t very complex in
nature, the majority of code design efforts were focused on writing code that was
adaptable to the many different cases that needed to be considered when writing a
compiler. That being said a particular design pattern that was used in the final project
was the Singleton pattern. From the Driver.py file, only one instance of the
scanner_parser and Converter Python files were created. This design pattern was used
simply because it allowed for not only improved code maintenance by separating what
could be one large Python file into many small ones but also for readability and logical
separation of the codebase as a whole.

Figure 1 - Driver.py containing the Singleton design pattern

Section 4: See attached report at the end.

Section 5: See attached UML diagram at the end.

Section 6: A particular design decision that was made during the project creation
process was to exclude any Tiny assembly code optimizations. For example, while it
may have been possible to include these optimizations that would result in a smaller
final output, we as a team believed that the optimizations were not worth their efforts
and would have created a messy code base. What our team wanted as a final product
was an easy to understand implementation of a compiler. Adding extra features such as
code optimization would have gone against the original goal. By excluding code
optimization, the amount of time and resources needed to run our compiler were
lessened.

Section 7: Writing a compiler requires developers to write code that covers many, if not
an infinite, number of possible inputs. With this in mind, our group knew that we needed
to split up the many different aspects of each step from our program into many different
functions that could perform many different tasks depending on a particular input. These
functions were also written to be adaptable so that if in the future more aspects of the
compiler could be added with minimal resistance. While following this model resulted in
better code by the end, it wasn’t particularly easy to implement. The easiest and
possibly “dirtiest” method of implementation would have been to just cover the base
cases provided and ignore all other possibilities. This implementation would have surely
been easy to create, but by the end, it would have resulted in poor maintainability and
difficult implementation.
Creating a Compiler

2 February 2015

Drew Antonich, Author


Anthony Schwartz, Author

Abstract
The following paper contains information regarding the construction of a compiler. The
research and development of the project took the efforts of both authors and the length
of an academic semester. After reading the following paper the reader will understand
the methods taken to construct a compiler for the Little language.
Table of Contents

Table of Contents……………………………………………………………………………… 1
Introduction……………………………………………………………………………………...2
Background……………………………………………………………………………………..2
Methods and Discussion………………………………………………………………………3
Scanner………………………………………………………………………………...3
Parser…………………………………………………………………………………..5
Symbol Table…………………………………………………………………………...6
Semantic Routines…………………………………………………………………….8
Full-Fledged Compiler…………………………………………………………………9
Conclusion and Future Work………………………………………………………………….10

List of Figures

Regular Expression rule example..……………………………………………………..1


Parser rule example……………………………………………………………………...2
Symbol Stack example...………………………………………………………………...3
P.value storage...….……………………………………………………………………...4
P.length example….……………………………………………………………………...5
IR generation example...………………………………………………………………...6
Converter example.…….………………………………………………………………...7
UML Diagram……...……………………………………………………………………...8
Register allocation...……………………………………………………………………...9
Introduction
This report examines the construction of a compiler in five parts. A scanner to take in a
string and tokenize it. A parser that will convert the set of tokens into a parse tree.
Semantic routines that will interpret the semantics of the syntactic constructs. Several
symbol tables that will provide a list of every declaration in the program, and lastly a
demonstration of the complete compiler. Compilers provide the programmer with a
means of communicating with the computer. These components will be discussed in
further detail later on. The motivation behind this project was to better understand how
the communication happens between programmer and computer. The research behind
this project comes from several sources including class lectures, laboratory exercises,
and online documentation.

Background
A compiler is a special program that processes statements written in a programming
language and turns them into a machine recognisable language. It’s purpose is to easily
and efficiently create executable programs from languages. The components of a
compiler include: a scanner, parser, semantic routines, an optimizer and several symbol
tables, along with the final code generation.

The scanner or lexical analyzer’s purpose within a compiler is to read the input of
characters and produce a sequence of tokens for the syntax analyzer. A second
function of the scanner is to ignore white-space and comments as to not confuse the
source with unnecessary errors. One last task may be to keep track of line numbers to
produce helpful error messages.

The parser’s purpose within the compiler is to receive input from the scanner and then
break that input into parts so that it can be defined into certain attributes. The parser
may also check for missing input as to make sure all information has been gathered.

The semantic routines within a compiler help to identify the meaning of a language.
These routines will help define what variables and expressions are of what type. This
helps the compiler distinguish what exactly an if statement may be within a language.
This step will also help with the construction of the symbol table.

A symbol table is the part of the compiler that will list every declaration in the program,
along with other information. This can include such things as variable and function
declarations, along with return types and the types of accepted arguments.
The optimizer and code generator can be described together as both their functions are
to produce efficient assembly code from an intermediate representation. The
intermediate representation is the syntax taken from the user and put through the
compiler.

Methods and Discussion


Scanner

To create the scanner portion of the compiler the team relied heavily on the Python Lex-
Yacc documentation. The Python Lex-Yacc or PLY tool is an implementation of the Lex
and Yacc parsing tools for the Python programming language. The main purpose of a
scanner is to tokenize an input string from a user. For example, given a user string such
as

x = 3 * (a + t)

the scanner will tokenize this to look similar to ‘x’, ‘=’, ‘3’, ‘*’, ‘(’, ‘a’, ‘+’, ‘t’, ‘)’ each of
these tokens will be given names and broken into pairs of type and value such as (‘ID’,
‘x’), (‘EQUALS’, ‘=’), (‘NUMBER’, ‘3’ ), (‘TIMES’, ‘*’ ), (‘LPAREN’, ‘(’ ), (‘ID’, ’a’), (‘PLUS’,
‘+’), (‘ID’, ‘t’), (‘RPAREN’, ‘)’ )

These tokens are identified using regular expression rules that the team have written.
The method for writing the rules is typically a list of the token names followed by the
rules set by the creators of the scanner. A shortened example from the teams scanner
can be seen in figure 1.
Figure 1: Tokenizer

The PLY documentation proved to be the most effective way of implementing this step
in the compiler process. Other methods were considered before the decision to use
PLY. PLY was chosen because it provided the means for an easy to set-up scanner that
came with few if any errors along the process. PLY was so effective that the team
encountered zero difficulties while implementing the scanner portion of the compiler. In
the event the team were to ever collaborate to create a larger more robust compiler the
team will definitely be re-visiting the PLY tool.

Parser
Returning to the PLY-Yacc tool the team was able to create and implement the parser
section of the compiler. The parser is the step of the compiler where the team
constructed the rules that the compiler would follow. The format of these rules can be
seen in the following identifier example:
def p_id(p):
'''id : IDENTIFIER'''

Yacc uses a parsing technique known as LR-parsing or shift-reduce parsing. LR uses a


bottom up technique that will recognize the right most side of a grammar rule. In the
previous example when the IDENTIFIER symbol is recognized the parser will know to
replace this with the grammar symbol on the left (in this case) id. A sample section of
the teams parser can be seen in figure 2

Figure 2: Parser snippet


Challenges the team faced while implementing step 2 of the compiler project included:
covering all cases, correctly routing rules, and translating PLY documentation to
applicable implementation. When writing the rules of the parser the main goal was to
create any rule necessary to handle any user input. This became a challenge in the
shear number of rules needed to cover all cases. Once the parser was believed to have
had all the rules needed to handle most if not all user cases, routing those rules
correctly became our next priority. This posed difficult at first as at times the flow of
rules became ambiguous.

Perhaps the greatest challenge in this step was translating the teachings of PLY’s
documentation to the compiler's implementation. Working separately on the same step
proved that not all developers translate implementation the same. Halfway through
writing the rules the team realized that they were implementing them differently then
one another and had to settle on a design.

Though this step proved to have challenges along the way all in all the PLY-Yacc tool
was still very effective.

Symbol Table

Step 3 of the compiler building process involved implementing the parser slightly further
with the addition of the symbol table. The symbol table would provide a means of
storing the user's input in a symbol stack that would be filled via the grammar rules
provided in the compiler. An example of this can be seen in figure 3 taken from the Ply-
Yacc documentation.
Figure 3: Stack example implemented on 3 + 5 * (10 - 20) ‘PLY’

The actual implementation of this step was fairly straightforward in addition to the parser
the team needed a way to make the grammar rules have values associated with the
symbols they would generate. In the following figure 4 the p values were added in order
to give each grammar symbol a value that could be thrown into a symbol table.

Figure 4: Showing what p values would be stored with

It is shown in the previous example that the p[1] value can be applied to both var_type
or void as it’s value is first determined in order to be stored as a value for p[0].

At the point of step 3’s implementation the team was faced with the greatest challenge
so far. The biggest of which being understanding how Ply-Yacc implemented and used
the p value system for the grammar rules. In the id_tail rule seen in figure 5 the process
of identifying more than one identifier in a single line proved to be difficult as our first
understanding was that the length of p was to be greater than 0. It was not until reading
the Ply-Yacc documentation further that the understanding of what the p length actually
was came to light. By checking for a p length of not equal to 2 we are essentially saying
that any p of greater than 2 is acceptable. The previous implementation was to check
for a p length of not 1. The p length of not 1 did not work because Ply’s implementation
recognizes p length not 1 as anything over 1 acceptable which cause the team's rule to
be ineffective in catching multiple identifiers for when there is only 1 identifier to be
captured.

Figure 5

Though the team faced confusion during the translation of Ply’s implementation to the
team's implementation in this step, Ply still proved to be the greatest option for the team
to pursue further with.

Semantic Routines

This section is comprised of generating an intermediate representation (IR code)


internally in the compiler and then feeding the IR code to another generator that will
create the Tiny instructions needed for execution. First, the team needed to generate
the intermediate representation. This was done by using a list of IR nodes the team
constructed through the semantic routines. In figure 6 an example of the write IR code
the team used in the project can be seen.

Figure 6: Generation of write IR code example

After the IR code generation came the conversion of generating Tiny instructions from
the IR code. The team used a list to find Labels given by the IR code generator. Then
after the label had been switched from an IR code label to it’s equivalent Tiny instruction
these were pushed onto a stack. This was then printed off in reverse order as to get the
instructions needed in the correct order. An example of the conversion method taken
can be seen in figure 7.

Figure 7: IR to Tiny Conversion Example

This step was by far the most complicated to finish. The first problem came with trying
to understand how to generate the IR nodes for the IR code. The team found that
generating Labels,Variables, boolean expressions, write code, and read code all were
pretty intuitive after consulting Fisher and LeBlancs, Crafting a Compiler in C(8).
However, the more complicated expressions such as a := (a+b)*(d+c) were difficult to
generate code for. Ultimately the team had to abandon implementation on these
because of time constraints. Unfortunately at the time of this step the team had no
possible solutions for generating this IR code. They decided that for the project the team
would overlook this part and focus on implementing the rest of the compiler.

Full-Fledged Compiler

With steps 1-4 mostly implemented it was time to view the results. A high level
description of how the compiler was finally formed can be seen in the UML diagram
provided in figure 8 below.
Figure 8: High Level UML Diagram
The team’s strategy for the design and implementation of the compiler was mainly
focused on the functionality of the program itself. In the beginning the team found that
having everything in one file made for easier debugging, however as time went on this
became ever more tedious as the project grew. During step 4 the team took an
architectural design pattern approach to the project and began to split the program into
various parts. This included such things as making the converter a separate entity from
the scanner-parser. Throughout the project documentation was a high priority. The team
made sure to identify and clearly label parts of the project as to avoid confusion, this
was a key element in the project's success.

The team decided to use Bitbucket in cooperation with SourceTree to handle version
control of the project over time. The decision to use Bitbucket and SourceTree over
other version control systems such as Github was to protect the project's integrity.
Bitbucket allowed the team to create a private repository, allowing the team to know that
the work being done would stay within the group, a characteristic not available through
Github with the funding available.

Overall the project ended with our compiler being able to complete several steps.
Included in the final product was the ability to tokenize and parse a given file along with
the ability to create and populate symbol tables needed for the compilers interpretation.
The compiler is able to generate most IR code necessary for a functional compiler. The
IR code can be fully translated and converted to tiny instructions as needed for the Little
language. The Tiny instructions can then be put through a Tiny simulator that will
provide the user with the results their program should produce assuming their program
is without error. If the user’s program contains errors, there are several error handlers
throughout the compiler that will address them. As stated before the only non-
functioning parts of the compiler include the ability to generate IR code for complicated
expressions. The team believes that with more time and resources this is an easily
achievable fix.

Conclusion and Future Work


Throughout the team's process the team found that the area in need of most
improvement would be design, aside from implementing the final part of IR code
generation. While the team's design is easily readable, and functional, elements such as
optimization, and efficiency were never truly implemented. With the project being mostly
implemented, optimization will definitely be the team's next focus. The team will first
look at things such as peephole optimization a technique used to remove redundant
instructions. For instance the team's project will generate instructions such as LDI R1
R2; ADD R1 4 R1 which could be replaced with a single instruction such as LDINC R1
R2 4. Other future works will include creating a system to remove redundant
computations. As of now the project will calculate many of the same expressions over
and over again e.g. A = B * C and E = B * C. The team will use common subexpression
elimination (CSE) to eliminate these unnecessary computations. Creating both a CSE
system in partner with a peephole optimization system will optimize the project beyond
what it currently is.

Other future work may include handling registers within our system more efficiently.
Because of the project's scope the team's focus was not concerned with register
allocation. The team was given up to a thousand registers to use thereby allowing many
more registers than were actually needed for the project. A smarter approach then to
assign every register available is to reuse registers once they are available allowing
machines that may not have as many registers to still be able to run the project. This will
be done by using a technique such as bottom-up register allocation, which will free
registers once the data in them isn’t used anymore. Bottom-up register allocation works
by calculating the liveness of registers and freeing registers that are no longer live within
blocks of the code. An example of a pseudo algorithm used can be seen in figure 9.

Figure 9: Adaptation of bottom-up register allocation (Indika)


All in all, this project was a unique experience for the team. While the team has had
group projects presented before, nothing has been quite like this project. The team
hopes to not only apply the solutions and design methods of this project to the future
work of the compiler, but to other projects as well.

Works Cited
1. Beazly, David. Python Lex-Yacc. PLY David Beazly, 2 Oct. 2015. Web. 21 Jan. 2016.
2. Kahanda, Indika. "Scanners." Compilers Class. MSU, Bozeman. 24 Jan. 2015. Lecture.
3. Slides Modified From Louden Book And Dr. Scherge. "Scanning." (n.d.): n. pag. Jan.
2010. Web. 24 Jan. 2016
4. Kahanda, Indika. "Parser." Compilers Class. MSU, Bozeman. 24 Jan. 2015. Lecture.
5. Kahanda, Indika. "Semantic Actions." Compilers Class. MSU, Bozeman. 24 Jan. 2015.
Lecture.
6. Kahanda, Indika. "Global Register Allocation." Compilers Class. MSU, Bozeman. 24 Jan.
2015. Lecture.
7. Kahanda, Indika. "Instruction Scheduling." Compilers Class. MSU, Bozeman. 24 Jan.
2015. Lecture.
8. Fischer, Charles N., and Richard J. LeBlanc. Crafting a Compiler with C. Redwood City,
CA: Benjamin/Cummings Pub., 1991. Print.

You might also like