The data segment contains the values associated with the variables. Each variable is assigned to a location which holds the associated value. Thus, part of the activity of code generation is to associate an address with each variable. The code segment consists of a sequence of operations. Program constants are incorporated in the code segment since their values do not change. The expression stack is a stack which is used to hold intermediate values in the evaluation of expressions. The presence of the expression stack indicates that the virtual machine for Simple is a ``stack machine''.
integer x,y,z. DATA 2
If the code is placed in an array, then the label addresses must be {\em back-patched} into the code when they become available.x := expr code for expr STORE X if cond then code for cond S1 BR_FALSE L1 else code for S1 S2 BR L2 end L1: code for S2 L2: while cond do L1: code for cond S BR_FALSE L2 end code for S BR L1 L2: read X IN_INT X write expr code for expr OUT_INT
constant LD_INT constant variable LD variable e1 op e2 code for e1 code for e2 code for op
The code segment begins with an offset of zero. Space is reserved, in the code segment, by calling the function {\tt reserve\_loc} which returns the address of the reserved location. The function {\tt gen\_label} returns the value of the code offset.int data_offset = 0; int data_location() { return data_offset++; }}
The functions {\tt reserve\_loc} and {\tt gen\_label} are used for backpatching code.int code_offset = 0; int reserve_loc() { return code_offset++; } int gen_label() { return code_offset; }
The functions {\tt gen\_code} and {\tt back\_patch} are used to generate code. {\tt gen\_code} generates code at the current offset while {\tt back\_patch} is used to generate code at some previously reserved address.
void gen_code( enum code_ops operation, int arg ) { code[code_offset].op = operation; code[code_offset++].arg = arg; } void back_patch( int addr, enum code_ops operation, int arg ) { code[addr].op = operation; code[addr].arg = arg; }}
struct symrec { char *name; /* name of symbol */ int offset; /* data offset */ struct symrec *next; /* link field */ }; ... symrec * putsym (char *sym_name) { symrec *ptr; ptr = (symrec *) malloc (sizeof(symrec)); ptr->name = (char *) malloc (strlen(sym_name)+1); strcpy (ptr->name,sym_name); ptr->offset = data_location(); ptr->next = (struct symrec *)sym_table; sym_table = ptr; return ptr; } ...}
The parser is extended to generate and assembly code. The code implementing the if and while commands must contain the correct jump addresses. In this example, the jump destinations are labels. Since the destinations are not known until the entire command is processed, {\em back-patching} of the destination information is required. In this example, the label identifier is generated when it is known that an address is required. The label is placed into the code when its position is known. An alternative solution is to store the code in an array and back-patch actual addresses.%{#include}/* For I/O */ #include /* For malloc here and in symbol table */ #include /* For strcmp in symbol table */ #include "ST.h" /* Symbol Table */ #include "SM.h" /* Stack Machine */ #include "CG.h" /* Code Generator */ #define YYDEBUG 1 /* For Debugging */ int errors; /* Error Count-incremented in CG, ckd here */ struct lbs /* For labels: if and while */ { int for_goto; int for_jmp_false; }; struct lbs * newlblrec() /* Allocate space for the labels */ { return (struct lbs *) malloc(sizeof(struct lbs)); } install ( char *sym_name ) { symrec *s; s = getsym (sym_name); if (s == 0) s = putsym (sym_name); else { errors++; printf( "%s is already defined\n", sym_name ); } } context_check( enum code_ops operation, char *sym_name ) { symrec *identifier; identifier = getsym( sym_name ); if ( identifier == 0 ) { errors++; printf( "%s", sym_name ); printf( "%s\n", " is an undeclared identifier" ); } else gen_code( operation, identifier->offset ); } %} %union semrec /* The Semantic Records */ { int intval; /* Integer values */ char *id; /* Identifiers */ struct lbs *lbls /* For backpatching */ } %start program %token NUMBER /* Simple integer */ %token IDENTIFIER /* Simple identifier */ %token IF WHILE /* For backpatching labels */ %token SKIP THEN ELSE FI DO END %token INTEGER READ WRITE LET IN %token ASSGNOP %left '-' '+' %left '*' '/' %right '^' %% /* Grammar Rules and Actions */ %% /* C subroutines */
The actions associated with code generation for a stack-machine based architecture are added to the grammar section. The code generated for the declaration section must reserve space for the variables.
The {\sf IF} and {\sf WHILE} commands require backpatching./* C and Parser declarations */ %% program : LET declarations IN { gen_code( DATA, sym_table->offset ); } commands END { gen_code( HALT, 0 ); YYACCEPT; } ; declarations : /* empty */ | INTEGER id_seq IDENTIFIER '.' { install( $3 ); } ; id_seq : /* empty */ | id_seq IDENTIFIER ',' { install( $2 ); } ;}
The code generated for expressions is straight forward.commands : /* empty */ | commands command ';' ; command : SKIP | READ IDENTIFIER { context_check( READ_INT, $2 ); } | WRITE exp { gen_code( WRITE_INT, 0 ); } | IDENTIFIER ASSGNOP exp { context_check( STORE, $1 ); } | IF exp { $1 = (struct lbs *) newlblrec(); $1->for_jmp_false = reserve_loc(); } THEN commands { $1->for_goto = reserve_loc(); } ELSE { back_patch( $1->for_jmp_false, JMP_FALSE, gen_label() ); } commands FI { back_patch( $1->for_goto, GOTO, gen_label() ); } | WHILE { $1 = (struct lbs *) newlblrec(); $1->for_goto = gen_label(); } exp { $1->for_jmp_false = reserve_loc(); } DO commands END { gen_code( GOTO, $1->for_goto ); back_patch( $1->for_jmp_false, JMP_FALSE, gen_label() ); } ;}
exp : NUMBER { gen_code( LD_INT, $1 ); } | IDENTIFIER { context_check( LD_VAR, $1 ); } | exp '<' exp { gen_code( LT, 0 ); } | exp '=' exp { gen_code( EQ, 0 ); } | exp '>' exp { gen_code( GT, 0 ); } | exp '+' exp { gen_code( ADD, 0 ); } | exp '-' exp { gen_code( SUB, 0 ); } | exp '*' exp { gen_code( MULT, 0 ); } | exp '/' exp { gen_code( DIV, 0 ); } | exp '^' exp { gen_code( PWR, 0 ); } | '(' exp ')' ; %% /* C subroutines */}
%{ #include}/* for strdup */ #include "simple.tab.h" /* for token definitions and yylval */ %} DIGIT [0-9] ID [a-z][a-z0-9]* %% {DIGIT}+ { yylval.intval = atoi( yytext ); return(INT); } ... {ID} { yylval.id = (char *) strdup(yytext); return(IDENT); } [ \t\n]+ /* eat up whitespace */ . { return(yytext[0]);} %%
let integer n,x,n. in read n; if n < 10 then x := 1; else skip; fi; while n < 10 do x := 5*x; n := n+1; end; skip; write n; write x; end\caption{A Simple program \label{simp:program}} \end{lfig} \begin{lfig}
0: data 1 1: in_int 0 2: ld_var 0 3: ld_int 10 4: lt 0 5: jmp_false 9 6: ld_int 1 7: store 1 8: goto 9 9: ld_var 0 10: ld_int 10 11: lt 0 12: jmp_false 22 13: ld_int 5 14: ld_var 1 15: mult 0 16: store 1 17: ld_var 0 18: ld_int 1 19: add 0 20: store 0 21: goto 9 22: ld_var 0 23: out_int 0 24: ld_var 1 25: out_int 0 26: halt 0\caption{Stack code \label{stackcode}} \end{lfig}
As an example of code generation, we extend our Lex and Yacc files for Simp to generate code for a stack machine. First, we must extend the Yacc and Lex files to pass the values of constants from the scanner to the parser. The definition of the semantic record in the Yacc file is modified that the constant may be returned as part of the semantic record.
Then the Lex file is extended to place the value of the constant into the semantic record.%union semrec /* The Semantic Records */ { int intval; /* Integer values */ char *id; /* Identifiers */ ...
The symbol table record is extended to contain the offset from the base address of the data segment (the storage area which is to contain the values associated with each variable) and the {\tt putsym} function is extended to place the offset into the record associated with the variable.%{ #include/* for strdup */ #include /* for atoi */ #include "simple.tab.h" /* for token definitions and yylval */ %} DIGIT [0-9] ID [a-z][a-z0-9]* %% {DIGIT}+ { yylval.intval = atoi( yytext ); return(INT); } ... {ID} { yylval.id = (char *) strdup(yytext); return(IDENT); } [ \t\n]+ /* eat up whitespace */ . { return(yytext[0]);} %%
The parser is extended to generate and assembly code. The code implementing the if and while commands must contain the correct jump addresses. In this example, the jump destinations are labels. Since the destinations are not known until the entire command is processed, back-patching of the destination information is required. In this example, the label identifier is generated when it is known that an address is required. The label is placed into the code when its position is known. An alternative solution is to store the code in an array and back-patch actual addresses.struct symrec { char *name; /* name of symbol */ int offset; /* data offset */ struct symrec *next; /* link field */ }; typedef struct symrec symrec; symrec *sym_table = (symrec *)0; /* Ptr to symbol table */ symrec *st_entry; /* Ptr to an entry */ symrec *putsym (); symrec *getsym (); symrec * putsym (sym_name) char *sym_name; { symrec *ptr; ptr = (symrec *) malloc (sizeof(symrec)); ptr->name = (char *) malloc (strlen(sym_name)+1); strcpy (ptr->name,sym_name); ptr->offset = data_offset++; ptr->next = (struct symrec *)sym_table; sym_table = ptr; return ptr; } ...
The semantic record is extended to hold two label identifiers since two labels will be required for the if and while commands. The remainder of the file contains the actions associated with code generation for a stack-machine based architecture.%{ #include/* For I/O */ #include /* for malloc here and in symbol table */ #include /* for strcmp in symbol table */ int data_offset = 0; /* for data area address offsets */ int label_count = 0; /* for label identifiers */ #include "ST.h" #define YYDEBUG 1 %} %union semrec /* The Semantic Records */ { int intval; /* Integer values */ char *id; /* Identifiers */ struct lbs /* Labels for if and while */ { int label0; int label1; } *lbls; } %token INT /* Simple integer */ %token IDENT /* Simple identifier */ %token IF WHILE /* For back-patching labels */
To illustrate the code generation capabilities of the compiler, Figure~\ref{simp:program} is a program in Simp and Figure~\ref{stackcode}. \begin{lfig}%token SKIP THEN ELSE FI DO END %left '-' '+' %left '*' '/' %right '^' /* Exponentiation */ %% program : command_sequence { /* print code */ } ; command_sequence : /* empty */ | command_sequence command ';' ; command : SKIP | IDENT ':' '=' exp { install( $1, IDENT ); printf("assign %ld\n", st_entry->offset ); } | IF exp { $1 = (struct lbs *) malloc(sizeof(struct lbs)); $1->label0 = label_count++; printf("jmp_false label %ld\n",$1->label0); } THEN command_sequence { $1->label1 = label_count++; printf("goto label %ld\n",$1->label1); } ELSE { printf("label %ld\n",$1->label0); } command_sequence FI { printf("label %ld\n",$1->label1); } | WHILE { $1 = (struct lbs *) malloc(sizeof(struct lbs)); $1->label0 = label_count++; printf("label %ld\n", $1->label0); } exp { $1->label1 = label_count++; printf("jmp_false label %ld\n",$1->label1); } DO command_sequence END { printf("goto label%ld\n",$1->label0); printf("label%ld\n",$1->label1); } ; exp : INT { printf("load_int %ld\n", $1); } | IDENT { if (getsym($1) == 0) printf("Undefined variable: %s\n", $1); else printf("load_var %ld\n", getsym($1)->offset); } | exp '<' exp { printf("less_than%\n"); } | exp '=' exp { printf("equal%\n"); } | exp '>' exp { printf("greater_than%\n"); } | exp '+' exp { printf("add%\n"); } | exp '-' exp { printf("sub%\n"); } | exp '*' exp { printf("mult%\n"); } | exp '/' exp { printf("div%\n"); } | exp '^' exp { printf("power%\n"); } | '(' exp ')' ; %% ...
\caption{A Simp program \label{simp:program}} \end{lfig} \begin{lfig}n := 1; if n < 10 then x := 1; else skip; fi; while n < 10 do x := 5; n := n+1; end; skip;
\caption{Stack code \label{stackcode}} \end{lfig}load_int 1 assign 0 load_var 0 load_int 10 less_than jmp_false label 0 load_int 1 assign 1 goto label 1 label 0 label 1 label 2 load_var 0 load_int 10 less_than jmp_false label 3 load_int 5 assign 1 load_var 0 load_int 1 add assign 0 goto label 2 label 3