Initial Lexer Regular Expressions – XM3 Compiler Design

Regular expressions are a sequence of characters that match a pattern of text. For example, “i”,”n”,”t” -> “int”. The Lexer uses regular expressions to create tokens of valid words when analyzing the c program character by character.

An initial list of regular expressions that the Lexer must be able to recognize include identifiers (i.e., variable names), numbers, keywords (i.e., int or while), operators, and some other basic regular expression (i.e., ;).

For keywords and operators the regular expressions are simple as they are just strings. For example, the regular expression to recognize the else keyword is “else”:

"else" {printf("ELSE_TOKEN\n")};

Then, to recognize a number, the first character must be a digit (0, 1, 2, 3, 4, 5, 6, 7, 8, or 9). Then any amount of digits can follow the initial digit. Using regex, the regular expression for a number is:

[0-9]+ {printf("NUMBER_TOKEN\n")};

Finally, to recognize an identifier, the first character must be a letter or an underscore. Then any amount of letters, underscores, and digits can follow the initial character. Using regex, the regular expression for an identifier is:

[a-zA-Z_][a-zA-Z0-9_]* {printf("ID_TOKEN\n")};

Note, that the C code contained in the braces after the regular expressions are executed when the regular expression is encountered. For now, print statements are used, but later these print statements will be replaced by code used to pass tokens to the parser.

See the code below for the initial regular expressions of the Lexer:

[a-zA-Z_][a-zA-Z0-9_]* {printf("ID_TOKEN\n")};

[0-9]+ {printf("NUMBER_TOKEN\n")};

"case" {printf("CASE\n");}
"default" {printf("DEFAULT\n");}
"else if" {printf("ELIF\n");}
"else" {printf("ELSE\n");}
"if" {printf("IF\n");}
"switch" {printf("SWITCH\n");}
"break" {printf("BRK\n");}
"continue" {printf("CONT\n");}
"for" {printf("FOR\n");}
"do" {printf("DO\n");}
"while" {printf("WHILE\n");}
"char" {printf("CHAR\n");}
"const" {printf("CONST\n");}
"double" {printf("DOUBLE\n");}
"enum" {printf("ENUM\n");}
"extern" {printf("EXTERN\n");}
"float" {printf("float\n");}
"int" {printf("INT\n");}
"long" {printf("LONG\n");}
"signed" {printf("SIGNED\n");}
"short" {printf("SHORT\n");}
"static" {printf("STATIC\n");}
"struct" {printf("STRUCT\n");}
"union" {printf("UNION\n");}
"unsigned" {printf("UNSIGNED\n");}
"void" {printf("VOID\n");}
"volatile" {printf("VOLATILE\n");}
"goto" {printf("GOTO\n");}
"return" {printf("RETURN\n");}

"++" {printf("INC\n");}
"+" {printf("PLUS\n");}
"--" {printf("DEC\n");}
"-" {printf("MINUS\n");}
"/" {printf("DIV\n");}
"*" {printf("MULT\n");}
"==" {printf("EQUAL_CHK\n");}
"+=" {printf("PLUSEQ\n");}
"-=" {printf("MINUSEQ\n");}
"/=" {printf("DIVEQ\n");}
"*=" {printf("MULTEQ\n");}
"%=" {printf("MODEQ\n");}
"&=" {printf("ANDEQ_BIT\n");}
"|=" {printf("OREQ_BIT\n");}
"^=" {printf("XOREQ_BIT\n");}
"<<=" {printf("LSEQ_BIT\n");}
">>=" {printf("RSEQ_BIT\n");}
"!=" {printf("NOT_EQUAL_CHK\n");}
"<<" {printf("L_SHIFT\n");}
">>" {printf("R_SHIFT\n");}
">=" {printf("GTEQ_CHK\n");}
"<=" {printf("LTEQ_CHK\n");}
">" {printf("GT_CHK\n");}
"<" {printf("LT_CHK\n");}
"=" {printf("ASSIGN\n");}
"%" {printf("MOD\n");}
"&&" {printf("AND\n");}
"&" {printf("BITAND\n");}
"||" {printf("OR\n");}
"|" {printf("BITOR\n");}
"!" {printf("NOT\n");}
"^" {printf("BITXOR\n");}
"~" {printf("BITCOMP\n");}

";" {printf("SEMICOLON\n");}
"\n" {printf("NEWLINE\n");}
"}" {printf("OP_PAR\n");}
"}" {printf("CL_PAR\n");}
"}" {printf("OP_BRACE\n");}
"}" {printf("CL_BRACE\n");}
"[" {printf("OP_BRACKET\n");}
"]" {printf("CL_BRACKET\n");}