User:Ed Davis
Lexical analysis is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an identified "meaning"). A program that performs lexical analysis may be called a lexer, tokenizer, or scanner (though "scanner" is also used to refer to the first stage of a lexer).
- The Task
Create a lexical analyzer for the Tiny programming language. The program should read input from a file and/or stdin, and write output to a file and/or stdout.
- Specification
The various token types are denoted below.
- Operators
Characters | Common name | Name |
---|---|---|
* | multiply | Mul |
/ | divide | Div |
+ | plus | Add |
- | minus and unary minus | Sub and Uminus |
< | less than | Lss |
<= | less than or equal | Leq |
> | greater than | Gtr |
!= | not equal | Neq |
= | assign | Assign |
&& | and | And |
- Symbols
Characters | Common name | Name |
---|---|---|
( | left parenthesis | Lparen |
) | right parenthesis | Rparen |
{ | left brace | Lbrace |
} | right brace | Rbrace |
; | semi colon | Semi |
, | comma | Comma |
- Keywords
Characters | Name |
---|---|
if | If |
while | While |
putc | Putc |
- Other entities
Characters | Regular expression | Name |
---|---|---|
integers | [0-9]+ | Integer |
char literal | 'x' | Integer |
identifiers | [_a-zA-Z][_a-zA-Z0-9]+ | Ident |
string literal | ".*" | String |
Notes: For char literals, '\n' is supported as a new line character. To represent \, use: '\\'. \n may also be used in Strings, to print a newline. No other special sequences are supported.
Comments /* ... */ (multi-line)
- Complete list of token names
EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add, Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident
- Program output
Output of the program should be:
- the word line, followed by:
- the line number where the token starts, followed by:
- the abbreviation col, followed by:
- the column number where the token starts, followed by:
- the token name.
- If the token name is one of Integer, Ident or String, the
actual value of the same should follow.
- Test Cases
<lang c> /*
Hello world */
print("Hello, World!\n"); </lang>
- Output
line | 4 | col | 1 | ||
line | 4 | col | 6 | Lparen | |
line | 4 | col | 7 | String | "Hello, World!\n" |
line | 4 | col | 24 | Rparen | |
line | 4 | col | 25 | Semi | |
line | 5 | col | 1 | EOI |
<lang c> /*
Show Ident and Integers */
phoenix_number = 142857; print(phoenix_number, "\n"); </lang>
- Output
line | 1 | col | 1 | Ident | phoenix_number |
line | 1 | col | 16 | Assign | |
line | 1 | col | 18 | Integer | 142857 |
line | 1 | col | 24 | Semi | |
line | 2 | col | 1 | ||
line | 2 | col | 6 | Lparen | |
line | 2 | col | 7 | Ident | phoenix_number |
line | 2 | col | 21 | Comma | |
line | 2 | col | 23 | String | "\n" |
line | 2 | col | 27 | Rparen | |
line | 2 | col | 28 | Semi | |
line | 3 | col | 1 | EOI |
- Diagnostics
The following error conditions should be caught:
- Empty character constant. Example:
- Unknown escape sequence. Example: '\r'
- Multi-character constant. Example: 'xx'
- End-of-file in comment. Closing comment characters not found.
- End-of-file while scanning string literal. Closing string character not found.
- End-of-line while scanning string literal. Closing string character not found before end-of-line.
- Unrecognized character. Example: |
- Reference
The C and Python versions can be considered reference implementations.
- Implementations
C
<lang C>
- include <stdlib.h>
- include <stdio.h>
- include <stdarg.h>
- include <ctype.h>
- include <string.h>
- include <errno.h>
- include <stdbool.h>
- define NELEMS(arr) (sizeof(arr) / sizeof(arr[0]))
- define da_dim(name, type) type *name = NULL; \
int _qy_ ## name ## _p = 0; \ int _qy_ ## name ## _max = 0
- define da_rewind(name) _qy_ ## name ## _p = 0
- define da_redim(name) if (_qy_ ## name ## _p >= _qy_ ## name ## _max) \
name = realloc(name, (_qy_ ## name ## _max += 32) * sizeof(name[0]))
- define da_append(name, x) do {da_redim(name); name[_qy_ ## name ## _p++] = x;} while (0)
- define da_len(name) _qy_ ## name ## _p
// dependancy: atr table in parse.c ordering is based on these typedef enum {
EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add, Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident
} TokenType;
typedef struct {
int tok; int err_ln, err_col; union { int n; /* value for constants */ char *text; /* text for idents */ };
} tok_s;
static FILE *source_fp, *dest_fp; static int line, col, the_ch; da_dim(text, char);
tok_s gettok();
static void error(int err_line, int err_col, const char *fmt, ... ) {
char buf[1000]; va_list ap;
va_start(ap, fmt); vsprintf(buf, fmt, ap); va_end(ap); printf("(%d,%d) error: %s\n", err_line, err_col, buf); exit(1);
}
static void read_ch() { /* get next char from input */
the_ch = getc(source_fp); ++col; if (the_ch == '\n') { ++line; col = 0; }
}
static tok_s char_lit(int n, int err_line, int err_col) { /* 'x' */
if (the_ch == '\) error(err_line, err_col, "gettok: empty character constant"); if (the_ch == '\\') { read_ch(); if (the_ch == 'n') n = 10; else if (the_ch == '\\') n = '\\'; else error(err_line, err_col, "gettok: unknown escape sequence \\%c", the_ch); } read_ch(); if (the_ch != '\) error(err_line, err_col, "multi-character constant"); read_ch(); return (tok_s){Integerk, err_line, err_col, {n}};
}
static tok_s div_or_cmt(int err_line, int err_col) { /* process divide or comments */
if (the_ch != '*') return (tok_s){Div, err_line, err_col, {0}};
/* comment found */ for (;;) { read_ch(); if (the_ch == '*' || the_ch == EOF) { read_ch(); if (the_ch == '/' || the_ch == EOF) { read_ch(); return gettok(); } } }
}
static tok_s string_lit(int start, int err_line, int err_col) { /* "st" */
da_rewind(text);
for (read_ch(); the_ch != start; read_ch()) { if (the_ch == '\n') error(err_line, err_col, "EOL in string"); if (the_ch == EOF) error(err_line, err_col, "EOF in string"); da_append(text, (char)the_ch); } da_append(text, '\0');
read_ch(); return (tok_s){Stringk, err_line, err_col, {.text=text}};
}
static int kwd_cmp(const void *p1, const void *p2) {
return stricmp(*(char **)p1, *(char **)p2);
}
static TokenType get_ident_type(const char *ident) {
static struct { char *s; TokenType sym; } kwds[] = { {"if", If}, {"print", Print}, {"putc", Putc}, {"while", While}, }, *kwp;
return (kwp = bsearch(&ident, kwds, NELEMS(kwds), sizeof(kwds[0]), kwd_cmp)) == NULL ? Ident : kwp->sym;
}
static tok_s ident_or_int(int err_line, int err_col) {
int n, is_number = true;
da_rewind(text); while (isalnum(the_ch) || the_ch == '_') { da_append(text, (char)the_ch); if (!isdigit(the_ch)) is_number = false; read_ch(); } if (da_len(text) == 0) error(err_line, err_col, "gettok: unrecognized character (%d) '%c'\n", the_ch, the_ch); da_append(text, '\0'); if (isdigit(text[0])) { if (!is_number) error(err_line, err_col, "invalid number: %s\n", text); n = strtol(text, NULL, 0); if (n == LONG_MAX && errno == ERANGE) error(err_line, err_col, "Number exceeds maximum value"); return (tok_s){Integerk, err_line, err_col, {n}}; } return (tok_s){get_ident_type(text), err_line, err_col, {.text=text}};
}
static tok_s follow(int expect, TokenType ifyes, TokenType ifno, int err_line, int err_col) { /* look ahead for '>=', etc. */
if (the_ch == expect) { read_ch(); return (tok_s){ifyes, err_line, err_col, {0}}; } if (ifno == EOI) error(err_line, err_col, "follow: unrecognized character '%c' (%d)\n", the_ch, the_ch); return (tok_s){ifno, err_line, err_col, {0}};
}
tok_s gettok() { /* return the token type */
/* skip white space */ while (isspace(the_ch)) read_ch(); int err_line = line; int err_col = col; switch (the_ch) { case '{': read_ch(); return (tok_s){Lbrace, err_line, err_col, {0}}; case '}': read_ch(); return (tok_s){Rbrace, err_line, err_col, {0}}; case '(': read_ch(); return (tok_s){Lparen, err_line, err_col, {0}}; case ')': read_ch(); return (tok_s){Rparen, err_line, err_col, {0}}; case '+': read_ch(); return (tok_s){Add, err_line, err_col, {0}}; case '-': read_ch(); return (tok_s){Sub, err_line, err_col, {0}}; case '*': read_ch(); return (tok_s){Mul, err_line, err_col, {0}}; case ';': read_ch(); return (tok_s){Semi, err_line, err_col, {0}}; case ',': read_ch(); return (tok_s){Comma, err_line, err_col, {0}}; case '>': read_ch(); return (tok_s){Gtr, err_line, err_col, {0}}; case '=': read_ch(); return (tok_s){Assign, err_line, err_col, {0}}; case '/': read_ch(); return div_or_cmt(err_line, err_col); case '\: read_ch(); return char_lit(the_ch, err_line, err_col); case '<': read_ch(); return follow('=', Leq, Lss, err_line, err_col); case '!': read_ch(); return follow('=', Neq, EOI, err_line, err_col); case '&': read_ch(); return follow('&', And, EOI, err_line, err_col); case '"' : return string_lit(the_ch, err_line, err_col); default: return ident_or_int(err_line, err_col); case EOF: return (tok_s){EOI, err_line, err_col, {0}}; }
}
void init_lex() { /* initialize the scanner */
line = 1; read_ch();
}
void run() { /* tokenize the given input */
tok_s tok; do { tok = gettok(); fprintf(dest_fp, "line %5d col %5d %.8s", tok.err_ln, tok.err_col, &"EOI Print Putc If While Lbrace Rbrace Lparen Rparen " "Uminus Mul Div Add Sub Lss Gtr Leq Neq " "And Semi Comma Assign Integer String Ident "[tok.tok * 9]); if (tok.tok == Integerk) fprintf(dest_fp, " %8d", tok.n); else if (tok.tok == Ident) fprintf(dest_fp, " %s", tok.text); else if (tok.tok == Stringk) fprintf(dest_fp, " \"%s\"", tok.text); fprintf(dest_fp, "\n"); } while (tok.tok != EOI); if (dest_fp != stdout) fclose(dest_fp);
}
void init_io(FILE **fp, FILE *std, const char mode[], const char fn[]) {
if (fn[0] == '\0') *fp = std; else if ((*fp = fopen(fn, mode)) == NULL) error(0, 0, "Can't open %s\n", fn);
}
int main(int argc, char *argv[]) {
init_io(&source_fp, stdin, "r", argc > 1 ? argv[1] : ""); init_io(&dest_fp, stdout, "wb", argc > 2 ? argv[2] : ""); init_lex(); run();
} </lang>
Python
<lang Python>
import sys
- following two must remain in the same order
EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add, \ Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident = range(25)
all_syms = [ 'EOI', 'Print', 'Putc', 'If', 'While', 'Lbrace', 'Rbrace', 'Lparen',
'Rparen', 'Uminus', 'Mul', 'Div', 'Add', 'Sub', 'Lss', 'Gtr', 'Leq', 'Neq', 'And', 'Semi', 'Comma', 'Assign', 'Integer', 'String', 'Ident' ]
- single character only symbols
symbols = { '{': Lbrace, '}': Rbrace, '(': Lparen, ')': Rparen, '+': Add, '-': Sub,
'*': Mul, ';': Semi, ',': Comma, '>': Gtr, '=': Assign }
key_words = { 'if': If, 'print': Print, 'putc': Putc, 'while': While }
the_ch = " " # dummy first char - but it must be a space the_col = 0 the_line = 1 input_file = None
- show error and exit
def error(line, col, msg):
print line, col, msg exit(1)
- get the next character from the input
def getc():
global the_ch, the_col, the_line
the_ch = input_file.read(1) the_col += 1 if the_ch == '\n': the_line += 1 the_col = 0 return the_ch
- 'x' - character constants
def char_lit(err_line, err_col):
n = ord(getc()) # skip opening quote if the_ch == '\: error(err_line, err_col, "empty character constant") elif the_ch == '\\': getc() if the_ch == 'n': n = 10 elif the_ch == '\\': n = '\\' else: error(err_line, err_col, "unknown escape sequence \\%c" % (the_ch)) if getc() != '\: error(err_line, err_col, "multi-character constant") getc() return Integerk, err_line, err_col, n
- process divide or comments
def div_or_cmt(err_line, err_col):
if getc() != '*': return Div, err_line, err_col
# comment found while True: if getc() == '*' and getc() == '/': getc() return gettok() elif len(the_ch) == 0: error(err_line, err_col, "EOF in comment")
- "string"
def string_lit(start, err_line, err_col):
text = ""
while getc() != start: if len(the_ch) == 0: error(err_line, err_col, "EOF while scanning string literal") if the_ch == '\n': error(err_line, err_col, "EOL while scanning string literal") text += the_ch
getc() return Stringk, err_line, err_col, text
- handle identifiers and integers
def ident_or_int(err_line, err_col):
is_number = True text = ""
while the_ch.isalnum() or the_ch == '_': text += the_ch if not the_ch.isdigit(): is_number = False getc()
if len(text) == 0: error(err_line, err_col, "ident_or_int: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch))
if text[0].isdigit(): if not is_number: error(err_line, err_col, "invalid number: %s" % (text)) n = int(text) return Integerk, err_line, err_col, n
if text in key_words: return key_words[text], err_line, err_col
return Ident, err_line, err_col, text
- look ahead for '>=', etc.
def follow(expect, ifyes, ifno, err_line, err_col):
if getc() == expect: getc() return ifyes, err_line, err_col
if ifno == EOI: error(err_line, err_col, "follow: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch))
return ifno, err_line, err_col
- return the next token type
def gettok():
while the_ch.isspace(): getc()
err_line = the_line err_col = the_col
if len(the_ch) == 0: return EOI, err_line, err_col elif the_ch in symbols: sym = symbols[the_ch]; getc(); return sym, err_line, err_col elif the_ch == '/': return div_or_cmt(err_line, err_col) elif the_ch == '\: return char_lit(err_line, err_col) elif the_ch == '<': return follow('=', Leq, Lss, err_line, err_col) elif the_ch == '!': return follow('=', Neq, EOI, err_line, err_col) elif the_ch == '&': return follow('&', And, EOI, err_line, err_col) elif the_ch == '"': return string_lit(the_ch, err_line, err_col) else: return ident_or_int(err_line, err_col)
- main driver
input_file = sys.stdin if len(sys.argv) > 1:
try: input_file = open(sys.argv[1], "r", 4096) except IOError as e: error(0, 0, "Can't open %s" % sys.argv[1])
while True:
t = gettok() tok = t[0] line = t[1] col = t[2]
if tok == Integerk: print "line %5d col %5d %-8s %8d" % (line, col, all_syms[tok], t[3]) elif tok == Ident: print "line %5d col %5d %-8s %s" % (line, col, all_syms[tok], t[3]) elif tok == Stringk: print 'line %5d col %5d %-8s "%s"' % (line, col, all_syms[tok], t[3]) else: print "line %5d col %5d %-8s" % (line, col, all_syms[tok])
if tok == EOI: break
</lang>