User:Ed Davis: Difference between revisions
No edit summary |
No edit summary |
||
Line 189: | Line 189: | ||
=={{header|C}}== |
=={{header|C}}== |
||
<lang C> |
|||
#include <stdlib.h> |
#include <stdlib.h> |
||
Line 419: | Line 420: | ||
run(); |
run(); |
||
} |
} |
||
</lang> |
|||
=={{header|Python}}== |
=={{header|Python}}== |
||
<lang Python> |
|||
import sys |
import sys |
||
Line 584: | Line 586: | ||
if tok == EOI: |
if tok == EOI: |
||
break |
break |
||
</lang> |
Revision as of 22:05, 9 August 2016
You are encouraged to solve this task according to the task description, using any language you may know.
Lexical Analyzer
From Wikipedia
Lexical analysis is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an identified "meaning"). A program that performs lexical analysis may be called a lexer, tokenizer, or scanner (though "scanner" is also used to refer to the first stage of a lexer).
- The Task
Create a lexical analyzer for the Tiny programming language. The program should read input from a file and/or stdin, and write output to a file and/or stdout.
- Specification
The various token types are denoted below.
- Operators
Characters | Common name | Name |
---|---|---|
'*' | multiply | Mul |
'/' | divide | Div |
'+' | plus | Add |
'-' | minus and unary minus | Sub and Uminus |
'<' | less than | Lss |
'<=' | less than or equal | Leq |
'>' | greater than | Gtr |
'!=' | not equal | Neq |
'=' | assign | Assign |
'&&' | and | And |
- Symbols
Characters | Common name | Name |
---|---|---|
'(' | left parenthesis | Lparen |
')' | right parenthesis | Rparen |
'{' | left brace | Lbrace |
'}' | right brace | Rbrace |
';' | semi colon | Semi |
',' | comma | Comma |
- Keywords
Characters | Name |
---|---|
"if" | If |
"while" | While |
"print" | |
"putc" | Putc |
- Other entities
Characters | Regular expression | Name |
---|---|---|
integers | [0-9]+ | Integer |
char literal | 'x' | Integer |
identifiers | [_a-zA-Z][_a-zA-Z0-9]+ | Ident |
string literal | ".*" | String |
Notes: For char literals, '\n' is supported as a new line character. To represent \, use: '\\'. \n may also be used in Strings, to print a newline. No other special sequences are supported.
Comments /* ... */ (multi-line)
- Complete list of token names
EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add, Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident
- Program output
Output of the program should be the line and column where the found token starts, followed by the Token name. For tokens Integer, Ident and String, the Integer, identifier, or string should follow.
- Test Cases
<lang c> /*
Hello world */
print("Hello, World!\n"); </lang>
- Output
line | 4 | col | 1 | ||
line | 4 | col | 6 | Lparen | |
line | 4 | col | 7 | String | "Hello, World!\n" |
line | 4 | col | 24 | Rparen | |
line | 4 | col | 25 | Semi | |
line | 5 | col | 1 | EOI |
<lang c> /*
Show Ident and Integers */
phoenix_number = 142857; print(phoenix_number, "\n"); </lang>
- Output
line | 1 | col | 1 | Ident | phoenix_number |
line | 1 | col | 16 | Assign | |
line | 1 | col | 18 | Integer | 142857 |
line | 1 | col | 24 | Semi | |
line | 2 | col | 1 | ||
line | 2 | col | 6 | Lparen | |
line | 2 | col | 7 | Ident | phoenix_number |
line | 2 | col | 21 | Comma | |
line | 2 | col | 23 | String | "\n" |
line | 2 | col | 27 | Rparen | |
line | 2 | col | 28 | Semi | |
line | 3 | col | 1 | EOI |
- Diagnostics
The following error conditions should be caught:
- Empty character constant. Example:
- Unknown escape sequence. Example: '\r'
- Multi-character constant. Example: 'xx'
- End-of-file in comment. Closing comment characters not found.
- End-of-file while scanning string literal. Closing string character not found.
- End-of-line while scanning string literal. Closing string character not found before end-of-line.
- Unrecognized character. Example: |
Refer additional questions to the C and Python implementations.
C
<lang C>
- include <stdlib.h>
- include <stdio.h>
- include <stdarg.h>
- include <ctype.h>
- include <string.h>
- include <errno.h>
- include <stdbool.h>
- define NELEMS(arr) (sizeof(arr) / sizeof(arr[0]))
- define da_dim(name, type) type *name = NULL; \
int _qy_ ## name ## _p = 0; \ int _qy_ ## name ## _max = 0
- define da_rewind(name) _qy_ ## name ## _p = 0
- define da_redim(name) if (_qy_ ## name ## _p >= _qy_ ## name ## _max) \
name = realloc(name, (_qy_ ## name ## _max += 32) * sizeof(name[0]))
- define da_append(name, x) do {da_redim(name); name[_qy_ ## name ## _p++] = x;} while (0)
- define da_len(name) _qy_ ## name ## _p
// dependancy: atr table in parse.c ordering is based on these typedef enum {
EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add, Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident
} TokenType;
typedef struct {
int tok; int err_ln, err_col; union { int n; /* value for constants */ char *text; /* text for idents */ };
} tok_s;
static FILE *source_fp, *dest_fp; static int line, col, the_ch; da_dim(text, char);
tok_s gettok();
static void error(int err_line, int err_col, const char *fmt, ... ) {
char buf[1000]; va_list ap;
va_start(ap, fmt); vsprintf(buf, fmt, ap); va_end(ap); printf("(%d,%d) error: %s\n", err_line, err_col, buf); exit(1);
}
static void read_ch() { /* get next char from input */
the_ch = getc(source_fp); ++col; if (the_ch == '\n') { ++line; col = 0; }
}
static tok_s char_lit(int n, int err_line, int err_col) { /* 'x' */
if (the_ch == '\) error(err_line, err_col, "gettok: empty character constant"); if (the_ch == '\\') { read_ch(); if (the_ch == 'n') n = 10; else if (the_ch == '\\') n = '\\'; else error(err_line, err_col, "gettok: unknown escape sequence \\%c", the_ch); } read_ch(); if (the_ch != '\) error(err_line, err_col, "multi-character constant"); read_ch(); return (tok_s){Integerk, err_line, err_col, {n}};
}
static tok_s div_or_cmt(int err_line, int err_col) { /* process divide or comments */
if (the_ch != '*') return (tok_s){Div, err_line, err_col, {0}};
/* comment found */ for (;;) { read_ch(); if (the_ch == '*' || the_ch == EOF) { read_ch(); if (the_ch == '/' || the_ch == EOF) { read_ch(); return gettok(); } } }
}
static tok_s string_lit(int start, int err_line, int err_col) { /* "st" */
da_rewind(text);
for (read_ch(); the_ch != start; read_ch()) { if (the_ch == '\n') error(err_line, err_col, "EOL in string"); if (the_ch == EOF) error(err_line, err_col, "EOF in string"); da_append(text, (char)the_ch); } da_append(text, '\0');
read_ch(); return (tok_s){Stringk, err_line, err_col, {.text=text}};
}
static int kwd_cmp(const void *p1, const void *p2) {
return stricmp(*(char **)p1, *(char **)p2);
}
static TokenType get_ident_type(const char *ident) {
static struct { char *s; TokenType sym; } kwds[] = { {"if", If}, {"print", Print}, {"putc", Putc}, {"while", While}, }, *kwp;
return (kwp = bsearch(&ident, kwds, NELEMS(kwds), sizeof(kwds[0]), kwd_cmp)) == NULL ? Ident : kwp->sym;
}
static tok_s ident_or_int(int err_line, int err_col) {
int n, is_number = true;
da_rewind(text); while (isalnum(the_ch) || the_ch == '_') { da_append(text, (char)the_ch); if (!isdigit(the_ch)) is_number = false; read_ch(); } if (da_len(text) == 0) error(err_line, err_col, "gettok: unrecognized character (%d) '%c'\n", the_ch, the_ch); da_append(text, '\0'); if (isdigit(text[0])) { if (!is_number) error(err_line, err_col, "invalid number: %s\n", text); n = strtol(text, NULL, 0); if (n == LONG_MAX && errno == ERANGE) error(err_line, err_col, "Number exceeds maximum value"); return (tok_s){Integerk, err_line, err_col, {n}}; } return (tok_s){get_ident_type(text), err_line, err_col, {.text=text}};
}
static tok_s follow(int expect, TokenType ifyes, TokenType ifno, int err_line, int err_col) { /* look ahead for '>=', etc. */
if (the_ch == expect) { read_ch(); return (tok_s){ifyes, err_line, err_col, {0}}; } if (ifno == EOI) error(err_line, err_col, "follow: unrecognized character '%c' (%d)\n", the_ch, the_ch); return (tok_s){ifno, err_line, err_col, {0}};
}
tok_s gettok() { /* return the token type */
/* skip white space */ while (isspace(the_ch)) read_ch(); int err_line = line; int err_col = col; switch (the_ch) { case '{': read_ch(); return (tok_s){Lbrace, err_line, err_col, {0}}; case '}': read_ch(); return (tok_s){Rbrace, err_line, err_col, {0}}; case '(': read_ch(); return (tok_s){Lparen, err_line, err_col, {0}}; case ')': read_ch(); return (tok_s){Rparen, err_line, err_col, {0}}; case '+': read_ch(); return (tok_s){Add, err_line, err_col, {0}}; case '-': read_ch(); return (tok_s){Sub, err_line, err_col, {0}}; case '*': read_ch(); return (tok_s){Mul, err_line, err_col, {0}}; case ';': read_ch(); return (tok_s){Semi, err_line, err_col, {0}}; case ',': read_ch(); return (tok_s){Comma, err_line, err_col, {0}}; case '>': read_ch(); return (tok_s){Gtr, err_line, err_col, {0}}; case '=': read_ch(); return (tok_s){Assign, err_line, err_col, {0}}; case '/': read_ch(); return div_or_cmt(err_line, err_col); case '\: read_ch(); return char_lit(the_ch, err_line, err_col); case '<': read_ch(); return follow('=', Leq, Lss, err_line, err_col); case '!': read_ch(); return follow('=', Neq, EOI, err_line, err_col); case '&': read_ch(); return follow('&', And, EOI, err_line, err_col); case '"' : return string_lit(the_ch, err_line, err_col); default: return ident_or_int(err_line, err_col); case EOF: return (tok_s){EOI, err_line, err_col, {0}}; }
}
void init_lex() { /* initialize the scanner */
line = 1; read_ch();
}
void run() { /* tokenize the given input */
tok_s tok; do { tok = gettok(); fprintf(dest_fp, "line %5d col %5d %.8s", tok.err_ln, tok.err_col, &"EOI Print Putc If While Lbrace Rbrace Lparen Rparen " "Uminus Mul Div Add Sub Lss Gtr Leq Neq " "And Semi Comma Assign Integer String Ident "[tok.tok * 9]); if (tok.tok == Integerk) fprintf(dest_fp, " %8d", tok.n); else if (tok.tok == Ident) fprintf(dest_fp, " %s", tok.text); else if (tok.tok == Stringk) fprintf(dest_fp, " \"%s\"", tok.text); fprintf(dest_fp, "\n"); } while (tok.tok != EOI); if (dest_fp != stdout) fclose(dest_fp);
}
void init_io(FILE **fp, FILE *std, const char mode[], const char fn[]) {
if (fn[0] == '\0') *fp = std; else if ((*fp = fopen(fn, mode)) == NULL) error(0, 0, "Can't open %s\n", fn);
}
int main(int argc, char *argv[]) {
init_io(&source_fp, stdin, "r", argc > 1 ? argv[1] : ""); init_io(&dest_fp, stdout, "wb", argc > 2 ? argv[2] : ""); init_lex(); run();
} </lang>
Python
<lang Python>
import sys
- following two must remain in the same order
EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add, \ Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident = range(25)
all_syms = [ 'EOI', 'Print', 'Putc', 'If', 'While', 'Lbrace', 'Rbrace', 'Lparen',
'Rparen', 'Uminus', 'Mul', 'Div', 'Add', 'Sub', 'Lss', 'Gtr', 'Leq', 'Neq', 'And', 'Semi', 'Comma', 'Assign', 'Integer', 'String', 'Ident' ]
- single character only symbols
symbols = { '{': Lbrace, '}': Rbrace, '(': Lparen, ')': Rparen, '+': Add, '-': Sub,
'*': Mul, ';': Semi, ',': Comma, '>': Gtr, '=': Assign }
key_words = { 'if': If, 'print': Print, 'putc': Putc, 'while': While }
the_ch = " " # dummy first char - but it must be a space the_col = 0 the_line = 1 input_file = None
- show error and exit
def error(line, col, msg):
print line, col, msg exit(1)
- get the next character from the input
def getc():
global the_ch, the_col, the_line
the_ch = input_file.read(1) the_col += 1 if the_ch == '\n': the_line += 1 the_col = 0 return the_ch
- 'x' - character constants
def char_lit(err_line, err_col):
n = ord(getc()) # skip opening quote if the_ch == '\: error(err_line, err_col, "empty character constant") elif the_ch == '\\': getc() if the_ch == 'n': n = 10 elif the_ch == '\\': n = '\\' else: error(err_line, err_col, "unknown escape sequence \\%c" % (the_ch)) if getc() != '\: error(err_line, err_col, "multi-character constant") getc() return Integerk, err_line, err_col, n
- process divide or comments
def div_or_cmt(err_line, err_col):
if getc() != '*': return Div, err_line, err_col
# comment found while True: if getc() == '*' and getc() == '/': getc() return gettok() elif len(the_ch) == 0: error(err_line, err_col, "EOF in comment")
- "string"
def string_lit(start, err_line, err_col):
text = ""
while getc() != start: if len(the_ch) == 0: error(err_line, err_col, "EOF while scanning string literal") if the_ch == '\n': error(err_line, err_col, "EOL while scanning string literal") text += the_ch
getc() return Stringk, err_line, err_col, text
- handle identifiers and integers
def ident_or_int(err_line, err_col):
is_number = True text = ""
while the_ch.isalnum() or the_ch == '_': text += the_ch if not the_ch.isdigit(): is_number = False getc()
if len(text) == 0: error(err_line, err_col, "ident_or_int: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch))
if text[0].isdigit(): if not is_number: error(err_line, err_col, "invalid number: %s" % (text)) n = int(text) return Integerk, err_line, err_col, n
if text in key_words: return key_words[text], err_line, err_col
return Ident, err_line, err_col, text
- look ahead for '>=', etc.
def follow(expect, ifyes, ifno, err_line, err_col):
if getc() == expect: getc() return ifyes, err_line, err_col
if ifno == EOI: error(err_line, err_col, "follow: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch))
return ifno, err_line, err_col
- return the next token type
def gettok():
while the_ch.isspace(): getc()
err_line = the_line err_col = the_col
if len(the_ch) == 0: return EOI, err_line, err_col elif the_ch in symbols: sym = symbols[the_ch]; getc(); return sym, err_line, err_col elif the_ch == '/': return div_or_cmt(err_line, err_col) elif the_ch == '\: return char_lit(err_line, err_col) elif the_ch == '<': return follow('=', Leq, Lss, err_line, err_col) elif the_ch == '!': return follow('=', Neq, EOI, err_line, err_col) elif the_ch == '&': return follow('&', And, EOI, err_line, err_col) elif the_ch == '"': return string_lit(the_ch, err_line, err_col) else: return ident_or_int(err_line, err_col)
- main driver
input_file = sys.stdin if len(sys.argv) > 1:
try: input_file = open(sys.argv[1], "r", 4096) except IOError as e: error(0, 0, "Can't open %s" % sys.argv[1])
while True:
t = gettok() tok = t[0] line = t[1] col = t[2]
if tok == Integerk: print "line %5d col %5d %-8s %8d" % (line, col, all_syms[tok], t[3]) elif tok == Ident: print "line %5d col %5d %-8s %s" % (line, col, all_syms[tok], t[3]) elif tok == Stringk: print 'line %5d col %5d %-8s "%s"' % (line, col, all_syms[tok], t[3]) else: print "line %5d col %5d %-8s" % (line, col, all_syms[tok])
if tok == EOI: break
</lang>