User:Ed Davis: Difference between revisions
Content added Content deleted
No edit summary |
m (Replaced content with "Hello, World!") |
||
(3 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
Hello, World! |
|||
{| class="wikitable" |
|||
|- |
|||
| <pre>line 4 col 1 Print </pre> |
|||
| <pre>line 4 col 6 Lparen </pre> |
|||
| <pre>line 4 col 7 String </pre>"Hello, World!\n" |
|||
| <pre>line 4 col 24 Rparen </pre> |
|||
| <pre>line 4 col 25 Semi </pre> |
|||
| <pre>line 5 col 1 EOI </pre> |
|||
|} |
|||
Lexical analysis is the process of converting a sequence of characters (such as in a |
|||
computer program or web page) into a sequence of tokens (strings with an identified |
|||
"meaning"). A program that performs lexical analysis may be called a lexer, tokenizer, |
|||
or scanner (though "scanner" is also used to refer to the first stage of a lexer). |
|||
;The Task |
|||
Create a lexical analyzer for the Tiny programming language. The |
|||
program should read input from a file and/or stdin, and write |
|||
output to a file and/or stdout. |
|||
;Specification |
|||
The various token types are denoted below. |
|||
;Operators |
|||
{| class="wikitable" |
|||
|- |
|||
! Characters !! Common name !! Name |
|||
|- |
|||
| * || multiply || Mul |
|||
|- |
|||
| / || divide || Div |
|||
|- |
|||
| + || plus || Add |
|||
|- |
|||
| - || minus and unary minus || Sub and Uminus |
|||
|- |
|||
| < || less than || Lss |
|||
|- |
|||
| <= || less than or equal || Leq |
|||
|- |
|||
| > || greater than || Gtr |
|||
|- |
|||
| != || not equal || Neq |
|||
|- |
|||
| = || assign || Assign |
|||
|- |
|||
| && || and || And |
|||
|} |
|||
;Symbols |
|||
{| class="wikitable" |
|||
|- |
|||
! Characters !! Common name !! Name |
|||
|- |
|||
| ( || left parenthesis || Lparen |
|||
|- |
|||
| ) || right parenthesis || Rparen |
|||
|- |
|||
| { || left brace || Lbrace |
|||
|- |
|||
| } || right brace || Rbrace |
|||
|- |
|||
| ; || semi colon || Semi |
|||
|- |
|||
| , || comma || Comma |
|||
|} |
|||
;Keywords |
|||
{| class="wikitable" |
|||
|- |
|||
! Characters !! Name |
|||
|- |
|||
| if || If |
|||
|- |
|||
| while || While |
|||
|- |
|||
| print || Print |
|||
|- |
|||
| putc || Putc |
|||
|} |
|||
;Other entities |
|||
{| class="wikitable" |
|||
|- |
|||
! Characters !! Regular expression !! Name |
|||
|- |
|||
| integers || [0-9]+ || Integer |
|||
|- |
|||
| char literal || 'x' || Integer |
|||
|- |
|||
| identifiers || [_a-zA-Z][_a-zA-Z0-9]+ || Ident |
|||
|- |
|||
| string literal || ".*" || String |
|||
|} |
|||
Notes: For char literals, '\n' is supported as a new line |
|||
character. To represent \, use: '\\'. \n may also be used in |
|||
Strings, to print a newline. No other special sequences are |
|||
supported. |
|||
'''Comments''' /* ... */ (multi-line) |
|||
;Complete list of token names |
|||
'''EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add, Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident''' |
|||
;Program output |
|||
Output of the program should be: |
|||
* the word line, followed by: |
|||
* the line number where the token starts, followed by: |
|||
* the abbreviation col, followed by: |
|||
* the column number where the token starts, followed by: |
|||
* the token name. |
|||
* If the token name is one of Integer, Ident or String, the actual value of the same should follow. |
|||
;Test Cases |
|||
<lang c> |
|||
/* |
|||
Hello world |
|||
*/ |
|||
print("Hello, World!\n"); |
|||
</lang> |
|||
;Output |
|||
{| class="wikitable" |
|||
|- |
|||
| line || 4 || col || 1 || Print || |
|||
|- |
|||
| line || 4 || col || 6 || Lparen || |
|||
|- |
|||
| line || 4 || col || 7 || String || "Hello, World!\n" |
|||
|- |
|||
| line || 4 || col || 24 || Rparen || |
|||
|- |
|||
| line || 4 || col || 25 || Semi || |
|||
|- |
|||
| line || 5 || col || 1 || EOI || |
|||
|} |
|||
<lang c> |
|||
/* |
|||
Show Ident and Integers |
|||
*/ |
|||
phoenix_number = 142857; |
|||
print(phoenix_number, "\n"); |
|||
</lang> |
|||
;Output |
|||
{| class="wikitable" |
|||
|- |
|||
| line || 4 || col || 1 || Ident || phoenix_number |
|||
|- |
|||
| line || 4 || col || 16 || Assign || |
|||
|- |
|||
| line || 4 || col || 18 || Integer || 142857 |
|||
|- |
|||
| line || 4 || col || 24 || Semi || |
|||
|- |
|||
| line || 5 || col || 1 || Print || |
|||
|- |
|||
| line || 5 || col || 6 || Lparen || |
|||
|- |
|||
| line || 5 || col || 7 || Ident || phoenix_number |
|||
|- |
|||
| line || 5 || col || 21 || Comma || |
|||
|- |
|||
| line || 5 || col || 23 || String || "\n" |
|||
|- |
|||
| line || 5 || col || 27 || Rparen || |
|||
|- |
|||
| line || 5 || col || 28 || Semi || |
|||
|- |
|||
| line || 6 || col || 1 || EOI || |
|||
|} |
|||
;Diagnostics |
|||
The following error conditions should be caught: |
|||
* Empty character constant. Example: '' |
|||
* Unknown escape sequence. Example: '\r' |
|||
* Multi-character constant. Example: 'xx' |
|||
* End-of-file in comment. Closing comment characters not found. |
|||
* End-of-file while scanning string literal. Closing string character not found. |
|||
* End-of-line while scanning string literal. Closing string character not found before end-of-line. |
|||
* Unrecognized character. Example: | |
|||
;Reference |
|||
The C and Python versions can be considered reference implementations. |
|||
;Implementations |
|||
__TOC__ |
|||
=={{header|C}}== |
|||
<lang C> |
|||
#include <stdlib.h> |
|||
#include <stdio.h> |
|||
#include <stdarg.h> |
|||
#include <ctype.h> |
|||
#include <string.h> |
|||
#include <errno.h> |
|||
#include <stdbool.h> |
|||
#include <limits.h> |
|||
#define NELEMS(arr) (sizeof(arr) / sizeof(arr[0])) |
|||
#define da_dim(name, type) type *name = NULL; \ |
|||
int _qy_ ## name ## _p = 0; \ |
|||
int _qy_ ## name ## _max = 0 |
|||
#define da_rewind(name) _qy_ ## name ## _p = 0 |
|||
#define da_redim(name) if (_qy_ ## name ## _p >= _qy_ ## name ## _max) \ |
|||
name = realloc(name, (_qy_ ## name ## _max += 32) * sizeof(name[0])) |
|||
#define da_append(name, x) do {da_redim(name); name[_qy_ ## name ## _p++] = x;} while (0) |
|||
#define da_len(name) _qy_ ## name ## _p |
|||
// dependancy: atr table in parse.c ordering is based on these |
|||
typedef enum { |
|||
EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add, |
|||
Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident |
|||
} TokenType; |
|||
typedef struct { |
|||
int tok; |
|||
int err_ln, err_col; |
|||
union { |
|||
int n; /* value for constants */ |
|||
char *text; /* text for idents */ |
|||
}; |
|||
} tok_s; |
|||
static FILE *source_fp, *dest_fp; |
|||
static int line, col, the_ch; |
|||
da_dim(text, char); |
|||
tok_s gettok(); |
|||
static void error(int err_line, int err_col, const char *fmt, ... ) { |
|||
char buf[1000]; |
|||
va_list ap; |
|||
va_start(ap, fmt); |
|||
vsprintf(buf, fmt, ap); |
|||
va_end(ap); |
|||
printf("(%d,%d) error: %s\n", err_line, err_col, buf); |
|||
exit(1); |
|||
} |
|||
static void read_ch() { /* get next char from input */ |
|||
the_ch = getc(source_fp); |
|||
++col; |
|||
if (the_ch == '\n') { |
|||
++line; |
|||
col = 0; |
|||
} |
|||
} |
|||
static tok_s char_lit(int n, int err_line, int err_col) { /* 'x' */ |
|||
if (the_ch == '\'') |
|||
error(err_line, err_col, "gettok: empty character constant"); |
|||
if (the_ch == '\\') { |
|||
read_ch(); |
|||
if (the_ch == 'n') |
|||
n = 10; |
|||
else if (the_ch == '\\') |
|||
n = '\\'; |
|||
else error(err_line, err_col, "gettok: unknown escape sequence \\%c", the_ch); |
|||
} |
|||
read_ch(); |
|||
if (the_ch != '\'') error(err_line, err_col, "multi-character constant"); |
|||
read_ch(); |
|||
return (tok_s){Integerk, err_line, err_col, {n}}; |
|||
} |
|||
static tok_s div_or_cmt(int err_line, int err_col) { /* process divide or comments */ |
|||
if (the_ch != '*') |
|||
return (tok_s){Div, err_line, err_col, {0}}; |
|||
/* comment found */ |
|||
for (;;) { |
|||
read_ch(); |
|||
if (the_ch == '*' || the_ch == EOF) { |
|||
read_ch(); |
|||
if (the_ch == '/' || the_ch == EOF) { |
|||
read_ch(); |
|||
return gettok(); |
|||
} |
|||
} |
|||
} |
|||
} |
|||
static tok_s string_lit(int start, int err_line, int err_col) { /* "st" */ |
|||
da_rewind(text); |
|||
for (read_ch(); the_ch != start; read_ch()) { |
|||
if (the_ch == '\n') |
|||
error(err_line, err_col, "EOL in string"); |
|||
if (the_ch == EOF) |
|||
error(err_line, err_col, "EOF in string"); |
|||
da_append(text, (char)the_ch); |
|||
} |
|||
da_append(text, '\0'); |
|||
read_ch(); |
|||
return (tok_s){Stringk, err_line, err_col, {.text=text}}; |
|||
} |
|||
static int kwd_cmp(const void *p1, const void *p2) { |
|||
return strcmp(*(char **)p1, *(char **)p2); |
|||
} |
|||
static TokenType get_ident_type(const char *ident) { |
|||
static struct { |
|||
char *s; |
|||
TokenType sym; |
|||
} kwds[] = { |
|||
{"if", If}, |
|||
{"print", Print}, |
|||
{"putc", Putc}, |
|||
{"while", While}, |
|||
}, *kwp; |
|||
return (kwp = bsearch(&ident, kwds, NELEMS(kwds), sizeof(kwds[0]), kwd_cmp)) == NULL ? Ident : kwp->sym; |
|||
} |
|||
static tok_s ident_or_int(int err_line, int err_col) { |
|||
int n, is_number = true; |
|||
da_rewind(text); |
|||
while (isalnum(the_ch) || the_ch == '_') { |
|||
da_append(text, (char)the_ch); |
|||
if (!isdigit(the_ch)) |
|||
is_number = false; |
|||
read_ch(); |
|||
} |
|||
if (da_len(text) == 0) |
|||
error(err_line, err_col, "gettok: unrecognized character (%d) '%c'\n", the_ch, the_ch); |
|||
da_append(text, '\0'); |
|||
if (isdigit(text[0])) { |
|||
if (!is_number) |
|||
error(err_line, err_col, "invalid number: %s\n", text); |
|||
n = strtol(text, NULL, 0); |
|||
if (n == LONG_MAX && errno == ERANGE) |
|||
error(err_line, err_col, "Number exceeds maximum value"); |
|||
return (tok_s){Integerk, err_line, err_col, {n}}; |
|||
} |
|||
return (tok_s){get_ident_type(text), err_line, err_col, {.text=text}}; |
|||
} |
|||
static tok_s follow(int expect, TokenType ifyes, TokenType ifno, int err_line, int err_col) { /* look ahead for '>=', etc. */ |
|||
if (the_ch == expect) { |
|||
read_ch(); |
|||
return (tok_s){ifyes, err_line, err_col, {0}}; |
|||
} |
|||
if (ifno == EOI) error(err_line, err_col, "follow: unrecognized character '%c' (%d)\n", the_ch, the_ch); |
|||
return (tok_s){ifno, err_line, err_col, {0}}; |
|||
} |
|||
tok_s gettok() { /* return the token type */ |
|||
/* skip white space */ |
|||
while (isspace(the_ch)) |
|||
read_ch(); |
|||
int err_line = line; |
|||
int err_col = col; |
|||
switch (the_ch) { |
|||
case '{': read_ch(); return (tok_s){Lbrace, err_line, err_col, {0}}; |
|||
case '}': read_ch(); return (tok_s){Rbrace, err_line, err_col, {0}}; |
|||
case '(': read_ch(); return (tok_s){Lparen, err_line, err_col, {0}}; |
|||
case ')': read_ch(); return (tok_s){Rparen, err_line, err_col, {0}}; |
|||
case '+': read_ch(); return (tok_s){Add, err_line, err_col, {0}}; |
|||
case '-': read_ch(); return (tok_s){Sub, err_line, err_col, {0}}; |
|||
case '*': read_ch(); return (tok_s){Mul, err_line, err_col, {0}}; |
|||
case ';': read_ch(); return (tok_s){Semi, err_line, err_col, {0}}; |
|||
case ',': read_ch(); return (tok_s){Comma, err_line, err_col, {0}}; |
|||
case '>': read_ch(); return (tok_s){Gtr, err_line, err_col, {0}}; |
|||
case '=': read_ch(); return (tok_s){Assign, err_line, err_col, {0}}; |
|||
case '/': read_ch(); return div_or_cmt(err_line, err_col); |
|||
case '\'': read_ch(); return char_lit(the_ch, err_line, err_col); |
|||
case '<': read_ch(); return follow('=', Leq, Lss, err_line, err_col); |
|||
case '!': read_ch(); return follow('=', Neq, EOI, err_line, err_col); |
|||
case '&': read_ch(); return follow('&', And, EOI, err_line, err_col); |
|||
case '"' : return string_lit(the_ch, err_line, err_col); |
|||
default: return ident_or_int(err_line, err_col); |
|||
case EOF: return (tok_s){EOI, err_line, err_col, {0}}; |
|||
} |
|||
} |
|||
void init_lex() { /* initialize the scanner */ |
|||
line = 1; |
|||
read_ch(); |
|||
} |
|||
void run() { /* tokenize the given input */ |
|||
tok_s tok; |
|||
do { |
|||
tok = gettok(); |
|||
fprintf(dest_fp, "line %5d col %5d %.8s", |
|||
tok.err_ln, tok.err_col, |
|||
&"EOI Print Putc If While Lbrace Rbrace Lparen Rparen " |
|||
"Uminus Mul Div Add Sub Lss Gtr Leq Neq " |
|||
"And Semi Comma Assign Integer String Ident "[tok.tok * 9]); |
|||
if (tok.tok == Integerk) |
|||
fprintf(dest_fp, " %8d", tok.n); |
|||
else if (tok.tok == Ident) |
|||
fprintf(dest_fp, " %s", tok.text); |
|||
else if (tok.tok == Stringk) |
|||
fprintf(dest_fp, " \"%s\"", tok.text); |
|||
fprintf(dest_fp, "\n"); |
|||
} while (tok.tok != EOI); |
|||
if (dest_fp != stdout) |
|||
fclose(dest_fp); |
|||
} |
|||
void init_io(FILE **fp, FILE *std, const char mode[], const char fn[]) { |
|||
if (fn[0] == '\0') |
|||
*fp = std; |
|||
else if ((*fp = fopen(fn, mode)) == NULL) |
|||
error(0, 0, "Can't open %s\n", fn); |
|||
} |
|||
int main(int argc, char *argv[]) { |
|||
init_io(&source_fp, stdin, "r", argc > 1 ? argv[1] : ""); |
|||
init_io(&dest_fp, stdout, "wb", argc > 2 ? argv[2] : ""); |
|||
init_lex(); |
|||
run(); |
|||
} |
|||
</lang> |
|||
=={{header|Python}}== |
|||
<lang Python> |
|||
import sys |
|||
# following two must remain in the same order |
|||
EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add, \ |
|||
Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident = range(25) |
|||
all_syms = [ 'EOI', 'Print', 'Putc', 'If', 'While', 'Lbrace', 'Rbrace', 'Lparen', |
|||
'Rparen', 'Uminus', 'Mul', 'Div', 'Add', 'Sub', 'Lss', 'Gtr', 'Leq', 'Neq', 'And', |
|||
'Semi', 'Comma', 'Assign', 'Integer', 'String', 'Ident' ] |
|||
# single character only symbols |
|||
symbols = { '{': Lbrace, '}': Rbrace, '(': Lparen, ')': Rparen, '+': Add, '-': Sub, |
|||
'*': Mul, ';': Semi, ',': Comma, '>': Gtr, '=': Assign } |
|||
key_words = { 'if': If, 'print': Print, 'putc': Putc, 'while': While } |
|||
the_ch = " " # dummy first char - but it must be a space |
|||
the_col = 0 |
|||
the_line = 1 |
|||
input_file = None |
|||
#*** show error and exit |
|||
def error(line, col, msg): |
|||
print(line, col, msg) |
|||
exit(1) |
|||
#*** get the next character from the input |
|||
def getc(): |
|||
global the_ch, the_col, the_line |
|||
the_ch = input_file.read(1) |
|||
the_col += 1 |
|||
if the_ch == '\n': |
|||
the_line += 1 |
|||
the_col = 0 |
|||
return the_ch |
|||
#*** 'x' - character constants |
|||
def char_lit(err_line, err_col): |
|||
n = ord(getc()) # skip opening quote |
|||
if the_ch == '\'': |
|||
error(err_line, err_col, "empty character constant") |
|||
elif the_ch == '\\': |
|||
getc() |
|||
if the_ch == 'n': |
|||
n = 10 |
|||
elif the_ch == '\\': |
|||
n = '\\' |
|||
else: |
|||
error(err_line, err_col, "unknown escape sequence \\%c" % (the_ch)) |
|||
if getc() != '\'': |
|||
error(err_line, err_col, "multi-character constant") |
|||
getc() |
|||
return Integerk, err_line, err_col, n |
|||
#*** process divide or comments |
|||
def div_or_cmt(err_line, err_col): |
|||
if getc() != '*': |
|||
return Div, err_line, err_col |
|||
# comment found |
|||
while True: |
|||
if getc() == '*' and getc() == '/': |
|||
getc() |
|||
return gettok() |
|||
elif len(the_ch) == 0: |
|||
error(err_line, err_col, "EOF in comment") |
|||
#*** "string" |
|||
def string_lit(start, err_line, err_col): |
|||
text = "" |
|||
while getc() != start: |
|||
if len(the_ch) == 0: |
|||
error(err_line, err_col, "EOF while scanning string literal") |
|||
if the_ch == '\n': |
|||
error(err_line, err_col, "EOL while scanning string literal") |
|||
text += the_ch |
|||
getc() |
|||
return Stringk, err_line, err_col, text |
|||
#*** handle identifiers and integers |
|||
def ident_or_int(err_line, err_col): |
|||
is_number = True |
|||
text = "" |
|||
while the_ch.isalnum() or the_ch == '_': |
|||
text += the_ch |
|||
if not the_ch.isdigit(): |
|||
is_number = False |
|||
getc() |
|||
if len(text) == 0: |
|||
error(err_line, err_col, "ident_or_int: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch)) |
|||
if text[0].isdigit(): |
|||
if not is_number: |
|||
error(err_line, err_col, "invalid number: %s" % (text)) |
|||
n = int(text) |
|||
return Integerk, err_line, err_col, n |
|||
if text in key_words: |
|||
return key_words[text], err_line, err_col |
|||
return Ident, err_line, err_col, text |
|||
#*** look ahead for '>=', etc. |
|||
def follow(expect, ifyes, ifno, err_line, err_col): |
|||
if getc() == expect: |
|||
getc() |
|||
return ifyes, err_line, err_col |
|||
if ifno == EOI: |
|||
error(err_line, err_col, "follow: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch)) |
|||
return ifno, err_line, err_col |
|||
#*** return the next token type |
|||
def gettok(): |
|||
while the_ch.isspace(): |
|||
getc() |
|||
err_line = the_line |
|||
err_col = the_col |
|||
if len(the_ch) == 0: return EOI, err_line, err_col |
|||
elif the_ch in symbols: sym = symbols[the_ch]; getc(); return sym, err_line, err_col |
|||
elif the_ch == '/': return div_or_cmt(err_line, err_col) |
|||
elif the_ch == '\'': return char_lit(err_line, err_col) |
|||
elif the_ch == '<': return follow('=', Leq, Lss, err_line, err_col) |
|||
elif the_ch == '!': return follow('=', Neq, EOI, err_line, err_col) |
|||
elif the_ch == '&': return follow('&', And, EOI, err_line, err_col) |
|||
elif the_ch == '"': return string_lit(the_ch, err_line, err_col) |
|||
else: return ident_or_int(err_line, err_col) |
|||
#*** main driver |
|||
input_file = sys.stdin |
|||
if len(sys.argv) > 1: |
|||
try: |
|||
input_file = open(sys.argv[1], "r", 4096) |
|||
except IOError as e: |
|||
error(0, 0, "Can't open %s" % sys.argv[1]) |
|||
while True: |
|||
t = gettok() |
|||
tok = t[0] |
|||
line = t[1] |
|||
col = t[2] |
|||
if tok == Integerk: |
|||
print("line %5d col %5d %-8s %8d" % (line, col, all_syms[tok], t[3])) |
|||
elif tok == Ident: |
|||
print("line %5d col %5d %-8s %s" % (line, col, all_syms[tok], t[3])) |
|||
elif tok == Stringk: |
|||
print('line %5d col %5d %-8s "%s"' % (line, col, all_syms[tok], t[3])) |
|||
else: |
|||
print("line %5d col %5d %-8s" % (line, col, all_syms[tok])) |
|||
if tok == EOI: |
|||
break |
|||
</lang> |
Latest revision as of 03:37, 14 August 2016
Hello, World!