Compiler/lexical analyzer: Difference between revisions
Content added Content deleted
Line 6,821: | Line 6,821: | ||
23 1 End_of_input |
23 1 End_of_input |
||
</pre> |
</pre> |
||
=={{header|Icon}}== |
|||
{{trans|ATS}} |
|||
{{works with|Icon|9.5.20i}} |
|||
This implementation was developed for Arizona Icon, but ought to work with the Unicon compiler, as well. |
|||
One interesting aspect is the use of co-expressions to handle "input with pushback". The main advantage of this approach is it hides the pushback buffer from the user, without making the buffer a global variable. |
|||
Global variables are avoided except for some constants that require initialization. |
|||
<lang Icon># |
|||
# The Rosetta Code lexical analyzer in Icon with co-expressions. Based |
|||
# upon the ATS implementation. |
|||
# |
|||
# Usage: lex [INPUTFILE [OUTPUTFILE]] |
|||
# If INPUTFILE or OUTPUTFILE is "-" or missing, then standard input |
|||
# or standard output is used, respectively. *) |
|||
# |
|||
$define EOF -1 |
|||
$define TOKEN_ELSE 0 |
|||
$define TOKEN_IF 1 |
|||
$define TOKEN_PRINT 2 |
|||
$define TOKEN_PUTC 3 |
|||
$define TOKEN_WHILE 4 |
|||
$define TOKEN_MULTIPLY 5 |
|||
$define TOKEN_DIVIDE 6 |
|||
$define TOKEN_MOD 7 |
|||
$define TOKEN_ADD 8 |
|||
$define TOKEN_SUBTRACT 9 |
|||
$define TOKEN_NEGATE 10 |
|||
$define TOKEN_LESS 11 |
|||
$define TOKEN_LESSEQUAL 12 |
|||
$define TOKEN_GREATER 13 |
|||
$define TOKEN_GREATEREQUAL 14 |
|||
$define TOKEN_EQUAL 15 |
|||
$define TOKEN_NOTEQUAL 16 |
|||
$define TOKEN_NOT 17 |
|||
$define TOKEN_ASSIGN 18 |
|||
$define TOKEN_AND 19 |
|||
$define TOKEN_OR 20 |
|||
$define TOKEN_LEFTPAREN 21 |
|||
$define TOKEN_RIGHTPAREN 22 |
|||
$define TOKEN_LEFTBRACE 23 |
|||
$define TOKEN_RIGHTBRACE 24 |
|||
$define TOKEN_SEMICOLON 25 |
|||
$define TOKEN_COMMA 26 |
|||
$define TOKEN_IDENTIFIER 27 |
|||
$define TOKEN_INTEGER 28 |
|||
$define TOKEN_STRING 29 |
|||
$define TOKEN_END_OF_INPUT 30 |
|||
global whitespace |
|||
global ident_start |
|||
global ident_continuation |
|||
procedure main(args) |
|||
local inpf, outf |
|||
local pushback_buffer, inp, pushback |
|||
initial { |
|||
whitespace := ' \t\v\f\r\n' |
|||
ident_start := '_' ++ &letters |
|||
ident_continuation := ident_start ++ &digits |
|||
} |
|||
inpf := &input |
|||
outf := &output |
|||
if 1 <= *args & args[1] ~== "-" then inpf := open(args[1], "rt") |
|||
if 2 <= *args & args[2] ~== "-" then outf := open(args[2], "wt") |
|||
pushback_buffer := [] |
|||
inp := create inputter(inpf, pushback_buffer) |
|||
pushback := create repeat push(pushback_buffer, \@&source) |
|||
@pushback # The first invocation does nothing. |
|||
scan_text(outf, inp, pushback) |
|||
end |
|||
procedure scan_text(outf, inp, pushback) |
|||
local ch |
|||
while /ch | ch[1] ~=== EOF do { |
|||
skip_spaces_and_comments(inp, pushback) |
|||
ch := @inp |
|||
if ch[1] === EOF then { |
|||
print_token(outf, [TOKEN_END_OF_INPUT, "", ch[2], ch[3]]) |
|||
} else { |
|||
ch @pushback |
|||
print_token(outf, get_next_token(inp, pushback)) |
|||
} |
|||
} |
|||
end |
|||
procedure get_next_token(inp, pushback) |
|||
local ch, ch1 |
|||
local ln, cn |
|||
skip_spaces_and_comments(inp, pushback) |
|||
ch := @inp |
|||
ln := ch[2] # line number |
|||
cn := ch[3] # column number |
|||
case ch[1] of { |
|||
"," : return [TOKEN_COMMA, ",", ln, cn] |
|||
";" : return [TOKEN_SEMICOLON, ";", ln, cn] |
|||
"(" : return [TOKEN_LEFTPAREN, "(", ln, cn] |
|||
")" : return [TOKEN_RIGHTPAREN, ")", ln, cn] |
|||
"{" : return [TOKEN_LEFTBRACE, "{", ln, cn] |
|||
"}" : return [TOKEN_RIGHTBRACE, "}", ln, cn] |
|||
"*" : return [TOKEN_MULTIPLY, "*", ln, cn] |
|||
"/" : return [TOKEN_DIVIDE, "/", ln, cn] |
|||
"%" : return [TOKEN_MOD, "%", ln, cn] |
|||
"+" : return [TOKEN_ADD, "+", ln, cn] |
|||
"-" : return [TOKEN_SUBTRACT, "-", ln, cn] |
|||
"<" : { |
|||
ch1 := @inp |
|||
if ch1[1] === "=" then { |
|||
return [TOKEN_LESSEQUAL, "<=", ln, cn] |
|||
} else { |
|||
ch1 @pushback |
|||
return [TOKEN_LESS, "<", ln, cn] |
|||
} |
|||
} |
|||
">" : { |
|||
ch1 := @inp |
|||
if ch1[1] === "=" then { |
|||
return [TOKEN_GREATEREQUAL, ">=", ln, cn] |
|||
} else { |
|||
ch1 @pushback |
|||
return [TOKEN_GREATER, ">", ln, cn] |
|||
} |
|||
} |
|||
"=" : { |
|||
ch1 := @inp |
|||
if ch1[1] === "=" then { |
|||
return [TOKEN_EQUAL, "==", ln, cn] |
|||
} else { |
|||
ch1 @pushback |
|||
return [TOKEN_ASSIGN, "=", ln, cn] |
|||
} |
|||
} |
|||
"!" : { |
|||
ch1 := @inp |
|||
if ch1[1] === "=" then { |
|||
return [TOKEN_NOTEQUAL, "!=", ln, cn] |
|||
} else { |
|||
ch1 @pushback |
|||
return [TOKEN_NOT, "!", ln, cn] |
|||
} |
|||
} |
|||
"&" : { |
|||
ch1 := @inp |
|||
if ch1[1] === "&" then { |
|||
return [TOKEN_AND, "&&", ln, cn] |
|||
} else { |
|||
unexpected_character(ln, cn, ch) |
|||
} |
|||
} |
|||
"|" : { |
|||
ch1 := @inp |
|||
if ch1[1] === "|" then { |
|||
return [TOKEN_OR, "||", ln, cn] |
|||
} else { |
|||
unexpected_character(ln, cn, ch) |
|||
} |
|||
} |
|||
"\"" : { |
|||
ch @pushback |
|||
return scan_string_literal(inp) |
|||
} |
|||
"'" : { |
|||
ch @pushback |
|||
return scan_character_literal(inp, pushback) |
|||
} |
|||
default : { |
|||
if (ch[1] ? any(&digits)) then { |
|||
ch @pushback |
|||
return scan_integer_literal(inp, pushback) |
|||
} else if (ch[1] ? any(ident_start)) then { |
|||
ch @pushback |
|||
return scan_identifier_or_reserved_word (inp, pushback) |
|||
} else { |
|||
unexpected_character(ln, cn, ch) |
|||
} |
|||
} |
|||
} |
|||
end |
|||
procedure scan_identifier_or_reserved_word(inp, pushback) |
|||
local ch |
|||
local s |
|||
local line_no, column_no |
|||
s := "" |
|||
ch := @inp |
|||
line_no := ch[2] |
|||
column_no := ch[3] |
|||
while EOF ~=== ch[1] & (ch[1] ? any(ident_continuation)) do { |
|||
s ||:= ch[1] |
|||
ch := @inp |
|||
} |
|||
ch @pushback |
|||
return reserved_word_lookup (s, line_no, column_no) |
|||
end |
|||
procedure scan_integer_literal(inp, pushback) |
|||
local ch |
|||
local s |
|||
local line_no, column_no |
|||
s := "" |
|||
ch := @inp |
|||
line_no := ch[2] |
|||
column_no := ch[3] |
|||
while EOF ~=== ch[1] & (ch[1] ? any(ident_continuation)) do { |
|||
s ||:= ch[1] |
|||
ch := @inp |
|||
} |
|||
ch @pushback |
|||
not upto(~&digits, s) | invalid_integer_literal(line_no, column_no, s) |
|||
return [TOKEN_INTEGER, s, line_no, column_no] |
|||
end |
|||
procedure scan_character_literal(inp, pushback) |
|||
local ch, ch1, ch2 |
|||
local close_quote |
|||
local toktup |
|||
local line_no, column_no |
|||
ch := @inp # The opening quote. |
|||
close_quote := ch[1] # Same as the opening quote. |
|||
ch @pushback |
|||
line_no := ch[2] |
|||
column_no := ch[3] |
|||
toktup := scan_character_literal_without_checking_end(inp) |
|||
ch1 := @inp |
|||
while EOF ~=== ch1[1] & ch1[1] ~== close_quote do { |
|||
case ch1[1] of { |
|||
EOF : unterminated_character_literal(line_no, column_no) |
|||
close_quote : multicharacter_literal(line_no, column_no) |
|||
default : ch1 := @inp |
|||
} |
|||
} |
|||
return toktup |
|||
end |
|||
procedure scan_character_literal_without_checking_end(inp) |
|||
local ch, ch1, ch2 |
|||
ch := @inp # The opening quote. |
|||
ch1 := @inp |
|||
EOF ~=== ch1[1] | unterminated_character_literal(ch[2], ch[3]) |
|||
if ch1[1] == "\\" then { |
|||
ch2 := @inp |
|||
EOF ~=== ch2[1] | unterminated_character_literal(ch[2], ch[3]) |
|||
case ch2[1] of { |
|||
"n" : return [TOKEN_INTEGER, string(ord("\n")), ch[2], ch[3]] |
|||
"\\" : return [TOKEN_INTEGER, string(ord("\\")), ch[2], ch[3]] |
|||
default : unsupported_escape(ch1[2], ch1[3], ch2) |
|||
} |
|||
} else { |
|||
return [TOKEN_INTEGER, string(ord(ch1[1])), ch[2], ch[3]] |
|||
} |
|||
end |
|||
procedure scan_string_literal(inp) |
|||
local ch, ch1, ch2 |
|||
local line_no, column_no |
|||
local close_quote |
|||
local s |
|||
local retval |
|||
ch := @inp # The opening quote |
|||
close_quote := ch[1] # Same as the opening quote. |
|||
line_no := ch[2] |
|||
column_no := ch[3] |
|||
s := ch[1] |
|||
until \retval do { |
|||
ch1 := @inp |
|||
ch1[1] ~=== EOF | |
|||
unterminated_string_literal (line_no, column_no, |
|||
"end of input") |
|||
ch1[1] ~== "\n" | |
|||
unterminated_string_literal (line_no, column_no, |
|||
"end of line") |
|||
if ch1[1] == close_quote then { |
|||
retval := [TOKEN_STRING, s || close_quote, line_no, column_no] |
|||
} else if ch1[1] ~== "\\" then { |
|||
s ||:= ch1[1] |
|||
} else { |
|||
ch2 := @inp |
|||
EOF ~=== ch2[1] | unsupported_escape(line_no, column_no, ch2) |
|||
case ch2[1] of { |
|||
"n" : s ||:= "\\n" |
|||
"\\" : s ||:= "\\\\" |
|||
default : unsupported_escape(line_no, column_no, ch2) |
|||
} |
|||
} |
|||
} |
|||
return retval |
|||
end |
|||
procedure skip_spaces_and_comments(inp, pushback) |
|||
local ch, ch1 |
|||
repeat { |
|||
ch := @inp |
|||
(EOF === ch[1]) & { ch @pushback; return } |
|||
if not (ch[1] ? any(whitespace)) then { |
|||
(ch[1] == "/") | { ch @pushback; return } |
|||
(ch1 := @inp) | { ch @pushback; return } |
|||
(ch1[1] == "*") | { ch1 @pushback; ch @pushback; return } |
|||
scan_comment(inp, ch[2], ch[3]) |
|||
} |
|||
} |
|||
end |
|||
procedure scan_comment(inp, line_no, column_no) |
|||
local ch, ch1 |
|||
until (\ch)[1] == "*" & (\ch1)[1] == "/" do { |
|||
ch := @inp |
|||
(EOF === ch[1]) & unterminated_comment(line_no, column_no) |
|||
if ch[1] == "*" then { |
|||
ch1 := @inp |
|||
(EOF === ch[1]) & unterminated_comment(line_no, column_no) |
|||
} |
|||
} |
|||
return |
|||
end |
|||
procedure reserved_word_lookup(s, line_no, column_no) |
|||
# Lookup is by an extremely simple perfect hash. |
|||
static reserved_words |
|||
static reserved_word_tokens |
|||
local hashval, token, toktup |
|||
initial { |
|||
reserved_words := ["if", "print", "else", |
|||
"", "putc", "", |
|||
"", "while", ""] |
|||
reserved_word_tokens := |
|||
[TOKEN_IF, TOKEN_PRINT, TOKEN_ELSE, |
|||
TOKEN_IDENTIFIER, TOKEN_PUTC, TOKEN_IDENTIFIER, |
|||
TOKEN_IDENTIFIER, TOKEN_WHILE, TOKEN_IDENTIFIER] |
|||
} |
|||
if *s < 2 then { |
|||
toktup := [TOKEN_IDENTIFIER, s, line_no, column_no] |
|||
} else { |
|||
hashval := ((ord(s[1]) + ord(s[2])) % (*reserved_words)) + 1 |
|||
token := reserved_word_tokens[hashval] |
|||
if token = TOKEN_IDENTIFIER | s ~== reserved_words[hashval] then { |
|||
toktup := [TOKEN_IDENTIFIER, s, line_no, column_no] |
|||
} else { |
|||
toktup := [token, s, line_no, column_no] |
|||
} |
|||
} |
|||
return toktup |
|||
end |
|||
procedure print_token(outf, toktup) |
|||
static token_names |
|||
local s_line, s_column |
|||
initial { |
|||
token_names := ["Keyword_else", |
|||
"Keyword_if", |
|||
"Keyword_print", |
|||
"Keyword_putc", |
|||
"Keyword_while", |
|||
"Op_multiply", |
|||
"Op_divide", |
|||
"Op_mod", |
|||
"Op_add", |
|||
"Op_subtract", |
|||
"Op_negate", |
|||
"Op_less", |
|||
"Op_lessequal", |
|||
"Op_greater", |
|||
"Op_greaterequal", |
|||
"Op_equal", |
|||
"Op_notequal", |
|||
"Op_not", |
|||
"Op_assign", |
|||
"Op_and", |
|||
"Op_or", |
|||
"LeftParen", |
|||
"RightParen", |
|||
"LeftBrace", |
|||
"RightBrace", |
|||
"Semicolon", |
|||
"Comma", |
|||
"Identifier", |
|||
"Integer", |
|||
"String", |
|||
"End_of_input"] |
|||
} |
|||
/outf := &output |
|||
s_line := string(toktup[3]) |
|||
s_column := string(toktup[4]) |
|||
writes(outf, right (s_line, max(5, *s_line))) |
|||
writes(outf, " ") |
|||
writes(outf, right (s_column, max(5, *s_column))) |
|||
writes(outf, " ") |
|||
writes(outf, token_names[toktup[1] + 1]) |
|||
case toktup[1] of { |
|||
TOKEN_IDENTIFIER : writes(outf, " ", toktup[2]) |
|||
TOKEN_INTEGER : writes(outf, " ", toktup[2]) |
|||
TOKEN_STRING : writes(outf, " ", toktup[2]) |
|||
} |
|||
write(outf) |
|||
return |
|||
end |
|||
procedure inputter(inpf, pushback_buffer) |
|||
local buffer |
|||
local line_no, column_no |
|||
local c |
|||
buffer := "" |
|||
line_no := 1 |
|||
column_no := 1 |
|||
repeat { |
|||
buffer? { |
|||
until *pushback_buffer = 0 & pos(0) do { |
|||
if *pushback_buffer ~= 0 then { |
|||
suspend pop(pushback_buffer) |
|||
} else { |
|||
c := move(1) |
|||
suspend [c, line_no, column_no] |
|||
if c == "\n" then { |
|||
line_no +:= 1 |
|||
column_no := 1 |
|||
} else { |
|||
column_no +:= 1 |
|||
} |
|||
} |
|||
} |
|||
} |
|||
(buffer := reads(inpf, 2048)) | |
|||
suspend [EOF, line_no, column_no] |
|||
} |
|||
end |
|||
procedure unterminated_comment(line_no, column_no) |
|||
error("unterminated comment starting at ", |
|||
line_no, ":", column_no) |
|||
end |
|||
procedure unexpected_character(line_no, column_no, ch) |
|||
error("unexpected character '", ch[1], "' starting at ", |
|||
line_no, ":", column_no) |
|||
end |
|||
procedure unterminated_string_literal (line_no, column_no, cause) |
|||
error("unterminated string literal (", cause, ") starting at ", |
|||
line_no, ":", column_no) |
|||
end |
|||
procedure unsupported_escape (line_no, column_no, ch) |
|||
if ch[1] === EOF then { |
|||
error("unexpected \\ at end of input", |
|||
" starting at ", line_no, ":", column_no) |
|||
} else { |
|||
error("unsupported escape \\", ch[1], |
|||
" starting at ", line_no, ":", column_no) |
|||
} |
|||
end |
|||
procedure invalid_integer_literal(line_no, column_no, s) |
|||
error("invalid integer literal ", s, |
|||
" starting at ", line_no, ":", column_no) |
|||
end |
|||
procedure unterminated_character_literal(line_no, column_no) |
|||
error("unterminated character literal starting at ", |
|||
line_no, ":", column_no) |
|||
end |
|||
procedure multicharacter_literal(line_no, column_no) |
|||
error("unsupported multicharacter literal starting at ", |
|||
line_no, ":", column_no) |
|||
end |
|||
procedure error(args[]) |
|||
write!([&errout] ||| args) |
|||
exit(1) |
|||
end |
|||
procedure max(x, y) |
|||
return (if x < y then y else x) |
|||
end</lang> |
|||
=={{header|J}}== |
=={{header|J}}== |