Compiler/lexical analyzer: Difference between revisions
Content added Content deleted
(Add solution for Racket) |
m (→{{header|Go}}) |
||
Line 3,265: | Line 3,265: | ||
22 30 End_of_input</pre> |
22 30 End_of_input</pre> |
||
</b> |
</b> |
||
=={{header|Julia}}== |
|||
<lang julia>struct Tokenized |
|||
startline::Int |
|||
startcol::Int |
|||
name::String |
|||
value::Union{Nothing, Int, String} |
|||
end |
|||
const optokens = Dict("*" => "Op_multiply", "/" => "Op_divide", "%" => "Op_mod", "+" => "Op_add", |
|||
"-" => "Op_subtract", "!" => "Op_not", "<" => "Op_less", "<=" => "Op_lessequal", |
|||
">" => "Op_greater", ">=" => "Op_greaterequal", "==" => "Op_equal", "!=" => "Op_notequal", |
|||
"!" => "Op_not", "=" => "Op_assign", "&&" => "Op_and", "||" => "Op_or") |
|||
const keywordtokens = Dict("if" => "Keyword_if", "else" => "Keyword_else", "while" => "Keyword_while", |
|||
"print" => "Keyword_print", "putc" => "Keyword_putc") |
|||
const symboltokens = Dict("(" => "LeftParen", ")" => "RightParen", "{" => "LeftBrace", |
|||
"}" => "RightBrace", ";" => "Semicolon", "," => "Comma") |
|||
const errors = ["Empty character constant.", "Unknown escape sequence.", "Multi-character constant.", |
|||
"End-of-file in comment. Closing comment characters not found.", |
|||
"End-of-file while scanning string literal. Closing string character not found.", |
|||
"End-of-line while scanning string literal. Closing string character not found before end-of-line.", |
|||
"Unrecognized character.", "Invalid number. Starts like a number, but ends in non-numeric characters."] |
|||
asws(s) = (nnl = length(findall(x->x=='\n', s)); " " ^ (length(s) - nnl) * "\n" ^ nnl) |
|||
comment2ws(t) = (while occursin("/*", t) t = replace(t, r"\/\* .+? (?: \*\/)"xs => asws; count = 1) end; t) |
|||
hasinvalidescapes(t) = ((m = match(r"\\.", t)) != nothing && m.match != "\\\\" && m.match != "\\n") |
|||
hasemptycharconstant(t) = (match(r"\'\'", t) != nothing) |
|||
hasmulticharconstant(t) = ((m = match(r"\'[^\'][^\']+\'", t)) != nothing && m.match != "\'\\\\\'" && m.match != "\'\\n\'") |
|||
hasunbalancedquotes(t) = isodd(length(findall(x -> x == '\"', t))) |
|||
hasunrecognizedchar(t) = match(r"[^\w\s\d\*\/\%\+\-\<\>\=\!\&\|\(\)\{\}\;\,\"\'\\]", t) != nothing |
|||
function throwiferror(line, n) |
|||
if hasemptycharconstant(line) |
|||
throw("Tokenizer error line $n: " * errors[1]) |
|||
end |
|||
if hasinvalidescapes(line) |
|||
throw("Tokenizer error line $n: " * errors[2]) |
|||
end |
|||
if hasmulticharconstant(line) |
|||
println("error at ", match(r"\'[^\'][^\']+\'", line).match) |
|||
throw("Tokenizer error line $n: " * errors[3]) |
|||
end |
|||
if occursin("/*", line) |
|||
throw("Tokenizer error line $n: " * errors[4]) |
|||
end |
|||
if hasunrecognizedchar(line) |
|||
throw("Tokenizer error line $n: " * errors[7]) |
|||
end |
|||
end |
|||
function tokenize(txt) |
|||
tokens = Vector{Tokenized}() |
|||
txt = comment2ws(txt) |
|||
lines = split(txt, "\n") |
|||
if hasunbalancedquotes(txt) |
|||
throw("Tokenizer error: $(errors[5])") |
|||
end |
|||
for (startline, line) in enumerate(lines) |
|||
if strip(line) == "" |
|||
continue |
|||
end |
|||
throwiferror(line, startline) |
|||
lastc = Char(0) |
|||
withintoken = 0 |
|||
for (startcol, c) in enumerate(line) |
|||
if withintoken > 0 |
|||
withintoken -= 1 |
|||
continue |
|||
elseif isspace(c[1]) |
|||
continue |
|||
elseif (c == '=') && (startcol > 1) && ((c2 = line[startcol - 1]) in ['<', '>', '=', '!']) |
|||
tokens[end] = Tokenized(startline, startcol - 1, optokens[c2 * c], nothing) |
|||
elseif (c == '&') || (c == '|') |
|||
if length(line) > startcol && line[startcol + 1] == c |
|||
push!(tokens, Tokenized(startline, startcol, optokens[c * c], nothing)) |
|||
withintoken = 1 |
|||
else |
|||
throw("Tokenizer error line $startline: $(error[7])") |
|||
end |
|||
elseif haskey(optokens, string(c)) |
|||
push!(tokens, Tokenized(startline, startcol, optokens[string(c)], nothing)) |
|||
elseif haskey(symboltokens, string(c)) |
|||
push!(tokens, Tokenized(startline, startcol, symboltokens[string(c)], nothing)) |
|||
elseif isdigit(c) |
|||
integerstring = match(r"^\d+", line[startcol:end]).match |
|||
i = parse(Int, integerstring) |
|||
push!(tokens, Tokenized(startline, startcol, "Integer", i)) |
|||
withintoken = length(integerstring) - 1 |
|||
elseif c == Char(39) # single quote |
|||
if (m = match(r"([^\\\'\n]|\\n|\\\\)\'", line[startcol+1:end])) != nothing |
|||
chs = m.captures[1] |
|||
i = (chs == "\\n") ? Int('\n') : (chs == "\\\\" ? Int('\\') : Int(chs[1])) |
|||
push!(tokens, Tokenized(startline, startcol, "Integer", i)) |
|||
withintoken = length(chs) + 1 |
|||
else |
|||
println("line $startline: bad match with ", line[startcol+1:end]) |
|||
end |
|||
elseif c == Char(34) # double quote |
|||
if (m = match(r"([^\"\n]+)\"", line[startcol+1:end])) == nothing |
|||
throw("Tokenizer error line $startline: " * errors[6]) |
|||
end |
|||
litstring = m.captures[1] |
|||
push!(tokens, Tokenized(startline, startcol, "String", "\"$litstring\"")) |
|||
withintoken = length(litstring) + 1 |
|||
elseif (cols = findfirst(r"[a-zA-Z]+", line[startcol:end])) != nothing |
|||
litstring = line[cols .+ startcol .- 1] |
|||
if haskey(keywordtokens, string(litstring)) |
|||
push!(tokens, Tokenized(startline, startcol, keywordtokens[litstring], nothing)) |
|||
else |
|||
litstring = match(r"[_a-zA-Z0-9]+", line[startcol:end]).match |
|||
push!(tokens, Tokenized(startline, startcol, "Identifier", string(litstring))) |
|||
end |
|||
withintoken = length(litstring) - 1 |
|||
end |
|||
lastc = c |
|||
end |
|||
end |
|||
push!(tokens, Tokenized(length(lines), length(lines[end]) + 1, "End_of_input", nothing)) |
|||
tokens |
|||
end |
|||
const test3txt = raw""" |
|||
/* |
|||
All lexical tokens - not syntactically correct, but that will |
|||
have to wait until syntax analysis |
|||
*/ |
|||
/* Print */ print /* Sub */ - |
|||
/* Putc */ putc /* Lss */ < |
|||
/* If */ if /* Gtr */ > |
|||
/* Else */ else /* Leq */ <= |
|||
/* While */ while /* Geq */ >= |
|||
/* Lbrace */ { /* Eq */ == |
|||
/* Rbrace */ } /* Neq */ != |
|||
/* Lparen */ ( /* And */ && |
|||
/* Rparen */ ) /* Or */ || |
|||
/* Uminus */ - /* Semi */ ; |
|||
/* Not */ ! /* Comma */ , |
|||
/* Mul */ * /* Assign */ = |
|||
/* Div */ / /* Integer */ 42 |
|||
/* Mod */ % /* String */ "String literal" |
|||
/* Add */ + /* Ident */ variable_name |
|||
/* character literal */ '\n' |
|||
/* character literal */ '\\' |
|||
/* character literal */ ' ' |
|||
""" |
|||
println("Line Col Name Value") |
|||
for tok in tokenize(test3txt) |
|||
println(lpad(tok.startline, 3), lpad(tok.startcol, 5), lpad(tok.name, 18), " ", tok.value != nothing ? tok.value : "") |
|||
end |
|||
</lang>{{output}}<pre> |
|||
Line Col Name Value |
|||
5 16 Keyword_print |
|||
5 40 Op_subtract |
|||
6 16 Keyword_putc |
|||
6 40 Op_less |
|||
7 16 Keyword_if |
|||
7 40 Op_greater |
|||
8 16 Keyword_else |
|||
8 40 Op_lessequal |
|||
9 16 Keyword_while |
|||
9 40 Op_greaterequal |
|||
10 16 LeftBrace |
|||
10 40 Op_equal |
|||
11 16 RightBrace |
|||
11 40 Op_notequal |
|||
12 16 LeftParen |
|||
12 40 Op_and |
|||
13 16 RightParen |
|||
13 40 Op_or |
|||
14 16 Op_subtract |
|||
14 40 Semicolon |
|||
15 16 Op_not |
|||
15 40 Comma |
|||
16 16 Op_multiply |
|||
16 40 Op_assign |
|||
17 16 Op_divide |
|||
17 40 Integer 42 |
|||
18 16 Op_mod |
|||
18 40 String "String literal" |
|||
19 16 Op_add |
|||
19 40 Identifier variable_name |
|||
20 26 Integer 10 |
|||
21 26 Integer 92 |
|||
22 26 Integer 32 |
|||
23 1 End_of_input |
|||
</pre> |
|||
=={{header|Go}}== |
=={{header|Go}}== |