Compiler/lexical analyzer: Difference between revisions

Line 3,265:

22 30 End_of_input</pre>

</b>

=={{header|Julia}}==

<lang julia>struct Tokenized

startline::Int

startcol::Int

name::String

value::Union{Nothing, Int, String}

end

const optokens = Dict("*" => "Op_multiply", "/" => "Op_divide", "%" => "Op_mod", "+" => "Op_add",

"-" => "Op_subtract", "!" => "Op_not", "<" => "Op_less", "<=" => "Op_lessequal",

">" => "Op_greater", ">=" => "Op_greaterequal", "==" => "Op_equal", "!=" => "Op_notequal",

"!" => "Op_not", "=" => "Op_assign", "&&" => "Op_and", "||" => "Op_or")

const keywordtokens = Dict("if" => "Keyword_if", "else" => "Keyword_else", "while" => "Keyword_while",

"print" => "Keyword_print", "putc" => "Keyword_putc")

const symboltokens = Dict("(" => "LeftParen", ")" => "RightParen", "{" => "LeftBrace",

"}" => "RightBrace", ";" => "Semicolon", "," => "Comma")

const errors = ["Empty character constant.", "Unknown escape sequence.", "Multi-character constant.",

"End-of-file in comment. Closing comment characters not found.",

"End-of-file while scanning string literal. Closing string character not found.",

"End-of-line while scanning string literal. Closing string character not found before end-of-line.",

"Unrecognized character.", "Invalid number. Starts like a number, but ends in non-numeric characters."]

asws(s) = (nnl = length(findall(x->x=='\n', s)); " " ^ (length(s) - nnl) * "\n" ^ nnl)

comment2ws(t) = (while occursin("/*", t) t = replace(t, r"\/\* .+? (?: \*\/)"xs => asws; count = 1) end; t)

hasinvalidescapes(t) = ((m = match(r"\\.", t)) != nothing && m.match != "\\\\" && m.match != "\\n")

hasemptycharconstant(t) = (match(r"\'\'", t) != nothing)

hasmulticharconstant(t) = ((m = match(r"\'[^\'][^\']+\'", t)) != nothing && m.match != "\'\\\\\'" && m.match != "\'\\n\'")

hasunbalancedquotes(t) = isodd(length(findall(x -> x == '\"', t)))

hasunrecognizedchar(t) = match(r"[^\w\s\d\*\/\%\+\-\<\>\=\!\&\|\{\}\;\,\"\'\\]", t) != nothing

function throwiferror(line, n)

if hasemptycharconstant(line)

throw("Tokenizer error line $n: " * errors[1])

end

if hasinvalidescapes(line)

throw("Tokenizer error line $n: " * errors[2])

end

if hasmulticharconstant(line)

println("error at ", match(r"\'[^\'][^\']+\'", line).match)

throw("Tokenizer error line $n: " * errors[3])

end

if occursin("/*", line)

throw("Tokenizer error line $n: " * errors[4])

end

if hasunrecognizedchar(line)

throw("Tokenizer error line $n: " * errors[7])

end

function tokenize(txt)

tokens = Vector{Tokenized}()

txt = comment2ws(txt)

lines = split(txt, "\n")

if hasunbalancedquotes(txt)

throw("Tokenizer error: $(errors[5])")

end

for (startline, line) in enumerate(lines)

if strip(line) == ""

continue

end

throwiferror(line, startline)

lastc = Char(0)

withintoken = 0

for (startcol, c) in enumerate(line)

if withintoken > 0

withintoken -= 1

continue

elseif isspace(c[1])

continue

elseif (c == '=') && (startcol > 1) && ((c2 = line[startcol - 1]) in ['<', '>', '=', '!'])

tokens[end] = Tokenized(startline, startcol - 1, optokens[c2 * c], nothing)

elseif (c == '&') || (c == '|')

if length(line) > startcol && line[startcol + 1] == c

push!(tokens, Tokenized(startline, startcol, optokens[c * c], nothing))

withintoken = 1

else

throw("Tokenizer error line $startline: $(error[7])")

end

elseif haskey(optokens, string(c))

push!(tokens, Tokenized(startline, startcol, optokens[string(c)], nothing))

elseif haskey(symboltokens, string(c))

push!(tokens, Tokenized(startline, startcol, symboltokens[string(c)], nothing))

elseif isdigit(c)

integerstring = match(r"^\d+", line[startcol:end]).match

i = parse(Int, integerstring)

push!(tokens, Tokenized(startline, startcol, "Integer", i))

withintoken = length(integerstring) - 1

elseif c == Char(39) # single quote

if (m = match(r"([^\\\'\n]|\\n|\\\\)\'", line[startcol+1:end])) != nothing

chs = m.captures[1]

i = (chs == "\\n") ? Int('\n') : (chs == "\\\\" ? Int('\\') : Int(chs[1]))

push!(tokens, Tokenized(startline, startcol, "Integer", i))

withintoken = length(chs) + 1

else

println("line $startline: bad match with ", line[startcol+1:end])

end

elseif c == Char(34) # double quote

if (m = match(r"([^\"\n]+)\"", line[startcol+1:end])) == nothing

throw("Tokenizer error line $startline: " * errors[6])

end

litstring = m.captures[1]

push!(tokens, Tokenized(startline, startcol, "String", "\"$litstring\""))

withintoken = length(litstring) + 1

elseif (cols = findfirst(r"[a-zA-Z]+", line[startcol:end])) != nothing

litstring = line[cols .+ startcol .- 1]

if haskey(keywordtokens, string(litstring))

push!(tokens, Tokenized(startline, startcol, keywordtokens[litstring], nothing))

else

litstring = match(r"[_a-zA-Z0-9]+", line[startcol:end]).match

push!(tokens, Tokenized(startline, startcol, "Identifier", string(litstring)))

end

withintoken = length(litstring) - 1

end

lastc = c

end

push!(tokens, Tokenized(length(lines), length(lines[end]) + 1, "End_of_input", nothing))

tokens

end

const test3txt = raw"""

/*

All lexical tokens - not syntactically correct, but that will

have to wait until syntax analysis

*/

/* Print */ print /* Sub */ -

/* Putc */ putc /* Lss */ <

/* If */ if /* Gtr */ >

/* Else */ else /* Leq */ <=

/* While */ while /* Geq */ >=

/* Lbrace */ { /* Eq */ ==

/* Rbrace */ } /* Neq */ !=

/* Lparen */ ( /* And */ &&

/* Rparen */ ) /* Or */ ||

/* Uminus */ - /* Semi */ ;

/* Not */ ! /* Comma */ ,

/* Mul */ * /* Assign */ =

/* Div */ / /* Integer */ 42

/* Mod */ % /* String */ "String literal"

/* Add */ + /* Ident */ variable_name

/* character literal */ '\n'

/* character literal */ '\\'

/* character literal */ ' '

"""

println("Line Col Name Value")

for tok in tokenize(test3txt)

println(lpad(tok.startline, 3), lpad(tok.startcol, 5), lpad(tok.name, 18), " ", tok.value != nothing ? tok.value : "")

end

</lang>{{output}}<pre>

Line Col Name Value

5 16 Keyword_print

5 40 Op_subtract

6 16 Keyword_putc

6 40 Op_less

7 16 Keyword_if

7 40 Op_greater

8 16 Keyword_else

8 40 Op_lessequal

9 16 Keyword_while

9 40 Op_greaterequal

10 16 LeftBrace

10 40 Op_equal

11 16 RightBrace

11 40 Op_notequal

12 16 LeftParen

12 40 Op_and

13 16 RightParen

13 40 Op_or

14 16 Op_subtract

14 40 Semicolon

15 16 Op_not

15 40 Comma

16 16 Op_multiply

16 40 Op_assign

17 16 Op_divide

17 40 Integer 42

18 16 Op_mod

18 40 String "String literal"

19 16 Op_add

19 40 Identifier variable_name

20 26 Integer 10

21 26 Integer 92

22 26 Integer 32

23 1 End_of_input

</pre>

=={{header|Go}}==