Compiler/lexical analyzer: Difference between revisions

Added Lua versions.
(→‎{{header|Haskell}}: fixed bug which caused ghci to crash)
(Added Lua versions.)
Line 4,679:
23 1 End_of_input
</pre>
 
=={{header|Lua}}==
===Using LPeg library===
This version uses LPeg, a parsing expression grammar library, developed by one of the authors of Lua.
The source is broken into several modules, in part to make it easier to present the "vanilla Lua" version afterwards.
Tested with Lua 5.3.5 and LPeg 1.0.2-1.
 
The first module is simply a table defining the names of tokens which don't have an associated value.
<lang Lua>-- module token_name (in a file "token_name.lua")
local token_name = {
['*'] = 'Op_multiply',
['/'] = 'Op_divide',
['%'] = 'Op_mod',
['+'] = 'Op_add',
['-'] = 'Op_subtract',
['<'] = 'Op_less',
['<='] = 'Op_lessequal',
['>'] = 'Op_greater',
['>='] = 'Op_greaterequal',
['=='] = 'Op_equal',
['!='] = 'Op_notequal',
['!'] = 'Op_not',
['='] = 'Op_assign',
['&&'] = 'Op_and',
['||'] = 'Op_or',
['('] = 'LeftParen',
[')'] = 'RightParen',
['{'] = 'LeftBrace',
['}'] = 'RightBrace',
[';'] = 'Semicolon',
[','] = 'Comma',
['if'] = 'Keyword_if',
['else'] = 'Keyword_else',
['while'] = 'Keyword_while',
['print'] = 'Keyword_print',
['putc'] = 'Keyword_putc',
}
return token_name</lang>
 
This module exports a function <i>find_token</i>, which attempts to find the next valid token from a specified position in a source line.
<lang Lua>-- module lpeg_token_finder
local M = {} -- only items added to M will be public (via 'return M' at end)
local table, concat = table, table.concat
local error, tonumber = error, tonumber
 
local lpeg = require 'lpeg' -- see http://www.inf.puc-rio.br/~roberto/lpeg/
local token_name = require 'token_name'
_ENV = {}
 
local imports = 'P R S C Carg Cb Cc Cf Cg Cp Cs Ct Cmt V'
for w in imports:gmatch('%a+') do _ENV[w] = lpeg[w] end
 
------------------- Define patterns to match tokens -----------------------
 
alpha = R'az' + R'AZ' + P'_'
digit = R'09'
alnum = alpha + digit
space = S' \t\r\n'
 
function ptok(text) return {name=token_name[text]} end
op2c = C(P'<=' + P'>=' + P'==' + P'!=' + P'&&' + P'||') / ptok
op1c = C(S'*/%+-<>!=') / ptok
symbol = C(S'(){};,') / ptok
 
keyword_or_identifier = C(alpha * alnum^0) / function(text)
local name = token_name[text]
return name and {name=name} or {name='Identifier', value=text}
end
 
integer = C(digit^1) * -alpha / function(text)
return {name='Integer', value=tonumber(text)}
end
 
Cline = Carg(1) -- call to 'match' sets the first extra argument to source line number
 
bad_escseq_err = Cmt(Cline, function (_,pos,line)
error{err='bad_escseq', line=line, column=pos-1}
end)
 
esc_subst = {['\\'] = '\\', ['n'] = '\n'}
escseq = P'\\' * C(S'\\n' + bad_escseq_err) / esc_subst
 
qchar = P"'" * ( C( P(1) - S"'\n\\" ) + escseq ) * P"'" / function (text)
return {name='Integer', value=text:byte()}
end
 
qstr = P'"' * ( C((P(1) - S'"\n\\')^1) + escseq )^0 * P'"' / function(...)
return {name='String', value=concat{...}}
end
 
Ctoken = symbol + op2c + op1c + keyword_or_identifier + integer + qstr + qchar
 
unfinished_comment_err = Cmt(Cline * Cb('SOC'), function (_, pos, line, socpos)
error{err='unfinished_comment', line=line, column=socpos}
end)
commentstart = Cg(Cp() * P'/*', 'SOC')
commentrest = (P(1) - P'*/')^0 * (P'*/' + unfinished_comment_err)
comment = commentstart * commentrest
morecomment = Cg(Cp(), 'SOC') * commentrest
 
ws = (space^1 + comment)^0
 
bad_token_err = Cmt(Cline, function (_, pos, line)
error{err='invalid_token', line=line, column=pos}
end)
tokenpat = ws * Cline * Cp() * (C(-1) + Ctoken + bad_token_err) * Cp() /
function (line, pos, token, nextpos)
if pos == nextpos then -- at end of line; no token
return nil
else
token.line, token.column = line, pos
return token, nextpos
end
end
 
closecomment_tokenpat = morecomment * tokenpat
 
function M.find_token(line, line_pos, line_number, in_comment)
pattern = in_comment and closecomment_tokenpat or tokenpat
return lpeg.match(pattern, line, line_pos, line_number)
end
return M</lang>
 
The <i>lexer</i> module uses <i>finder.find_token</i> to produce an iterator over the tokens in a source.
<lang Lua>-- module lexer
local M = {} -- only items added to M will publicly available (via 'return M' at end)
local string, io, coroutine, yield = string, io, coroutine, coroutine.yield
local error, pcall, type = error, pcall, type
 
local finder = require 'lpeg_token_finder'
_ENV = {}
 
-- produces a token iterator given a source line iterator
function M.tokenize_lineiter(lineiter)
local function fatal(err)
local msgtext = {
unfinished_comment = "EOF inside comment started",
invalid_token = "Invalid token",
bad_escseq = "Invalid escape sequence",
}
local fmt = "LEX ERROR: %s at line %d, column %d"
error(string.format(fmt, msgtext[err.err], err.line, err.column))
end
return coroutine.wrap(function()
local line_number = 0
local line_pos
local in_comment -- where unfinished comment started
for line in lineiter do
line_number = line_number + 1
line_pos = 1
local function scanline() -- yield current line's tokens
repeat
local token, pos =
finder.find_token(line, line_pos, line_number, in_comment)
if token then
line_pos = pos
in_comment = nil
yield(token)
end
until token == nil
end
 
if line then
local ok, err = pcall(scanline)
if ok then
in_comment = nil
elseif type(err) == 'table' and err.err=='unfinished_comment' then
if not(in_comment and err.column==1) then
in_comment = err
end
elseif type(err) == 'table' then
fatal(err)
else
error(err) -- some internal error
end
end
end
if in_comment then
fatal(in_comment)
else
yield{name='End_of_input', line=line_number+1, column=1}
end
return nil
end)
end
 
------------------- exports -----------------------------
 
lexer = M.tokenize_lineiter
 
function M.tokenize_file(filename)
return lexer(io.lines(filename))
end
 
function M.tokenize_text(text)
return lexer(text:gmatch('[^\n]+'))
end
 
-- M._INTERNALS = _ENV
return M
</lang>
 
This script uses <i>lexer.tokenize_text</i> to show the tokens produced from tokenizing a text.
 
<lang Lua>lexer = require 'lexer'
format, gsub = string.format, string.gsub
 
function printf(fmt, ...) print(format(fmt, ...)) end
 
function stringrep(str)
local subst = {['\n'] = "\\n", ['\\'] = '\\\\'}
return format('"%s"', gsub(str, '[\n\\]', subst))
end
 
function display(text)
for t in lexer.tokenize_text(text) do
local value = (t.name=='String') and stringrep(t.value) or t.value or ''
printf("%4d %3d %-15s %s", t.line, t.column, t.name, value)
end
end
 
----------------------- test cases from Rosetta spec ------------------------
testing = true
if testing then
-- test case 1
display[[
/*
Hello world
*/
print("Hello, World!\n");]]
print()
 
-- test ercase 2
display[[
/*
Show Ident and Integers
*/
phoenix_number = 142857;
print(phoenix_number, "\n");]]
print()
-- etc.
end
</lang>
===Using only standard libraries===
This version replaces the <i>lpeg_token_finder</i> module with this <i>basic_token_finder</i> module, altering the <i>require</i> expression near the top of the <i>lexer</i> module accordingly. Tested with Lua 5.3.5. (Note that <i>select</i> is a standard function as of Lua 5.2.)
 
<lang lua>-- module basic_token_finder
local M = {} -- only items added to M will be public (via 'return M' at end)
local table, string = table, string
local error, tonumber, select, assert = error, tonumber, select, assert
 
local token_name = require 'token_name'
_ENV = {}
 
function next_token(line, pos, line_num) -- match a token at line,pos
local function m(pat)
from, to, capture = line:find(pat, pos)
if from then
pos = to + 1
return capture
end
end
local function ptok(str)
return {name=token_name[str]}
end
local function op2c()
local text = m'^([<>=!]=)' or m'^(&&)' or m'^(||)'
if text then return ptok(text) end
end
 
local function op1c_or_symbol()
local char = m'^([%*/%%%+%-<>!=%(%){};,])'
if char then return ptok(char) end
end
local function keyword_or_identifier()
local text = m'^([%a_][%w_]*)'
if text then
local name = token_name[text]
return name and {name=name} or {name='Identifier', value=text}
end
end
local function integer()
local text = m'^(%d+)%f[^%w_]'
if text then return {name='Integer', value=tonumber(text)} end
end
local subst = {['\\\\'] = '\\', ['\\n'] = '\n'}
local function qchar()
local text = m"^'([^\\])'" or m"^'(\\[\\n])'"
if text then
local value = #text==1 and text:byte() or subst[text]:byte()
return {name='Integer', value=value}
end
end
local function qstr()
local text = m'^"([^"\n]*\\?)"'
if text then
local value = text:gsub('()(\\.?)', function(at, esc)
local replace = subst[esc]
if replace then
return replace
else
error{err='bad_escseq', line=line_num, column=pos+at-1}
end
end)
return {name='String', value=value}
end
end
local found = (op2c() or op1c_or_symbol() or
keyword_or_identifier() or integer() or qchar() or qstr())
if found then
return found, pos
end
end
 
function find_commentrest(line, pos, line_num, socpos)
local sfrom, sto = line:find('%*%/', pos)
if sfrom then
return socpos, sto
else
error{err='unfinished_comment', line=line_num, column=socpos}
end
end
 
function find_comment(line, pos, line_num)
local sfrom, sto = line:find('^%/%*', pos)
if sfrom then
local efrom, eto = find_commentrest(line, sto+1, line_num, sfrom)
return sfrom, eto
end
end
 
function find_morecomment(line, pos, line_num)
assert(pos==1)
return find_commentrest(line, pos, line_num, pos)
end
 
function find_whitespace(line, pos, line_num)
local spos = pos
repeat
local eto = select(2, line:find('^%s+', pos))
if not eto then
eto = select(2, find_comment(line, pos, line_num))
end
if eto then pos = eto + 1 end
until not eto
return spos, pos - 1
end
 
function M.find_token(line, pos, line_num, in_comment)
local spos = pos
if in_comment then
pos = 1 + select(2, find_morecomment(line, pos, line_num))
end
pos = 1 + select(2, find_whitespace(line, pos, line_num))
if pos > #line then
return nil
else
local token, nextpos = next_token(line, pos, line_num)
if token then
token.line, token.column = line_num, pos
return token, nextpos
else
error{err='invalid_token', line=line_num, column=pos}
end
end
end
 
-- M._ENV = _ENV
return M</lang>
 
=={{header|M2000 Interpreter}}==
Anonymous user