Parse EBNF: Difference between revisions

10,027 bytes added ,  5 years ago
(→‎{{header|Perl 6}}: Updated to work with latest version of Rakudo. Minor syntax fixes)
Line 848:
 
*******************************************************************************</pre>
 
=={{header|Phix}}==
<lang Phix>string src
integer ch, sdx
object token
 
bool error = false
function invalid(string msg)
error = true
?msg
sdx = length(src)+1 -- (set to eof)
return -1
end function
 
procedure skip_spaces()
while 1 do
if sdx>length(src) then exit end if
ch = src[sdx]
if not find(ch," \t\r\n") then exit end if
sdx += 1
end while
end procedure
 
procedure get_token()
-- yeilds a single character token, one of {}()[]|=.;
-- or {"terminal",string} or {"ident", string} or -1.
skip_spaces()
if sdx>length(src) then
token = -1
return
end if
integer tokstart = sdx
if find(ch,"{}()[]|=.;") then
sdx += 1
token = ch
elsif ch='\"'
or ch='\'' then
integer closech = ch
for tokend=sdx+1 to length(src) do
if src[tokend] = closech then
sdx = tokend+1
token = {"terminal",src[tokstart+1..tokend-1]}
return
end if
end for
token = invalid("no closing quote")
elsif ch>='a' and ch<='z' then
-- to simplify things for the purposes of this task,
-- identifiers are strictly a-z only, not A-Z or 1-9
while 1 do
sdx += 1
if sdx>length(src) then exit end if
ch = src[sdx]
if ch<'a' or ch>'z' then exit end if
end while
token = {"ident",src[tokstart..sdx-1]}
else
token = invalid("invalid ebnf")
end if
end procedure
 
procedure match_token(integer ch)
if token!=ch then
token = invalid(sprintf("invalid ebnf (%c expected)",ch))
else
get_token()
end if
end procedure
 
sequence idents = {},
ididx = {}
 
function add_ident(string ident)
integer k = find(ident,idents)
if k=0 then
idents = append(idents,ident)
k = length(idents)
ididx = append(ididx,0)
end if
return k
end function
 
forward function expression()
 
function factor()
object res
if sequence(token) then
if token[1]="ident" then
token &= add_ident(token[2])
end if
res = token
get_token()
elsif token='[' then
get_token()
res = {"optional",expression()}
match_token(']')
elsif token='(' then
get_token()
res = expression()
match_token(')')
elsif token='{' then
get_token()
res = {"repeat",expression()}
match_token('}')
else
?9/0 -- erm??
-- res = -1 -- ""
end if
return res
end function
 
function term()
-- sequence res = {"factors",factor()} -- (opted against this)
sequence res = {factor()}
while not find(token,{-1,'|','.',';',')',']','}'}) do
res = append(res,factor())
end while
if length(res)=1 then res = res[1] end if
return res
end function
 
function expression()
object res = term()
if token='|' then
res = {"or",res}
while token='|' do
get_token()
res = append(res,term())
end while
end if
return res
end function
 
sequence productions = {}
 
function production()
-- returns a token or -1; the real result is left in productions[] etc.
get_token()
if token!='}' then
if token=-1 then
return invalid("invalid ebnf (missing closing })")
end if
if not sequence(token)
or token[1]!="ident" then
return -1
end if
string ident = token[2]
integer idx = add_ident(ident)
get_token()
match_token('=')
if token=-1 then
return -1
end if
sequence res = expression()
productions = append(productions,{ident,idx,res})
ididx[idx] = length(productions)
end if
return token
end function
 
sequence extras = {}
 
function parse(string ebnf)
-- returns: +1 if ok, -1 if bad
puts(1,"parse: "&ebnf&" ===>\n")
error = false
src = ebnf
sdx = 1
idents = {}
ididx = {}
productions = {}
extras = {}
get_token()
if sequence(token) then
token[1] = "title"
extras = append(extras,token)
get_token()
end if
if token!='{' then
return invalid("invalid ebnf (missing opening {)")
end if
while 1 do
token = production()
if token='}' or token=-1 then exit end if
end while
get_token()
if sequence(token) then
token[1] = "comment"
extras = append(extras,token)
get_token()
end if
if token!=-1 then
return invalid("invalid ebnf (missing eof?)")
end if
if error then return -1 end if
integer k = find(0,ididx)
if k!=0 then
return invalid("invalid ebnf (undefined:"&idents[k]&")")
end if
ppOpt({pp_Pause,0})
pp(productions)
-- pp(idents)
-- pp(ididx)
-- pp(extras)
return 1
end function
 
-- The rules that applies() has to deal with are:
-- {factors} - if rule[1] is not string, just apply one after the other,
-- (as per term() above, originally I had a "factors" there,
-- but getting rid of it made the syntax tree much clearer)
-- {"terminal", "a1"} -- literal constants
-- {"or", <e1>, <e2>, ...} -- (any) one of n
-- {"repeat", <e1>} -- as per "{}" in ebnf
-- {"optional", <e1>} -- as per "()" in ebnf
-- {"ident", <name>, idx} -- apply the sub-rule
 
function applies(sequence rule)
integer was_sdx = sdx -- in case of failure
object r1 = rule[1]
if not string(r1) then
for i=1 to length(rule) do
if not applies(rule[i]) then
sdx = was_sdx
return false
end if
end for
elsif r1="terminal" then
skip_spaces()
for i=1 to length(rule[2]) do
if sdx>length(src)
or src[sdx]!=rule[2][i] then
sdx = was_sdx
return false
end if
sdx += 1
end for
elsif r1="or" then
for i=2 to length(rule) do
if applies(rule[i]) then
return true
end if
end for
sdx = was_sdx
return false
elsif r1="repeat" then
while applies(rule[2]) do
end while
elsif r1="optional" then
if applies(rule[2]) then
end if
elsif r1="ident" then
integer i = rule[3],
ii = ididx[i]
if not applies(productions[ii][3]) then
sdx = was_sdx
return false
end if
else
?9/0
end if
return true
end function
 
procedure check_valid(string test)
src = test
sdx = 1
bool res = applies(productions[1][3])
skip_spaces()
if sdx<=length(src) then res = false end if
?{test,{"pass","fail"}[2-res]}
end procedure
constant ebnf = {"""
"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z" """,
"""
{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
plus = "+" | "-" .
times = "*" | "/" .
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
}""",
`a = "1"`,
`{ a = "1" ;`,
`{ hello world = "1"; }`,
`{ foo = bar . }`
}
 
constant tests = { {"a1a3a4a4a5a6",
"a1 a2a6",
"a1 a3 a4 a6",
"a1 a4 a5 a6",
"a1 a2 a4 a5 a5 a6",
"a1 a2 a4 a5 a6 a7",
"your ad here"},
{"2",
"2*3 + 4/23 - 7",
"(3 + 4) * 6-2+(4*(4))",
"-2",
"3 +",
"(4 + 3"}}
 
for i=1 to length(ebnf) do
if parse(ebnf[i])=+1 then
?"tests:"
for j=1 to length(tests[i]) do
check_valid(tests[i][j])
end for
end if
end for</lang>
In real use, I would be tempted to use numeric literals rather than string tags in such structures, but the latter certainly make things ten times easier to debug, plus I got an instantly legible dump practically for free.
{{out}}
<pre>
parse:
"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z" ===>
{{"a", 1,
{{"terminal", "a1"}, {"or", {"terminal", "a2"}, {"terminal", "a3"}},
{"repeat", {"terminal", "a4"}}, {"optional", {"terminal", "a5"}},
{"terminal", "a6"}}}}
"tests:"
{"a1a3a4a4a5a6","pass"}
{"a1 a2a6","pass"}
{"a1 a3 a4 a6","pass"}
{"a1 a4 a5 a6","fail"}
{"a1 a2 a4 a5 a5 a6","fail"}
{"a1 a2 a4 a5 a6 a7","fail"}
{"your ad here","fail"}
parse:
{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
 
plus = "+" | "-" .
times = "*" | "/" .
 
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
} ===>
{{"expr", 1,
{{"ident", "term", 2},
{"repeat", {{"ident", "plus", 3}, {"ident", "term", 2}}}}},
{"term", 2,
{{"ident", "factor", 4},
{"repeat", {{"ident", "times", 5}, {"ident", "factor", 4}}}}},
{"factor", 4,
{"or", {"ident", "number", 6},
{{"terminal", "("}, {"ident", "expr", 1}, {"terminal", ")"}}}},
{"plus", 3, {"or", {"terminal", "+"}, {"terminal", "-"}}},
{"times", 5, {"or", {"terminal", "*"}, {"terminal", "/"}}},
{"number", 6, {{"ident", "digit", 7}, {"repeat", {"ident", "digit", 7}}}},
{"digit", 7,
{"or", {"terminal", "0"}, {"terminal", "1"}, {"terminal", "2"},
{"terminal", "3"}, {"terminal", "4"}, {"terminal", "5"},
{"terminal", "6"}, {"terminal", "7"}, {"terminal", "8"},
{"terminal", "9"}}}}
"tests:"
{"2","pass"}
{"2*3 + 4/23 - 7","pass"}
{"(3 + 4) * 6-2+(4*(4))","pass"}
{"-2","fail"}
{"3 +","fail"}
{"(4 + 3","fail"}
parse: a = "1" ===>
"invalid ebnf (missing opening {)"
parse: { a = "1" ; ===>
"invalid ebnf (missing closing })"
parse: { hello world = "1"; } ===>
"invalid ebnf (= expected)"
parse: { foo = bar . } ===>
"invalid ebnf (undefined:bar)"
</pre>
 
=={{header|PicoLisp}}==
7,806

edits