Parse EBNF: Difference between revisions
Content added Content deleted
Thundergnat (talk | contribs) (→{{header|Perl 6}}: Updated to work with latest version of Rakudo. Minor syntax fixes) |
|||
Line 848: | Line 848: | ||
*******************************************************************************</pre> |
*******************************************************************************</pre> |
||
=={{header|Phix}}== |
|||
<lang Phix>string src |
|||
integer ch, sdx |
|||
object token |
|||
bool error = false |
|||
function invalid(string msg) |
|||
error = true |
|||
?msg |
|||
sdx = length(src)+1 -- (set to eof) |
|||
return -1 |
|||
end function |
|||
procedure skip_spaces() |
|||
while 1 do |
|||
if sdx>length(src) then exit end if |
|||
ch = src[sdx] |
|||
if not find(ch," \t\r\n") then exit end if |
|||
sdx += 1 |
|||
end while |
|||
end procedure |
|||
procedure get_token() |
|||
-- yeilds a single character token, one of {}()[]|=.; |
|||
-- or {"terminal",string} or {"ident", string} or -1. |
|||
skip_spaces() |
|||
if sdx>length(src) then |
|||
token = -1 |
|||
return |
|||
end if |
|||
integer tokstart = sdx |
|||
if find(ch,"{}()[]|=.;") then |
|||
sdx += 1 |
|||
token = ch |
|||
elsif ch='\"' |
|||
or ch='\'' then |
|||
integer closech = ch |
|||
for tokend=sdx+1 to length(src) do |
|||
if src[tokend] = closech then |
|||
sdx = tokend+1 |
|||
token = {"terminal",src[tokstart+1..tokend-1]} |
|||
return |
|||
end if |
|||
end for |
|||
token = invalid("no closing quote") |
|||
elsif ch>='a' and ch<='z' then |
|||
-- to simplify things for the purposes of this task, |
|||
-- identifiers are strictly a-z only, not A-Z or 1-9 |
|||
while 1 do |
|||
sdx += 1 |
|||
if sdx>length(src) then exit end if |
|||
ch = src[sdx] |
|||
if ch<'a' or ch>'z' then exit end if |
|||
end while |
|||
token = {"ident",src[tokstart..sdx-1]} |
|||
else |
|||
token = invalid("invalid ebnf") |
|||
end if |
|||
end procedure |
|||
procedure match_token(integer ch) |
|||
if token!=ch then |
|||
token = invalid(sprintf("invalid ebnf (%c expected)",ch)) |
|||
else |
|||
get_token() |
|||
end if |
|||
end procedure |
|||
sequence idents = {}, |
|||
ididx = {} |
|||
function add_ident(string ident) |
|||
integer k = find(ident,idents) |
|||
if k=0 then |
|||
idents = append(idents,ident) |
|||
k = length(idents) |
|||
ididx = append(ididx,0) |
|||
end if |
|||
return k |
|||
end function |
|||
forward function expression() |
|||
function factor() |
|||
object res |
|||
if sequence(token) then |
|||
if token[1]="ident" then |
|||
token &= add_ident(token[2]) |
|||
end if |
|||
res = token |
|||
get_token() |
|||
elsif token='[' then |
|||
get_token() |
|||
res = {"optional",expression()} |
|||
match_token(']') |
|||
elsif token='(' then |
|||
get_token() |
|||
res = expression() |
|||
match_token(')') |
|||
elsif token='{' then |
|||
get_token() |
|||
res = {"repeat",expression()} |
|||
match_token('}') |
|||
else |
|||
?9/0 -- erm?? |
|||
-- res = -1 -- "" |
|||
end if |
|||
return res |
|||
end function |
|||
function term() |
|||
-- sequence res = {"factors",factor()} -- (opted against this) |
|||
sequence res = {factor()} |
|||
while not find(token,{-1,'|','.',';',')',']','}'}) do |
|||
res = append(res,factor()) |
|||
end while |
|||
if length(res)=1 then res = res[1] end if |
|||
return res |
|||
end function |
|||
function expression() |
|||
object res = term() |
|||
if token='|' then |
|||
res = {"or",res} |
|||
while token='|' do |
|||
get_token() |
|||
res = append(res,term()) |
|||
end while |
|||
end if |
|||
return res |
|||
end function |
|||
sequence productions = {} |
|||
function production() |
|||
-- returns a token or -1; the real result is left in productions[] etc. |
|||
get_token() |
|||
if token!='}' then |
|||
if token=-1 then |
|||
return invalid("invalid ebnf (missing closing })") |
|||
end if |
|||
if not sequence(token) |
|||
or token[1]!="ident" then |
|||
return -1 |
|||
end if |
|||
string ident = token[2] |
|||
integer idx = add_ident(ident) |
|||
get_token() |
|||
match_token('=') |
|||
if token=-1 then |
|||
return -1 |
|||
end if |
|||
sequence res = expression() |
|||
productions = append(productions,{ident,idx,res}) |
|||
ididx[idx] = length(productions) |
|||
end if |
|||
return token |
|||
end function |
|||
sequence extras = {} |
|||
function parse(string ebnf) |
|||
-- returns: +1 if ok, -1 if bad |
|||
puts(1,"parse: "&ebnf&" ===>\n") |
|||
error = false |
|||
src = ebnf |
|||
sdx = 1 |
|||
idents = {} |
|||
ididx = {} |
|||
productions = {} |
|||
extras = {} |
|||
get_token() |
|||
if sequence(token) then |
|||
token[1] = "title" |
|||
extras = append(extras,token) |
|||
get_token() |
|||
end if |
|||
if token!='{' then |
|||
return invalid("invalid ebnf (missing opening {)") |
|||
end if |
|||
while 1 do |
|||
token = production() |
|||
if token='}' or token=-1 then exit end if |
|||
end while |
|||
get_token() |
|||
if sequence(token) then |
|||
token[1] = "comment" |
|||
extras = append(extras,token) |
|||
get_token() |
|||
end if |
|||
if token!=-1 then |
|||
return invalid("invalid ebnf (missing eof?)") |
|||
end if |
|||
if error then return -1 end if |
|||
integer k = find(0,ididx) |
|||
if k!=0 then |
|||
return invalid("invalid ebnf (undefined:"&idents[k]&")") |
|||
end if |
|||
ppOpt({pp_Pause,0}) |
|||
pp(productions) |
|||
-- pp(idents) |
|||
-- pp(ididx) |
|||
-- pp(extras) |
|||
return 1 |
|||
end function |
|||
-- The rules that applies() has to deal with are: |
|||
-- {factors} - if rule[1] is not string, just apply one after the other, |
|||
-- (as per term() above, originally I had a "factors" there, |
|||
-- but getting rid of it made the syntax tree much clearer) |
|||
-- {"terminal", "a1"} -- literal constants |
|||
-- {"or", <e1>, <e2>, ...} -- (any) one of n |
|||
-- {"repeat", <e1>} -- as per "{}" in ebnf |
|||
-- {"optional", <e1>} -- as per "()" in ebnf |
|||
-- {"ident", <name>, idx} -- apply the sub-rule |
|||
function applies(sequence rule) |
|||
integer was_sdx = sdx -- in case of failure |
|||
object r1 = rule[1] |
|||
if not string(r1) then |
|||
for i=1 to length(rule) do |
|||
if not applies(rule[i]) then |
|||
sdx = was_sdx |
|||
return false |
|||
end if |
|||
end for |
|||
elsif r1="terminal" then |
|||
skip_spaces() |
|||
for i=1 to length(rule[2]) do |
|||
if sdx>length(src) |
|||
or src[sdx]!=rule[2][i] then |
|||
sdx = was_sdx |
|||
return false |
|||
end if |
|||
sdx += 1 |
|||
end for |
|||
elsif r1="or" then |
|||
for i=2 to length(rule) do |
|||
if applies(rule[i]) then |
|||
return true |
|||
end if |
|||
end for |
|||
sdx = was_sdx |
|||
return false |
|||
elsif r1="repeat" then |
|||
while applies(rule[2]) do |
|||
end while |
|||
elsif r1="optional" then |
|||
if applies(rule[2]) then |
|||
end if |
|||
elsif r1="ident" then |
|||
integer i = rule[3], |
|||
ii = ididx[i] |
|||
if not applies(productions[ii][3]) then |
|||
sdx = was_sdx |
|||
return false |
|||
end if |
|||
else |
|||
?9/0 |
|||
end if |
|||
return true |
|||
end function |
|||
procedure check_valid(string test) |
|||
src = test |
|||
sdx = 1 |
|||
bool res = applies(productions[1][3]) |
|||
skip_spaces() |
|||
if sdx<=length(src) then res = false end if |
|||
?{test,{"pass","fail"}[2-res]} |
|||
end procedure |
|||
constant ebnf = {""" |
|||
"a" { |
|||
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ; |
|||
} "z" """, |
|||
""" |
|||
{ |
|||
expr = term { plus term } . |
|||
term = factor { times factor } . |
|||
factor = number | '(' expr ')' . |
|||
plus = "+" | "-" . |
|||
times = "*" | "/" . |
|||
number = digit { digit } . |
|||
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" . |
|||
}""", |
|||
`a = "1"`, |
|||
`{ a = "1" ;`, |
|||
`{ hello world = "1"; }`, |
|||
`{ foo = bar . }` |
|||
} |
|||
constant tests = { {"a1a3a4a4a5a6", |
|||
"a1 a2a6", |
|||
"a1 a3 a4 a6", |
|||
"a1 a4 a5 a6", |
|||
"a1 a2 a4 a5 a5 a6", |
|||
"a1 a2 a4 a5 a6 a7", |
|||
"your ad here"}, |
|||
{"2", |
|||
"2*3 + 4/23 - 7", |
|||
"(3 + 4) * 6-2+(4*(4))", |
|||
"-2", |
|||
"3 +", |
|||
"(4 + 3"}} |
|||
for i=1 to length(ebnf) do |
|||
if parse(ebnf[i])=+1 then |
|||
?"tests:" |
|||
for j=1 to length(tests[i]) do |
|||
check_valid(tests[i][j]) |
|||
end for |
|||
end if |
|||
end for</lang> |
|||
In real use, I would be tempted to use numeric literals rather than string tags in such structures, but the latter certainly make things ten times easier to debug, plus I got an instantly legible dump practically for free. |
|||
{{out}} |
|||
<pre> |
|||
parse: |
|||
"a" { |
|||
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ; |
|||
} "z" ===> |
|||
{{"a", 1, |
|||
{{"terminal", "a1"}, {"or", {"terminal", "a2"}, {"terminal", "a3"}}, |
|||
{"repeat", {"terminal", "a4"}}, {"optional", {"terminal", "a5"}}, |
|||
{"terminal", "a6"}}}} |
|||
"tests:" |
|||
{"a1a3a4a4a5a6","pass"} |
|||
{"a1 a2a6","pass"} |
|||
{"a1 a3 a4 a6","pass"} |
|||
{"a1 a4 a5 a6","fail"} |
|||
{"a1 a2 a4 a5 a5 a6","fail"} |
|||
{"a1 a2 a4 a5 a6 a7","fail"} |
|||
{"your ad here","fail"} |
|||
parse: |
|||
{ |
|||
expr = term { plus term } . |
|||
term = factor { times factor } . |
|||
factor = number | '(' expr ')' . |
|||
plus = "+" | "-" . |
|||
times = "*" | "/" . |
|||
number = digit { digit } . |
|||
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" . |
|||
} ===> |
|||
{{"expr", 1, |
|||
{{"ident", "term", 2}, |
|||
{"repeat", {{"ident", "plus", 3}, {"ident", "term", 2}}}}}, |
|||
{"term", 2, |
|||
{{"ident", "factor", 4}, |
|||
{"repeat", {{"ident", "times", 5}, {"ident", "factor", 4}}}}}, |
|||
{"factor", 4, |
|||
{"or", {"ident", "number", 6}, |
|||
{{"terminal", "("}, {"ident", "expr", 1}, {"terminal", ")"}}}}, |
|||
{"plus", 3, {"or", {"terminal", "+"}, {"terminal", "-"}}}, |
|||
{"times", 5, {"or", {"terminal", "*"}, {"terminal", "/"}}}, |
|||
{"number", 6, {{"ident", "digit", 7}, {"repeat", {"ident", "digit", 7}}}}, |
|||
{"digit", 7, |
|||
{"or", {"terminal", "0"}, {"terminal", "1"}, {"terminal", "2"}, |
|||
{"terminal", "3"}, {"terminal", "4"}, {"terminal", "5"}, |
|||
{"terminal", "6"}, {"terminal", "7"}, {"terminal", "8"}, |
|||
{"terminal", "9"}}}} |
|||
"tests:" |
|||
{"2","pass"} |
|||
{"2*3 + 4/23 - 7","pass"} |
|||
{"(3 + 4) * 6-2+(4*(4))","pass"} |
|||
{"-2","fail"} |
|||
{"3 +","fail"} |
|||
{"(4 + 3","fail"} |
|||
parse: a = "1" ===> |
|||
"invalid ebnf (missing opening {)" |
|||
parse: { a = "1" ; ===> |
|||
"invalid ebnf (missing closing })" |
|||
parse: { hello world = "1"; } ===> |
|||
"invalid ebnf (= expected)" |
|||
parse: { foo = bar . } ===> |
|||
"invalid ebnf (undefined:bar)" |
|||
</pre> |
|||
=={{header|PicoLisp}}== |
=={{header|PicoLisp}}== |