Parse EBNF/ALGOL 68
Parse EBNF/ALGOL 68
This is an Algol 68 sample for the Parse EBNF task.
BEGIN # parse an EBNF grammar and then parse a text that should conform to it #
INT max literals = 100; # maximum number of literals allowed in EBNF #
INT max rules = 100; # maximum number of rules allowed in EBNF #
# MODE defining an input text #
MODE INPUT = STRUCT( INT pos, STRING text );
# possible EBNF token types #
INT end of input tk = 1
, literal tk = 2
, identifier tk = 3
, equal tk = 4
, or tk = 5
, open group tk = 6
, close group tk = 7
, open option tk = 8
, close option tk = 9
, open list tk = 10
, close list tk = 11
, end of production tk = 12
;
[]STRING ebnf token name = []STRING( "(eof)", "literal", "identifier"
, "=", "|", "(", ")", "[", "]", "{", "}", "./;"
);
# EBNF rule types #
INT literal rule = 1 # match a literal #
, numbered rule = 2 # match a numbered rule #
, sequence rule = 3 # match a sequence of rules #
, choice rule = 4 # match a | b ... " #
, list rule = 5 # match 0 or more occurences of a rule #
, option rule = 6 # match 0 or 1 occurences of a rule #
;
[]STRING ebnf rule name = []STRING( "literal", "rule", "seq", "oneof", "list", "opt" );
STRING ebnf special = "=|()[]{}.;"; # special characters #
[]INT ebnf tokens = # tokens corresponding to the special characters #
( equal tk, or tk, open group tk, close group tk
, open option tk, close option tk
, open list tk, close list tk
, end of production tk, end of production tk
);
# mode defining an EBNF rule #
MODE RULE = STRUCT( STRING name, INT type, UNION( REF[]RULE, INT ) rules );
# get the INT value of a RULE, if it has one #
OP INDEXOF = ( RULE r )INT:
CASE rules OF r IN ( INT index ): index OUT -1 ESAC;
CHAR eos char = REPR 0; # end of source character #
# MODE defining an EBNF token #
MODE EBNFTOKEN = STRUCT( STRING text, INT type );
# MODE defining an EBNF grammar #
MODE EBNF = STRUCT( STRING title, comment
, REF[]STRING literals
, REF[]RULE productions
, INPUT source
, BOOL ok
);
PROC error = ( STRING msg, INPUT source )VOID: # reports an error #
print( ( "**** ", msg, newline ) );
OP ISWHITESPACE = ( CHAR c )BOOL: c <= " " AND c /= eos char;
OP ISSPECIAL = ( CHAR c )BOOL: char in string( c, NIL, ebnf special );
OP SPECIALTOKEN = ( CHAR c )INT:
BEGIN
INT token := -1;
char in string( c, token, ebnf special );
ebnf tokens[ token ]
END # SPECIALTOKEN # ;
PRIO INIT = 1;
OP INIT = ( REF INPUT source, STRING text )REF INPUT:
BEGIN
text OF source := text;
pos OF source := LWB text OF source;
source
END # INIT # ;
OP CURR = ( REF INPUT source )CHAR: # get current character from source #
IF pos OF source > UPB text OF source
OR pos OF source < LWB text OF source
THEN # past the end/before the start of the source text #
eos char
ELSE # not past the end/before the start of the source text #
CHAR c = ( text OF source )[ pos OF source ];
# treat embedded control characters ( including NUKs ) as spaces #
IF c <= " " THEN " " ELSE c FI
FI # CURR # ;
OP GET = ( REF INPUT source )CHAR: # get next character from source #
IF pos OF source >= UPB text OF source
THEN # past the end of the source text #
pos OF source := UPB text OF source + 1;
eos char
ELSE # still have more source text #
pos OF source +:= 1;
CURR source
FI # GET # ;
# returns the number of characters remaining in source #
OP CHARSLEFT = ( INPUT source )INT: ( UPB text OF source - pos OF source ) + 1;
OP EBNFNEXT = ( REF INPUT source )EBNFTOKEN: # EBNF lexical analyser #
IF CHAR c := CURR source;
WHILE ISWHITESPACE c DO c := GET source OD;
c = eos char
THEN # at end of source #
EBNFTOKEN( "(eof)", end of input tk )
ELIF c = """" OR c = "'"
THEN # literal #
STRING literal := "";
CHAR delimiter = c;
WHILE c := GET source;
c /= delimiter AND c /= eos char
DO
literal +:= c
OD;
IF c = eos char THEN # no closing delimiter #
error( ( "Missing literal delimiter: ["
+ delimiter
+ "] for literal: "
+ literal
)
, source
)
FI;
c := GET source;
( literal, literal tk )
ELIF ISSPECIAL c
THEN # single character token #
CHAR x = GET source;
( c, SPECIALTOKEN c )
ELSE # must be an identifier #
STRING identifier := c;
c := GET source;
WHILE NOT ISWHITESPACE c
AND NOT ISSPECIAL c
AND c /= eos char
DO
identifier +:= c;
c := GET source
OD;
( identifier, identifier tk )
FI # EBNFNEXT # ;
OP PARSEEBNF = ( REF INPUT source )EBNF: # parses the EBNF in source #
BEGIN
# syntax: [ title ] "{" { production } "}" [ comment ] (";"|".") #
# production = identifier "=" expression (";"|".") #
# expression = term { "|" term } #
# term = factor { factor } #
# factor = identifier | literal #
# | "(" expression ")" #
# | "[" expression "]" #
# | "[" expression "}" #
# title = literal #
# comment = literal #
BOOL ok := TRUE;
[ 1 : max literals ]STRING literals;
INT l max := 0; # current maximum number of literals/identifiers #
[ 1 : max rules ]RULE productions;
INT p max := 0; # current maximum number or productions #
EBNFTOKEN tk := EBNFNEXT source;
# replace the indexes of identifier rules with the index of the #
# referenced ptoduction #
PROC resolve ebnf identifiers = ( REF RULE r )VOID:
CASE rules OF r
IN ( INT index ): # have a literal or numbered rule #
IF type OF r = numbered rule THEN
# the index of r is currently an index into the #
# literals, replace it with an indexc into the #
# productions #
BOOL found := FALSE;
INT p pos := 0;
FOR i FROM LWB productions TO p max WHILE NOT found DO
IF found := name OF productions[ i ] = literals[ index ]
THEN
rules OF r := i
FI
OD;
IF NOT found THEN # undefined name #
ok := FALSE;
error( "Rule """ + literals[ index ] + """ not defined", source )
FI
FI
, ( REF[]RULE rules ): # resolve references in the #
FOR i FROM LWB rules TO UPB rules DO # sub rules #
resolve ebnf identifiers( rules[ i ] )
OD
ESAC # resolve ebnf references # ;
OP EXPECT = ( INT type )VOID: # check the current token is type #
IF type OF tk = type THEN # and skip it if it is #
tk := EBNFNEXT source
ELIF ok THEN # issue an error if not the expected token #
# and we haven't had an error before #
error( "Expected """ + ebnf token name[ type ]
+ """, not """ + text OF tk + """"
, source
);
ok := FALSE
FI # EXPECT # ;
PROC add text = INT: # add the text of a literal or identifier #
BEGIN
BOOL found := FALSE;
INT l pos := 0;
FOR i FROM LWB literals TO l max WHILE NOT found DO
IF found := text OF tk = literals[ i ] THEN
l pos := i
FI
OD;
IF NOT found THEN # have a new literal/identifier #
IF l max >= max literals THEN
error( "Too many identifiers/literals", source );
ok := FALSE
ELSE
literals[ l pos := ( l max +:= 1 ) ] := text OF tk
FI
FI;
l pos
END # add text # ;
# add a literal/idntifier/sequence/option/list rule to rules #
PROC add = ( RULE r, REF[]RULE rules, REF INT r max ) VOID:
IF r max >= UPB rules THEN
error( "Too many rules", source );
ok := FALSE
ELSE
rules[ r max +:= 1 ] := r
FI # add # ;
# add a sub expression to rules #
PROC add sub = ( INT type, RULE r, REF[]RULE rules, REF INT r max )VOID:
IF type = sequence rule
AND ( type OF r = literal rule OR type OF r = numbered rule )
THEN
add( r, rules, r max ) # single element sequence #
ELIF type /= sequence rule AND type OF r = sequence rule THEN
# add a sequence as an option, choice or list #
add( ( "", type, rules OF r ), rules, r max )
ELSE
# add a sequence rule and we already have one #
# or somethine else as a sequence #
add( r, rules, r max )
FI # add sub # ;
# add a literal or identifier rulw to rules #
PROC add simple factor = ( REF[]RULE rules, REF INT r max )VOID:
IF INT l pos := add text;
ok
THEN # ok to try storing the rule #
RULE r = ( ""
, IF type OF tk = literal tk
THEN literal rule
ELSE numbered rule
FI
, l pos
);
add( r, rules, r max )
FI # add simple factor # ;
PROC ebnf production = RULE: # parse a production rule #
BEGIN
PROC ebnf expression = RULE: # parse an expression #
IF
PROC ebnf term = RULE: # parse a term #
BEGIN
[ 1 : max rules ]RULE rules;
INT r max := 0;
WHILE
BOOL have factor := TRUE;
IF type OF tk = literal tk
OR type OF tk = identifier tk
THEN
add simple factor( rules, r max );
tk := EBNFNEXT source
ELIF type OF tk = open group tk
OR type OF tk = open option tk
OR type OF tk = open list tk
THEN
IF type OF tk = open group tk
THEN tk := EBNFNEXT source;
add sub( sequence rule
, ebnf expression
, rules
, r max
);
EXPECT close group tk
ELIF type OF tk = open option tk
THEN tk := EBNFNEXT source;
add sub( option rule
, ebnf expression
, rules
, r max
);
EXPECT close option tk
ELSE tk := EBNFNEXT source;
add sub( list rule
, ebnf expression
, rules
, r max
);
EXPECT close list tk
FI
ELSE
have factor := FALSE
FI;
have factor
DO SKIP OD;
IF r max = 0 AND ok THEN # no factors... #
# ...and this is the first error #
error( "No factors in term", source )
FI;
( ""
, sequence rule
, HEAP[ 1 : r max ]RULE := rules[ 1 : r max ]
)
END # ebnf term # ;
RULE first term := ebnf term;
type OF tk /= or tk
THEN
# no "|", so not a choice #
first term
ELSE
# at least two choices #
[ 1 : max rules ]RULE rules;
INT r max := 0;
add sub( sequence rule, first term, rules, r max );
WHILE type OF tk = or tk DO
tk := EBNFNEXT source;
add sub( sequence rule, ebnf term, rules, r max )
OD;
( "", choice rule
, HEAP[ 1 : r max ]RULE := rules[ 1 : r max ]
)
FI # ebnf expression # ;
STRING name := text OF tk;
EXPECT identifier tk;
EXPECT equal tk;
RULE r := ebnf expression;
name OF r := name;
EXPECT end of production tk;
r
END # ebnf production # ;
PROC optional literal = STRING: # get the text of an optional #
IF type OF tk /= literal tk # literal or "" of it is #
THEN "" # not present #
ELSE STRING text := text OF tk;
tk := EBNFNEXT source;
text
FI # optional literal # ;
STRING title := optional literal;
EXPECT open list tk; # must be "{" at the start of the grammar #
WHILE type OF tk /= close list tk
AND type OF tk /= end of input tk
AND ok
DO
add( ebnf production, productions, p max )
OD;
EXPECT close list tk; # must be "}" after the productions #
STRING comment := optional literal; # ...then an optional comment #
EXPECT end of input tk; # ...followed by end of source #
IF NOT ok THEN # invalid EBNF #
SKIP
ELIF p max = 0 THEN # no productions #
error( "Grammat has no production rules", source )
ELSE # EBNF is OK so far, must check all named rules are defined #
# and replace their index into the literals array with the #
# index in the productions array #
FOR i TO p max DO
resolve ebnf identifiers( productions[ i ] )
OD
FI;
( title
, comment
, HEAP[ 1 : l max ]STRING := literals[ 1 : l max ]
, HEAP[ 1 : p max ]RULE := productions[ 1 : p max ]
, source
, ok
)
END # PARSEEBNF # ;
# parses text with the EBNF grammar g, returns TRUE/FALSE depending on #
# whether the parse succeeded or not #
PRIO PARSE = 1;
OP PARSE = ( EBNF g, STRING text )BOOL:
BEGIN
INPUT source;
source INIT text;
CHAR c := CURR source;
INT furthest match pos := LWB text OF source - 1;
OP MATCH = ( RULE r )BOOL:
BEGIN
BOOL matched := FALSE;
INT start = pos OF source;
INT type = type OF r;
IF type = literal rule THEN # match a literal #
WHILE ISWHITESPACE c DO c := GET source OD;
INT pos = pos OF source;
STRING literal = ( literals OF g )[ INDEXOF r ];
INT length = ( UPB literal - LWB literal ) + 1;
IF length <= CHARSLEFT source THEN
IF matched
:= literal = ( text OF source )[ pos : ( pos + length ) - 1 ]
THEN
# the literal appears at the start of the #
# remaining source - skip over it #
pos OF source +:= length;
c := CURR source
FI
FI
ELIF type = numbered rule THEN # match a production #
matched := MATCH ( productions OF g )[ INDEXOF r ]
ELSE # matching a rule with sub-rules #
CASE rules OF r
IN ( REF[]RULE rules ):
IF type = sequence rule THEN
# match all sub-rules of r #
matched := MATCH rules[ LWB rules ];
FOR i FROM LWB rules + 1 TO UPB rules WHILE matched DO
matched := MATCH rules[ i ]
OD
ELIF type = choice rule THEN
# find one matching sub-rule in r #
matched := MATCH rules[ LWB rules ];
FOR i FROM LWB rules + 1 TO UPB rules WHILE NOT matched DO
matched := MATCH rules[ i ]
OD
ELIF type = list rule THEN
# match all the sub-rules 0 or more times #
WHILE
INT list start = pos OF source;
matched := MATCH rules[ LWB rules ];
FOR i FROM LWB rules + 1 TO UPB rules WHILE matched DO
matched := MATCH rules[ i ]
OD;
IF NOT matched THEN
pos OF source := list start;
c := CURR source
FI;
matched
DO SKIP OD;
# a list rule always matches #
matched := TRUE
ELIF type = option rule THEN
# optionally match all the sub-rules #
INT option start = pos OF source;
matched := MATCH rules[ LWB rules ];
FOR i FROM LWB rules + 1 TO UPB rules WHILE matched DO
matched := MATCH rules[ i ]
OD;
IF NOT matched THEN
pos OF source := option start;
c := CURR source
FI;
# an option rule always matches #
matched := TRUE
ELSE
# invalid rule #
error( "Unexpected rule: " + whole( type, 0 ), source )
FI
ESAC
FI;
IF NOT matched THEN
# the rule didn't match, restore the source position #
pos OF source := start;
c := CURR source
ELIF pos OF source > furthest match pos THEN
# have matched more of the source that before #
furthest match pos := pos OF source
FI;
matched
END # MATCH # ;
# attempt to match the first production rule #
IF NOT MATCH ( productions OF g )[ 1 ] THEN # source didn't match #
error( ( "Syntax error near """
+ ( text OF source )
[ IF furthest match pos <= LWB text OF source - 1 THEN
LWB text OF source
ELSE
furthest match pos - 2
FI
:
]
+ """"
)
, source
);
FALSE
ELSE
# matched so far - must have consumed the whole source #
WHILE ISWHITESPACE c DO c := GET source OD;
IF c /= eos char THEN
error( ( "Unexpected text: """
+ ( text OF source )[ pos OF source : ]
+ """ at the end of source"
)
, source
)
FI;
c = eos char
FI
END # PARSE # ;
# display the details of an EBNF rule and its sub-rules #
PROC show rule = ( STRING indent, RULE r, EBNF g )VOID:
BEGIN
print( ( indent ) );
IF name OF r /= "" THEN print( ( name OF r, ": " ) ) FI;
print( ( ebnf rule name[ type OF r ], " " ) );
IF type OF r = literal rule THEN
print( ( " """, ( literals OF g )[ INDEXOF r ], """" ) )
ELIF type OF r = numbered rule THEN
print( ( " ", name OF ( productions OF g )[ INDEXOF r ] ) )
FI;
print( ( newline ) );
CASE rules OF r
IN ( REF[]RULE rules ):
FOR i FROM LWB rules TO UPB rules DO
show rule( " " + indent, rules[ i ], g )
OD
ESAC
END # show rule # ;
OP SHOW = ( EBNF g )VOID: # show the details of an EBNF grammat #
IF print( ( IF ok OF g THEN "Valid" ELSE "Invalid" FI
, " EBNF: "
, title OF g
, IF comment OF g /= "" THEN "/" + comment OF g ELSE "" FI
, IF comment OF g /= "" OR title OF g /= "" THEN ": " ELSE "" FI
, text OF source OF g
, newline
)
);
ok OF g
THEN # valid EBNF #
FOR p FROM LWB productions OF g TO UPB productions OF g DO
show rule( " ", ( productions OF g )[ p ], g )
OD;
print( ( newline ) )
FI # SHOW # ;
# parse text with the grammar g and show whether it matches or not #
# and whether it matching/not matching is not the expected result #
PROC test ebnf = ( EBNF g, STRING text, BOOL expected result )VOID:
BEGIN
BOOL matched := g PARSE text;
print( ( """", text, """ is "
, IF matched THEN "valid" ELSE "not valid" FI
, " according to ", title OF g
)
);
IF matched /= expected result THEN print( ( " UNEXPECTED RESULT" ) ) FI;
print( ( newline, newline ) )
END # test ebnf # ;
CHAR nl = REPR 10; # newline character #
[]STRING tests = ( ( "'a' { a = 'a1' ( 'a2' | 'a3' )"
+ " { 'a4' } [ 'a5' ] 'a6' ; } 'z'"
)
, ( nl + "'Arithmetic expressions'"
+ nl + "{"
+ nl + " expr = term { plus term } ."
+ nl + " term = factor { times factor } ."
+ nl + " factor = number | '(' expr ')' ."
+ nl + " plus = '+' | '-' ."
+ nl + " times = '*' | '/' ."
+ nl + " number = digit { digit } ."
+ nl + " digit = '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' ."
+ nl + "}"
)
, "a = '1';"
, "{ a = '1' ;"
, "{ hello world = '1'; }"
, "{ foo = bar . }"
);
FOR t FROM LWB tests TO UPB tests DO
INPUT ebnf source;
EBNF grammar := PARSEEBNF ( ebnf source INIT tests[ t ] );
SHOW grammar;
print( ( newline ) );
# test cases for some grammars #
IF title OF grammar = "a" THEN
test ebnf( grammar, "a1a3a4a4a5a6", TRUE );
test ebnf( grammar, "a1 a2a6", TRUE );
test ebnf( grammar, "a1 a3 a4 a6", TRUE );
test ebnf( grammar, "a1 a4 a5 a6", FALSE );
test ebnf( grammar, "a1 a2 a4 a5 a5 a6", FALSE );
test ebnf( grammar, "a1 a2 a4 a5 a6 a7", FALSE );
test ebnf( grammar, "your ad here", FALSE );
print( ( newline ) )
ELIF title OF grammar = "Arithmetic expressions" THEN
test ebnf( grammar, "2", TRUE );
test ebnf( grammar, "2*3 + 4/23 - 7", TRUE );
test ebnf( grammar, "(3 + 4) * 6-2+(4*(4))", TRUE );
test ebnf( grammar, "-2", FALSE );
test ebnf( grammar, "3 +", FALSE );
test ebnf( grammar, "(4 + 3", FALSE );
print( ( newline ) )
FI
OD
END