Parse EBNF/ALGOL 68

From Rosetta Code

Parse EBNF/ALGOL 68

This is an Algol 68 sample for the Parse EBNF task.

BEGIN # parse an EBNF grammar and then parse a text that should conform to it #

    INT max literals = 100;      # maximum number of literals allowed in EBNF #
    INT max rules    = 100;         # maximum number of rules allowed in EBNF #

    # MODE defining an input text                                             #
    MODE INPUT = STRUCT( INT pos, STRING text );

    # possible EBNF token types                                               #
    INT       end of input tk =  1
      ,            literal tk =  2
      ,         identifier tk =  3
      ,              equal tk =  4
      ,                 or tk =  5
      ,         open group tk =  6
      ,        close group tk =  7
      ,        open option tk =  8
      ,       close option tk =  9
      ,          open list tk = 10
      ,         close list tk = 11
      ,  end of production tk = 12
      ;
    []STRING ebnf token name = []STRING( "(eof)", "literal", "identifier"
                                       , "=", "|", "(", ")", "[", "]", "{", "}", "./;"
                                       );
    # EBNF rule types                                                         #
    INT        literal rule = 1                             # match a literal #
      ,       numbered rule = 2                       # match a numbered rule #
      ,       sequence rule = 3                   # match a sequence of rules #
      ,         choice rule = 4                           # match a | b ... " #
      ,           list rule = 5        # match 0 or more occurences of a rule #
      ,         option rule = 6           # match 0 or 1 occurences of a rule #
      ;
    []STRING ebnf rule name = []STRING( "literal", "rule", "seq", "oneof", "list", "opt" );

    STRING ebnf special = "=|()[]{}.;";                  # special characters #
    []INT  ebnf tokens  =    # tokens corresponding to the special characters #
                          ( equal tk, or tk, open group tk, close group tk
                          , open option tk, close option tk
                          , open list tk, close list tk
                          , end of production tk, end of production tk
                          );

    # mode defining an EBNF rule                                              #
    MODE RULE = STRUCT( STRING name, INT type, UNION( REF[]RULE, INT ) rules );

    # get the INT value of a RULE, if it has one                              #
    OP   INDEXOF = ( RULE r )INT:
         CASE rules OF r IN ( INT index ): index OUT -1 ESAC;

    CHAR eos char = REPR 0;                         # end of source character #

    # MODE defining an EBNF token                                             #
    MODE EBNFTOKEN = STRUCT( STRING text, INT type );

    # MODE defining an EBNF grammar                                           #
    MODE EBNF = STRUCT( STRING      title, comment
                      , REF[]STRING literals
                      , REF[]RULE   productions
                      , INPUT       source
                      , BOOL        ok
                      );

    PROC error = ( STRING msg, INPUT source )VOID:         # reports an error #
         print( ( "**** ", msg, newline ) );

    OP   ISWHITESPACE = ( CHAR c )BOOL: c <= " " AND c /= eos char;
    OP   ISSPECIAL    = ( CHAR c )BOOL: char in string( c, NIL, ebnf special );
    OP   SPECIALTOKEN = ( CHAR c )INT:
         BEGIN
            INT token := -1;
            char in string( c, token, ebnf special );
            ebnf tokens[ token ]
         END # SPECIALTOKEN # ;
    PRIO INIT = 1;
    OP   INIT = ( REF INPUT source, STRING text )REF INPUT:
         BEGIN
            text OF source := text;
            pos OF source  := LWB text OF source;
            source
         END # INIT # ;
    OP   CURR = ( REF INPUT source )CHAR: # get current character from source #
         IF pos OF source > UPB text OF source
         OR pos OF source < LWB text OF source
         THEN              # past the end/before the start of the source text #
             eos char
         ELSE          # not past the end/before the start of the source text #
             CHAR c = ( text OF source )[ pos OF source ];
             # treat embedded control characters ( including NUKs ) as spaces #
             IF c <= " " THEN " " ELSE c FI
         FI # CURR # ;
    OP   GET  = ( REF INPUT source )CHAR:    # get next character from source #
         IF pos OF source >= UPB text OF source
         THEN                               # past the end of the source text #
             pos OF source := UPB text OF source + 1;
             eos char
         ELSE                                   # still have more source text #
             pos OF source +:= 1;
             CURR source
         FI # GET # ;
    # returns the number of characters remaining in source                    #
    OP   CHARSLEFT = ( INPUT source )INT: ( UPB text OF source - pos OF source ) + 1;

    OP   EBNFNEXT = ( REF INPUT source )EBNFTOKEN:    # EBNF lexical analyser #
         IF CHAR c := CURR source;
            WHILE ISWHITESPACE c DO c := GET source OD;
            c = eos char
         THEN                                              # at end of source #
            EBNFTOKEN( "(eof)", end of input tk )
         ELIF c = """" OR c = "'"
         THEN                                                       # literal #
            STRING literal  := "";
            CHAR   delimiter = c;
            WHILE c := GET source;
                  c /= delimiter AND c /= eos char
            DO
                literal +:= c
            OD;
            IF c = eos char THEN                       # no closing delimiter #
                error( ( "Missing literal delimiter: ["
                       + delimiter
                       + "] for literal: "
                       + literal
                       )
                     , source
                     )
            FI;
            c := GET source;
            ( literal, literal tk )
         ELIF ISSPECIAL c
         THEN                                        # single character token #
            CHAR x = GET source;
            ( c, SPECIALTOKEN c )
         ELSE                                         # must be an identifier #
             STRING identifier := c;
             c := GET source;
             WHILE NOT ISWHITESPACE c
               AND NOT ISSPECIAL c
               AND c /= eos char
             DO
                 identifier +:= c;
                 c := GET source
             OD;
             ( identifier, identifier tk )
         FI # EBNFNEXT # ;

    OP   PARSEEBNF = ( REF INPUT source )EBNF:    # parses the EBNF in source #
         BEGIN
            # syntax: [ title ] "{" { production } "}" [ comment ] (";"|".")  #
            #         production = identifier "=" expression       (";"|".")  #
            #         expression = term { "|" term }                          #
            #         term       = factor { factor }                          #
            #         factor     = identifier | literal                       #
            #                    | "(" expression ")"                         #
            #                    | "[" expression "]"                         #
            #                    | "[" expression "}"                         #
            #         title      = literal                                    #
            #         comment    = literal                                    #
            BOOL ok := TRUE;
            [ 1 : max literals ]STRING literals;
            INT l max := 0;  # current maximum number of literals/identifiers #
            [ 1 : max rules    ]RULE   productions;
            INT p max := 0;           # current maximum number or productions #
            EBNFTOKEN tk := EBNFNEXT source;

            # replace the indexes of identifier rules with the index of the   #
            #         referenced ptoduction                                   #
            PROC resolve ebnf identifiers = ( REF RULE r )VOID:
                 CASE rules OF r
                   IN ( INT index ):        # have a literal or numbered rule #
                      IF type OF r = numbered rule THEN
                          # the index of r is currently an index into the     #
                          # literals, replace it with an indexc into the      #
                          # productions                                       #
                          BOOL found := FALSE;
                          INT  p pos := 0;
                          FOR i FROM LWB productions TO p max WHILE NOT found DO
                              IF found := name OF productions[ i ] = literals[ index ]
                              THEN
                                  rules OF r := i
                              FI
                          OD;
                          IF NOT found THEN                  # undefined name #
                              ok := FALSE;
                              error( "Rule """ + literals[ index ] + """ not defined", source )
                          FI
                      FI
                    , ( REF[]RULE rules ):        # resolve references in the #
                      FOR i FROM LWB rules TO UPB rules DO        # sub rules #
                          resolve ebnf identifiers( rules[ i ] )
                      OD
                 ESAC # resolve ebnf references # ;

            OP   EXPECT = ( INT type )VOID: # check the current token is type #
                 IF type OF tk = type THEN             # and skip it if it is #
                    tk := EBNFNEXT source
                 ELIF ok THEN      # issue an error if not the expected token #
                                         # and we haven't had an error before #
                    error( "Expected """ + ebnf token name[ type ]
                         + """, not """  + text OF tk + """"
                         , source
                         );
                    ok := FALSE
                 FI # EXPECT # ;

            PROC add text = INT:    # add the text of a literal or identifier #
                 BEGIN
                    BOOL found := FALSE;
                    INT  l pos := 0;
                    FOR i FROM LWB literals TO l max WHILE NOT found DO
                        IF found := text OF tk = literals[ i ] THEN
                            l pos := i
                        FI
                    OD;
                    IF NOT found THEN         # have a new literal/identifier #
                        IF l max >= max literals THEN
                            error( "Too many identifiers/literals", source );
                            ok := FALSE
                        ELSE
                            literals[ l pos := ( l max +:= 1 ) ] := text OF tk
                        FI
                    FI;
                    l pos
                 END # add text # ;

            # add a literal/idntifier/sequence/option/list rule to rules      #
            PROC add = ( RULE r, REF[]RULE rules, REF INT r max ) VOID:
                 IF r max >= UPB rules THEN
                    error( "Too many rules", source );
                    ok := FALSE
                 ELSE
                    rules[ r max +:= 1 ] := r
                 FI # add # ;
            # add a sub expression to rules                                   #
            PROC add sub = ( INT type, RULE r, REF[]RULE rules, REF INT r max )VOID:
                  IF type = sequence rule
                  AND ( type OF r = literal rule OR type OF r = numbered rule )
                  THEN
                     add( r, rules, r max )         # single element sequence #
                  ELIF type /= sequence rule AND type OF r = sequence rule THEN
                     # add a sequence as an option, choice or list            #
                     add( ( "", type, rules OF r ), rules, r max )
                  ELSE
                     # add a sequence rule and we already have one            #
                     # or somethine else as a sequence                        #
                     add( r, rules, r max )
                  FI # add sub # ;
            # add a literal or identifier rulw to rules                       #
            PROC add simple factor = ( REF[]RULE rules, REF INT r max )VOID:
                 IF INT l pos := add text;
                    ok
                 THEN                            # ok to try storing the rule #
                    RULE r = ( ""
                             , IF type OF tk = literal tk
                               THEN literal rule
                               ELSE numbered rule
                               FI
                             , l pos
                             );
                    add( r, rules, r max )
                 FI # add simple factor # ;

            PROC ebnf production = RULE:            # parse a production rule #
                 BEGIN
                    PROC ebnf expression = RULE:        # parse an expression #
                         IF
                            PROC ebnf term = RULE:             # parse a term #
                                 BEGIN
                                    [ 1 : max rules ]RULE rules;
                                    INT r max := 0;
                                    WHILE
                                        BOOL have factor := TRUE;
                                        IF type OF tk = literal tk
                                        OR type OF tk = identifier tk
                                        THEN
                                           add simple factor( rules, r max );
                                           tk := EBNFNEXT source
                                        ELIF type OF tk = open group tk
                                          OR type OF tk = open option tk
                                          OR type OF tk = open list tk
                                        THEN
                                            IF   type OF tk = open group tk
                                            THEN tk := EBNFNEXT source;
                                                 add sub( sequence rule
                                                        , ebnf expression
                                                        , rules
                                                        , r max
                                                        );
                                                 EXPECT close group tk
                                            ELIF type OF tk = open option tk
                                            THEN tk := EBNFNEXT source;
                                                 add sub( option rule
                                                        , ebnf expression
                                                        , rules
                                                        , r max
                                                        );
                                                 EXPECT close option tk
                                            ELSE tk := EBNFNEXT source;
                                                 add sub( list rule
                                                        , ebnf expression
                                                        , rules
                                                        , r max
                                                        );
                                                 EXPECT close list tk
                                            FI
                                        ELSE
                                            have factor := FALSE
                                        FI;
                                        have factor
                                    DO SKIP OD;
                                    IF r max = 0 AND ok THEN  # no factors... #
                                             # ...and this is the first error #
                                        error( "No factors in term", source )
                                    FI;
                                    ( ""
                                    , sequence rule
                                    , HEAP[ 1 : r max ]RULE := rules[ 1 : r max ]
                                    )
                                 END # ebnf term # ;
                            RULE first term := ebnf term;
                            type OF tk /= or tk
                         THEN
                            # no "|", so not a choice                         #
                            first term
                         ELSE
                            # at least two choices                            #
                            [ 1 : max rules ]RULE rules;
                            INT r max := 0;
                            add sub( sequence rule, first term, rules, r max );
                            WHILE type OF tk = or tk DO
                                tk := EBNFNEXT source;
                                add sub( sequence rule, ebnf term, rules, r max )
                            OD;
                            ( "", choice rule
                            , HEAP[ 1 : r max ]RULE := rules[ 1 : r max ]
                            )
                         FI # ebnf expression # ;

                    STRING name := text OF tk;
                    EXPECT identifier tk;
                    EXPECT equal tk;
                    RULE r := ebnf expression;
                    name OF r := name;
                    EXPECT end of production tk;
                    r
                 END # ebnf production # ;

            PROC optional literal = STRING:     # get the text of an optional #
                 IF type OF tk /= literal tk         # literal or "" of it is #
                 THEN ""                                        # not present #
                 ELSE STRING text := text OF tk;
                      tk          := EBNFNEXT source;
                      text
                 FI # optional literal # ;

            STRING title := optional literal;
            EXPECT open list tk;    # must be "{" at the start of the grammar #
            WHILE type OF tk /= close list tk
              AND type OF tk /= end of input tk
              AND ok
            DO
                add( ebnf production, productions, p max )
            OD;
            EXPECT close list tk;         # must be "}" after the productions #
            STRING comment := optional literal; # ...then an optional comment #
            EXPECT end of input tk;            # ...followed by end of source #
            IF   NOT ok THEN                                   # invalid EBNF #
                 SKIP
            ELIF p max = 0 THEN                              # no productions #
                error( "Grammat has no production rules", source )
            ELSE  # EBNF is OK so far, must check all named rules are defined #
                  # and replace their index into the literals array with the  #
                  # index in the productions array                            #
                FOR i TO p max DO
                    resolve ebnf identifiers( productions[ i ] )
                OD
            FI;
            ( title
            , comment
            , HEAP[ 1 : l max ]STRING := literals[    1 : l max ]
            , HEAP[ 1 : p max ]RULE   := productions[ 1 : p max ]
            , source
            , ok
            )
         END # PARSEEBNF # ;

    # parses text with the EBNF grammar g, returns TRUE/FALSE depending on    #
    # whether the parse succeeded or not                                      #
    PRIO PARSE = 1;
    OP   PARSE = ( EBNF g, STRING text )BOOL:
         BEGIN
            INPUT source;
            source INIT text;
            CHAR c := CURR source;
            INT  furthest match pos := LWB text OF source - 1;

            OP   MATCH = ( RULE r )BOOL:
                 BEGIN
                    BOOL matched := FALSE;
                    INT  start    = pos OF source;
                    INT  type     = type OF r;
                    IF   type = literal rule THEN           # match a literal #
                        WHILE ISWHITESPACE c DO c := GET source OD;
                        INT    pos     = pos OF source;
                        STRING literal = ( literals OF g )[ INDEXOF r ];
                        INT    length  = ( UPB literal - LWB literal ) + 1;
                        IF length <= CHARSLEFT source THEN
                            IF matched
                                  := literal = ( text OF source )[ pos : ( pos + length ) - 1 ]
                            THEN
                                # the literal appears at the start of the     #
                                # remaining source - skip over it             #
                                pos OF source +:= length;
                                c := CURR source
                            FI
                        FI
                    ELIF type = numbered rule THEN       # match a production #
                        matched := MATCH ( productions OF g )[ INDEXOF r ]
                    ELSE                     # matching a rule with sub-rules #
                        CASE rules OF r
                          IN ( REF[]RULE rules ):
                             IF    type = sequence rule THEN
                                 # match all sub-rules of r                   #
                                 matched := MATCH rules[ LWB rules ];
                                 FOR i FROM LWB rules + 1 TO UPB rules WHILE matched DO
                                     matched := MATCH rules[ i ]
                                 OD
                             ELIF type = choice rule THEN
                                 # find one matching sub-rule in r            #
                                 matched := MATCH rules[ LWB rules ];
                                 FOR i FROM LWB rules + 1 TO UPB rules WHILE NOT matched DO
                                     matched := MATCH rules[ i ]
                                 OD
                             ELIF type = list rule THEN
                                 # match all the sub-rules 0 or more times    #
                                 WHILE
                                     INT list start = pos OF source;
                                     matched := MATCH rules[ LWB rules ];
                                     FOR i FROM LWB rules + 1 TO UPB rules WHILE matched DO
                                         matched := MATCH rules[ i ]
                                     OD;
                                     IF NOT matched THEN
                                         pos OF source := list start;
                                         c             := CURR source
                                     FI;
                                     matched
                                 DO SKIP OD;
                                 # a list rule always matches                 #
                                 matched := TRUE
                             ELIF type = option rule THEN
                                 # optionally match all the sub-rules         #
                                 INT option start = pos OF source;
                                 matched := MATCH rules[ LWB rules ];
                                 FOR i FROM LWB rules + 1 TO UPB rules WHILE matched DO
                                     matched := MATCH rules[ i ]
                                 OD;
                                 IF NOT matched THEN
                                     pos OF source := option start;
                                     c             := CURR source
                                 FI;
                                 # an option rule always matches              #
                                 matched := TRUE
                             ELSE
                                 # invalid rule                               #
                                 error( "Unexpected rule: " + whole( type, 0 ), source )
                             FI
                        ESAC
                    FI;
                    IF NOT matched THEN
                        # the rule didn't match, restore the source position  #
                        pos OF source := start;
                        c             := CURR source
                    ELIF pos OF source > furthest match pos THEN
                        # have matched more of the source that before         #
                        furthest match pos := pos OF source
                    FI;
                    matched
                 END # MATCH # ;
                        
            # attempt to match the first production rule                      #
            IF NOT MATCH ( productions OF g )[ 1 ] THEN # source didn't match #
                error( ( "Syntax error near """
                       + ( text OF source )
                            [ IF furthest match pos <= LWB text OF source - 1 THEN
                                  LWB text OF source
                              ELSE
                                  furthest match pos - 2
                              FI
                            :
                            ]
                       + """"
                       )
                     , source
                     );
                FALSE
            ELSE
                # matched so far - must have consumed the whole source        #
                WHILE ISWHITESPACE c DO c := GET source OD;
                IF c /= eos char THEN
                    error( ( "Unexpected text: """
                           + ( text OF source )[ pos OF source : ]
                           + """ at the end of source"
                           )
                         , source
                         )
                FI;
                c = eos char
            FI
         END # PARSE # ;

    # display the details of an EBNF rule and its sub-rules                   #
    PROC show rule = ( STRING indent, RULE r, EBNF g )VOID:
         BEGIN
            print( ( indent ) );
            IF name OF r /= "" THEN print( ( name OF r, ": " ) ) FI;
            print( ( ebnf rule name[ type OF r ], " " ) );
            IF   type OF r = literal rule THEN
                print( ( " """, ( literals OF g )[ INDEXOF r ], """" ) )
            ELIF type OF r = numbered rule THEN
                print( ( " ", name OF ( productions OF g )[ INDEXOF r ] ) )
            FI;
            print( ( newline ) );
            CASE rules OF r
              IN ( REF[]RULE rules ):
                 FOR i FROM LWB rules TO UPB rules DO
                     show rule( "  " + indent, rules[ i ], g )
                 OD
            ESAC
         END # show rule # ;
    OP   SHOW = ( EBNF g )VOID:         # show the details of an EBNF grammat #
         IF print( ( IF ok OF g THEN "Valid" ELSE "Invalid" FI
                   , " EBNF: "
                   , title OF g
                   , IF comment OF g /= "" THEN "/" + comment OF g ELSE "" FI
                   , IF comment OF g /= "" OR title OF g /= "" THEN ": " ELSE "" FI
                   , text OF source OF g
                   , newline
                   )
                 );
            ok OF g
         THEN                                                    # valid EBNF #
            FOR p FROM LWB productions OF g TO UPB productions OF g DO
                show rule( " ", ( productions OF g )[ p ], g )
            OD;
            print( ( newline ) )
         FI # SHOW # ;

    # parse text with the grammar g and show whether it matches or not        #
    # and whether it matching/not matching is not the expected result         #
    PROC test ebnf = ( EBNF g, STRING text, BOOL expected result )VOID:
         BEGIN
            BOOL matched := g PARSE text;
            print( ( """", text, """ is "
                   , IF matched THEN "valid" ELSE "not valid" FI
                   , " according to ", title OF g
                   )
                 );
            IF matched /= expected result THEN print( ( " UNEXPECTED RESULT" ) ) FI;
            print( ( newline, newline ) )
         END # test ebnf # ;

    CHAR nl = REPR 10;                                    # newline character #
    []STRING tests = ( ( "'a' { a = 'a1' ( 'a2' | 'a3' )"
                       + " { 'a4' } [ 'a5' ] 'a6' ; } 'z'"
                       )
                     , ( nl + "'Arithmetic expressions'"
                       + nl + "{"
                       + nl + "    expr = term { plus term } ." 
                       + nl + "    term = factor { times factor } ."
                       + nl + "    factor = number | '(' expr ')' ."
                       + nl + "    plus = '+' | '-' ."
                       + nl + "    times = '*' | '/' ."
                       + nl + "    number = digit { digit } ."
                       + nl + "    digit = '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' ."
                       + nl + "}"
                       )
                     , "a = '1';"
                     , "{ a = '1' ;"
                     , "{ hello world = '1'; }"
                     , "{ foo = bar . }"
                     );

    FOR t FROM LWB tests TO UPB tests DO
        INPUT ebnf source;
        EBNF grammar := PARSEEBNF ( ebnf source INIT tests[ t ] );
        SHOW grammar;
        print( ( newline ) );
        # test cases for some grammars                                        #
        IF title OF grammar = "a" THEN
            test ebnf( grammar, "a1a3a4a4a5a6",           TRUE );
            test ebnf( grammar, "a1 a2a6",                TRUE );
            test ebnf( grammar, "a1 a3 a4 a6",            TRUE );
            test ebnf( grammar, "a1 a4 a5 a6",           FALSE );
            test ebnf( grammar, "a1 a2 a4 a5 a5 a6",     FALSE );
            test ebnf( grammar, "a1 a2 a4 a5 a6 a7",     FALSE );
            test ebnf( grammar, "your ad here",          FALSE );
            print( ( newline ) )
        ELIF title OF grammar = "Arithmetic expressions" THEN
            test ebnf( grammar, "2",                      TRUE );
            test ebnf( grammar, "2*3 + 4/23 - 7",         TRUE );
            test ebnf( grammar, "(3 + 4) * 6-2+(4*(4))",  TRUE );
            test ebnf( grammar, "-2",                    FALSE );
            test ebnf( grammar, "3 +",                   FALSE );
            test ebnf( grammar, "(4 + 3",                FALSE );
            print( ( newline ) )
        FI
    OD

END