Parse EBNF

From Rosetta Code
Parse EBNF is a draft programming task. It is not yet considered ready to be promoted as a complete task, for reasons that should be found in its talk page.

Create a simple parser for EBNF grammars. Here is an ebnf grammar in itself and a parser for it in php.

Here is part of the simple parser rules for a calculator taken from the antlr tutorial

expr	: term ( ( PLUS | MINUS )  term )* ;

term	: factor ( ( MULT | DIV ) factor )* ;

factor	: NUMBER ;

Modula-2

<lang Modula-2> MODULE EBNF;

FROM ASCII IMPORT EOL; FROM InOut IMPORT Done, Read, Write, WriteLn, WriteInt, WriteString; FROM EBNFScanner IMPORT Symbol, sym, id, Ino, GetSym, MarkError, SkipLine; FROM TableHandler IMPORT WordLength, Table, overflow, InitTable, Record, Tabulate;

VAR T0, T1  : Table;

PROCEDURE skip (n : INTEGER);

BEGIN

 MarkError (n);
 WHILE  (sym < lpar) OR (sym > period)  DO  GetSym  END

END skip;

PROCEDURE Expression;

 PROCEDURE Term;
   PROCEDURE Factor;
   BEGIN
     IF  sym = ident  THEN
       Record (T0, id, Ino);
       GetSym
     ELSIF  sym = literal  THEN
       Record (T1, id, Ino);
       GetSym
     ELSIF  sym = lpar  THEN
       GetSym;
       Expression;
       IF  sym = rpar  THEN  GetSym  ELSE  skip (2)  END
     ELSIF  sym = lbk  THEN
       GetSym;
       Expression;
       IF  sym = rbk  THEN  GetSym  ELSE  skip (3)  END
     ELSIF  sym = lbr  THEN
       GetSym;
       Expression;
       IF  sym = rbr  THEN  GetSym  ELSE  skip (4)  END
     ELSE
       skip (5)
     END
   END Factor;
 BEGIN
   Factor;
   WHILE  sym < bar  DO  Factor  END
 END Term;

BEGIN

 Term;
 WHILE  sym = bar  DO
   GetSym;
   Term
 END

END Expression;


PROCEDURE Production;

BEGIN

 Record (T0, id, - INTEGER (Ino));
 GetSym;
 IF  sym = eql  THEN  GetSym  ELSE  skip (7)  END;
 Expression;
 IF  sym # period  THEN
   MarkError (8);
   SkipLine
 END;
 GetSym

END Production;


BEGIN

 InitTable (T0);
 InitTable (T1);
 GetSym;
 WHILE  (sym = ident) AND (overflow = 0)  DO  Production  END;
 IF  overflow > 0  THEN
   WriteLn;
   WriteString ("Table overflow");
   WriteInt (overflow, 6);
 END;
 Write (35C);
 Tabulate (T0);
 Tabulate (T1);

END EBNF.</lang> And the source for the EBNF scanner. I hope you like nested procedures. <lang Modula-2> IMPLEMENTATION MODULE EBNFScanner;

FROM ASCII IMPORT LF; FROM InOut IMPORT Read, Write, WriteLn, WriteInt, WriteBf, EOF;

VAR ch  : CHAR;

MODULE LineHandler;

 IMPORT   LF, EOF, ch, Ino, Read, Write, WriteLn, WriteInt, WriteBf;
 EXPORT   GetCh, MarkError, SkipLine;
 CONST    LineWidth = 100;
 VAR      cc        : INTEGER;
          cc1       : INTEGER;
          cc2       : INTEGER;
          line      : ARRAY [0..LineWidth - 1] OF CHAR;
 PROCEDURE GetLine;
 BEGIN
   IF  cc2 > 0  THEN
     WriteLn;
     cc2 := 0
   END;
   Read (ch);
   IF  EOF ()  THEN
     line [0] := 177C;
     cc1 := 1
   ELSE
     INC (Ino);
     WriteInt (Ino, 5);
     Write (' ');
     cc1 := 0;
     LOOP
       Write (ch);
       line [cc1] := ch;
       INC (cc1);
       IF  ch = LF  THEN  EXIT  END;
       Read (ch)
     END
   END
 END GetLine;


   PROCEDURE GetCh;
   BEGIN
     WHILE  cc = cc1  DO
       cc := 0;
       GetLine
     END;
     ch := line [cc];
     INC (cc)
   END GetCh;


   PROCEDURE MarkError (n  : INTEGER);
   BEGIN
     IF  cc2 = 0  THEN
       Write ('*');
       cc2 := 3;
       REPEAT
         Write (' ');
         DEC (cc2)
       UNTIL  cc2 = 0;
     END;
     WHILE  cc2 < cc  DO
       Write (' ');
       INC (cc2)
     END;
     Write ('^');
     WriteInt (n, 1);
     INC (cc2, 2)
   END MarkError;
   PROCEDURE SkipLine;
   BEGIN
     WHILE  ch # LF  DO  GetCh  END;
     GetCh
   END SkipLine;
 BEGIN          (* BEGIN of LineHandler        *)
   cc  := 0;
   cc1 := 0;
   cc2 := 0
 END LineHandler;

PROCEDURE GetSym;

VAR i  : CARDINAL;

BEGIN

 WHILE  ch <= ' '  DO  GetCh  END;
 IF  ch = '/'  THEN
   SkipLine;
   WHILE  ch <= ' '  DO  GetCh  END
 END;
 IF  (CAP (ch) <= 'Z') AND (CAP (ch) >= 'A')  THEN
   i := 0;
   sym := literal;
   REPEAT
     IF  i < IdLength  THEN
       id [i] := ch;
       INC (i)
     END;
     IF  ch > 'Z' THEN  sym := ident  END;
     GetCh
   UNTIL  (CAP (ch) < 'A') OR (CAP (ch) > 'Z');
   id [i] := ' '
 ELSIF  ch = "'"  THEN
   i := 0;
   GetCh;
   sym := literal;
   WHILE  ch # "'"  DO
     IF  i < IdLength  THEN
       id [i] := ch;
       INC (i)
     END;
     GetCh
   END;
   GetCh;
   id [i] := ' '
   WHILE  ch # "'"  DO
     IF  i < IdLength  THEN
       id [i] := ch;
       INC (i)
     END;
     GetCh
   END;
   GetCh;
   id [i] := ' '
 ELSIF  ch = '"'  THEN
   i := 0;
   GetCh;
   sym := literal;
   WHILE  ch # '"'  DO
     IF  i < IdLength  THEN
       id [i] := ch;
       INC (i)
     END;
     GetCh
   END;
   GetCh;
   id [i] := ' '
 ELSIF  ch = '='  THEN  sym := eql;   GetCh
 ELSIF  ch = '('  THEN  sym := lpar;  GetCh
 ELSIF  ch = ')'  THEN  sym := rpar;  GetCh
 ELSIF  ch = '['  THEN  sym := lbk;   GetCh
 ELSIF  ch = ']'  THEN  sym := rbk;   GetCh
 ELSIF  ch = '{'  THEN  sym := lbr;   GetCh
 ELSIF  ch = '}'  THEN  sym := rbr;   GetCh
 ELSIF  ch = '|'  THEN  sym := bar;   GetCh
 ELSIF  ch = '.'  THEN  sym := period;  GetCh
 ELSIF  ch = 177C THEN  sym := other;  GetCh
 ELSE
   sym := other;
   GetCh
 END

END GetSym;

BEGIN

 Ino := 0;
 ch := ' '

END EBNFScanner.</lang>


PicoLisp

<lang PicoLisp>(de EBNF

  "expr  : term ( ( PLUS | MINUS )  term )* ;"
  "term  : factor ( ( MULT | DIV ) factor )* ;"
  "factor   : NUMBER ;" )

(for E EBNF

  (use (@S @E)
     (unless (and (match '(@S : @E ;) (str E)) (not (cdr @S)))
        (quit "Invalid EBNF" E) )
     (put (car @S) 'ebnf @E) ) )</lang>

<lang PicoLisp>(de matchEbnf (Pat)

  (cond
     ((asoq Pat '((PLUS . +) (MINUS . -) (MULT . *) (DIV . /)))
        (let Op (cdr @)
           (when (= Op (car *Lst))
              (pop '*Lst)
              Op ) ) )
     ((== 'NUMBER Pat)
        (cond
           ((num? (car *Lst))
              (pop '*Lst)
              @ )
           ((and (= "-" (car *Lst)) (num? (cadr *Lst)))
              (setq *Lst (cddr *Lst))
              (- @) ) ) )
     ((get Pat 'ebnf) (parseLst @))
     ((atom Pat))
     (T
        (loop
           (T (matchEbnf (pop 'Pat)) @)
           (NIL Pat)
           (NIL (== '| (pop 'Pat)))
           (NIL Pat) ) ) ) )

(de parseLst (Pat)

  (let (P (pop 'Pat)  X (matchEbnf P))
     (loop
        (NIL Pat)
        (if (n== '* (cadr Pat))
           (if (matchEbnf (pop 'Pat))
              (setq X (list @ X))
              (throw) )
           (loop
              (NIL *Lst)
              (NIL (matchEbnf (car Pat)))
              (setq X (list @ X (or (matchEbnf P) (throw)))) )
           (setq Pat (cddr Pat)) ) )
     X ) )

(de parseEbnf (Str)

  (let *Lst (str Str "")
     (catch NIL
        (parseLst (get 'expr 'ebnf)) ) ) )</lang>

Output:

: (parseEbnf "1 + 2 * -3 / 7 - 3 * 4")
-> (- (+ 1 (/ (* 2 -3) 7)) (* 3 4))

Ruby

This example is under development. It was marked thus on 12/May/2011. Please help complete the example.
This example is incomplete. The tokenizer is here, but the parser is very incomplete. Please ensure that it meets all task requirements and remove this message.

<lang ruby>#--

  1. The tokenizer splits the input into Tokens like "identifier",
  2. ":", ")*" and so on. This design uses a StringScanner on each line of
  3. input, therefore a Token can never span more than one line.
  4. Each Token knows its original line and position, so an error message
  5. can locate a bad token.
  6. ++

require 'strscan'

  1. A line of input.
  2. where:: A location like "file.txt:3"
  3. str:: String of this line

Line = Struct.new :where, :str

  1. A token.
  2. cat:: A category like :colon, :ident or so on
  3. str:: String of this token
  4. line:: Line containing this token
  5. pos:: Position of this token within this line

Token = Struct.new :cat, :str, :line, :pos

  1. Reads and returns the next Token. At end of file, returns nil.
  2. --
  3. Needs @filename and @in.
  4. ++

def next_token

 # Loop until we reach a Token.
 loop do
   # If at end of line, then get next line, or else declare end of
   # file.
   if @scanner.eos?
     if s = @in.gets
       # Each line needs a new Line object. Tokens can hold references
       # to old Line objects.
       @line = Line.new("#{@filename}:#{@in.lineno}", s)
       @scanner.string = s
     else
       return nil  # End of file
     end
   end
   # Skip whitespace.
   break unless @scanner.skip(/space:+/)
 end
 # Read token by regular expression.
 if s = @scanner.scan(/:/)
   c = :colon
 elsif s = @scanner.scan(/;/)
   c = :semicolon
 elsif s = @scanner.scan(/\(/)
   c = :paren
 elsif s = @scanner.scan(/\)\?/)
   c = :option
 elsif s = @scanner.scan(/\)\*/)
   c = :repeat
 elsif s = @scanner.scan(/\)/)
   c = :group
 elsif s = @scanner.scan(/\|/)
   c = :bar
 elsif s = @scanner.scan(/alpha:alnum:*/)
   c = :ident
 elsif s = @scanner.scan(/'[^']*'|"[^"]*"/)
   # Fix syntax highlighting for Rosetta Code. => '
   c = :string
 elsif s = @scanner.scan(/'[^']*|"[^"]*/)
   c = :bad_string
 elsif s = @scanner.scan(/.*/)
   c = :unknown
 end
 Token.new(c, s, @line, (@scanner.pos - s.length))

end

  1. Prints a _message_ to standard error, along with location of _token_.

def error(token, message)

 line = token.line
 # We print a caret ^ pointing at the bad token. We make a very crude
 # attempt to align the caret ^ in the correct column. If the input
 # line has a non-[:print:] character, like a tab, then we print it as
 # a space.
 STDERR.puts <<EOF
  1. {line.where}: #{message}
  2. {line.str.gsub(/[^[:print:]]/, " ")}
  3. {" " * token.pos}^

EOF end


  1. --
  2. The parser converts Tokens to a Grammar object. The parser also
  3. detects syntax errors.
  4. ++
  1. A parsed EBNF grammar. It is an Array of Productions.

class Grammar < Array; end

  1. A production.
  2. ident:: The identifier
  3. alts:: An Array of Alternatives

Production = Struct.new :ident, :alts

  1. An array of Alternatives, as from "(a | b)".

class Group < Array; end

  1. An optional group, as from "(a | b)?".

class OptionalGroup < Group; end

  1. A repeated group, as from "(a | b)*".

class RepeatedGroup < Group; end

  1. An array of identifiers and string literals.

class Alternative < Array; end

  1. --
  2. Needs @filename and @in.
  3. ++

def parse

 # TODO: this only dumps the tokens.
 while t = next_token
   error(t, "#{t.cat}")
 end

end

  1. Set @filename and @in. Parse input.

case ARGV.length when 0 then @filename = "-" when 1 then @filename = ARGV[0] else fail "Too many arguments" end open(@filename) do |f|

 @in = f
 @scanner = StringScanner.new("")
 parse

end </lang>

Tcl

This example is in need of improvement:

This is not an EBNF parser. It never uses EBNF. It is a calculator parser, but there is already a calculator parser at Arithmetic evaluation#Tcl. One should adjust this solution to parse the EBNF language, not the calculator language.

Demonstration lexer and parser. Note that this parser supports parenthesized expressions, making the grammar recursive. <lang tcl>package require Tcl 8.6

  1. Utilities to make the coroutine easier to use

proc provide args {while {![yield $args]} {yield}} proc next lexer {$lexer 1} proc pushback lexer {$lexer 0}

  1. Lexical analyzer coroutine core

proc lexer {str} {

   yield [info coroutine]
   set symbols {+ PLUS - MINUS * MULT / DIV ( LPAR ) RPAR}
   set idx 0
   while 1 {

switch -regexp -matchvar m -- $str { {^\s+} { # No special action for whitespace } {^([-+*/()])} { provide [dict get $symbols [lindex $m 1]] [lindex $m 1] $idx } {^(\d+)} { provide NUMBER [lindex $m 1] $idx } {^$} { provide EOT "EOT" $idx return } . { provide PARSE_ERROR [lindex $m 0] $idx } } # Trim the matched string set str [string range $str [string length [lindex $m 0]] end] incr idx [string length [lindex $m 0]]

   }

}

  1. Utility functions to help with making an LL(1) parser; ParseLoop handles
  2. EBNF looping constructs, ParseSeq handles sequence constructs.

proc ParseLoop {lexer def} {

   upvar 1 token token payload payload index index
   foreach {a b} $def {

if {$b ne "-"} {set b [list set c $b]} lappend m $a $b

   }
   lappend m default {pushback $lexer; break}
   while 1 {

lassign [next $lexer] token payload index switch -- $token {*}$m if {[set c [catch {uplevel 1 $c} res opt]]} { dict set opt -level [expr {[dict get $opt -level]+1}] return -options $opt $res }

   }

} proc ParseSeq {lexer def} {

   upvar 1 token token payload payload index index
   foreach {t s} $def {

lassign [next $lexer] token payload index switch -- $token $t { if {[set c [catch {uplevel 1 $s} res opt]]} { dict set opt -level [expr {[dict get $opt -level]+1}] return -options $opt $res } } EOT { throw SYNTAX "end of text at position $index" } default { throw SYNTAX "\"$payload\" at position $index" }

   }

}

  1. Main parser driver; contains "master" grammar that ensures that the whole
  2. text is matched and not just a prefix substring. Note also that the parser
  3. runs the lexer as a coroutine (with a fixed name in this basic demonstration
  4. code).

proc parse {str} {

   set lexer [coroutine l lexer $str]
   try {

set parsed [parse.expr $lexer] ParseLoop $lexer { EOT { return $parsed } } throw SYNTAX "\"$payload\" at position $index"

   } trap SYNTAX msg {

return -code error "syntax error: $msg"

   } finally {

catch {rename $lexer ""}

   }

}

  1. Now the descriptions of how to match each production in the grammar...

proc parse.expr {lexer} {

   set expr [parse.term $lexer]
   ParseLoop $lexer {

PLUS - MINUS { set expr [list $token $expr [parse.term $lexer]] }

   }
   return $expr

} proc parse.term {lexer} {

   set term [parse.factor $lexer]
   ParseLoop $lexer {

MULT - DIV { set term [list $token $term [parse.factor $lexer]] }

   }
   return $term

} proc parse.factor {lexer} {

   ParseLoop $lexer {

NUMBER { return $payload } MINUS { ParseSeq $lexer { NUMBER {return -$payload} } } LPAR { set result [parse.expr $lexer] ParseSeq $lexer { RPAR {return $result} } break } EOT { throw SYNTAX "end of text at position $index" }

   }
   throw SYNTAX "\"$payload\" at position $index"

}</lang>

<lang tcl># Demonstration code puts [parse "1 - 2 - -3 * 4 + 5"] puts [parse "1 - 2 - -3 * (4 + 5)"]</lang> Output:

PLUS {MINUS {MINUS 1 2} {MULT -3 4}} 5
MINUS {MINUS 1 2} {MULT -3 {PLUS 4 5}}