Parse EBNF

From Rosetta Code
This task has been clarified. Its programming examples are in need of review to ensure that they still fit the requirements of the task.
Parse EBNF is a draft programming task. It is not yet considered ready to be promoted as a complete task, for reasons that should be found in its talk page.

Write a program that can parse a grammar in Extended Backus–Naur Form and then parse something else according to the grammar. The program is only required to decide whether or not the something else belongs to the language described by the grammar, but for extra credit, it can output a syntax tree. See the tests.

Haskell[edit]

We use Parsec to generate Parsec.

import Control.Applicative
import Control.Monad
import Data.Maybe
import qualified Data.Map as M
import System.Environment (getArgs)
import Text.Parsec hiding (many, optional, (<|>))
import Text.Parsec.String
import Text.Parsec.Error
 
-----------------------------------------------------------------
-- Main
-----------------------------------------------------------------
 
main = do
{- Uses the EBNF grammar contained in the first file to parse
the second file, then prints a parse tree. -}

[grammar_file, other_file] <- getArgs
ebnf_text <- readFile grammar_file
case parseGrammar grammar_file ebnf_text of
Left err ->
putStrLn $ "Failed to parse EBNF grammar: " ++ show err
Right g -> do
putStrLn "Successfully parsed EBNF grammar."
o <- readFile other_file
case parseWithGrammar g other_file o of
Left err ->
putStrLn $ "Failed to parse second file: " ++ show err
Right tree ->
print tree
 
-----------------------------------------------------------------
-- Types and user functions
-----------------------------------------------------------------
 
parseGrammar :: FilePath -> String -> Either ParseError Grammar
parseGrammar fp s =
case runParser ebnf M.empty fp s of
Left e ->
Left e
Right (Grammar g, usedNames) ->
let undefinedRules = foldl (flip M.delete) usedNames $ map fst g
(undefName, undefNamePos) = M.findMin undefinedRules
in if M.null undefinedRules
then Right $ Grammar g
else Left $ newErrorMessage
(Message $ "Undefined rule: " ++ undefName)
undefNamePos
 
parseWithGrammar :: Grammar -> FilePath -> String -> Either ParseError ParseTree
parseWithGrammar g@(Grammar ((_, firstR) : _)) fp s =
runParser (liftA cleanTree $ firstR <* eof) g fp s
 
type GParser = Parsec String UsedNames
type UsedNames = M.Map String SourcePos
 
type Rule = Parsec String Grammar ParseTree
-- We need to keep the Grammar around as a Parsec user state
-- to look up named rules.
data Grammar = Grammar [(String, Rule)]
-- Grammar would be a type synonym instead of an ADT, but
-- infinite types aren't allowed in Haskell.
 
data ParseTree =
ParseBranch String [ParseTree] |
ParseLeaf String
 
instance Show ParseTree where
show = showIndented 0
-- show (ParseBranch "" t) = '[' : concatMap ((' ' :) . show) t ++ "]"
-- show (ParseBranch s t) = '(' : s ++ concatMap ((' ' :) . show) t ++ ")"
-- show (ParseLeaf s) = show s
 
showIndented :: Int -> ParseTree -> String
showIndented i (ParseBranch "" []) =
indent i "[]"
showIndented i (ParseBranch "" t) =
indent i "[" ++
concatMap (showIndented (i + 2)) t ++
"]"
showIndented i (ParseBranch s t) =
indent i ("(" ++ s) ++
concatMap (showIndented (i + 2)) t ++
")"
showIndented i (ParseLeaf s) =
indent i $ show s
 
indent :: Int -> String -> String
indent i s = "\n" ++ replicate i ' ' ++ s
 
cleanTree :: ParseTree -> ParseTree
-- Removes empty anonymous branches.
cleanTree (ParseBranch i ts) =
ParseBranch i $ map cleanTree $ filter p ts
where p (ParseBranch "" []) = False
p _ = True
cleanTree x = x
 
-----------------------------------------------------------------
-- GParser definitions
-----------------------------------------------------------------
 
ebnf :: GParser (Grammar, UsedNames)
ebnf = liftA2 (,) (ws *> syntax <* eof) getState
 
syntax :: GParser Grammar
syntax = liftA Grammar $
optional title *>
lcbtw '{' '}' (many production) <*
optional comment
 
production :: GParser (String, Rule)
production = do
i <- identifier
lc '='
r <- expression
oneOf ".;"
ws
return (i, liftM (nameBranch i) r)
where nameBranch i (ParseBranch _ l) = ParseBranch i l
 
expression, term :: GParser Rule
expression = liftA (foldl1 (<|>)) $ term `sepBy1` (lc '|')
term = liftA (branch . sequence) $ many1 factor
 
factor :: GParser Rule
factor = liftA try $
liftA ruleByName rememberName <|>
liftA (leaf . (<* ws) . string) literal <|>
liftA perhaps (lcbtw '[' ']' expression) <|>
lcbtw '(' ')' expression <|>
liftA (branch . many) (lcbtw '{' '}' expression)
where rememberName :: GParser String
rememberName = do
i <- identifier
p <- getPosition
modifyState $ M.insertWith (flip const) i p
{- Adds i → p to the state only if i doesn't
already have an entry. This ensures we report the
*first* usage of each unknown identifier. -}

return i
 
ruleByName :: String -> Rule
ruleByName name = do
Grammar g <- getState
fromJust (lookup name g) <?> name
 
perhaps = option $ ParseLeaf ""
 
identifier :: GParser String
identifier = many1 (noneOf " \t\n=|(){}[].;\"'") <* ws
 
title = literal
 
comment = literal
 
literal =
(lc '\'' *> manyTill anyChar (lc '\'')) <|>
(lc '"' *> manyTill anyChar (lc '"'))
<* ws
 
-----------------------------------------------------------------
-- Miscellany
-----------------------------------------------------------------
 
leaf = liftA ParseLeaf
branch = liftA $ ParseBranch ""
 
lcbtw c1 c2 = between (lc c1) (lc c2)
 
lc :: Char -> GParser Char
lc c = char c <* ws
 
ws = many $ oneOf " \n\t"

Modula-2[edit]

MODULE EBNF;
 
FROM ASCII IMPORT EOL;
FROM InOut IMPORT Done, Read, Write, WriteLn, WriteInt, WriteString;
FROM EBNFScanner IMPORT Symbol, sym, id, Ino, GetSym, MarkError, SkipLine;
FROM TableHandler IMPORT WordLength, Table, overflow, InitTable, Record, Tabulate;
 
VAR T0, T1 : Table;
 
PROCEDURE skip (n : INTEGER);
 
BEGIN
MarkError (n);
WHILE (sym < lpar) OR (sym > period) DO GetSym END
END skip;
 
PROCEDURE Expression;
 
PROCEDURE Term;
 
PROCEDURE Factor;
 
BEGIN
IF sym = ident THEN
Record (T0, id, Ino);
GetSym
ELSIF sym = literal THEN
Record (T1, id, Ino);
GetSym
ELSIF sym = lpar THEN
GetSym;
Expression;
IF sym = rpar THEN GetSym ELSE skip (2) END
ELSIF sym = lbk THEN
GetSym;
Expression;
IF sym = rbk THEN GetSym ELSE skip (3) END
ELSIF sym = lbr THEN
GetSym;
Expression;
IF sym = rbr THEN GetSym ELSE skip (4) END
ELSE
skip (5)
END
END Factor;
 
BEGIN
Factor;
WHILE sym < bar DO Factor END
END Term;
 
BEGIN
Term;
WHILE sym = bar DO
GetSym;
Term
END
END Expression;
 
 
PROCEDURE Production;
 
BEGIN
Record (T0, id, - INTEGER (Ino));
GetSym;
IF sym = eql THEN GetSym ELSE skip (7) END;
Expression;
IF sym # period THEN
MarkError (8);
SkipLine
END;
GetSym
END Production;
 
 
BEGIN
InitTable (T0);
InitTable (T1);
GetSym;
WHILE (sym = ident) AND (overflow = 0) DO Production END;
IF overflow > 0 THEN
WriteLn;
WriteString ("Table overflow");
WriteInt (overflow, 6);
END;
Write (35C);
Tabulate (T0);
Tabulate (T1);
END EBNF.

And the source for the EBNF scanner. I hope you like nested procedures.

IMPLEMENTATION MODULE EBNFScanner;
 
FROM ASCII IMPORT LF;
FROM InOut IMPORT Read, Write, WriteLn, WriteInt, WriteBf, EOF;
 
VAR ch : CHAR;
 
MODULE LineHandler;
 
IMPORT LF, EOF, ch, Ino, Read, Write, WriteLn, WriteInt, WriteBf;
EXPORT GetCh, MarkError, SkipLine;
 
CONST LineWidth = 100;
 
VAR cc : INTEGER;
cc1 : INTEGER;
cc2 : INTEGER;
line : ARRAY [0..LineWidth - 1] OF CHAR;
 
PROCEDURE GetLine;
 
BEGIN
IF cc2 > 0 THEN
WriteLn;
cc2 := 0
END;
Read (ch);
IF EOF () THEN
line [0] := 177C;
cc1 := 1
ELSE
INC (Ino);
WriteInt (Ino, 5);
Write (' ');
cc1 := 0;
LOOP
Write (ch);
line [cc1] := ch;
INC (cc1);
IF ch = LF THEN EXIT END;
Read (ch)
END
END
END GetLine;
 
 
PROCEDURE GetCh;
 
BEGIN
WHILE cc = cc1 DO
cc := 0;
GetLine
END;
ch := line [cc];
INC (cc)
END GetCh;
 
 
PROCEDURE MarkError (n : INTEGER);
 
BEGIN
IF cc2 = 0 THEN
Write ('*');
cc2 := 3;
REPEAT
Write (' ');
DEC (cc2)
UNTIL cc2 = 0;
END;
WHILE cc2 < cc DO
Write (' ');
INC (cc2)
END;
Write ('^');
WriteInt (n, 1);
INC (cc2, 2)
END MarkError;
 
PROCEDURE SkipLine;
 
BEGIN
WHILE ch # LF DO GetCh END;
GetCh
END SkipLine;
 
BEGIN (* BEGIN of LineHandler *)
cc := 0;
cc1 := 0;
cc2 := 0
END LineHandler;
 
PROCEDURE GetSym;
 
VAR i : CARDINAL;
 
BEGIN
WHILE ch <= ' ' DO GetCh END;
IF ch = '/' THEN
SkipLine;
WHILE ch <= ' ' DO GetCh END
END;
IF (CAP (ch) <= 'Z') AND (CAP (ch) >= 'A') THEN
i := 0;
sym := literal;
REPEAT
IF i < IdLength THEN
id [i] := ch;
INC (i)
END;
IF ch > 'Z' THEN sym := ident END;
GetCh
UNTIL (CAP (ch) < 'A') OR (CAP (ch) > 'Z');
id [i] := ' '
ELSIF ch = "'" THEN
i := 0;
GetCh;
sym := literal;
WHILE ch # "'" DO
IF i < IdLength THEN
id [i] := ch;
INC (i)
END;
GetCh
END;
GetCh;
id [i] := ' '
WHILE ch # "'" DO
IF i < IdLength THEN
id [i] := ch;
INC (i)
END;
GetCh
END;
GetCh;
id [i] := ' '
ELSIF ch = '"' THEN
i := 0;
GetCh;
sym := literal;
WHILE ch # '"' DO
IF i < IdLength THEN
id [i] := ch;
INC (i)
END;
GetCh
END;
GetCh;
id [i] := ' '
ELSIF ch = '=' THEN sym := eql; GetCh
ELSIF ch = '(' THEN sym := lpar; GetCh
ELSIF ch = ')' THEN sym := rpar; GetCh
ELSIF ch = '[' THEN sym := lbk; GetCh
ELSIF ch = ']' THEN sym := rbk; GetCh
ELSIF ch = '{' THEN sym := lbr; GetCh
ELSIF ch = '}' THEN sym := rbr; GetCh
ELSIF ch = '|' THEN sym := bar; GetCh
ELSIF ch = '.' THEN sym := period; GetCh
ELSIF ch = 177C THEN sym := other; GetCh
ELSE
sym := other;
GetCh
END
END GetSym;
 
BEGIN
Ino := 0;
ch := ' '
END EBNFScanner.

Perl 6[edit]

Works with: Rakudo version 2013.09

This parses the EBNF rule set using a perl 6 grammar, then if it parses as valid EBNF, constructs a grammar and parses the test strings with that. EBNF rule sets that are naively syntactically correct but missing rules will parse as valid but will give a runtime failure warning about missing methods. It is implemented and exercised using the flavor of EBNF and test cases specified on the test page.

# A perl 6 grammar to parse EBNF
grammar EBNF {
rule TOP { ^ <title>? '{' [ <production> ]+ '}' <comment>? $ }
rule production { <name> '=' <expression> <[.;]> }
rule expression { <term> +% "|" }
rule term { <factor>+ }
rule factor { <group> | <repeat> | <optional> | <identifier> | <literal> }
rule group { '(' <expression> ')' }
rule repeat { '{' <expression> '}' }
rule optional { '[' <expression> ']' }
token identifier { <-[\|\(\)\{\}\[\]\.\;\"\'\s]>+ } #"
token literal { ["'" <-[']>+ "'" | '"' <-["]>+ '"'] } #"
token title { <literal> }
token comment { <literal> }
token name { <identifier> <?before \h* '='> }
}
 
class EBNF::Actions {
method TOP($/) {
say "Syntax Tree:\n", $/; # Dump the syntax tree to STDOUT
make 'grammar ' ~
($<title> ?? $<title>.subst(/\W/, '', :g) !! 'unnamed') ~
" \{\n rule TOP \{^[<" ~ $/<production>[0]<name> ~
">]+\$\}\n " ~ $<production>>>.ast ~ "\}"
}
method production($/) {
make 'token ' ~ $<name> ~ ' {' ~
$<expression>.ast ~ "}\n"
}
method expression($/) { make join '|', $<term>>>.ast }
method term($/) { make join '\h*', $<factor>>>.ast }
method factor($/) {
make $<literal> ?? $<literal> !!
$<group> ?? '[' ~ $<group>.ast ~ ']' !!
$<repeat> ?? '[' ~ $<repeat>.ast ~ '\\s*]*' !!
$<optional> ?? '[' ~ $<optional>.ast ~ ']?' !!
'<' ~ $<identifier> ~ '>'
}
method repeat($/) { make $<expression>.ast }
method optional($/) { make $<expression>.ast }
method group($/) { make $<expression>.ast }
}
 
# An array of test cases
my @tests = (
{
ebnf =>
q<"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z">
,
teststrings => [
'a1a3a4a4a5a6',
'a1 a2a6',
'a1 a3 a4 a6',
'a1 a4 a5 a6',
'a1 a2 a4 a4 a5 a6',
'a1 a2 a4 a5 a5 a6',
'a1 a2 a4 a5 a6 a7',
'your ad here'
]
},
{
ebnf =>
q<{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .
 
plus = "+" | "-" .
times = "*" | "/" .
 
number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
}>
,
teststrings => [
'2',
'2*3 + 4/23 - 7',
'(3 + 4) * 6-2+(4*(4))',
'-2',
'3 +',
'(4 + 3'
]
},
{
ebnf => q<a = "1";>,
teststrings => ['foobar']
},
{
ebnf => q<{ a = "1" ;>,
teststrings => ['foobar']
},
{
ebnf => q<{ hello world = "1"; }>,
teststrings => ['foobar']
},
{
ebnf => q<{ foo = bar . }>,
teststrings => ['foobar']
}
);
 
# Test the parser.
my $i = 1;
for @tests -> $test {
unless EBNF.parse($test<ebnf>) {
say "Parsing EBNF grammar:\n";
say "{$test<ebnf>.subst(/^^\h*/,'',:g)}\n";
say "Invalid syntax. Can not be parsed.\n";
say '*' x 79;
next;
}
my $p = EBNF.parse($test<ebnf>, :actions(EBNF::Actions));
my $grammar = $p.ast;
$grammar ~~ m/^'grammar '(\w+)/;
my $title = $0;
my $fn = 'EBNFtest'~$i++;
my $fh = open($fn, :w) or die "$!\n";
$fh.say( "\{\n", $grammar );
$fh.say( qq|say "Parsing EBNF grammar '$title':\\n";| );
$fh.say( qq|say q<{$test<ebnf>.subst(/^^\h*/,'',:g)}>;| );
$fh.say( q|say "\nValid syntax.\n\nTesting:\n";| );
$fh.say( q|CATCH { default { say " - $_" } };| );
my $len = [max] $test<teststrings>.flat>>.chars;
for $test<teststrings>.flat -> $s {
$fh.say( qq|printf "%{$len}s", '{$s}';| ~
qq|printf " - %s\\n", {$title}.parse('{$s}')| ~
qq| ?? 'valid.' !! 'NOT valid.';|
);
}
$fh.say( qq| "\\n"} |);
$fh.close;
say qqx/perl6 $fn/;
say '*' x 79, "\n";
unlink $fn;
}

Output:

Syntax Tree:
「"a" {
                a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
            } "z"」
 title => 「"a"」
  literal => 「"a"」
 production => 「a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
            」
  name => 「a」
   identifier => 「a」
  expression => 「"a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" 」
   term => 「"a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" 」
    factor => 「"a1" 」
     literal => 「"a1"」
    factor => 「( "a2" | "a3" ) 」
     group => 「( "a2" | "a3" ) 」
      expression => 「"a2" | "a3" 」
       term => 「"a2" 」
        factor => 「"a2" 」
         literal => 「"a2"」
       term => 「 "a3" 」
        factor => 「"a3" 」
         literal => 「"a3"」
    factor => 「{ "a4" } 」
     repeat => 「{ "a4" } 」
      expression => 「"a4" 」
       term => 「"a4" 」
        factor => 「"a4" 」
         literal => 「"a4"」
    factor => 「[ "a5" ] 」
     optional => 「[ "a5" ] 」
      expression => 「"a5" 」
       term => 「"a5" 」
        factor => 「"a5" 」
         literal => 「"a5"」
    factor => 「"a6" 」
     literal => 「"a6"」
 comment => 「"z"」
  literal => 「"z"」

Parsing EBNF grammar 'a':

"a" {
a = "a1" ( "a2" | "a3" ) { "a4" } [ "a5" ] "a6" ;
} "z"

Valid syntax.

Testing:

     a1a3a4a4a5a6 - valid.
          a1 a2a6 - valid.
      a1 a3 a4 a6 - valid.
      a1 a4 a5 a6 - NOT valid.
a1 a2 a4 a4 a5 a6 - valid.
a1 a2 a4 a5 a5 a6 - NOT valid.
a1 a2 a4 a5 a6 a7 - NOT valid.
     your ad here - NOT valid.

*******************************************************************************

Syntax Tree:
「{
                expr = term { plus term } .
                term = factor { times factor } .
                factor = number | '(' expr ')' .

                plus = "+" | "-" .
                times = "*" | "/" .

                number = digit { digit } .
                digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
            }」
 production => 「expr = term { plus term } .
                」
  name => 「expr」
   identifier => 「expr」
  expression => 「term { plus term } 」
   term => 「term { plus term } 」
    factor => 「term 」
     identifier => 「term」
    factor => 「{ plus term } 」
     repeat => 「{ plus term } 」
      expression => 「plus term 」
       term => 「plus term 」
        factor => 「plus 」
         identifier => 「plus」
        factor => 「term 」
         identifier => 「term」
 production => 「term = factor { times factor } .
                」
  name => 「term」
   identifier => 「term」
  expression => 「factor { times factor } 」
   term => 「factor { times factor } 」
    factor => 「factor 」
     identifier => 「factor」
    factor => 「{ times factor } 」
     repeat => 「{ times factor } 」
      expression => 「times factor 」
       term => 「times factor 」
        factor => 「times 」
         identifier => 「times」
        factor => 「factor 」
         identifier => 「factor」
 production => 「factor = number | '(' expr ')' .

                」
  name => 「factor」
   identifier => 「factor」
  expression => 「number | '(' expr ')' 」
   term => 「number 」
    factor => 「number 」
     identifier => 「number」
   term => 「 '(' expr ')' 」
    factor => 「'(' 」
     literal => 「'('」
    factor => 「expr 」
     identifier => 「expr」
    factor => 「')' 」
     literal => 「')'」
 production => 「plus = "+" | "-" .
                」
  name => 「plus」
   identifier => 「plus」
  expression => 「"+" | "-" 」
   term => 「"+" 」
    factor => 「"+" 」
     literal => 「"+"」
   term => 「 "-" 」
    factor => 「"-" 」
     literal => 「"-"」
 production => 「times = "*" | "/" .

                」
  name => 「times」
   identifier => 「times」
  expression => 「"*" | "/" 」
   term => 「"*" 」
    factor => 「"*" 」
     literal => 「"*"」
   term => 「 "/" 」
    factor => 「"/" 」
     literal => 「"/"」
 production => 「number = digit { digit } .
                」
  name => 「number」
   identifier => 「number」
  expression => 「digit { digit } 」
   term => 「digit { digit } 」
    factor => 「digit 」
     identifier => 「digit」
    factor => 「{ digit } 」
     repeat => 「{ digit } 」
      expression => 「digit 」
       term => 「digit 」
        factor => 「digit 」
         identifier => 「digit」
 production => 「digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
            」
  name => 「digit」
   identifier => 「digit」
  expression => 「"0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" 」
   term => 「"0" 」
    factor => 「"0" 」
     literal => 「"0"」
   term => 「 "1" 」
    factor => 「"1" 」
     literal => 「"1"」
   term => 「 "2" 」
    factor => 「"2" 」
     literal => 「"2"」
   term => 「 "3" 」
    factor => 「"3" 」
     literal => 「"3"」
   term => 「 "4" 」
    factor => 「"4" 」
     literal => 「"4"」
   term => 「 "5" 」
    factor => 「"5" 」
     literal => 「"5"」
   term => 「 "6" 」
    factor => 「"6" 」
     literal => 「"6"」
   term => 「 "7" 」
    factor => 「"7" 」
     literal => 「"7"」
   term => 「 "8" 」
    factor => 「"8" 」
     literal => 「"8"」
   term => 「 "9" 」
    factor => 「"9" 」
     literal => 「"9"」

Parsing EBNF grammar 'unnamed':

{
expr = term { plus term } .
term = factor { times factor } .
factor = number | '(' expr ')' .

plus = "+" | "-" .
times = "*" | "/" .

number = digit { digit } .
digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" .
}

Valid syntax.

Testing:

                    2 - valid.
       2*3 + 4/23 - 7 - valid.
(3 + 4) * 6-2+(4*(4)) - valid.
                   -2 - NOT valid.
                  3 + - NOT valid.
               (4 + 3 - NOT valid.

*******************************************************************************

Parsing EBNF grammar:

a = "1";

Invalid syntax. Can not be parsed.

*******************************************************************************
Parsing EBNF grammar:

{ a = "1" ;

Invalid syntax. Can not be parsed.

*******************************************************************************
Parsing EBNF grammar:

{ hello world = "1"; }

Invalid syntax. Can not be parsed.

*******************************************************************************
Syntax Tree:
「{ foo = bar . }」
 production => 「foo = bar . 」
  name => 「foo」
   identifier => 「foo」
  expression => 「bar 」
   term => 「bar 」
    factor => 「bar 」
     identifier => 「bar」

Parsing EBNF grammar 'unnamed':

{ foo = bar . }

Valid syntax.

Testing:

foobar - No such method 'bar' for invocant of type 'unnamed'

*******************************************************************************

PicoLisp[edit]

(de EBNF
"expr  : term ( ( PLUS | MINUS ) term )* ;"
"term  : factor ( ( MULT | DIV ) factor )* ;"
"factor  : NUMBER ;" )
 
(for E EBNF
(use (@S @E)
(unless (and (match '(@S : @E ;) (str E)) (not (cdr @S)))
(quit "Invalid EBNF" E) )
(put (car @S) 'ebnf @E) ) )
(de matchEbnf (Pat)
(cond
((asoq Pat '((PLUS . +) (MINUS . -) (MULT . *) (DIV . /)))
(let Op (cdr @)
(when (= Op (car *Lst))
(pop '*Lst)
Op ) ) )
((== 'NUMBER Pat)
(cond
((num? (car *Lst))
(pop '*Lst)
@ )
((and (= "-" (car *Lst)) (num? (cadr *Lst)))
(setq *Lst (cddr *Lst))
(- @) ) ) )
((get Pat 'ebnf) (parseLst @))
((atom Pat))
(T
(loop
(T (matchEbnf (pop 'Pat)) @)
(NIL Pat)
(NIL (== '| (pop 'Pat)))
(NIL Pat) ) ) ) )
 
(de parseLst (Pat)
(let (P (pop 'Pat) X (matchEbnf P))
(loop
(NIL Pat)
(if (n== '* (cadr Pat))
(if (matchEbnf (pop 'Pat))
(setq X (list @ X))
(throw) )
(loop
(NIL *Lst)
(NIL (matchEbnf (car Pat)))
(setq X (list @ X (or (matchEbnf P) (throw)))) )
(setq Pat (cddr Pat)) ) )
X ) )
 
(de parseEbnf (Str)
(let *Lst (str Str "")
(catch NIL
(parseLst (get 'expr 'ebnf)) ) ) )

Output:

: (parseEbnf "1 + 2 * -3 / 7 - 3 * 4")
-> (- (+ 1 (/ (* 2 -3) 7)) (* 3 4))

Ruby[edit]

This example is under development. It was marked thus on 12/May/2011. Please help complete the example.
This example is incomplete. The tokenizer is here, but the parser is very incomplete. Please ensure that it meets all task requirements and remove this message.
#--
# The tokenizer splits the input into Tokens like "identifier",
# ":", ")*" and so on. This design uses a StringScanner on each line of
# input, therefore a Token can never span more than one line.
#
# Each Token knows its original line and position, so an error message
# can locate a bad token.
#++
 
require 'strscan'
 
# A line of input.
# where:: A location like "file.txt:3"
# str:: String of this line
Line = Struct.new :where, :str
 
# A token.
# cat:: A category like :colon, :ident or so on
# str:: String of this token
# line:: Line containing this token
# pos:: Position of this token within this line
Token = Struct.new :cat, :str, :line, :pos
 
# Reads and returns the next Token. At end of file, returns nil.
#--
# Needs @filename and @in.
#++
def next_token
# Loop until we reach a Token.
loop do
# If at end of line, then get next line, or else declare end of
# file.
if @scanner.eos?
if s = @in.gets
# Each line needs a new Line object. Tokens can hold references
# to old Line objects.
@line = Line.new("[email protected]}:[email protected]}", s)
@scanner.string = s
else
return nil # End of file
end
end
 
# Skip whitespace.
break unless @scanner.skip(/[[:space:]]+/)
end
 
# Read token by regular expression.
if s = @scanner.scan(/:/)
c = :colon
elsif s = @scanner.scan(/;/)
c = :semicolon
elsif s = @scanner.scan(/\(/)
c = :paren
elsif s = @scanner.scan(/\)\?/)
c = :option
elsif s = @scanner.scan(/\)\*/)
c = :repeat
elsif s = @scanner.scan(/\)/)
c = :group
elsif s = @scanner.scan(/\|/)
c = :bar
elsif s = @scanner.scan(/[[:alpha:]][[:alnum:]]*/)
c = :ident
elsif s = @scanner.scan(/'[^']*'|"[^"]*"/)
# Fix syntax highlighting for Rosetta Code. => '

c = :string
elsif s = @scanner.scan(/'[^']*|"[^"]*/)
c = :bad_string
elsif s = @scanner.scan(/.*/)
c = :unknown
end
 
Token.new(c, s, @line, (@scanner.pos - s.length))
end
 
# Prints a _message_ to standard error, along with location of _token_.
def error(token, message)
line = token.line
 
# We print a caret ^ pointing at the bad token. We make a very crude
# attempt to align the caret ^ in the correct column. If the input
# line has a non-[:print:] character, like a tab, then we print it as
# a space.
STDERR.puts <<EOF
#{line.where}: #{message}
#{line.str.gsub(/[^[:print:]]/, " ")}
#{" " * token.pos}^
EOF

end
 
 
#--
# The parser converts Tokens to a Grammar object. The parser also
# detects syntax errors.
#++
 
# A parsed EBNF grammar. It is an Array of Productions.
class Grammar < Array; end
 
# A production.
# ident:: The identifier
# alts:: An Array of Alternatives
Production = Struct.new :ident, :alts
 
# An array of Alternatives, as from "(a | b)".
class Group < Array; end
 
# An optional group, as from "(a | b)?".
class OptionalGroup < Group; end
 
# A repeated group, as from "(a | b)*".
class RepeatedGroup < Group; end
 
# An array of identifiers and string literals.
class Alternative < Array; end
 
#--
# Needs @filename and @in.
#++
def parse
# TODO: this only dumps the tokens.
while t = next_token
error(t, "#{t.cat}")
end
end
 
# Set @filename and @in. Parse input.
case ARGV.length
when 0 then @filename = "-"
when 1 then @filename = ARGV[0]
else fail "Too many arguments"
end
open(@filename) do |f|
@in = f
@scanner = StringScanner.new("")
parse
end
 

Tcl[edit]

This example is in need of improvement:
This is not an EBNF parser. It never uses EBNF. It is a calculator parser, but there is already a calculator parser at Arithmetic evaluation#Tcl. One should adjust this solution to parse the EBNF language, not the calculator language.

Demonstration lexer and parser. Note that this parser supports parenthesized expressions, making the grammar recursive.

package require Tcl 8.6
 
# Utilities to make the coroutine easier to use
proc provide args {while {![yield $args]} {yield}}
proc next lexer {$lexer 1}
proc pushback lexer {$lexer 0}
 
# Lexical analyzer coroutine core
proc lexer {str} {
yield [info coroutine]
set symbols {+ PLUS - MINUS * MULT / DIV ( LPAR ) RPAR}
set idx 0
while 1 {
switch -regexp -matchvar m -- $str {
{^\s+} {
# No special action for whitespace
}
{^([-+*/()])} {
provide [dict get $symbols [lindex $m 1]] [lindex $m 1] $idx
}
{^(\d+)} {
provide NUMBER [lindex $m 1] $idx
}
{^$} {
provide EOT "EOT" $idx
return
}
. {
provide PARSE_ERROR [lindex $m 0] $idx
}
}
# Trim the matched string
set str [string range $str [string length [lindex $m 0]] end]
incr idx [string length [lindex $m 0]]
}
}
 
# Utility functions to help with making an LL(1) parser; ParseLoop handles
# EBNF looping constructs, ParseSeq handles sequence constructs.
proc ParseLoop {lexer def} {
upvar 1 token token payload payload index index
foreach {a b} $def {
if {$b ne "-"} {set b [list set c $b]}
lappend m $a $b
}
lappend m default {pushback $lexer; break}
while 1 {
lassign [next $lexer] token payload index
switch -- $token {*}$m
if {[set c [catch {uplevel 1 $c} res opt]]} {
dict set opt -level [expr {[dict get $opt -level]+1}]
return -options $opt $res
}
}
}
proc ParseSeq {lexer def} {
upvar 1 token token payload payload index index
foreach {t s} $def {
lassign [next $lexer] token payload index
switch -- $token $t {
if {[set c [catch {uplevel 1 $s} res opt]]} {
dict set opt -level [expr {[dict get $opt -level]+1}]
return -options $opt $res
}
} EOT {
throw SYNTAX "end of text at position $index"
} default {
throw SYNTAX "\"$payload\" at position $index"
}
}
}
 
# Main parser driver; contains "master" grammar that ensures that the whole
# text is matched and not just a prefix substring. Note also that the parser
# runs the lexer as a coroutine (with a fixed name in this basic demonstration
# code).
proc parse {str} {
set lexer [coroutine l lexer $str]
try {
set parsed [parse.expr $lexer]
ParseLoop $lexer {
EOT {
return $parsed
}
}
throw SYNTAX "\"$payload\" at position $index"
} trap SYNTAX msg {
return -code error "syntax error: $msg"
} finally {
catch {rename $lexer ""}
}
}
 
# Now the descriptions of how to match each production in the grammar...
proc parse.expr {lexer} {
set expr [parse.term $lexer]
ParseLoop $lexer {
PLUS - MINUS {
set expr [list $token $expr [parse.term $lexer]]
}
}
return $expr
}
proc parse.term {lexer} {
set term [parse.factor $lexer]
ParseLoop $lexer {
MULT - DIV {
set term [list $token $term [parse.factor $lexer]]
}
}
return $term
}
proc parse.factor {lexer} {
ParseLoop $lexer {
NUMBER {
return $payload
}
MINUS {
ParseSeq $lexer {
NUMBER {return -$payload}
}
}
LPAR {
set result [parse.expr $lexer]
ParseSeq $lexer {
RPAR {return $result}
}
break
}
EOT {
throw SYNTAX "end of text at position $index"
}
}
throw SYNTAX "\"$payload\" at position $index"
}
# Demonstration code
puts [parse "1 - 2 - -3 * 4 + 5"]
puts [parse "1 - 2 - -3 * (4 + 5)"]

Output:

PLUS {MINUS {MINUS 1 2} {MULT -3 4}} 5
MINUS {MINUS 1 2} {MULT -3 {PLUS 4 5}}