Recursive descent parser generator: Difference between revisions
m (J is on github now) |
|||
Line 608: | Line 608: | ||
└── return %4 |
└── return %4 |
||
))) |
))) |
||
</pre> |
|||
=={{header|Phix}}== |
|||
Technically the task is asking for code which generates something like the following, so I suppose |
|||
it would actually meet the spec if it began with <code>constant src = """</code> and ended with |
|||
<code>""" puts(1,src)</code>... |
|||
<lang Phix>string src |
|||
integer ch, sdx |
|||
sequence tok |
|||
procedure skip_spaces() |
|||
while 1 do |
|||
if sdx>length(src) then exit end if |
|||
ch = src[sdx] |
|||
if not find(ch," \t\r\n") then exit end if |
|||
sdx += 1 |
|||
end while |
|||
end procedure |
|||
procedure next_token() |
|||
-- yeilds one of: |
|||
-- {"SYMBOL",ch} where ch is one of "()+-/*", or |
|||
-- {"IDENT",string}, or {"EOF"} |
|||
skip_spaces() |
|||
integer tokstart = sdx |
|||
if sdx>length(src) then |
|||
tok = {"EOF"} |
|||
elsif find(ch,"()+-/*") then |
|||
sdx += 1 |
|||
tok = {"SYMBOL",ch&""} |
|||
elsif (ch>='a' and ch<='z') |
|||
or (ch>='A' and ch<='Z') then |
|||
while true do |
|||
sdx += 1 |
|||
if sdx>length(src) then exit end if |
|||
ch = src[sdx] |
|||
if ch!='_' |
|||
and (ch<'a' or ch>'z') |
|||
and (ch<'A' or ch>'Z') |
|||
and (ch<'0' or ch>'9') then |
|||
exit |
|||
end if |
|||
end while |
|||
tok = {"IDENT",src[tokstart..sdx-1]} |
|||
else |
|||
?9/0 |
|||
end if |
|||
end procedure |
|||
forward function sum_expr() |
|||
function primary() |
|||
sequence res |
|||
if tok[1]="IDENT" then |
|||
res = tok |
|||
next_token() |
|||
elsif tok={"SYMBOL","("} then |
|||
next_token() |
|||
res = sum_expr() |
|||
if tok!={"SYMBOL",")"} then ?9/0 end if |
|||
next_token() |
|||
else |
|||
?9/0 |
|||
end if |
|||
return res |
|||
end function |
|||
function mul_expr() |
|||
sequence res = primary() |
|||
while true do |
|||
if tok[1]!="SYMBOL" or not find(tok[2],{"*","/"}) then exit end if |
|||
res = {tok,res,NULL} |
|||
next_token() |
|||
res[3] = primary() |
|||
end while |
|||
return res |
|||
end function |
|||
function sum_expr() |
|||
sequence res = mul_expr() |
|||
while true do |
|||
if tok[1]!="SYMBOL" or not find(tok[2],{"+","-"}) then exit end if |
|||
res = {tok,res,NULL} |
|||
next_token() |
|||
res[3] = mul_expr() |
|||
end while |
|||
return res |
|||
end function |
|||
integer nxt = 1 |
|||
function show_ast(sequence ast) |
|||
if ast[1][1]="SYMBOL" then |
|||
string op = ast[1][2], |
|||
lhs = show_ast(ast[2]), |
|||
rhs = show_ast(ast[3]), |
|||
this = sprintf("_%04d",nxt) |
|||
printf(1,"%s = %s %s %s\n",{this,lhs,op,rhs}) |
|||
nxt += 1 |
|||
return this |
|||
elsif ast[1]="IDENT" then |
|||
return ast[2] |
|||
end if |
|||
?9/0 |
|||
end function |
|||
procedure parse(string source) |
|||
src = source |
|||
sdx = 1 |
|||
next_token() |
|||
sequence ast = sum_expr() |
|||
if tok!={"EOF"} then ?9/0 end if |
|||
pp(ast,{pp_Nest,4}) |
|||
{} = show_ast(ast) |
|||
end procedure |
|||
parse("(one + two) * three - four * five")</lang> |
|||
{{out}} |
|||
<pre> |
|||
{{`SYMBOL`, |
|||
`-`}, |
|||
{{`SYMBOL`, |
|||
`*`}, |
|||
{{`SYMBOL`, |
|||
`+`}, |
|||
{`IDENT`, |
|||
`one`}, |
|||
{`IDENT`, |
|||
`two`}}, |
|||
{`IDENT`, |
|||
`three`}}, |
|||
{{`SYMBOL`, |
|||
`*`}, |
|||
{`IDENT`, |
|||
`four`}, |
|||
{`IDENT`, |
|||
`five`}}} |
|||
_0001 = one + two |
|||
_0002 = _0001 * three |
|||
_0003 = four * five |
|||
_0004 = _0002 - _0003 |
|||
</pre> |
</pre> |
Revision as of 22:18, 5 January 2020
Write a recursive descent parser generator that takes a description of a grammar as input and outputs the source code for a parser in the same language as the generator. (So a generator written in C++ would output C++ source code for the parser.) You can assume that all of the rules have been preprocessed into a form suitable for the construction of a recursive descent parser. Check the following links for more details.
- http://www.cs.engr.uky.edu/~lewis/essays/compilers/rec-des.html
- http://www.engr.mun.ca/~theo/Misc/exp_parsing.htm
Use the parser generator and a grammar file to build a parser that takes an arithmetic expression and turns it in to three address code. The resulting parser should take this (or something similar) as input:
(one + two) * three - four * five
And generate this (or something similar) as output:
_0001 = one + two _0002 = _0001 * three _0003 = four * five _0004 = _0002 - _0003
C++
This program translates an annotated LL(1) grammar into a C++ lexer plus parser. Each rule is required to return a string of some kind and the return values of the already matched nonterminals (and matched text of terminals) can be accessed with $1, $2, etc. which are replaced by the appropriate string variable.
It can't handle newlines as part of the grammar, the error checking is fairly limited and the error reporting is basically non-existent, but the parser it generates (not shown below) is human readable.
<lang cpp>
- include <iostream>
- include <fstream>
- include <string>
- include <sstream>
- include <map>
- include <set>
- include <regex>
using namespace std;
map<string, string> terminals; map<string, vector<vector<string>>> nonterminalRules; map<string, set<string>> nonterminalFirst; map<string, vector<string>> nonterminalCode;
int main(int argc, char **argv) { if (argc < 3) { cout << "Usage: <input file> <output file>" << endl; return 1; }
ifstream inFile(argv[1]); ofstream outFile(argv[2]);
regex blankLine(R"(^\s*$)"); regex terminalPattern(R"((\w+)\s+(.+))"); regex rulePattern(R"(^!!\s*(\w+)\s*->\s*((?:\w+\s*)*)$)"); regex argPattern(R"(\$(\d+))"); smatch results;
// Read terminal patterns string line; while (true) { getline(inFile, line);
// Terminals section ends with a blank line if (regex_match(line, blankLine)) break;
regex_match(line, results, terminalPattern); terminals[results[1]] = results[2]; }
outFile << "#include <iostream>" << endl; outFile << "#include <fstream>" << endl; outFile << "#include <string>" << endl; outFile << "#include <regex>" << endl; outFile << "using namespace std;" << endl << endl;
// Generate the token processing functions outFile << "string input, nextToken, nextTokenValue;" << endl; outFile << "string prevToken, prevTokenValue;" << endl << endl;
outFile << "void advanceToken() {" << endl; outFile << " static smatch results;" << endl << endl;
outFile << " prevToken = nextToken;" << endl; outFile << " prevTokenValue = nextTokenValue;" << endl << endl;
for (auto i = terminals.begin(); i != terminals.end(); ++i) { string name = i->first + "_pattern"; string pattern = i->second;
outFile << " static regex " << name << "(R\"(^\\s*(" << pattern << "))\");" << endl; outFile << " if (regex_search(input, results, " << name << ", regex_constants::match_continuous)) {" << endl; outFile << " nextToken = \"" << i->first << "\";" << endl; outFile << " nextTokenValue = results[1];" << endl; outFile << " input = regex_replace(input, " << name << ", \"\");" << endl; outFile << " return;" << endl; outFile << " }" << endl << endl; }
outFile << " static regex eof(R\"(\\s*)\");" << endl; outFile << " if (regex_match(input, results, eof, regex_constants::match_continuous)) {" << endl; outFile << " nextToken = \"\";" << endl; outFile << " nextTokenValue = \"\";" << endl; outFile << " return;" << endl; outFile << " }" << endl << endl;
outFile << " throw \"Unknown token\";" << endl; outFile << "}" << endl << endl;
outFile << "bool same(string symbol) {" << endl; outFile << " if (symbol == nextToken) {" << endl; outFile << " advanceToken();" << endl; outFile << " return true;" << endl; outFile << " }" << endl; outFile << " return false;" << endl; outFile << "}" << endl << endl;
// Copy the header code to the output while (true) { getline(inFile, line);
// Copy lines until we reach the first rule if (regex_match(line, results, rulePattern)) break;
outFile << line << endl; }
// Build the nonterminal table while (true) { // results already contains the last matched rule string name = results[1]; stringstream ss(results[2]);
string tempString; vector<string> tempVector; while (ss >> tempString) tempVector.push_back(tempString); nonterminalRules[name].push_back(tempVector);
// Read code until another rule is found string code = ""; while (true) { getline(inFile, line);
if (!inFile || regex_match(line, results, rulePattern)) break;
// Replace $1 with results[1], etc. line = regex_replace(line, argPattern, "results[$1]");
code += line + "\n"; } nonterminalCode[name].push_back(code);
// Stop when we reach the end of the file if (!inFile) break; }
// Generate the first sets, inefficiently bool done = false; while (!done) for (auto i = nonterminalRules.begin(); i != nonterminalRules.end(); ++i) { string name = i->first; done = true;
if (nonterminalFirst.find(i->first) == nonterminalFirst.end()) nonterminalFirst[i->first] = set<string>();
for (int j = 0; j < i->second.size(); ++j) { if (i->second[j].size() == 0) nonterminalFirst[i->first].insert(""); else { string first = i->second[j][0]; if (nonterminalFirst.find(first) != nonterminalFirst.end()) { for (auto k = nonterminalFirst[first].begin(); k != nonterminalFirst[first].end(); ++k) { if (nonterminalFirst[name].find(*k) == nonterminalFirst[name].end()) { nonterminalFirst[name].insert(*k); done = false; } } } else if (nonterminalFirst[name].find(first) == nonterminalFirst[name].end()) { nonterminalFirst[name].insert(first); done = false; } } } }
// Generate function signatures for the nonterminals for (auto i = nonterminalRules.begin(); i != nonterminalRules.end(); ++i) { string name = i->first + "_rule"; outFile << "string " << name << "();" << endl; } outFile << endl;
// Generate the nonterminal functions for (auto i = nonterminalRules.begin(); i != nonterminalRules.end(); ++i) { string name = i->first + "_rule"; outFile << "string " << name << "() {" << endl; outFile << " vector<string> results;" << endl; outFile << " results.push_back(\"\");" << endl << endl;
// Check if this rule can match an empty string int epsilon = -1; for (int j = 0; epsilon == -1 && j < i->second.size(); ++j) if (i->second[j].size() == 0) epsilon = j;
// Generate each production for (int j = 0; j < i->second.size(); ++j) { // Nothing to generate for an empty rule if (j == epsilon) continue;
string token = i->second[j][0]; if (terminals.find(token) != terminals.end()) outFile << " if (nextToken == \"" << i->second[j][0] << "\") {" << endl; else { outFile << " if ("; bool first = true; for (auto k = nonterminalFirst[token].begin(); k != nonterminalFirst[token].end(); ++k, first = false) { if (!first) outFile << " || "; outFile << "nextToken == \"" << (*k) << "\""; } outFile << ") {" << endl; }
for (int k = 0; k < i->second[j].size(); ++k) { if (terminals.find(i->second[j][k]) != terminals.end()) { outFile << " if(same(\"" << i->second[j][k] << "\"))" << endl; outFile << " results.push_back(prevTokenValue);" << endl; outFile << " else" << endl; outFile << " throw \"Syntax error - mismatched token\";" << endl; } else outFile << " results.push_back(" << i->second[j][k] << "_rule());" << endl; }
// Copy rule code to output outFile << nonterminalCode[i->first][j];
outFile << " }" << endl << endl; }
if (epsilon == -1) outFile << " throw \"Syntax error - unmatched token\";" << endl; else outFile << nonterminalCode[i->first][epsilon];
outFile << "}" << endl << endl; }
// Generate the main function outFile << "int main(int argc, char **argv) {" << endl; outFile << " if(argc < 2) {" << endl; outFile << " cout << \"Usage: <input file>\" << endl;" << endl; outFile << " return 1;" << endl; outFile << " }" << endl << endl;
outFile << " ifstream file(argv[1]);" << endl; outFile << " string line;" << endl; outFile << " input = \"\";" << endl << endl;
outFile << " while(true) {" << endl; outFile << " getline(file, line);" << endl; outFile << " if(!file) break;" << endl; outFile << " input += line + \"\\n\";" << endl; outFile << " }" << endl << endl;
outFile << " advanceToken();" << endl << endl;
outFile << " start_rule();" << endl; outFile << "}" << endl; } </lang>
Example grammar:
plus \+ minus - times \* div / open \( close \) num [0-9]+ var [a-z]+ string nextLabel() { static string label = "0000"; for(int i = label.length() - 1; i >= 0; --i) { if(label[i] == '9') label[i] = '0'; else { ++label[i]; break; } } return "_" + label; } !! start -> expr start2 if($2 == "") return $1; else { string label = nextLabel(); cout << label << " = " << $1 << " " << $2 << endl; return label; } !! start2 -> plus start return "+ " + $2; !! start2 -> minus start return "- " + $2; !! start2 -> return ""; !! expr -> term expr2 if($2 == "") return $1; else { string label = nextLabel(); cout << label << " = " << $1 << " " << $2 << endl; return label; } !! expr2 -> times expr return "* " + $2; !! expr2 -> div expr return "/ " + $2; !! expr2 -> return ""; !! term -> var return $1; !! term -> num return $1; !! term -> open start close return $2;
Example input to parser (filename passed through command line):
(one + two) * three + four * five
Output (to standard out):
_0001 = one + two _0002 = _0001 * three _0003 = four * five _0004 = _0002 + _0003
Go
The standard library already contains a recursive descent parser for Go programs or expressions, written in Go itself, whose output is an abstract syntax tree (AST) representing such code.
I've therefore applied this parser to the expression designated by the task, which only involves binary expressions and identifiers, and written a separate routine to convert the resulting AST into three-address code. <lang go>package main
import (
"fmt" "go/ast" "go/parser" "log"
)
func labelStr(label int) string {
return fmt.Sprintf("_%04d", label)
}
type binexp struct {
op, left, right string kind, index int
}
func main() {
x := "(one + two) * three - four * five" fmt.Println("Expression to parse: ", x) f, err := parser.ParseExpr(x) if err != nil { log.Fatal(err) }
fmt.Println("\nThe abstract syntax tree for this expression:") ast.Print(nil, f)
fmt.Println("\nThe corresponding three-address code:") var binexps []binexp // Inspect nodes in depth-first order. ast.Inspect(f, func(n ast.Node) bool { switch x := n.(type) { case *ast.BinaryExpr: sx, ok1 := x.X.(*ast.Ident) sy, ok2 := x.Y.(*ast.Ident) op := x.Op.String() if ok1 && ok2 { binexps = append(binexps, binexp{op, sx.Name, sy.Name, 3, 0}) } else if !ok1 && ok2 { binexps = append(binexps, binexp{op, "<addr>", sy.Name, 2, 0}) } else if ok1 && !ok2 { binexps = append(binexps, binexp{op, sx.Name, "<addr>", 1, 0}) } else { binexps = append(binexps, binexp{op, "<addr>", "<addr>", 0, 0}) } } return true })
for i := 0; i < len(binexps); i++ { binexps[i].index = i }
label, last := 0, -1 var ops, args []binexp var labels []string for i, be := range binexps { if be.kind == 0 { ops = append(ops, be) } if be.kind != 3 { continue } label++ ls := labelStr(label) fmt.Printf(" %s = %s %s %s\n", ls, be.left, be.op, be.right) for j := i - 1; j > last; j-- { be2 := binexps[j] if be2.kind == 2 { label++ ls2 := labelStr(label) fmt.Printf(" %s = %s %s %s\n", ls2, ls, be2.op, be2.right) ls = ls2 be = be2 } else if be2.kind == 1 { label++ ls2 := labelStr(label) fmt.Printf(" %s = %s %s %s\n", ls2, be2.left, be2.op, ls) ls = ls2 be = be2 } } args = append(args, be) labels = append(labels, ls) lea, leo := len(args), len(ops) for lea >= 2 { if i < len(binexps)-1 && args[lea-2].index <= ops[leo-1].index { break } label++ ls2 := labelStr(label) fmt.Printf(" %s = %s %s %s\n", ls2, labels[lea-2], ops[leo-1].op, labels[lea-1]) ops = ops[0 : leo-1] args = args[0 : lea-1] labels = labels[0 : lea-1] lea-- leo-- args[lea-1] = be labels[lea-1] = ls2 } last = i }
}</lang>
- Output:
Expression to parse: (one + two) * three - four * five The abstract syntax tree for this expression: 0 *ast.BinaryExpr { 1 . X: *ast.BinaryExpr { 2 . . X: *ast.ParenExpr { 3 . . . Lparen: 1 4 . . . X: *ast.BinaryExpr { 5 . . . . X: *ast.Ident { 6 . . . . . NamePos: 2 7 . . . . . Name: "one" 8 . . . . . Obj: *ast.Object { 9 . . . . . . Kind: bad 10 . . . . . . Name: "" 11 . . . . . } 12 . . . . } 13 . . . . OpPos: 6 14 . . . . Op: + 15 . . . . Y: *ast.Ident { 16 . . . . . NamePos: 8 17 . . . . . Name: "two" 18 . . . . . Obj: *(obj @ 8) 19 . . . . } 20 . . . } 21 . . . Rparen: 11 22 . . } 23 . . OpPos: 13 24 . . Op: * 25 . . Y: *ast.Ident { 26 . . . NamePos: 15 27 . . . Name: "three" 28 . . . Obj: *(obj @ 8) 29 . . } 30 . } 31 . OpPos: 21 32 . Op: - 33 . Y: *ast.BinaryExpr { 34 . . X: *ast.Ident { 35 . . . NamePos: 23 36 . . . Name: "four" 37 . . . Obj: *(obj @ 8) 38 . . } 39 . . OpPos: 28 40 . . Op: * 41 . . Y: *ast.Ident { 42 . . . NamePos: 30 43 . . . Name: "five" 44 . . . Obj: *(obj @ 8) 45 . . } 46 . } 47 } The corresponding three-address code: _0001 = one + two _0002 = _0001 * three _0003 = four * five _0004 = _0002 - _0003
J
J's native recursive descent parser is adequate for this task, if we map names appropriately.
Implementation:
<lang J>cocurrent 'base'
inlocale=: 4 :0 L:0
x,'_',y,'_'
)
parse=: 3 :0
sentence=. ;:y opinds=. (;:'+*-')i.sentence opfuns=. (;:'plus times minus') inlocale 'base' scratch=. cocreate coinsert__scratch 'base' names=. ~.sentence#~_1<:nc sentence (names inlocale scratch)=: names r=. do__scratch ;:inv opinds}((#sentence)#"0 opfuns),sentence codestroy__scratch r
)
term=: 1 :0
2 :('m,m,expr n')
)
expr=:1 :0
r=. genname emit r,'=:',x,m,y r
)
plus=: '+' expr times=: '*' term minus=: '-' expr
N=: 10000 genname=: 3 :0
'z',}.":N=: N+1
)
emit=: smoutput </lang>
Task example:
<lang J> parse '(one + two) * three - four * five' z0001=:four*five z0002=:one+two z0003=:z0002*three z0004=:z0003-z0001 z0004</lang>
See also https://github.com/jsoftware/general_misc/blob/master/trace.ijs for a model of the underlying parser being employed here.
Julia
The Julia compiler's own parser is a recursive descent parser, and can be used directly here. <lang julia>const one, two, three, four, five, six, seven, eight, nine = collect(1:9)
function testparser(s)
cod = Meta.parse(s) println(Meta.lower(Main, cod))
end
testparser("(one + two) * three - four * five")
</lang>
- Output:
$(Expr(:thunk, CodeInfo( @ none within `top-level scope' 1 ─ %1 = one + two │ %2 = %1 * three │ %3 = four * five │ %4 = %2 - %3 └── return %4 )))
Phix
Technically the task is asking for code which generates something like the following, so I suppose
it would actually meet the spec if it began with constant src = """
and ended with
""" puts(1,src)
...
<lang Phix>string src
integer ch, sdx
sequence tok
procedure skip_spaces()
while 1 do if sdx>length(src) then exit end if ch = src[sdx] if not find(ch," \t\r\n") then exit end if sdx += 1 end while
end procedure
procedure next_token() -- yeilds one of: -- {"SYMBOL",ch} where ch is one of "()+-/*", or -- {"IDENT",string}, or {"EOF"}
skip_spaces() integer tokstart = sdx if sdx>length(src) then tok = {"EOF"} elsif find(ch,"()+-/*") then sdx += 1 tok = {"SYMBOL",ch&""} elsif (ch>='a' and ch<='z') or (ch>='A' and ch<='Z') then while true do sdx += 1 if sdx>length(src) then exit end if ch = src[sdx] if ch!='_' and (ch<'a' or ch>'z') and (ch<'A' or ch>'Z') and (ch<'0' or ch>'9') then exit end if end while tok = {"IDENT",src[tokstart..sdx-1]} else ?9/0 end if
end procedure
forward function sum_expr()
function primary()
sequence res if tok[1]="IDENT" then res = tok next_token() elsif tok={"SYMBOL","("} then next_token() res = sum_expr() if tok!={"SYMBOL",")"} then ?9/0 end if next_token() else ?9/0 end if return res
end function
function mul_expr()
sequence res = primary() while true do if tok[1]!="SYMBOL" or not find(tok[2],{"*","/"}) then exit end if res = {tok,res,NULL} next_token() res[3] = primary() end while return res
end function
function sum_expr()
sequence res = mul_expr() while true do if tok[1]!="SYMBOL" or not find(tok[2],{"+","-"}) then exit end if res = {tok,res,NULL} next_token() res[3] = mul_expr() end while return res
end function
integer nxt = 1 function show_ast(sequence ast)
if ast[1][1]="SYMBOL" then string op = ast[1][2], lhs = show_ast(ast[2]), rhs = show_ast(ast[3]), this = sprintf("_%04d",nxt) printf(1,"%s = %s %s %s\n",{this,lhs,op,rhs}) nxt += 1 return this elsif ast[1]="IDENT" then return ast[2] end if ?9/0
end function
procedure parse(string source)
src = source sdx = 1 next_token() sequence ast = sum_expr() if tok!={"EOF"} then ?9/0 end if pp(ast,{pp_Nest,4}) {} = show_ast(ast)
end procedure
parse("(one + two) * three - four * five")</lang>
- Output:
{{`SYMBOL`, `-`}, {{`SYMBOL`, `*`}, {{`SYMBOL`, `+`}, {`IDENT`, `one`}, {`IDENT`, `two`}}, {`IDENT`, `three`}}, {{`SYMBOL`, `*`}, {`IDENT`, `four`}, {`IDENT`, `five`}}} _0001 = one + two _0002 = _0001 * three _0003 = four * five _0004 = _0002 - _0003