Recursive descent parser generator: Difference between revisions
Recursive descent parser generator (view source)
Revision as of 10:53, 2 February 2024
, 3 months ago→{{header|Wren}}: Changed to Wren S/H
m (→{{header|Phix}}: added syntax colouring, made p2js compatible) |
m (→{{header|Wren}}: Changed to Wren S/H) |
||
(2 intermediate revisions by 2 users not shown) | |||
Line 24:
It can't handle newlines as part of the grammar, the error checking is fairly limited and the error reporting is basically non-existent, but the parser it generates (not shown below) is human readable.
<
#include <fstream>
#include <
#include <map>
#include <regex>
#include <set>
#include <sstream>
#include <string>
using namespace std;
Line 40:
int main(int argc, char **argv) {
<< endl;
<< endl
<< "void advanceToken() {" << endl
<< " static smatch results;" << endl
<< endl
<< " prevToken = nextToken;" << endl
<< " prevTokenValue = nextTokenValue;" << endl
<< endl;
for (auto i = terminals.begin(); i != terminals.end(); ++i) {
string name = i->first + "_pattern";
string pattern = i->second;
<< " if (regex_search(input, results, " << name << ", regex_constants::match_continuous)) {" << endl
<< " nextToken = \"" << i->first << "\";" << endl
<< " nextTokenValue = results[1];" << endl
<< " input = regex_replace(input, " << name << ", \"\");" << endl
<< " return;" << endl
<< " }" << endl
<< endl;
}
outFile << " static regex eof(R\"(\\s*)\");" << endl
<< " if (regex_match(input, results, eof, regex_constants::match_continuous)) {" << endl
<< " nextToken = \"\";" << endl
<< " nextTokenValue = \"\";" << endl
<< " return;" << endl
<< " }" << endl
<< endl
<< " throw \"Unknown token\";" << endl
<< "}" << endl
<< endl
<< "bool same(string symbol) {" << endl
<< " if (symbol == nextToken) {" << endl
<< " advanceToken();" << endl
<< " return true;" << endl
<< " }" << endl
<< " return false;" << endl
<< "}" << endl
<< endl;
// Copy the header code to the output
while (true) {
getline(inFile, line);
// Copy lines until we reach the first rule
break;
}
// Build the nonterminal table
while (true) {
// results already contains the last matched rule
string name = results[1];
stringstream ss(results[2]);
string tempString;
vector<string> tempVector;
while (ss >> tempString)
tempVector.push_back(tempString);
nonterminalRules[name].push_back(tempVector);
// Read code until another rule is found
string code = "";
while (true) {
getline(inFile, line);
if (!inFile || regex_match(line, results, rulePattern))
break;
// Replace $1 with results[1], etc.
line = regex_replace(line, argPattern, "results[$1]");
code += line + "\n";
}
nonterminalCode[name].push_back(code);
// Stop when we reach the end of the file
if (!inFile)
break;
}
// Generate the first sets, inefficiently
bool done = false;
while (!done)
for (auto i = nonterminalRules.begin(); i != nonterminalRules.end(); ++i) {
string name = i->first;
done = true;
if (nonterminalFirst.find(i->first) == nonterminalFirst.end())
nonterminalFirst[i->first] = set<string>();
for (int j = 0; j < i->second.size(); ++j) {
if (i->second[j].size() == 0)
nonterminalFirst[i->first].insert("");
else {
string first = i->second[j][0];
if (nonterminalFirst.find(first) != nonterminalFirst.end()) {
for (auto k = nonterminalFirst[first].begin(); k != nonterminalFirst[first].end(); ++k) {
if (nonterminalFirst[name].find(*k) == nonterminalFirst[name].end()) {
nonterminalFirst[name].insert(*k);
done = false;
}
}
} else if (nonterminalFirst[name].find(first) == nonterminalFirst[name].end()) {
nonterminalFirst[name].insert(first);
done = false;
}
}
}
}
for (auto i = nonterminalRules.begin(); i != nonterminalRules.end(); ++i) {
string name = i->first + "_rule";
outFile << "string " << name << "();" << endl;
}
outFile << endl;
// Generate the nonterminal functions
for (auto i = nonterminalRules.begin(); i != nonterminalRules.end(); ++i) {
string name = i->first + "_rule";
outFile << "string " << name << "() {" << endl
<< " vector<string> results;" << endl
<< " results.push_back(\"\");" << endl
<< endl;
// Check if this rule can match an empty string
int epsilon = -1;
for (int j = 0; epsilon == -1 && j < i->second.size(); ++j)
if (i->second[j].size() == 0)
epsilon = j;
// Generate each production
// Nothing to generate for an empty rule
if (j == epsilon)
continue;
string token = i->second[j][0];
outFile << " if (nextToken == \"" << i->second[j][0] << "\") {" << endl;
else {
bool first = true;
for (auto k = nonterminalFirst[token].begin(); k != nonterminalFirst[token].end(); ++k, first = false) {
if (!first)
outFile << " || ";
outFile << "nextToken == \"" << (*k) << "\"";
}
outFile << ") {" << endl;
}
for (int k = 0; k < i->second[j].size(); ++k) {
outFile << " if(same(\"" << i->second[j][k] << "\"))" << endl
<< " results.push_back(prevTokenValue);" << endl
<< " else" << endl
<< " throw \"Syntax error - mismatched token\";" << endl;
} else
outFile << " results.push_back(" << i->second[j][k] << "_rule());" << endl;
}
// Copy rule code to output
outFile << nonterminalCode[i->first][j];
outFile << " }" << endl << endl;
}
if (epsilon == -1)
outFile << " throw \"Syntax error - unmatched token\";" << endl;
else
outFile << nonterminalCode[i->first][epsilon];
outFile << "}" << endl << endl;
}
// Generate the main function
<< " if(argc < 2) {" << endl
<< " cout << \"Usage: <input file>\" << endl;" << endl
<< " return 1;" << endl
<< " }" << endl
<< endl
<< " ifstream file(argv[1]);" << endl
<< " string line;" << endl
<< " input = \"\";" << endl
<< endl
<< " while(true) {" << endl
<< " getline(file, line);" << endl
<< " if(!file) break;" << endl
<< " input += line + \"\\n\";" << endl
<< " }" << endl
<< endl
<< " advanceToken();" << endl
<< endl
<< " start_rule();" << endl
<< "}" << endl;
}
</syntaxhighlight>
Example grammar:
Line 362 ⟶ 367:
I've therefore applied this parser to the expression designated by the task, which only involves binary expressions and identifiers, and written a separate routine to convert the resulting AST into three-address code.
<
import (
Line 466 ⟶ 471:
last = i
}
}</
{{out}}
Line 535 ⟶ 540:
Implementation:
<
inlocale=: 4 :0 L:0
Line 575 ⟶ 580:
emit=: smoutput
</syntaxhighlight>
Task example:
<
z0001=:four*five
z0002=:one+two
z0003=:z0002*three
z0004=:z0003-z0001
z0004</
See also https://github.com/jsoftware/general_misc/blob/master/trace.ijs for a model of the underlying parser being employed here.
Line 590 ⟶ 595:
=={{header|Julia}}==
The Julia compiler's own parser is a recursive descent parser, and can be used directly here.
<
function testparser(s)
Line 598 ⟶ 603:
testparser("(one + two) * three - four * five")
</
<pre>
$(Expr(:thunk, CodeInfo(
Line 611 ⟶ 616:
=={{header|Perl}}==
<
use strict; # https://rosettacode.org/wiki/Recursive_descent_parser_generator
Line 742 ⟶ 747:
@stack >= 3 or die "not enough on stack";
splice @stack, -3, 3, $stack[-2];
}</
Running the above with no arguments uses a default grammar that will solve the specified example. It produces
the following perl script (and then runs it).
<
use strict; # https://rosettacode.org/wiki/Recursive_descent_parser_generator
use warnings; # WARNING: this code is generated
Line 852 ⟶ 857:
Rule::startsymbol();
eval { end() }; # eval because it is optional
/\G\s*\z/ or die "ERROR: incomplete parse\n";</
The above script can also be run stand-alone and produces the following output.
{{out}}
Line 868 ⟶ 873:
</pre>
and giving this file as "arithexpr.y"
<
expr = term { '+' term <fadd> | '-' term <fsub> } .
Line 882 ⟶ 887:
sub fdiv { splice @stack, -3, 3, $stack[-3] / $stack[-1] }
sub begin { print "expr = $_\n" }
sub end { print "answer = @{[pop @stack]}\n" }</
will produce the following
{{out}}
Line 896 ⟶ 901:
it would actually meet the spec if it began with <code>constant src = """</code> and ended with
<code>""" puts(1,src)</code>... (and like several/most other entries on this page, this does not use a formal grammer)
<!--<
<span style="color: #000080;font-style:italic;">--
-- demo\rosetta\RecursiveDescentParser.exw
Line 1,013 ⟶ 1,018:
<span style="color: #000000;">parse</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"(one + two) * three - four * five"</span><span style="color: #0000FF;">)</span>
<span style="color: #0000FF;">{}</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">wait_key</span><span style="color: #0000FF;">()</span>
<!--</
{{out}}
<pre>
Line 1,066 ⟶ 1,071:
{{libheader|Wren-str}}
{{libheader|Wren-fmt}}
<
import "./fmt" for Fmt
Line 1,184 ⟶ 1,189:
}
RDP.parse("(one + two) * three - four * five")</
{{out}}
|