Recursive descent parser generator: Difference between revisions

← Older edit

Recursive descent parser generator (view source)

Revision as of 10:53, 2 February 2024

794 bytes added , 3 months ago

m

→‎{{header|Wren}}: Changed to Wren S/H

PureFox

9,476

edits

Revision as of 15:23, 10 January 2022 (view source) Petelomax (talk \| contribs) m (→‎{{header\|Phix}}: added syntax colouring, made p2js compatible) ← Older edit		Latest revision as of 10:53, 2 February 2024 (view source) PureFox (talk \| contribs) m (→‎{{header\|Wren}}: Changed to Wren S/H)
(2 intermediate revisions by 2 users not shown)
Line 24: It can't handle newlines as part of the grammar, the error checking is fairly limited and the error reporting is basically non-existent, but the parser it generates (not shown below) is human readable. <~~lang~~syntaxhighlight lang="cpp"> ~~#include <iostream>~~ #include <fstream> #include <~~string~~iostream> ~~#include <sstream>~~ #include <map> ~~#include <set>~~ #include <regex> #include <set> #include <sstream> #include <string> using namespace std; Line 40: int main(int argc, char *argv) { if (argc < 3) { cout << "Usage: <input file> <output file>" << endl; return 1; } ifstream inFile(argv[1]); ofstream outFile(argv[2]); regex blankLine(R"(^\s$)"); regex terminalPattern(R"((\w+)\s+(.+))"); regex rulePattern(R"(^!!\s(\w+)\s->\s((?:\w+\s))$)"); regex argPattern(R"(\$(\d+))"); smatch results; // Read terminal patterns string line; while (true) { getline(inFile, line); // Terminals section ends with a blank line if (regex_match(line, blankLine)) break; regex_match(line, results, terminalPattern); terminals[results[1]] = results[2]; } outFile << "#include <iostream>" << endl; ~~outFile~~ << "#include <fstream>" << endl; ~~outFile~~ << "#include <string>" << endl; ~~outFile~~ << "#include <regex>" << endl; ~~outFile~~ << "using namespace std;" << endl ~~<< endl;~~ << endl; // Generate the token processing functions outFile << "string input, nextToken, nextTokenValue;" << endl; ~~outFile~~ << "string prevToken, prevTokenValue;" << endl ~~<< endl;~~ << endl << "void advanceToken() {" << endl << " static smatch results;" << endl << endl << " prevToken = nextToken;" << endl << " prevTokenValue = nextTokenValue;" << endl << endl; for (auto i = terminals.begin(); i != terminals.end(); ++i) { ~~outFile << "void advanceToken() {" << endl;~~ string name = i->first + "_pattern"; ~~outFile << " static smatch results;" << endl << endl;~~ string pattern = i->second; outFile << " ~~prevToken~~static =regex ~~nextToken~~" << name << "(R\"(^\\s(" << pattern << "))\");" << endl; << " if (regex_search(input, results, " << name << ", regex_constants::match_continuous)) {" << endl ~~outFile << " prevTokenValue = nextTokenValue;" << endl << endl;~~ << " nextToken = \"" << i->first << "\";" << endl << " nextTokenValue = results[1];" << endl << " input = regex_replace(input, " << name << ", \"\");" << endl << " return;" << endl << " }" << endl << endl; } outFile << " static regex eof(R\"(\\s)\");" << endl ~~for (auto i = terminals.begin(); i != terminals.end(); ++i) {~~ << " if (regex_match(input, results, eof, regex_constants::match_continuous)) {" << endl ~~string name = i->first + "_pattern";~~ << " nextToken = \"\";" << endl ~~string pattern = i->second;~~ << " nextTokenValue = \"\";" << endl << " return;" << endl << " }" << endl << endl << " throw \"Unknown token\";" << endl << "}" << endl << endl << "bool same(string symbol) {" << endl << " if (symbol == nextToken) {" << endl << " advanceToken();" << endl << " return true;" << endl << " }" << endl << " return false;" << endl << "}" << endl << endl; // Copy the header code to the output ~~outFile << " static regex " << name << "(R\"(^\\s(" << pattern << "))\");" << endl;~~ while (true) { ~~outFile << " if (regex_search(input, results, " << name << ", regex_constants::match_continuous)) {" << endl;~~ getline(inFile, line); ~~outFile << " nextToken = \"" << i->first << "\";" << endl;~~ ~~outFile << " nextTokenValue = results[1];" << endl;~~ ~~outFile << " input = regex_replace(input, " << name << ", \"\");" << endl;~~ ~~outFile << " return;" << endl;~~ ~~outFile << " }" << endl << endl;~~ } // Copy lines until we reach the first rule ~~outFile << " static regex eof(R\"(\\s)\");" << endl;~~ ~~outFile~~ << " if (regex_match(~~input~~line, results, ~~eof, regex_constants::match_continuous~~rulePattern)) ~~{" << endl;~~ break; ~~outFile << " nextToken = \"\";" << endl;~~ ~~outFile << " nextTokenValue = \"\";" << endl;~~ ~~outFile << " return;" << endl;~~ ~~outFile << " }" << endl << endl;~~ outFile << ~~" throw \"Unknown token\";"~~line << endl; } ~~outFile << "}" << endl << endl;~~ // Build the nonterminal table ~~outFile << "bool same(string symbol) {" << endl;~~ while (true) { ~~outFile << " if (symbol == nextToken) {" << endl;~~ // results already contains the last matched rule ~~outFile << " advanceToken();" << endl;~~ string name = results[1]; ~~outFile << " return true;" << endl;~~ stringstream ss(results[2]); ~~outFile << " }" << endl;~~ ~~outFile << " return false;" << endl;~~ ~~outFile << "}" << endl << endl;~~ string tempString; ~~// Copy the header code to the output~~ vector<string> tempVector; ~~while (true) {~~ while (ss >> tempString) ~~getline(inFile, line);~~ tempVector.push_back(tempString); nonterminalRules[name].push_back(tempVector); ~~// Copy lines until we reach the first rule~~ ~~if (regex_match(line, results, rulePattern))~~ ~~break;~~ // Read code until another rule is found ~~outFile << line << endl;~~ string code = ""; } while (true) { getline(inFile, line); if (!inFile \|\| regex_match(line, results, rulePattern)) ~~// Build the nonterminal table~~ break; ~~while (true) {~~ ~~// results already contains the last matched rule~~ ~~string name = results[1];~~ ~~stringstream ss(results[2]);~~ // Replace $1 with results[1], etc. ~~string tempString;~~ line = regex_replace(line, argPattern, "results[$1]"); ~~vector<string> tempVector;~~ ~~while (ss >> tempString)~~ ~~tempVector.push_back(tempString);~~ ~~nonterminalRules[name].push_back(tempVector);~~ code += line + "\n"; ~~// Read code until another rule is found~~ } ~~string code = "";~~ nonterminalCode[name].push_back(code); ~~while (true) {~~ ~~getline(inFile, line);~~ // Stop when we reach the end of the file ~~if (!inFile \|\| regex_match(line, results, rulePattern))~~ if (!inFile) ~~break;~~ break; } // Generate the first sets, inefficiently ~~// Replace $1 with results[1], etc.~~ bool done = false; ~~line = regex_replace(line, argPattern, "results[$1]");~~ while (!done) for (auto i = nonterminalRules.begin(); i != nonterminalRules.end(); ++i) { string name = i->first; done = true; if (nonterminalFirst.find(i->first) == nonterminalFirst.end()) ~~code += line + "\n";~~ nonterminalFirst[i->first] = set<string>(); } ~~nonterminalCode[name].push_back(code);~~ for (int j = 0; j < i->second.size(); ++j) { ~~// Stop when we reach the end of the file~~ if (i->second[j].size() == 0) ~~if (!inFile)~~ nonterminalFirst[i->first].insert(""); ~~break;~~ else { } string first = i->second[j][0]; if (nonterminalFirst.find(first) != nonterminalFirst.end()) { for (auto k = nonterminalFirst[first].begin(); k != nonterminalFirst[first].end(); ++k) { if (nonterminalFirst[name].find(k) == nonterminalFirst[name].end()) { nonterminalFirst[name].insert(k); done = false; } } } else if (nonterminalFirst[name].find(first) == nonterminalFirst[name].end()) { nonterminalFirst[name].insert(first); done = false; } } } } // Generate ~~the~~function ~~first~~signatures ~~sets,~~for the ~~inefficiently~~nonterminals for (auto i = nonterminalRules.begin(); i != nonterminalRules.end(); ++i) { ~~bool done = false;~~ string name = i->first + "_rule"; ~~while (!done)~~ outFile << "string " << name << "();" << endl; ~~for (auto i = nonterminalRules.begin(); i != nonterminalRules.end(); ++i) {~~ } ~~string name = i->first;~~ outFile << endl; ~~done = true;~~ // Generate the nonterminal functions ~~if (nonterminalFirst.find(i->first) == nonterminalFirst.end())~~ for (auto i = nonterminalRules.begin(); i != nonterminalRules.end(); ++i) { ~~nonterminalFirst[i->first] = set<string>();~~ string name = i->first + "_rule"; outFile << "string " << name << "() {" << endl << " vector<string> results;" << endl << " results.push_back(\"\");" << endl << endl; // Check if this rule can match an empty string ~~for (int j = 0; j < i->second.size(); ++j) {~~ int epsilon = -1; ~~if (i->second[j].size() == 0)~~ for (int j = 0; epsilon == -1 && j < i->second.size(); ++j) ~~nonterminalFirst[i->first].insert("");~~ if (i->second[j].size() == 0) ~~else {~~ epsilon = j; ~~string first = i->second[j][0];~~ ~~if (nonterminalFirst.find(first) != nonterminalFirst.end()) {~~ ~~for (auto k = nonterminalFirst[first].begin(); k != nonterminalFirst[first].end(); ++k) {~~ ~~if (nonterminalFirst[name].find(k) == nonterminalFirst[name].end()) {~~ ~~nonterminalFirst[name].insert(k);~~ ~~done = false;~~ } } ~~} else if (nonterminalFirst[name].find(first) == nonterminalFirst[name].end()) {~~ ~~nonterminalFirst[name].insert(first);~~ ~~done = false;~~ } } } } // Generate each production ~~// Generate function signatures for the nonterminals~~ for (~~auto~~int ij = ~~nonterminalRules.begin()~~0; ij !=< ~~nonterminalRules~~i->second.~~end~~size(); ++ij) { // Nothing to generate for an empty rule ~~string name = i->first + "_rule";~~ if (j == epsilon) ~~outFile << "string " << name << "();" << endl;~~ continue; } ~~outFile << endl;~~ string token = i->second[j][0]; ~~// Generate the nonterminal functions~~ ~~for~~ ~~(auto~~ i = ~~nonterminalRules~~ if (terminals.~~begin~~find(token)~~; i~~ != ~~nonterminalRules~~terminals.end()~~; ++i~~) { outFile << " if (nextToken == \"" << i->second[j][0] << "\") {" << endl; ~~string name = i->first + "_rule";~~ else { ~~outFile << "string " << name << "() {" << endl;~~ outFile << " ~~vector<string>~~if ~~results;~~(" ~~<< endl~~; bool first = true; ~~outFile << " results.push_back(\"\");" << endl << endl;~~ for (auto k = nonterminalFirst[token].begin(); k != nonterminalFirst[token].end(); ++k, first = false) { if (!first) ~~// Check if this rule can match an empty string~~ outFile << " \|\| "; ~~int epsilon = -1;~~ outFile << "nextToken == \"" << (k) << "\""; ~~for (int j = 0; epsilon == -1 && j < i->second.size(); ++j)~~ } ~~if (i->second[j].size() == 0)~~ outFile << ") {" << endl; ~~epsilon = j;~~ } for (int k = 0; k < i->second[j].size(); ++k) { ~~// Generate each production~~ ~~for~~ ~~(int~~ j = 0; j < if (terminals.find(i->second[j][k]) != terminals.~~size~~end()~~; ++j~~) { outFile << " if(same(\"" << i->second[j][k] << "\"))" << endl ~~// Nothing to generate for an empty rule~~ << " results.push_back(prevTokenValue);" << endl ~~if (j == epsilon)~~ << " else" << endl ~~continue;~~ << " throw \"Syntax error - mismatched token\";" << endl; } else outFile << " results.push_back(" << i->second[j][k] << "_rule());" << endl; } // Copy rule code to output ~~string token = i->second[j][0];~~ outFile << nonterminalCode[i->first][j]; ~~if (terminals.find(token) != terminals.end())~~ ~~outFile << " if (nextToken == \"" << i->second[j][0] << "\") {" << endl;~~ ~~else {~~ ~~outFile << " if (";~~ ~~bool first = true;~~ ~~for (auto k = nonterminalFirst[token].begin(); k != nonterminalFirst[token].end(); ++k, first = false) {~~ ~~if (!first)~~ ~~outFile << " \|\| ";~~ ~~outFile << "nextToken == \"" << (k) << "\"";~~ } ~~outFile << ") {" << endl;~~ } ~~for (int k = 0; k < i->second[j].size(); ++k) {~~ ~~if (terminals.find(i->second[j][k]) != terminals.end()) {~~ ~~outFile << " if(same(\"" << i->second[j][k] << "\"))" << endl;~~ ~~outFile << " results.push_back(prevTokenValue);" << endl;~~ ~~outFile << " else" << endl;~~ ~~outFile << " throw \"Syntax error - mismatched token\";" << endl;~~ ~~} else~~ ~~outFile << " results.push_back(" << i->second[j][k] << "_rule());" << endl;~~ } outFile << " }" << endl << endl; ~~// Copy rule code to output~~ } ~~outFile << nonterminalCode[i->first][j];~~ if (epsilon == -1) ~~outFile << " }" << endl << endl;~~ outFile << " throw \"Syntax error - unmatched token\";" << endl; } else outFile << nonterminalCode[i->first][epsilon]; outFile << "}" << endl << endl; ~~if (epsilon == -1)~~ } ~~outFile << " throw \"Syntax error - unmatched token\";" << endl;~~ ~~else~~ ~~outFile << nonterminalCode[i->first][epsilon];~~ ~~outFile << "}" << endl << endl;~~ } ~~// Generate the main function~~ ~~outFile << "int main(int argc, char argv) {" << endl;~~ ~~outFile << " if(argc < 2) {" << endl;~~ ~~outFile << " cout << \"Usage: <input file>\" << endl;" << endl;~~ ~~outFile << " return 1;" << endl;~~ ~~outFile << " }" << endl << endl;~~ ~~outFile << " ifstream file(argv[1]);" << endl;~~ ~~outFile << " string line;" << endl;~~ ~~outFile << " input = \"\";" << endl << endl;~~ ~~outFile << " while(true) {" << endl;~~ ~~outFile << " getline(file, line);" << endl;~~ ~~outFile << " if(!file) break;" << endl;~~ ~~outFile << " input += line + \"\\n\";" << endl;~~ ~~outFile << " }" << endl << endl;~~ ~~outFile << " advanceToken();" << endl << endl;~~ // Generate the main function ~~outFile << " start_rule();" << endl;~~ outFile << "}int main(int argc, char argv) {" << endl; << " if(argc < 2) {" << endl << " cout << \"Usage: <input file>\" << endl;" << endl << " return 1;" << endl << " }" << endl << endl << " ifstream file(argv[1]);" << endl << " string line;" << endl << " input = \"\";" << endl << endl << " while(true) {" << endl << " getline(file, line);" << endl << " if(!file) break;" << endl << " input += line + \"\\n\";" << endl << " }" << endl << endl << " advanceToken();" << endl << endl << " start_rule();" << endl << "}" << endl; } </syntaxhighlight> ~~</lang>~~ Example grammar: Line 362 ⟶ 367: I've therefore applied this parser to the expression designated by the task, which only involves binary expressions and identifiers, and written a separate routine to convert the resulting AST into three-address code. <~~lang~~syntaxhighlight lang="go">package main import ( Line 466 ⟶ 471: last = i } }</~~lang~~syntaxhighlight> {{out}} Line 535 ⟶ 540: Implementation: <~~lang~~syntaxhighlight Jlang="j">cocurrent 'base' inlocale=: 4 :0 L:0 Line 575 ⟶ 580: emit=: smoutput </syntaxhighlight> ~~</lang>~~ Task example: <~~lang~~syntaxhighlight Jlang="j"> parse '(one + two) three - four * five' z0001=:fourfive z0002=:one+two z0003=:z0002three z0004=:z0003-z0001 z0004</~~lang~~syntaxhighlight> See also https://github.com/jsoftware/general_misc/blob/master/trace.ijs for a model of the underlying parser being employed here. Line 590 ⟶ 595: =={{header\|Julia}}== The Julia compiler's own parser is a recursive descent parser, and can be used directly here. <~~lang~~syntaxhighlight lang="julia">const one, two, three, four, five, six, seven, eight, nine = collect(1:9) function testparser(s) Line 598 ⟶ 603: testparser("(one + two) * three - four * five") </~~lang~~syntaxhighlight>{{out}} <pre> $(Expr(:thunk, CodeInfo( Line 611 ⟶ 616: =={{header\|Perl}}== <~~lang~~syntaxhighlight lang="perl">#!/usr/bin/perl use strict; # https://rosettacode.org/wiki/Recursive_descent_parser_generator Line 742 ⟶ 747: @stack >= 3 or die "not enough on stack"; splice @stack, -3, 3, $stack[-2]; }</~~lang~~syntaxhighlight> Running the above with no arguments uses a default grammar that will solve the specified example. It produces the following perl script (and then runs it). <~~lang~~syntaxhighlight lang="perl">#!/usr/bin/perl use strict; # https://rosettacode.org/wiki/Recursive_descent_parser_generator use warnings; # WARNING: this code is generated Line 852 ⟶ 857: Rule::startsymbol(); eval { end() }; # eval because it is optional /\G\s\z/ or die "ERROR: incomplete parse\n";</~~lang~~syntaxhighlight> The above script can also be run stand-alone and produces the following output. {{out}} Line 868 ⟶ 873: </pre> and giving this file as "arithexpr.y" <~~lang~~syntaxhighlight lang="perl"># test arith expr expr = term { '+' term <fadd> \| '-' term <fsub> } . Line 882 ⟶ 887: sub fdiv { splice @stack, -3, 3, $stack[-3] / $stack[-1] } sub begin { print "expr = $_\n" } sub end { print "answer = @{[pop @stack]}\n" }</~~lang~~syntaxhighlight> will produce the following {{out}} Line 896 ⟶ 901: it would actually meet the spec if it began with <code>constant src = """</code> and ended with <code>""" puts(1,src)</code>... (and like several/most other entries on this page, this does not use a formal grammer) <!--<~~lang~~syntaxhighlight ~~Phix~~lang="phix">(phixonline)--> <span style="color: #000080;font-style:italic;">-- -- demo\rosetta\RecursiveDescentParser.exw Line 1,013 ⟶ 1,018: <span style="color: #000000;">parse</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"(one + two) three - four * five"</span><span style="color: #0000FF;">)</span> <span style="color: #0000FF;">{}</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">wait_key</span><span style="color: #0000FF;">()</span> <!--</~~lang~~syntaxhighlight>--> {{out}} <pre> Line 1,066 ⟶ 1,071: {{libheader\|Wren-str}} {{libheader\|Wren-fmt}} <~~lang~~syntaxhighlight ~~ecmascript~~lang="wren">import "./str" for Char import "./fmt" for Fmt Line 1,184 ⟶ 1,189: } RDP.parse("(one + two) * three - four * five")</~~lang~~syntaxhighlight> {{out}}