Compiler/syntax analyzer: Difference between revisions

Add Zig language implementation
(Add Zig language implementation)
Line 6,350:
;
</pre>
 
=={{header|Zig}}==
<lang zig>
const std = @import("std");
 
pub const NodeValue = union(enum) {
integer: i32,
string: []const u8,
 
fn fromToken(token: Token) ?NodeValue {
if (token.value) |value| {
switch (value) {
.integer => |int| return NodeValue{ .integer = int },
.string => |str| return NodeValue{ .string = str },
}
} else {
return null;
}
}
};
 
pub const Tree = struct {
left: ?*Tree,
right: ?*Tree,
typ: NodeType,
value: ?NodeValue = null,
};
 
pub const ParserError = error{
OutOfMemory,
ExpectedNotFound,
} || std.fmt.ParseIntError;
 
pub const Parser = struct {
token_it: LexerOutputTokenizer,
curr: Token,
allocator: *std.mem.Allocator,
 
const Self = @This();
 
pub fn init(allocator: *std.mem.Allocator, str: []const u8) Self {
return Self{
.token_it = LexerOutputTokenizer.init(str),
.curr = Token{ .line = 0, .col = 0, .typ = .unknown },
.allocator = allocator,
};
}
 
fn makeNode(self: *Self, typ: NodeType, left: ?*Tree, right: ?*Tree) !*Tree {
const result = try self.allocator.create(Tree);
result.* = Tree{ .left = left, .right = right, .typ = typ };
return result;
}
 
fn makeLeaf(self: *Self, typ: NodeType, value: ?NodeValue) !*Tree {
const result = try self.allocator.create(Tree);
result.* = Tree{ .left = null, .right = null, .typ = typ, .value = value };
return result;
}
 
pub fn parse(self: *Self) ParserError!?*Tree {
try self.next();
var result: ?*Tree = null;
while (true) {
const stmt = try self.parseStmt();
result = try self.makeNode(.sequence, result, stmt);
if (self.curr.typ == .eof) break;
}
return result;
}
 
/// Classic "Recursive descent" statement parser.
fn parseStmt(self: *Self) ParserError!?*Tree {
var result: ?*Tree = null;
switch (self.curr.typ) {
.kw_print => {
try self.next();
try self.expect(.left_paren);
// Parse each print's argument as an expression delimited by commas until we reach
// a closing parens.
while (true) {
var expr: ?*Tree = null;
if (self.curr.typ == .string) {
expr = try self.makeNode(
.prts,
try self.makeLeaf(.string, NodeValue.fromToken(self.curr)),
null,
);
try self.next();
} else {
expr = try self.makeNode(.prti, try self.parseExpr(0), null);
}
result = try self.makeNode(.sequence, result, expr);
if (self.curr.typ != .comma) break;
try self.next();
}
try self.expect(.right_paren);
try self.expect(.semicolon);
},
.kw_putc => {
try self.next();
result = try self.makeNode(.prtc, try self.parseParenExpr(), null);
try self.expect(.semicolon);
},
.kw_while => {
try self.next();
const expr = try self.parseParenExpr();
result = try self.makeNode(.kw_while, expr, try self.parseStmt());
},
.kw_if => {
try self.next();
const expr = try self.parseParenExpr();
const if_stmt = try self.parseStmt();
const else_stmt = blk: {
if (self.curr.typ == .kw_else) {
try self.next();
break :blk try self.parseStmt();
} else {
break :blk null;
}
};
const stmt_node = try self.makeNode(.kw_if, if_stmt, else_stmt);
// If-statement uses `.kw_if` node for both first node with `expr` on the left
// and statements on the right and also `.kw_if` node which goes to the right
// and contains both if-branch and else-branch.
result = try self.makeNode(.kw_if, expr, stmt_node);
},
.left_brace => {
try self.next();
while (self.curr.typ != .right_brace and self.curr.typ != .eof) {
result = try self.makeNode(.sequence, result, try self.parseStmt());
}
try self.expect(.right_brace);
},
.identifier => {
const identifer = try self.makeLeaf(.identifier, NodeValue.fromToken(self.curr));
try self.next();
try self.expect(.assign);
const expr = try self.parseExpr(0);
result = try self.makeNode(.assign, identifer, expr);
try self.expect(.semicolon);
},
.semicolon => try self.next(),
else => {
std.debug.print("\nSTMT: UNKNOWN {}\n", .{self.curr});
std.os.exit(1);
},
}
return result;
}
 
/// "Precedence climbing" expression parser.
fn parseExpr(self: *Self, precedence: i8) ParserError!?*Tree {
var result: ?*Tree = null;
switch (self.curr.typ) {
.left_paren => {
result = try self.parseParenExpr();
},
.subtract => {
try self.next();
const metadata = NodeMetadata.find(.negate);
const expr = try self.parseExpr(metadata.precedence);
result = try self.makeNode(.negate, expr, null);
},
.not => {
try self.next();
const metadata = NodeMetadata.find(.not);
const expr = try self.parseExpr(metadata.precedence);
result = try self.makeNode(.not, expr, null);
},
.add => {
try self.next();
result = try self.parseExpr(precedence);
},
.integer, .identifier => {
const node_type = NodeMetadata.find(self.curr.typ).node_type;
result = try self.makeLeaf(node_type, NodeValue.fromToken(self.curr));
try self.next();
},
else => {
std.debug.print("\nEXPR: UNKNOWN {}\n", .{self.curr});
std.os.exit(1);
},
}
 
var curr_metadata = NodeMetadata.find(self.curr.typ);
while (curr_metadata.binary and curr_metadata.precedence >= precedence) {
const new_precedence =
if (curr_metadata.right_associative)
curr_metadata.precedence
else
curr_metadata.precedence + 1;
try self.next();
const sub_expr = try self.parseExpr(new_precedence);
result = try self.makeNode(curr_metadata.node_type, result, sub_expr);
curr_metadata = NodeMetadata.find(self.curr.typ);
}
return result;
}
 
fn parseParenExpr(self: *Self) ParserError!?*Tree {
try self.expect(.left_paren);
const result = try self.parseExpr(0);
try self.expect(.right_paren);
return result;
}
 
fn next(self: *Self) ParserError!void {
const token = try self.token_it.next();
if (token) |tok| {
self.curr = tok;
} else {
self.curr = Token{ .line = 0, .col = 0, .typ = .unknown };
}
}
 
fn expect(self: *Self, token_type: TokenType) ParserError!void {
if (self.curr.typ != token_type) {
const expected_str = NodeMetadata.find(token_type).token_str;
const found_str = NodeMetadata.find(self.curr.typ).token_str;
std.debug.print(
"({d}, {d}) error: Expecting '{s}', found '{s}'\n",
.{ self.curr.line, self.curr.col, expected_str, found_str },
);
return ParserError.ExpectedNotFound;
}
try self.next();
}
};
 
pub fn parse(allocator: *std.mem.Allocator, str: []const u8) !?*Tree {
var parser = Parser.init(allocator, str);
return try parser.parse();
}
 
pub fn main() !void {
var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
defer arena.deinit();
var allocator = &arena.allocator;
 
var arg_it = std.process.args();
_ = try arg_it.next(allocator) orelse unreachable; // program name
const file_name = arg_it.next(allocator);
// We accept both files and standard input.
var file_handle = blk: {
if (file_name) |file_name_delimited| {
const fname: []const u8 = try file_name_delimited;
break :blk try std.fs.cwd().openFile(fname, .{});
} else {
break :blk std.io.getStdIn();
}
};
defer file_handle.close();
const input_content = try file_handle.readToEndAlloc(allocator, std.math.maxInt(usize));
 
const result: ?*Tree = try parse(allocator, input_content);
const result_str = try astToFlattenedString(allocator, result);
_ = try std.io.getStdOut().write(result_str);
}
 
const NodeMetadata = struct {
token_type: TokenType,
right_associative: bool,
binary: bool,
unary: bool,
precedence: i8,
node_type: NodeType,
token_str: []const u8,
 
const self = [_]NodeMetadata{
.{ .token_type = .multiply, .right_associative = false, .binary = true, .unary = false, .precedence = 13, .node_type = .multiply, .token_str = "*" },
.{ .token_type = .divide, .right_associative = false, .binary = true, .unary = false, .precedence = 13, .node_type = .divide, .token_str = "/" },
.{ .token_type = .mod, .right_associative = false, .binary = true, .unary = false, .precedence = 13, .node_type = .mod, .token_str = "%" },
.{ .token_type = .add, .right_associative = false, .binary = true, .unary = false, .precedence = 12, .node_type = .add, .token_str = "+" },
.{ .token_type = .subtract, .right_associative = false, .binary = true, .unary = false, .precedence = 12, .node_type = .subtract, .token_str = "-" },
.{ .token_type = .negate, .right_associative = false, .binary = false, .unary = true, .precedence = 14, .node_type = .negate, .token_str = "-" },
.{ .token_type = .less, .right_associative = false, .binary = true, .unary = false, .precedence = 10, .node_type = .less, .token_str = "<" },
.{ .token_type = .less_equal, .right_associative = false, .binary = true, .unary = false, .precedence = 10, .node_type = .less_equal, .token_str = "<=" },
.{ .token_type = .greater, .right_associative = false, .binary = true, .unary = false, .precedence = 10, .node_type = .greater, .token_str = ">" },
.{ .token_type = .greater_equal, .right_associative = false, .binary = true, .unary = false, .precedence = 10, .node_type = .greater_equal, .token_str = ">=" },
.{ .token_type = .equal, .right_associative = false, .binary = true, .unary = false, .precedence = 9, .node_type = .equal, .token_str = "=" },
.{ .token_type = .not_equal, .right_associative = false, .binary = true, .unary = false, .precedence = 9, .node_type = .not_equal, .token_str = "!=" },
.{ .token_type = .not, .right_associative = false, .binary = false, .unary = true, .precedence = 14, .node_type = .not, .token_str = "!" },
.{ .token_type = .assign, .right_associative = false, .binary = false, .unary = false, .precedence = -1, .node_type = .assign, .token_str = "=" },
.{ .token_type = .bool_and, .right_associative = false, .binary = true, .unary = false, .precedence = 5, .node_type = .bool_and, .token_str = "&&" },
.{ .token_type = .bool_or, .right_associative = false, .binary = true, .unary = false, .precedence = 4, .node_type = .bool_or, .token_str = "||" },
.{ .token_type = .left_paren, .right_associative = false, .binary = false, .unary = false, .precedence = -1, .node_type = .unknown, .token_str = "(" },
.{ .token_type = .right_paren, .right_associative = false, .binary = false, .unary = false, .precedence = -1, .node_type = .unknown, .token_str = ")" },
.{ .token_type = .left_brace, .right_associative = false, .binary = false, .unary = false, .precedence = -1, .node_type = .unknown, .token_str = "{" },
.{ .token_type = .right_brace, .right_associative = false, .binary = false, .unary = false, .precedence = -1, .node_type = .unknown, .token_str = "}" },
.{ .token_type = .semicolon, .right_associative = false, .binary = false, .unary = false, .precedence = -1, .node_type = .unknown, .token_str = ";" },
.{ .token_type = .comma, .right_associative = false, .binary = false, .unary = false, .precedence = -1, .node_type = .unknown, .token_str = "," },
.{ .token_type = .kw_if, .right_associative = false, .binary = false, .unary = false, .precedence = -1, .node_type = .kw_if, .token_str = "if" },
.{ .token_type = .kw_else, .right_associative = false, .binary = false, .unary = false, .precedence = -1, .node_type = .unknown, .token_str = "else" },
.{ .token_type = .kw_while, .right_associative = false, .binary = false, .unary = false, .precedence = -1, .node_type = .kw_while, .token_str = "while" },
.{ .token_type = .kw_print, .right_associative = false, .binary = false, .unary = false, .precedence = -1, .node_type = .unknown, .token_str = "print" },
.{ .token_type = .kw_putc, .right_associative = false, .binary = false, .unary = false, .precedence = -1, .node_type = .unknown, .token_str = "putc" },
.{ .token_type = .identifier, .right_associative = false, .binary = false, .unary = false, .precedence = -1, .node_type = .identifier, .token_str = "Identifier" },
.{ .token_type = .integer, .right_associative = false, .binary = false, .unary = false, .precedence = -1, .node_type = .integer, .token_str = "Integer literal" },
.{ .token_type = .string, .right_associative = false, .binary = false, .unary = false, .precedence = -1, .node_type = .string, .token_str = "String literal" },
.{ .token_type = .eof, .right_associative = false, .binary = false, .unary = false, .precedence = -1, .node_type = .unknown, .token_str = "End of line" },
};
 
pub fn find(token_type: TokenType) NodeMetadata {
for (self) |metadata| {
if (metadata.token_type == token_type) return metadata;
} else {
unreachable;
}
}
};
 
pub const NodeType = enum {
unknown,
identifier,
string,
integer,
sequence,
kw_if,
prtc,
prts,
prti,
kw_while,
assign,
negate,
not,
multiply,
divide,
mod,
add,
subtract,
less,
less_equal,
greater,
greater_equal,
equal,
not_equal,
bool_and,
bool_or,
 
pub fn toString(self: NodeType) []const u8 {
return switch (self) {
.unknown => "UNKNOWN",
.identifier => "Identifier",
.string => "String",
.integer => "Integer",
.sequence => "Sequence",
.kw_if => "If",
.prtc => "Prtc",
.prts => "Prts",
.prti => "Prti",
.kw_while => "While",
.assign => "Assign",
.negate => "Negate",
.not => "Not",
.multiply => "Multiply",
.divide => "Divide",
.mod => "Mod",
.add => "Add",
.subtract => "Subtract",
.less => "Less",
.less_equal => "LessEqual",
.greater => "Greater",
.greater_equal => "GreaterEqual",
.equal => "Equal",
.not_equal => "NotEqual",
.bool_and => "And",
.bool_or => "Or",
};
}
};
 
fn astToFlattenedString(allocator: *std.mem.Allocator, tree: ?*Tree) ![]const u8 {
var result = std.ArrayList(u8).init(allocator);
var writer = result.writer();
try treeToString(allocator, writer, tree);
return result.items;
}
 
pub const TokenType = enum {
unknown,
multiply,
divide,
mod,
add,
subtract,
negate,
less,
less_equal,
greater,
greater_equal,
equal,
not_equal,
not,
assign,
bool_and,
bool_or,
left_paren,
right_paren,
left_brace,
right_brace,
semicolon,
comma,
kw_if,
kw_else,
kw_while,
kw_print,
kw_putc,
identifier,
integer,
string,
eof,
 
const from_string_map = std.ComptimeStringMap(TokenType, .{
.{ "Op_multiply", .multiply },
.{ "Op_divide", .divide },
.{ "Op_mod", .mod },
.{ "Op_add", .add },
.{ "Op_subtract", .subtract },
.{ "Op_negate", .negate },
.{ "Op_less", .less },
.{ "Op_lessequal", .less_equal },
.{ "Op_greater", .greater },
.{ "Op_greaterequal", .greater_equal },
.{ "Op_equal", .equal },
.{ "Op_notequal", .not_equal },
.{ "Op_not", .not },
.{ "Op_assign", .assign },
.{ "Op_and", .bool_and },
.{ "Op_or", .bool_or },
.{ "LeftParen", .left_paren },
.{ "RightParen", .right_paren },
.{ "LeftBrace", .left_brace },
.{ "RightBrace", .right_brace },
.{ "Semicolon", .semicolon },
.{ "Comma", .comma },
.{ "Keyword_if", .kw_if },
.{ "Keyword_else", .kw_else },
.{ "Keyword_while", .kw_while },
.{ "Keyword_print", .kw_print },
.{ "Keyword_putc", .kw_putc },
.{ "Identifier", .identifier },
.{ "Integer", .integer },
.{ "String", .string },
.{ "End_of_input", .eof },
});
 
pub fn fromString(str: []const u8) TokenType {
return from_string_map.get(str).?;
}
};
 
pub const TokenValue = union(enum) {
integer: i32,
string: []const u8,
};
 
pub const Token = struct {
line: usize,
col: usize,
typ: TokenType = .unknown,
value: ?TokenValue = null,
};
 
const TreeToStringError = error{OutOfMemory};
 
fn treeToString(
allocator: *std.mem.Allocator,
writer: std.ArrayList(u8).Writer,
tree: ?*Tree,
) TreeToStringError!void {
if (tree) |t| {
_ = try writer.write(try std.fmt.allocPrint(
allocator,
"{s}",
.{t.typ.toString()},
));
switch (t.typ) {
.string, .identifier => _ = try writer.write(try std.fmt.allocPrint(
allocator,
" {s}\n",
.{t.value.?.string},
)),
.integer => _ = try writer.write(try std.fmt.allocPrint(
allocator,
" {d}\n",
.{t.value.?.integer},
)),
else => {
_ = try writer.write(try std.fmt.allocPrint(
allocator,
"\n",
.{},
));
try treeToString(allocator, writer, t.left);
try treeToString(allocator, writer, t.right);
},
}
} else {
_ = try writer.write(try std.fmt.allocPrint(
allocator,
";\n",
.{},
));
}
}
 
pub const LexerOutputTokenizer = struct {
it: std.mem.SplitIterator,
 
const Self = @This();
 
pub fn init(str: []const u8) Self {
return Self{ .it = std.mem.split(str, "\n") };
}
 
pub fn next(self: *Self) std.fmt.ParseIntError!?Token {
if (self.it.next()) |line| {
if (line.len == 0) return null;
var tokens_it = std.mem.tokenize(line, " ");
const lineNumber = try std.fmt.parseInt(usize, tokens_it.next().?, 10);
const colNumber = try std.fmt.parseInt(usize, tokens_it.next().?, 10);
const typ_text = tokens_it.next().?;
const typ = TokenType.fromString(typ_text);
const pre_value_index = tokens_it.index;
const value = tokens_it.next();
var token = Token{ .line = lineNumber, .col = colNumber, .typ = typ };
if (value) |val| {
const token_value = blk: {
switch (typ) {
.string, .identifier => {
tokens_it.index = pre_value_index;
break :blk TokenValue{ .string = tokens_it.rest() };
},
.integer => break :blk TokenValue{ .integer = try std.fmt.parseInt(i32, val, 10) },
else => unreachable,
}
};
token.value = token_value;
}
return token;
} else {
return null;
}
}
};
 
fn stringToTokenList(allocator: *std.mem.Allocator, str: []const u8) !std.ArrayList(Token) {
var result = std.ArrayList(Token).init(allocator);
var lexer_output_it = LexerOutputTokenizer.init(str);
while (try lexer_output_it.next()) |token| {
try result.append(token);
}
return result;
}
</lang>
Anonymous user