// SPDX-License-Identifier: MIT // Copyright (c) 2015-2020 Zig Contributors // This file is part of [zig](https://ziglang.org/), which is MIT licensed. // The MIT license requires this copyright notice to be included in all copies // and substantial portions of the software. const std = @import("std"); const assert = std.debug.assert; const Allocator = std.mem.Allocator; const ast = @import("ast.zig"); const Node = ast.Node; const Tree = ast.Tree; const AstError = ast.Error; const TokenIndex = ast.TokenIndex; const NodeIndex = ast.NodeIndex; const lexer = @import("tokenizer.zig"); const Token = lexer.Token; const Tokenizer = lexer.Tokenizer; pub const Error = error{ParseError} || Allocator.Error; /// Result should be freed with tree.deinit() when there are /// no more references to any of the tokens or nodes. pub fn parse(gpa: *Allocator, source: []const u8) Allocator.Error!*Tree { var token_ids = std.ArrayList(Token.Id).init(gpa); defer token_ids.deinit(); var token_locs = std.ArrayList(Token.Loc).init(gpa); defer token_locs.deinit(); // Before starting, just check that we have about enough memory. const estimated_token_count = source.len / 8; try token_ids.ensureCapacity(estimated_token_count); try token_locs.ensureCapacity(estimated_token_count); // Use the lexer to get all the tokens from the source code. var tokenizer = Tokenizer.init(source); while (true) { const token = tokenizer.next(); try token_ids.append(token.id); try token_locs.append(token.loc); if (token.id == .Eof) break; } // Create a Parser structure. var parser: Parser = .{ .source = source, // Source code. .arena = std.heap.ArenaAllocator.init(gpa), // Arena allocator. .gpa = gpa, // General Purpose Allocator. .token_ids = token_ids.items, // IDs of the tokens. .token_locs = token_locs.items, // Location of the tokens. .errors = .{}, // List of errors in our parsing. .tok_i = 0, // Index of current token being analyzed. .indent = 0, // Indentation for debug. }; defer parser.errors.deinit(gpa); errdefer parser.arena.deinit(); // Ignore the first line comments from our code. while (token_ids.items[parser.tok_i] == .LineComment) parser.tok_i += 1; // Perform parsing, called once. const root_node = try parser.parseRoot(); // Create a parsing Tree, with the nodes parsed early on. // toOwnedSlice: free the memory and return the list. Arrays are empty, // allocator can be free, arrays are owned by a different allocator. const tree = try parser.arena.allocator.create(Tree); tree.* = .{ .gpa = gpa, .source = source, .token_ids = token_ids.toOwnedSlice(), .token_locs = token_locs.toOwnedSlice(), .errors = parser.errors.toOwnedSlice(gpa), .root_node = root_node, .arena = parser.arena.state, }; return tree; } const Assignment = struct { id_attribute: TokenIndex, id_value: TokenIndex, }; /// Represents in-progress parsing, will be converted to an ast.Tree after completion. const Parser = struct { arena: std.heap.ArenaAllocator, gpa: *Allocator, source: []const u8, token_ids: []const Token.Id, token_locs: []const Token.Loc, tok_i: TokenIndex, errors: std.ArrayListUnmanaged(AstError), indent: u16, // Indentation for debug. /// Root <- skip ContainerMembers eof fn parseRoot(p: *Parser) Allocator.Error!*Node.Root { // Parse declarations. const decls = try parseContainerMembers(p, true); defer p.gpa.free(decls); // parseContainerMembers will try to skip as much // invalid tokens as it can so this can only be the EOF // eatToken returns next token or null (if current token id isn't parameter). // If current token is .Eof, next token is actually the first. const eof_token = p.eatToken(.Eof).?; // Nb of declarations becomes an ast.NodeIndex integer variable (usize). const decls_len = @intCast(NodeIndex, decls.len); const node = try Node.Root.create(&p.arena.allocator, decls_len, eof_token); // std.mem.copy: T, dest, src std.mem.copy(*Node, node.decls(), decls); return node; // Root node. } /// ContainerMembers /// <- TestDecl ContainerMembers /// / TopLevelComptime ContainerMembers /// / KEYWORD_pub? TopLevelDecl ContainerMembers /// / ContainerField COMMA ContainerMembers /// / ContainerField /// / // parseContainerMembers: actual parsing code starts here. fn parseContainerMembers(p: *Parser, top_level: bool) ![]*Node { std.debug.print("parseContainerMembers: is top? {}\n", .{top_level}); // list: all nodes in the ast. var list = std.ArrayList(*Node).init(p.gpa); defer list.deinit(); // field_state: union of enum. // Tagged union: eligible to use in switch expressions and coerce their value. // Example: switch (some_tagged_union) { SomeType => |value| print("{}\n", value); } // If a '*' is placed before the variable name, it's a pointer to the value inside // the tagged union. // Example: switch (some_tagged_union) { SomeType => |*value| value.* += 1; } // @TagType can be used to get the right enum type. var field_state: union(enum) { /// no fields have been seen none, /// currently parsing fields seen, /// saw fields and then a declaration after them. /// payload is first token of previous declaration. end: TokenIndex, // TokenIndex is defined as usize in std.zig.ast. /// there was a declaration between fields, don't report more errors err, } = .none; // Start => Requires Thing => End // Requires => require StringLiteral Requires | nil // Thing => Definition Thing | ClassHdr Thing | nil // Definition => define Identifier ClassHdrSimple // ClassHdr => ClassHdrFull LBrace ClassCon RBrace | // ClassHdrSimple LBrace ClassCon RBrace // ClassHdrSimple: a class without identifier, // for a definition or when this won't be used later. // ClassHdrSimple => Identifier // ClassHdrFull: a class with an identifier. // ClassHdrFull => Identifier LParen Identifier RParen // ClassCon => ClassHdr | statement ClassCon | nil // Property keyword adds a new property to the definition, // without it, default property values can be changed. // statement => Keyword_property Identifier Identifier Colon value // value => StringLiteral | Keyword_null | IntegerLiteral | FloatLiteral // True start of parsing. while (true) { const token = p.nextToken(); switch (p.token_ids[token]) { .Keyword_require => { p.putBackToken(token); // TODO: read file required and parse its content. p.parseRequire(); }, .Keyword_define => { p.putBackToken(token); // TODO: definitions. p.parseDefine() catch |err| switch(err) { // Propagate memory errors. error.OutOfMemory => { return (error.OutOfMemory); }, error.ParseError => { continue; } }; // |normal_value| { stuff; to; do; } }, .Identifier => { // Identifier => on top level this means a class. p.putBackToken(token); p.parseClass() catch |err| switch(err) { // Propagate memory errors. error.OutOfMemory => { return (error.OutOfMemory); }, error.ParseError => { p.say("we catched a ParseError on token: {}\n" , .{p.giveTokenContent(p.tok_i)}); continue; } }; // |normal_value| { stuff; to; do; } }, .Eof => { p.putBackToken(token); break; }, else => { std.debug.print("token: {}\n", .{p.token_ids[token]}); continue; }, } } return list.toOwnedSlice(); } fn say(p: *Parser, comptime fmt: []const u8, args: anytype) void { var i = p.indent; while (i > 0) { std.debug.print("\t", .{}); i-=1; } std.debug.print(fmt, args); } // TODO: require "file" // file should be read, parsed and a loop detection should take place. fn parseRequire(p: *Parser) void { const require_token = p.eatToken(.Keyword_require); const file_to_read = p.eatToken(.StringLiteral); std.debug.print("TODO: file required: {}\n", .{file_to_read}); } // TODO: class definition (inheritance). // fn parseDefine(p: *Parser) !?Definition { fn parseDefine(p: *Parser) !void { const define_token = p.eatToken(.Keyword_define); const new_class_name = p.eatToken(.Identifier); const parent_class_name = p.eatToken(.Identifier); std.debug.print("TODO: class inheritance: {} < {}\n", .{new_class_name, parent_class_name}); try p.parseClass(); // TODO: get the old class definition, // create a new definition, // then add old and new properties and children to it. } // TODO: class definition (inheritance). fn parseClass(p: *Parser) !void { if (p.indent > 0) { p.say("INNER parseClass\n", .{}); } else { p.say("TOP parseClass\n", .{}); } p.indent += 1; defer { p.indent -= 1; } const class_name = p.eatToken(.Identifier); if (class_name == null) { return; } // Either simple or full header. const identifier: ?[] const u8 = try p.parseFullClassHeader(); p.say("TODO: read class: {}", .{p.giveTokenContent(class_name.?)}); if (identifier) |id| { std.debug.print(", id: {}\n", .{id}); } else { std.debug.print("\n", .{}); } // Starting the class. // const lbrace = p.nextToken(); const ignored = try p.expectToken(.LBrace); while (true) { // TODO: parsing class content. // TODO: loop over this. const token = p.nextToken(); p.say("(top class reading) reading token: {}\n", .{p.giveTokenContent(token)}); switch (p.token_ids[token]) { .Identifier => { const following = p.nextToken(); switch (p.token_ids[following]) { // Class header (with or without id). .LParen, .LBrace => { p.putBackToken(following); p.putBackToken(token); p.say("reading a new class\n", .{}); // WARNING: RECURSION: this may cause errors. const res = p.parseClass(); continue; }, .Colon => { p.putBackToken(following); p.putBackToken(token); const assignment = try p.parseAssignment(); p.say("redefining an attribute {} => {}\n" , .{ p.giveTokenContent(assignment.id_attribute) , p.giveTokenContent(assignment.id_value)}); }, else => { // Wasn't expected. // Couln't understand what was in this class. p.say("did not understand {} then {}\n" , .{ p.giveTokenContent(token) , p.giveTokenContent(following)}); p.putBackToken(following); p.putBackToken(token); break; } } }, .Keyword_property => { p.say("Reading a property\n", .{}); p.putBackToken(token); try p.parseProperty(); p.say("Done reading a property\n", .{}); continue; }, .LBrace => { p.say("Reading a LBrace\n", .{}); }, .RBrace => { p.say("Reading a RBrace\n", .{}); p.putBackToken(token); break; }, else => { p.putBackToken(token); p.say("reading {} in a class, backing up\n" , .{p.giveTokenContent(token)}); break; } } } // Class definition or instance ends with a RBrace. const end_of_class = try p.expectToken(.RBrace); } fn parseProperty(p: *Parser) !void { const property = try p.expectToken(.Keyword_property); const class_name = try p.expectToken(.Identifier); const attribute_name = try p.expectToken(.Identifier); const colon = try p.expectToken(.Colon); const id_value = p.nextToken(); switch (p.token_ids[id_value]) { .Keyword_null, .StringLiteral, .IntegerLiteral, .FloatLiteral => { p.say("property: {} {} = {}\n" , .{p.giveTokenContent(class_name), p.giveTokenContent(attribute_name), p.giveTokenContent(id_value)}); return ; // return Assignment{.id_attribute = ia, .id_value = id_value}; }, else => { return error.ParseError; } } } // statement => Keyword_property Identifier Identifier Colon value // value => StringLiteral | Keyword_null | IntegerLiteral | FloatLiteral fn parseAssignment(p: *Parser) !Assignment { const id_attribute = p.eatToken(.Identifier); const ignored = try p.expectToken(.Colon); const id_value = p.nextToken(); switch (p.token_ids[id_value]) { .Keyword_null, .StringLiteral, .IntegerLiteral, .FloatLiteral => { if (id_attribute) |ia| { return Assignment{.id_attribute = ia, .id_value = id_value}; } }, else => { return error.ParseError; } } return error.ParseError; } fn parseFullClassHeader(p: *Parser) !?[]const u8 { if (p.eatToken(.LParen) == null) return null; // Once we know this is a full header, an identifier then a right // parenthesis are expected, and should trigger a parsing error if not there. const identifier = try p.expectToken(.Identifier); const blah = try p.expectToken(.RParen); return p.giveTokenContent(identifier); } // fn parseStatement(p: *Parser) Error!?*Node { // } fn eatToken(p: *Parser, id: Token.Id) ?TokenIndex { return if (p.token_ids[p.tok_i] == id) p.nextToken() else null; } // expectToken: either returns the token or an error. fn expectToken(p: *Parser, id: Token.Id) Error!TokenIndex { return (try p.expectTokenRecoverable(id)) orelse error.ParseError; } // expectTokenRecoverable: either returns the token or null if not the one expected. // Also, appends the error inside p.errors. fn expectTokenRecoverable(p: *Parser, id: Token.Id) !?TokenIndex { const token = p.nextToken(); if (p.token_ids[token] != id) { try p.errors.append(p.gpa, .{ .ExpectedToken = .{ .token = token, .expected_id = id }, }); // go back so that we can recover properly p.putBackToken(token); return null; } return token; } // nextToken: provide the TokenIndex of the current token, but increases the tok_i // inside the Parser structure. fn nextToken(p: *Parser) TokenIndex { const result = p.tok_i; p.tok_i += 1; assert(p.token_ids[result] != .LineComment); if (p.tok_i >= p.token_ids.len) return result; while (true) { if (p.token_ids[p.tok_i] != .LineComment) return result; p.tok_i += 1; } } // putBackToken: come back one token (except for comment lines which are ignored). // Example: we have a function searching for a declaration, // the function read a token "my-variable" so it returns after putting back the token. // Caller now have the start of a declaration in its parsing structure. fn putBackToken(p: *Parser, putting_back: TokenIndex) void { while (p.tok_i > 0) { p.tok_i -= 1; if (p.token_ids[p.tok_i] == .LineComment) continue; assert(putting_back == p.tok_i); return; } } /// TODO Delete this function. I don't like the inversion of control. fn expectNode( p: *Parser, parseFn: NodeParseFn, /// if parsing fails err: AstError, ) Error!*Node { return (try p.expectNodeRecoverable(parseFn, err)) orelse return error.ParseError; } /// TODO Delete this function. I don't like the inversion of control. fn expectNodeRecoverable( p: *Parser, parseFn: NodeParseFn, /// if parsing fails err: AstError, ) !?*Node { return (try parseFn(p)) orelse { try p.errors.append(p.gpa, err); return null; }; } // WARNING: VALID token identifier expected. fn giveTokenContent(p: *Parser, id: TokenIndex) []const u8 { const loc = p.token_locs[id]; return p.source[loc.start..loc.end]; } }; fn ParseFn(comptime T: type) type { return fn (p: *Parser) Error!T; } test "std.zig.parser" { _ = @import("parser_test.zig"); }