guid/src/parse.zig

508 lines
19 KiB
Zig

// SPDX-License-Identifier: MIT
// Copyright (c) 2015-2020 Zig Contributors
// This file is part of [zig](https://ziglang.org/), which is MIT licensed.
// The MIT license requires this copyright notice to be included in all copies
// and substantial portions of the software.
const std = @import("std");
const assert = std.debug.assert;
const Allocator = std.mem.Allocator;
const ast = @import("ast.zig");
const Node = ast.Node;
const Tree = ast.Tree;
const AstError = ast.Error;
const TokenIndex = ast.TokenIndex;
const NodeIndex = ast.NodeIndex;
const lexer = @import("tokenizer.zig");
const Token = lexer.Token;
const Tokenizer = lexer.Tokenizer;
pub const Error = error{ParseError} || Allocator.Error;
/// Result should be freed with tree.deinit() when there are
/// no more references to any of the tokens or nodes.
pub fn parse(gpa: *Allocator, source: []const u8) Allocator.Error!*Tree {
var token_ids = std.ArrayList(Token.Id).init(gpa);
defer token_ids.deinit();
var token_locs = std.ArrayList(Token.Loc).init(gpa);
defer token_locs.deinit();
// Before starting, just check that we have about enough memory.
const estimated_token_count = source.len / 8;
try token_ids.ensureCapacity(estimated_token_count);
try token_locs.ensureCapacity(estimated_token_count);
// Use the lexer to get all the tokens from the source code.
var tokenizer = Tokenizer.init(source);
while (true) {
const token = tokenizer.next();
try token_ids.append(token.id);
try token_locs.append(token.loc);
if (token.id == .Eof) break;
}
// Create a Parser structure.
var parser: Parser = .{
.source = source, // Source code.
.arena = std.heap.ArenaAllocator.init(gpa), // Arena allocator.
.gpa = gpa, // General Purpose Allocator.
.token_ids = token_ids.items, // IDs of the tokens.
.token_locs = token_locs.items, // Location of the tokens.
.errors = .{}, // List of errors in our parsing.
.tok_i = 0, // Index of current token being analyzed.
.indent = 0, // Indentation for debug.
};
defer parser.errors.deinit(gpa);
errdefer parser.arena.deinit();
// Ignore the first line comments from our code.
while (token_ids.items[parser.tok_i] == .LineComment) parser.tok_i += 1;
// Perform parsing, called once.
const root_node = try parser.parseRoot();
// Create a parsing Tree, with the nodes parsed early on.
// toOwnedSlice: free the memory and return the list. Arrays are empty,
// allocator can be free, arrays are owned by a different allocator.
const tree = try parser.arena.allocator.create(Tree);
tree.* = .{
.gpa = gpa,
.source = source,
.token_ids = token_ids.toOwnedSlice(),
.token_locs = token_locs.toOwnedSlice(),
.errors = parser.errors.toOwnedSlice(gpa),
.root_node = root_node,
.arena = parser.arena.state,
};
return tree;
}
const Assignment = struct {
id_attribute: TokenIndex,
id_value: TokenIndex,
};
/// Represents in-progress parsing, will be converted to an ast.Tree after completion.
const Parser = struct {
arena: std.heap.ArenaAllocator,
gpa: *Allocator,
source: []const u8,
token_ids: []const Token.Id,
token_locs: []const Token.Loc,
tok_i: TokenIndex,
errors: std.ArrayListUnmanaged(AstError),
indent: u16, // Indentation for debug.
/// Root <- skip ContainerMembers eof
fn parseRoot(p: *Parser) Allocator.Error!*Node.Root {
// Parse declarations.
const decls = try parseContainerMembers(p, true);
defer p.gpa.free(decls);
// parseContainerMembers will try to skip as much
// invalid tokens as it can so this can only be the EOF
// eatToken returns next token or null (if current token id isn't parameter).
// If current token is .Eof, next token is actually the first.
const eof_token = p.eatToken(.Eof).?;
// Nb of declarations becomes an ast.NodeIndex integer variable (usize).
const decls_len = @intCast(NodeIndex, decls.len);
const node = try Node.Root.create(&p.arena.allocator, decls_len, eof_token);
// std.mem.copy: T, dest, src
std.mem.copy(*Node, node.decls(), decls);
return node; // Root node.
}
/// ContainerMembers
/// <- TestDecl ContainerMembers
/// / TopLevelComptime ContainerMembers
/// / KEYWORD_pub? TopLevelDecl ContainerMembers
/// / ContainerField COMMA ContainerMembers
/// / ContainerField
/// /
// parseContainerMembers: actual parsing code starts here.
fn parseContainerMembers(p: *Parser, top_level: bool) ![]*Node {
std.debug.print("parseContainerMembers: is top? {}\n", .{top_level});
// list: all nodes in the ast.
var list = std.ArrayList(*Node).init(p.gpa);
defer list.deinit();
// field_state: union of enum.
// Tagged union: eligible to use in switch expressions and coerce their value.
// Example: switch (some_tagged_union) { SomeType => |value| print("{}\n", value); }
// If a '*' is placed before the variable name, it's a pointer to the value inside
// the tagged union.
// Example: switch (some_tagged_union) { SomeType => |*value| value.* += 1; }
// @TagType can be used to get the right enum type.
var field_state: union(enum) {
/// no fields have been seen
none,
/// currently parsing fields
seen,
/// saw fields and then a declaration after them.
/// payload is first token of previous declaration.
end: TokenIndex, // TokenIndex is defined as usize in std.zig.ast.
/// there was a declaration between fields, don't report more errors
err,
} = .none;
// Start => Requires Thing => End
// Requires => require StringLiteral Requires | nil
// Thing => Definition Thing | ClassHdr Thing | nil
// Definition => define Identifier ClassHdrSimple
// ClassHdr => ClassHdrFull LBrace ClassCon RBrace |
// ClassHdrSimple LBrace ClassCon RBrace
// ClassHdrSimple: a class without identifier,
// for a definition or when this won't be used later.
// ClassHdrSimple => Identifier
// ClassHdrFull: a class with an identifier.
// ClassHdrFull => Identifier LParen Identifier RParen
// ClassCon => ClassHdr | statement ClassCon | nil
// Property keyword adds a new property to the definition,
// without it, default property values can be changed.
// statement => Keyword_property Identifier Identifier Colon value
// value => StringLiteral | Keyword_null | IntegerLiteral | FloatLiteral
// True start of parsing.
while (true) {
const token = p.nextToken();
switch (p.token_ids[token]) {
.Keyword_require => {
p.putBackToken(token);
// TODO: read file required and parse its content.
p.parseRequire();
},
.Keyword_define => {
p.putBackToken(token);
// TODO: definitions.
p.parseDefine() catch |err| switch(err) {
// Propagate memory errors.
error.OutOfMemory => { return (error.OutOfMemory); },
error.ParseError => {
continue;
}
}; // |normal_value| { stuff; to; do; }
},
.Identifier => {
// Identifier => on top level this means a class.
p.putBackToken(token);
p.parseClass() catch |err| switch(err) {
// Propagate memory errors.
error.OutOfMemory => { return (error.OutOfMemory); },
error.ParseError => {
p.say("we catched a ParseError on token: {}\n"
, .{p.giveTokenContent(p.tok_i)});
continue;
}
}; // |normal_value| { stuff; to; do; }
},
.Eof => {
p.putBackToken(token);
break;
},
else => {
std.debug.print("token: {}\n", .{p.token_ids[token]});
continue;
},
}
}
return list.toOwnedSlice();
}
fn say(p: *Parser, comptime fmt: []const u8, args: anytype) void {
var i = p.indent;
while (i > 0) {
std.debug.print("\t", .{});
i-=1;
}
std.debug.print(fmt, args);
}
// TODO: require "file"
// file should be read, parsed and a loop detection should take place.
fn parseRequire(p: *Parser) void {
const require_token = p.eatToken(.Keyword_require);
const file_to_read = p.eatToken(.StringLiteral);
std.debug.print("TODO: file required: {}\n", .{file_to_read});
}
// TODO: class definition (inheritance).
// fn parseDefine(p: *Parser) !?Definition {
fn parseDefine(p: *Parser) !void {
const define_token = p.eatToken(.Keyword_define);
const new_class_name = p.eatToken(.Identifier);
const parent_class_name = p.eatToken(.Identifier);
std.debug.print("TODO: class inheritance: {} < {}\n",
.{new_class_name, parent_class_name});
try p.parseClass();
// TODO: get the old class definition,
// create a new definition,
// then add old and new properties and children to it.
}
// TODO: class definition (inheritance).
fn parseClass(p: *Parser) !void {
if (p.indent > 0) {
p.say("INNER parseClass\n", .{});
}
else {
p.say("TOP parseClass\n", .{});
}
p.indent += 1;
defer { p.indent -= 1; }
const class_name = p.eatToken(.Identifier);
if (class_name == null) {
return;
}
// Either simple or full header.
const identifier: ?[] const u8 = try p.parseFullClassHeader();
p.say("TODO: read class: {}", .{p.giveTokenContent(class_name.?)});
if (identifier) |id| {
std.debug.print(", id: {}\n", .{id});
}
else {
std.debug.print("\n", .{});
}
// Starting the class.
// const lbrace = p.nextToken();
const ignored = try p.expectToken(.LBrace);
while (true) {
// TODO: parsing class content.
// TODO: loop over this.
const token = p.nextToken();
p.say("(top class reading) reading token: {}\n", .{p.giveTokenContent(token)});
switch (p.token_ids[token]) {
.Identifier => {
const following = p.nextToken();
switch (p.token_ids[following]) {
// Class header (with or without id).
.LParen,
.LBrace => {
p.putBackToken(following);
p.putBackToken(token);
p.say("reading a new class\n", .{});
// WARNING: RECURSION: this may cause errors.
const res = p.parseClass();
continue;
},
.Colon => {
p.putBackToken(following);
p.putBackToken(token);
const assignment = try p.parseAssignment();
p.say("redefining an attribute {} => {}\n"
, .{ p.giveTokenContent(assignment.id_attribute)
, p.giveTokenContent(assignment.id_value)});
},
else => {
// Wasn't expected.
// Couln't understand what was in this class.
p.say("did not understand {} then {}\n"
, .{ p.giveTokenContent(token)
, p.giveTokenContent(following)});
p.putBackToken(following);
p.putBackToken(token);
break;
}
}
},
.Keyword_property => {
p.say("Reading a property\n", .{});
p.putBackToken(token);
try p.parseProperty();
p.say("Done reading a property\n", .{});
continue;
},
.LBrace => {
p.say("Reading a LBrace\n", .{});
},
.RBrace => {
p.say("Reading a RBrace\n", .{});
p.putBackToken(token);
break;
},
else => {
p.putBackToken(token);
p.say("reading {} in a class, backing up\n"
, .{p.giveTokenContent(token)});
break;
}
}
}
// Class definition or instance ends with a RBrace.
const end_of_class = try p.expectToken(.RBrace);
}
fn parseProperty(p: *Parser) !void {
const property = try p.expectToken(.Keyword_property);
const class_name = try p.expectToken(.Identifier);
const attribute_name = try p.expectToken(.Identifier);
const colon = try p.expectToken(.Colon);
const id_value = p.nextToken();
switch (p.token_ids[id_value]) {
.Keyword_null,
.StringLiteral,
.IntegerLiteral,
.FloatLiteral => {
p.say("property: {} {} = {}\n"
, .{p.giveTokenContent(class_name), p.giveTokenContent(attribute_name), p.giveTokenContent(id_value)});
return ;
// return Assignment{.id_attribute = ia, .id_value = id_value};
},
else => {
return error.ParseError;
}
}
}
// statement => Keyword_property Identifier Identifier Colon value
// value => StringLiteral | Keyword_null | IntegerLiteral | FloatLiteral
fn parseAssignment(p: *Parser) !Assignment {
const id_attribute = p.eatToken(.Identifier);
const ignored = try p.expectToken(.Colon);
const id_value = p.nextToken();
switch (p.token_ids[id_value]) {
.Keyword_null,
.StringLiteral,
.IntegerLiteral,
.FloatLiteral => {
if (id_attribute) |ia| {
return Assignment{.id_attribute = ia, .id_value = id_value};
}
},
else => {
return error.ParseError;
}
}
return error.ParseError;
}
fn parseFullClassHeader(p: *Parser) !?[]const u8 {
if (p.eatToken(.LParen) == null)
return null;
// Once we know this is a full header, an identifier then a right
// parenthesis are expected, and should trigger a parsing error if not there.
const identifier = try p.expectToken(.Identifier);
const blah = try p.expectToken(.RParen);
return p.giveTokenContent(identifier);
}
// fn parseStatement(p: *Parser) Error!?*Node {
// }
fn eatToken(p: *Parser, id: Token.Id) ?TokenIndex {
return if (p.token_ids[p.tok_i] == id) p.nextToken() else null;
}
// expectToken: either returns the token or an error.
fn expectToken(p: *Parser, id: Token.Id) Error!TokenIndex {
return (try p.expectTokenRecoverable(id)) orelse error.ParseError;
}
// expectTokenRecoverable: either returns the token or null if not the one expected.
// Also, appends the error inside p.errors.
fn expectTokenRecoverable(p: *Parser, id: Token.Id) !?TokenIndex {
const token = p.nextToken();
if (p.token_ids[token] != id) {
try p.errors.append(p.gpa, .{
.ExpectedToken = .{ .token = token, .expected_id = id },
});
// go back so that we can recover properly
p.putBackToken(token);
return null;
}
return token;
}
// nextToken: provide the TokenIndex of the current token, but increases the tok_i
// inside the Parser structure.
fn nextToken(p: *Parser) TokenIndex {
const result = p.tok_i;
p.tok_i += 1;
assert(p.token_ids[result] != .LineComment);
if (p.tok_i >= p.token_ids.len) return result;
while (true) {
if (p.token_ids[p.tok_i] != .LineComment) return result;
p.tok_i += 1;
}
}
// putBackToken: come back one token (except for comment lines which are ignored).
// Example: we have a function searching for a declaration,
// the function read a token "my-variable" so it returns after putting back the token.
// Caller now have the start of a declaration in its parsing structure.
fn putBackToken(p: *Parser, putting_back: TokenIndex) void {
while (p.tok_i > 0) {
p.tok_i -= 1;
if (p.token_ids[p.tok_i] == .LineComment) continue;
assert(putting_back == p.tok_i);
return;
}
}
/// TODO Delete this function. I don't like the inversion of control.
fn expectNode(
p: *Parser,
parseFn: NodeParseFn,
/// if parsing fails
err: AstError,
) Error!*Node {
return (try p.expectNodeRecoverable(parseFn, err)) orelse return error.ParseError;
}
/// TODO Delete this function. I don't like the inversion of control.
fn expectNodeRecoverable(
p: *Parser,
parseFn: NodeParseFn,
/// if parsing fails
err: AstError,
) !?*Node {
return (try parseFn(p)) orelse {
try p.errors.append(p.gpa, err);
return null;
};
}
// WARNING: VALID token identifier expected.
fn giveTokenContent(p: *Parser, id: TokenIndex) []const u8 {
const loc = p.token_locs[id];
return p.source[loc.start..loc.end];
}
};
fn ParseFn(comptime T: type) type {
return fn (p: *Parser) Error!T;
}
test "std.zig.parser" {
_ = @import("parser_test.zig");
}