guid/src/parse.zig

// SPDX-License-Identifier: MIT
// Copyright (c) 2015-2020 Zig Contributors
// This file is part of [zig](https://ziglang.org/), which is MIT licensed.
// The MIT license requires this copyright notice to be included in all copies
// and substantial portions of the software.
const std = @import("std");
const assert = std.debug.assert;
const Allocator = std.mem.Allocator;
const ast = @import("ast.zig");
const Node = ast.Node;
const AstError = ast.Error;
const TokenIndex = ast.TokenIndex;
const NodeIndex = ast.NodeIndex;

const lexer = @import("tokenizer.zig");
const Token = lexer.Token;
const Tokenizer = lexer.Tokenizer;

const cs = @import("common-structures.zig");

pub const Error = error{ParseError} || Allocator.Error;

// Start     => Requires Thing => End
// Requires  => require StringLiteral Requires | nil
// Thing     => Definition Thing | NodeHdr Thing | nil

// Definition => define Identifier NodeHdrSimple

// NodeHdr   => NodeHdrFull LBrace NodeContent RBrace | NodeHdrSimple LBrace NodeContent RBrace
// NodeHdrSimple: a node without identifier, for a definition or when this won't be used later.
// NodeHdrSimple => Identifier
// NodeHdrFull: a node with an identifier.
// NodeHdrFull   => Identifier LParen Identifier RParen
// NodeContent  => NodeHdr | statement NodeContent | nil

// Property keyword adds a new property to the definition,
// without it, default property values can be changed.

// statement => Keyword_property Identifier Identifier Colon value
// value     => StringLiteral | Keyword_null | IntegerLiteral | FloatLiteral

/// Result should be freed with tree.deinit() when there are
/// no more references to any of the tokens or nodes.
pub fn parse(gpa: *Allocator, source: []const u8) Allocator.Error!cs.Tree {

    var token_ids = std.ArrayList(Token.Id).init(gpa);
    defer token_ids.deinit();
    var token_locs = std.ArrayList(Token.Loc).init(gpa);
    defer token_locs.deinit();

    // Before starting, just check that we have about enough memory.
    const estimated_token_count = source.len / 8;
    try token_ids.ensureCapacity(estimated_token_count);
    try token_locs.ensureCapacity(estimated_token_count);

    // Use the lexer to get all the tokens from the source code.
    var tokenizer = Tokenizer.init(source);
    while (true) {
        const token = tokenizer.next();
        try token_ids.append(token.id);
        try token_locs.append(token.loc);
        if (token.id == .Eof) break;
    }

    // Create a Parser structure.
    var parser: Parser = .{
        .source = source,                           // Source code.
        .arena = std.heap.ArenaAllocator.init(gpa), // Arena allocator.
        .gpa = gpa,                                 // General Purpose Allocator.
        .token_ids = token_ids.items,               // IDs of the tokens.
        .token_locs = token_locs.items,             // Location of the tokens.
        .errors = .{},                              // List of errors in our parsing.
        .tok_i = 0,                                 // Index of current token being analyzed.
        .indent = 0,                                // Indentation for debug.
    };
    defer parser.errors.deinit(gpa);
    errdefer parser.arena.deinit();

    // Ignore the first line comments from our code.
    while (token_ids.items[parser.tok_i] == .LineComment) parser.tok_i += 1;

    // Perform parsing of the source, extract data, create a Tree structure and its content.
    // Create a parsing Tree, with the nodes parsed early on.
    // toOwnedSlice: free the memory and return the list. Arrays are empty,
    // allocator can be free, arrays are owned by a different allocator.
    var tree = try parser.parseTree();

    return tree;
}

const Assignment = struct {
    id_attribute: TokenIndex,
    id_value:     TokenIndex,
};

/// Represents in-progress parsing, will be converted to an cs.Tree after completion.
const Parser = struct {
    arena: std.heap.ArenaAllocator,
    gpa: *Allocator,
    source: []const u8,
    token_ids: []const Token.Id,
    token_locs: []const Token.Loc,
    tok_i: TokenIndex,
    errors: std.ArrayListUnmanaged(AstError),
    indent: u16, // Indentation for debug.

    // parseTree: create a cs.Tree with all its content.
    fn parseTree(p: *Parser) Allocator.Error!cs.Tree {

        // Create a tree.
        var tree = try cs.Tree.create(&p.arena.allocator);

        // Parse the content.
        try parseTopLevel(p, &tree);

        // parseTopLevel will try to skip as much
        // invalid tokens as it can so this can only be the EOF
        // eatToken returns next token or null (if current token id isn't parameter).
        // If current token is .Eof, next token is actually the first.
        const eof_token = p.eatToken(.Eof).?;

        return tree;
    }

    // parseTopLevel: actual parsing code starts here.
    fn parseTopLevel(p: *Parser, tree: *cs.Tree) !void {

        // list: all nodes in the ast.
        var list = std.ArrayList(*Node).init(p.gpa);
        defer list.deinit();

        // True start of parsing.
        while (true) {
            const token = p.nextToken();
            switch (p.token_ids[token]) {
                .Keyword_require => {
                    p.putBackToken(token);
                    // TODO: read file required and parse its content.
                    p.parseRequire();
                },

                .Keyword_define => {
                    p.putBackToken(token);
                    // TODO: definitions.
                    p.parseDefine() catch |err| switch(err) {
                        // Propagate memory errors.
                        error.OutOfMemory => { return (error.OutOfMemory); },
                        error.InvalidCharacter => continue,
                        error.NoSpaceLeft => continue,
                        error.Overflow => continue,
                        error.ParseError => continue,
                    }; // |normal_value| { stuff; to; do; }
                },

                .Identifier => {
                    // Identifier => on top level this means a node.
                    p.putBackToken(token);
                    p.parseNode() catch |err| switch(err) {
                        // Propagate memory errors.
                        error.OutOfMemory => { return (error.OutOfMemory); },
                        error.InvalidCharacter => continue,
                        error.NoSpaceLeft => continue,
                        error.Overflow => continue,
                        error.ParseError => {
                             p.say("we catched a ParseError on token: {}\n"
                                 , .{p.giveTokenContent(p.tok_i)});
                             continue;
                        }
                    }; // |normal_value| { stuff; to; do; }
                },

                .Eof => {
                    p.putBackToken(token);
                    break;
                },

                else => {
                    p.say("unexpected token: {}\n", .{p.token_ids[token]});
                    continue;
                },
            }
        }
    }

    fn say(p: *Parser, comptime fmt: []const u8, args: anytype) void {
        var i = p.indent;
        while (i > 0) {
            std.debug.print("    ", .{});
            i-=1;
        }
        std.debug.print(fmt, args);
    }

    // TODO: require "file"
    //       file should be read, parsed and a loop detection should take place.
    fn parseRequire(p: *Parser) void {
        const require_token = p.eatToken(.Keyword_require);
        const file_to_read = p.eatToken(.StringLiteral);
        std.debug.print("TODO: file required: {}\n", .{file_to_read});
    }

    // TODO: node definition (inheritance).
    // fn parseDefine(p: *Parser) !?Definition {
    fn parseDefine(p: *Parser) !void {
        const define_token = try p.expectToken(.Keyword_define);
        const new_class_name = try p.expectToken(.Identifier);
        const parent_node_name = try p.expectToken(.Identifier);

        std.debug.print("TODO: node inheritance: {} < {}\n",
            .{p.giveTokenContent(new_class_name), p.giveTokenContent(parent_node_name)});

        p.putBackToken(parent_node_name);
        try p.parseNode();
        // TODO: get the old node definition,
        //       create a new definition,
        //       then add old and new properties and children to it.
    }

    // TODO: node definition (inheritance).
    fn parseNode(p: *Parser) !void {
        const node_name = p.eatToken(.Identifier);
        if (node_name == null) {
             return;
        }
        // Either simple or full header.
        const identifier: ?[] const u8 = try p.parseFullNodeHeader();

        if (p.indent > 0) {
            p.say("Child {}", .{p.giveTokenContent(node_name.?)});
        }
        else {
            p.say("Node {}", .{p.giveTokenContent(node_name.?)});
        }

        if (identifier) |id| {
            std.debug.print(", id: {}\n", .{id});
        }
        else {
            std.debug.print("\n", .{});
        }

        p.indent += 1;
        defer { p.indent -= 1; }

        // Starting the node.
        // const lbrace = p.nextToken();
        const ignored = try p.expectToken(.LBrace);

        while (true) {
            const token = p.nextToken();
            // p.say("within a node, token: {}\n", .{p.giveTokenContent(token)});
            switch (p.token_ids[token]) {

                .Keyword_null,
                .StringLiteral,
                .FloatLiteral,
                .IntegerLiteral => {
                    p.say("reading (and ignoring) a literal: {}\n", .{p.giveTokenContent(token)});
                    continue;
                },

                .Identifier => {
                    const following = p.nextToken();
                    switch (p.token_ids[following]) {

                        // Node header (with or without id).
                        .LParen,
                        .LBrace => {
                            p.putBackToken(following);
                            p.putBackToken(token);
                            // p.say("reading a new node\n", .{});
                            // WARNING: RECURSION: this may cause errors.
                            const res = p.parseNode();
                            continue;
                        },

                        .Colon => {
                            // p.putBackToken(following);
                            // p.putBackToken(token);

                            const value: cs.PropertyValue = try p.parseValue();
                            p.say("attribute {:>20} => {}\n"
                                , .{  p.giveTokenContent(token)
                                    , value});
                            // const assignment = try p.parseAssignment();
                            // p.say("redefining an attribute {} => {}\n"
                            //     , .{  p.giveTokenContent(assignment.id_attribute)
                            //         , p.giveTokenContent(assignment.id_value)});
                        },

                        .Period => {
                            p.putBackToken(following);
                            p.putBackToken(token);

                            // Hacking a bit, this is not a value but an identifier.
                            const attribute_loc: Token.Loc = try p.parseReference();

                            const colon = p.expectToken(.Colon);

                            const value: cs.PropertyValue = try p.parseValue();
                            p.say("attribute {:>20} => {}\n"
                                , .{  p.source[attribute_loc.start..attribute_loc.end]
                                    , value});
                        },

                        else => {
                            // Wasn't expected.
                            // Couln't understand what was in this node.
                            p.say("did not understand {} then {}\n"
                                , .{  p.giveTokenContent(token)
                                    , p.giveTokenContent(following)});
                            p.putBackToken(following);
                            p.putBackToken(token);
                            break;
                        }
                    }
                },

                .Keyword_property => {
                    p.putBackToken(token);
                    try p.parseProperty();
                    continue;
                },

                .RBrace => {
                    p.putBackToken(token);
                    break;
                },

                else => {
                    p.putBackToken(token);
                    p.say("reading {} in a node, backing up\n"
                        , .{p.giveTokenContent(token)});
                    break;
                }
            }
        }

        // Node definition or instance ends with a RBrace.
        const end_of_class = try p.expectToken(.RBrace);
    }

    // A property value can either be a simple value,
    // or a reference (for property binding stuff).
    // Simple values are copied and casted into the real type.
    // For a reference, we keep a copy of the string representation.
    fn parseValue(p: *Parser) !cs.PropertyValue {
        while(true) {
            const token = p.nextToken();
            switch (p.token_ids[token]) {
                 // .Keyword_null => {
                 //     return cs.PropertyValue{.nil};
                 // },

                 .StringLiteral => {
                     return cs.PropertyValue{.string = p.giveTokenContent(token)};
                 },

                 .IntegerLiteral => {
                     return cs.PropertyValue{
                         .integer = try std.fmt.parseInt(u64, p.giveTokenContent(token), 10)
                     };
                 },

                 .FloatLiteral => {
                     // p.say("property: {} {} = {}\n"
                     //     , .{p.giveTokenContent(node_name)
                     //         , p.giveTokenContent(attribute_name)
                     //         , p.giveTokenContent(id_value)});
                     return cs.PropertyValue{
                         .float = try std.fmt.parseFloat(f64, p.giveTokenContent(token))
                     };
                 },

                 .Identifier => {
                     // Loop over identifier and points.
                     p.putBackToken(token);
                     const loc: Token.Loc = try p.parseReference();
                     // p.say("loc: {}\n", .{loc});
                     // const val: []const u8 = try p.parseReference();
                     // p.say("value: {}\n", .{val});
                     return cs.PropertyValue{
                         .reference = p.source[loc.start..loc.end]
                     };
                 },

                 else => {
                      return error.ParseError;
                 }
            }
        }
    }

//    // statement => Keyword_property Identifier Identifier Colon value
//    // value     => StringLiteral | Keyword_null | IntegerLiteral | FloatLiteral
//    fn parseAssignment(p: *Parser) !Assignment {
//        const id_attribute = p.eatToken(.Identifier);
//        const ignored = try p.expectToken(.Colon);
//        const id_value = p.nextToken();
//        switch (p.token_ids[id_value]) {
//             .Keyword_null,
//             .StringLiteral,
//             .IntegerLiteral,
//             .FloatLiteral => {
//                  if (id_attribute) |ia| {
//                       return Assignment{.id_attribute = ia, .id_value = id_value};
//                  }
//             },
//             else => {
//                  return error.ParseError;
//             }
//        }
//        return error.ParseError;
//    }

    // Get the representation of the reference: returning a location.
    fn parseReference(p: *Parser) !Token.Loc {

        var representation: [100]u8 = undefined;
        var fbs = std.io.fixedBufferStream(representation[0..]);

        // First part of the reference has to be an Identifier.
        const id = try p.expectToken(.Identifier);
        // try std.fmt.format(fbs.writer(), "{}", .{p.giveTokenContent(id)});

        var loc: Token.Loc = Token.Loc{
            .start = p.token_locs[id].start,
            .end   = p.token_locs[id].end,
        };

        while (true) {
            const token = p.nextToken();
            switch (p.token_ids[token]) {

                .Period => {
                    // Following token is expected to be an Identifier.
                    const following = try p.expectToken(.Identifier);
                    try std.fmt.format(fbs.writer(), ".{}", .{p.giveTokenContent(following)});
                    loc.end = p.token_locs[following].end;
                },

                else => {
                    p.putBackToken(token);
                    break;
                },
            }
        }

        // return representation[0..fbs.pos];
        return loc;
    }

    fn parseProperty(p: *Parser) !void {
        const property = try p.expectToken(.Keyword_property);
        const type_name = try p.expectToken(.Identifier);
        const attribute_name = try p.expectToken(.Identifier);
        const colon = try p.expectToken(.Colon);
        const value: cs.PropertyValue = try p.parseValue();
        p.say("- {} (type {}) = {}\n"
             , .{ p.giveTokenContent(attribute_name)
                , p.giveTokenContent(type_name)
                , value});
    }

    fn parseFullNodeHeader(p: *Parser) !?[]const u8 {
        if (p.eatToken(.LParen) == null)
            return null;

        // Once we know this is a full header, an identifier then a right
        // parenthesis are expected, and should trigger a parsing error if not there.
        const identifier = try p.expectToken(.Identifier);
        const blah = try p.expectToken(.RParen);

        return p.giveTokenContent(identifier);
    }


    fn eatToken(p: *Parser, id: Token.Id) ?TokenIndex {
        return if (p.token_ids[p.tok_i] == id) p.nextToken() else null;
    }

    // expectToken: either returns the token or an error.
    fn expectToken(p: *Parser, id: Token.Id) Error!TokenIndex {
        return (try p.expectTokenRecoverable(id)) orelse error.ParseError;
    }

    // expectTokenRecoverable: either returns the token or null if not the one expected.
    //                         Also, appends the error inside p.errors.
    fn expectTokenRecoverable(p: *Parser, id: Token.Id) !?TokenIndex {
        const token = p.nextToken();
        if (p.token_ids[token] != id) {
            try p.errors.append(p.gpa, .{
                .ExpectedToken = .{ .token = token, .expected_id = id },
            });
            // go back so that we can recover properly
            p.putBackToken(token);
            return null;
        }
        return token;
    }

    // nextToken: provide the TokenIndex of the current token, but increases the tok_i
    //            inside the Parser structure.
    fn nextToken(p: *Parser) TokenIndex {
        const result = p.tok_i;
        p.tok_i += 1;
        assert(p.token_ids[result] != .LineComment);
        if (p.tok_i >= p.token_ids.len) return result;

        while (true) {
            if (p.token_ids[p.tok_i] != .LineComment) return result;
            p.tok_i += 1;
        }
    }

    // putBackToken: come back one token (except for comment lines which are ignored).
    // Example: we have a function searching for a declaration,
    //          the function read a token "my-variable" so it returns after putting back the token.
    //          Caller now have the start of a declaration in its parsing structure.
    fn putBackToken(p: *Parser, putting_back: TokenIndex) void {
        while (p.tok_i > 0) {
            p.tok_i -= 1;
            if (p.token_ids[p.tok_i] == .LineComment) continue;
            assert(putting_back == p.tok_i);
            return;
        }
    }

    /// TODO Delete this function. I don't like the inversion of control.
    fn expectNode(
        p: *Parser,
        parseFn: NodeParseFn,
        /// if parsing fails
        err: AstError,
    ) Error!*Node {
        return (try p.expectNodeRecoverable(parseFn, err)) orelse return error.ParseError;
    }

    /// TODO Delete this function. I don't like the inversion of control.
    fn expectNodeRecoverable(
        p: *Parser,
        parseFn: NodeParseFn,
        /// if parsing fails
        err: AstError,
    ) !?*Node {
        return (try parseFn(p)) orelse {
            try p.errors.append(p.gpa, err);
            return null;
        };
    }

    // WARNING: VALID token identifier expected.
    fn giveTokenContent(p: *Parser, id: TokenIndex) []const u8 {
        const loc = p.token_locs[id];
        return p.source[loc.start..loc.end];
    }
};

fn ParseFn(comptime T: type) type {
    return fn (p: *Parser) Error!T;
}

test "std.zig.parser" {
    _ = @import("parser_test.zig");
}