master
  1const std = @import("std");
  2const assert = std.debug.assert;
  3const log = std.log.scoped(.yaml);
  4const mem = std.mem;
  5
  6const Allocator = mem.Allocator;
  7const Tokenizer = @import("Tokenizer.zig");
  8const Token = Tokenizer.Token;
  9const TokenIndex = Tokenizer.TokenIndex;
 10const TokenIterator = Tokenizer.TokenIterator;
 11
 12pub const ParseError = error{
 13    InvalidEscapeSequence,
 14    MalformedYaml,
 15    NestedDocuments,
 16    UnexpectedEof,
 17    UnexpectedToken,
 18    Unhandled,
 19} || Allocator.Error;
 20
 21pub const Node = struct {
 22    tag: Tag,
 23    tree: *const Tree,
 24    start: TokenIndex,
 25    end: TokenIndex,
 26
 27    pub const Tag = enum {
 28        doc,
 29        map,
 30        list,
 31        value,
 32
 33        pub fn Type(comptime tag: Tag) type {
 34            return switch (tag) {
 35                .doc => Doc,
 36                .map => Map,
 37                .list => List,
 38                .value => Value,
 39            };
 40        }
 41    };
 42
 43    pub fn cast(self: *const Node, comptime T: type) ?*const T {
 44        if (self.tag != T.base_tag) {
 45            return null;
 46        }
 47        return @fieldParentPtr("base", self);
 48    }
 49
 50    pub fn deinit(self: *Node, allocator: Allocator) void {
 51        switch (self.tag) {
 52            inline else => |tag| {
 53                const parent: *tag.Type() = @fieldParentPtr("base", self);
 54                parent.deinit(allocator);
 55                allocator.destroy(parent);
 56            },
 57        }
 58    }
 59
 60    pub fn format(self: *const Node, writer: *std.Io.Writer) std.Io.Writer.Error!void {
 61        switch (self.tag) {
 62            inline else => |tag| return @as(*tag.Type(), @fieldParentPtr("base", self)).format(writer),
 63        }
 64    }
 65
 66    pub const Doc = struct {
 67        base: Node = Node{
 68            .tag = Tag.doc,
 69            .tree = undefined,
 70            .start = undefined,
 71            .end = undefined,
 72        },
 73        directive: ?TokenIndex = null,
 74        value: ?*Node = null,
 75
 76        pub const base_tag: Node.Tag = .doc;
 77
 78        pub fn deinit(self: *Doc, allocator: Allocator) void {
 79            if (self.value) |node| {
 80                node.deinit(allocator);
 81            }
 82        }
 83
 84        pub fn format(self: *const Doc, writer: *std.Io.Writer) std.Io.Writer.Error!void {
 85            if (self.directive) |id| {
 86                try writer.print("{{ ", .{});
 87                const directive = self.base.tree.getRaw(id, id);
 88                try writer.print(".directive = {s}, ", .{directive});
 89            }
 90            if (self.value) |node| {
 91                try writer.print("{}", .{node});
 92            }
 93            if (self.directive != null) {
 94                try writer.print(" }}", .{});
 95            }
 96        }
 97    };
 98
 99    pub const Map = struct {
100        base: Node = Node{
101            .tag = Tag.map,
102            .tree = undefined,
103            .start = undefined,
104            .end = undefined,
105        },
106        values: std.ArrayList(Entry) = .empty,
107
108        pub const base_tag: Node.Tag = .map;
109
110        pub const Entry = struct {
111            key: TokenIndex,
112            value: ?*Node,
113        };
114
115        pub fn deinit(self: *Map, allocator: Allocator) void {
116            for (self.values.items) |entry| {
117                if (entry.value) |value| {
118                    value.deinit(allocator);
119                }
120            }
121            self.values.deinit(allocator);
122        }
123
124        pub fn format(self: *const Map, writer: *std.Io.Writer) std.Io.Writer.Error!void {
125            try std.fmt.format(writer, "{{ ", .{});
126            for (self.values.items) |entry| {
127                const key = self.base.tree.getRaw(entry.key, entry.key);
128                if (entry.value) |value| {
129                    try std.fmt.format(writer, "{s} => {}, ", .{ key, value });
130                } else {
131                    try std.fmt.format(writer, "{s} => null, ", .{key});
132                }
133            }
134            return std.fmt.format(writer, " }}", .{});
135        }
136    };
137
138    pub const List = struct {
139        base: Node = Node{
140            .tag = Tag.list,
141            .tree = undefined,
142            .start = undefined,
143            .end = undefined,
144        },
145        values: std.ArrayList(*Node) = .empty,
146
147        pub const base_tag: Node.Tag = .list;
148
149        pub fn deinit(self: *List, allocator: Allocator) void {
150            for (self.values.items) |node| {
151                node.deinit(allocator);
152            }
153            self.values.deinit(allocator);
154        }
155
156        pub fn format(self: *const List, writer: *std.Io.Writer) std.Io.Writer.Error!void {
157            try std.fmt.format(writer, "[ ", .{});
158            for (self.values.items) |node| {
159                try std.fmt.format(writer, "{}, ", .{node});
160            }
161            return std.fmt.format(writer, " ]", .{});
162        }
163    };
164
165    pub const Value = struct {
166        base: Node = Node{
167            .tag = Tag.value,
168            .tree = undefined,
169            .start = undefined,
170            .end = undefined,
171        },
172        string_value: std.ArrayList(u8) = .empty,
173
174        pub const base_tag: Node.Tag = .value;
175
176        pub fn deinit(self: *Value, allocator: Allocator) void {
177            self.string_value.deinit(allocator);
178        }
179
180        pub fn format(self: *const Value, writer: *std.Io.Writer) std.Io.Writer.Error!void {
181            const raw = self.base.tree.getRaw(self.base.start, self.base.end);
182            return std.fmt.format(writer, "{s}", .{raw});
183        }
184    };
185};
186
187pub const LineCol = struct {
188    line: usize,
189    col: usize,
190};
191
192pub const Tree = struct {
193    allocator: Allocator,
194    source: []const u8,
195    tokens: []Token,
196    line_cols: std.AutoHashMap(TokenIndex, LineCol),
197    docs: std.ArrayList(*Node) = .empty,
198
199    pub fn init(allocator: Allocator) Tree {
200        return .{
201            .allocator = allocator,
202            .source = undefined,
203            .tokens = undefined,
204            .line_cols = std.AutoHashMap(TokenIndex, LineCol).init(allocator),
205        };
206    }
207
208    pub fn deinit(self: *Tree) void {
209        self.allocator.free(self.tokens);
210        self.line_cols.deinit();
211        for (self.docs.items) |doc| {
212            doc.deinit(self.allocator);
213        }
214        self.docs.deinit(self.allocator);
215    }
216
217    pub fn getDirective(self: Tree, doc_index: usize) ?[]const u8 {
218        assert(doc_index < self.docs.items.len);
219        const doc = self.docs.items[doc_index].cast(Node.Doc) orelse return null;
220        const id = doc.directive orelse return null;
221        return self.getRaw(id, id);
222    }
223
224    pub fn getRaw(self: Tree, start: TokenIndex, end: TokenIndex) []const u8 {
225        assert(start <= end);
226        assert(start < self.tokens.len and end < self.tokens.len);
227        const start_token = self.tokens[start];
228        const end_token = self.tokens[end];
229        return self.source[start_token.start..end_token.end];
230    }
231
232    pub fn parse(self: *Tree, source: []const u8) !void {
233        var tokenizer = Tokenizer{ .buffer = source };
234        var tokens = std.array_list.Managed(Token).init(self.allocator);
235        defer tokens.deinit();
236
237        var line: usize = 0;
238        var prev_line_last_col: usize = 0;
239
240        while (true) {
241            const token = tokenizer.next();
242            const tok_id = tokens.items.len;
243            try tokens.append(token);
244
245            try self.line_cols.putNoClobber(tok_id, .{
246                .line = line,
247                .col = token.start - prev_line_last_col,
248            });
249
250            switch (token.id) {
251                .eof => break,
252                .new_line => {
253                    line += 1;
254                    prev_line_last_col = token.end;
255                },
256                else => {},
257            }
258        }
259
260        self.source = source;
261        self.tokens = try tokens.toOwnedSlice();
262
263        var it = TokenIterator{ .buffer = self.tokens };
264        var parser = Parser{
265            .allocator = self.allocator,
266            .tree = self,
267            .token_it = &it,
268            .line_cols = &self.line_cols,
269        };
270
271        parser.eatCommentsAndSpace(&.{});
272
273        while (true) {
274            parser.eatCommentsAndSpace(&.{});
275            const token = parser.token_it.next() orelse break;
276
277            log.debug("(main) next {s}@{d}", .{ @tagName(token.id), parser.token_it.pos - 1 });
278
279            switch (token.id) {
280                .eof => break,
281                else => {
282                    parser.token_it.seekBy(-1);
283                    const doc = try parser.doc();
284                    try self.docs.append(self.allocator, doc);
285                },
286            }
287        }
288    }
289};
290
291const Parser = struct {
292    allocator: Allocator,
293    tree: *Tree,
294    token_it: *TokenIterator,
295    line_cols: *const std.AutoHashMap(TokenIndex, LineCol),
296
297    fn value(self: *Parser) ParseError!?*Node {
298        self.eatCommentsAndSpace(&.{});
299
300        const pos = self.token_it.pos;
301        const token = self.token_it.next() orelse return error.UnexpectedEof;
302
303        log.debug("  next {s}@{d}", .{ @tagName(token.id), pos });
304
305        switch (token.id) {
306            .literal => if (self.eatToken(.map_value_ind, &.{ .new_line, .comment })) |_| {
307                // map
308                self.token_it.seekTo(pos);
309                return self.map();
310            } else {
311                // leaf value
312                self.token_it.seekTo(pos);
313                return self.leaf_value();
314            },
315            .single_quoted, .double_quoted => {
316                // leaf value
317                self.token_it.seekBy(-1);
318                return self.leaf_value();
319            },
320            .seq_item_ind => {
321                // list
322                self.token_it.seekBy(-1);
323                return self.list();
324            },
325            .flow_seq_start => {
326                // list
327                self.token_it.seekBy(-1);
328                return self.list_bracketed();
329            },
330            else => return null,
331        }
332    }
333
334    fn doc(self: *Parser) ParseError!*Node {
335        const node = try self.allocator.create(Node.Doc);
336        errdefer self.allocator.destroy(node);
337        node.* = .{};
338        node.base.tree = self.tree;
339        node.base.start = self.token_it.pos;
340
341        log.debug("(doc) begin {s}@{d}", .{ @tagName(self.tree.tokens[node.base.start].id), node.base.start });
342
343        // Parse header
344        const explicit_doc: bool = if (self.eatToken(.doc_start, &.{})) |doc_pos| explicit_doc: {
345            if (self.getCol(doc_pos) > 0) return error.MalformedYaml;
346            if (self.eatToken(.tag, &.{ .new_line, .comment })) |_| {
347                node.directive = try self.expectToken(.literal, &.{ .new_line, .comment });
348            }
349            break :explicit_doc true;
350        } else false;
351
352        // Parse value
353        node.value = try self.value();
354        if (node.value == null) {
355            self.token_it.seekBy(-1);
356        }
357        errdefer if (node.value) |val| {
358            val.deinit(self.allocator);
359        };
360
361        // Parse footer
362        footer: {
363            if (self.eatToken(.doc_end, &.{})) |pos| {
364                if (!explicit_doc) return error.UnexpectedToken;
365                if (self.getCol(pos) > 0) return error.MalformedYaml;
366                node.base.end = pos;
367                break :footer;
368            }
369            if (self.eatToken(.doc_start, &.{})) |pos| {
370                if (!explicit_doc) return error.UnexpectedToken;
371                if (self.getCol(pos) > 0) return error.MalformedYaml;
372                self.token_it.seekBy(-1);
373                node.base.end = pos - 1;
374                break :footer;
375            }
376            if (self.eatToken(.eof, &.{})) |pos| {
377                node.base.end = pos - 1;
378                break :footer;
379            }
380            return error.UnexpectedToken;
381        }
382
383        log.debug("(doc) end {s}@{d}", .{ @tagName(self.tree.tokens[node.base.end].id), node.base.end });
384
385        return &node.base;
386    }
387
388    fn map(self: *Parser) ParseError!*Node {
389        const node = try self.allocator.create(Node.Map);
390        errdefer self.allocator.destroy(node);
391        node.* = .{};
392        node.base.tree = self.tree;
393        node.base.start = self.token_it.pos;
394        errdefer {
395            for (node.values.items) |entry| {
396                if (entry.value) |val| {
397                    val.deinit(self.allocator);
398                }
399            }
400            node.values.deinit(self.allocator);
401        }
402
403        log.debug("(map) begin {s}@{d}", .{ @tagName(self.tree.tokens[node.base.start].id), node.base.start });
404
405        const col = self.getCol(node.base.start);
406
407        while (true) {
408            self.eatCommentsAndSpace(&.{});
409
410            // Parse key
411            const key_pos = self.token_it.pos;
412            if (self.getCol(key_pos) < col) {
413                break;
414            }
415
416            const key = self.token_it.next() orelse return error.UnexpectedEof;
417            switch (key.id) {
418                .literal => {},
419                .doc_start, .doc_end, .eof => {
420                    self.token_it.seekBy(-1);
421                    break;
422                },
423                else => {
424                    // TODO key not being a literal
425                    return error.Unhandled;
426                },
427            }
428
429            log.debug("(map) key {s}@{d}", .{ self.tree.getRaw(key_pos, key_pos), key_pos });
430
431            // Separator
432            _ = try self.expectToken(.map_value_ind, &.{ .new_line, .comment });
433
434            // Parse value
435            const val = try self.value();
436            errdefer if (val) |v| {
437                v.deinit(self.allocator);
438            };
439
440            if (val) |v| {
441                if (self.getCol(v.start) < self.getCol(key_pos)) {
442                    return error.MalformedYaml;
443                }
444                if (v.cast(Node.Value)) |_| {
445                    if (self.getCol(v.start) == self.getCol(key_pos)) {
446                        return error.MalformedYaml;
447                    }
448                }
449            }
450
451            try node.values.append(self.allocator, .{
452                .key = key_pos,
453                .value = val,
454            });
455        }
456
457        node.base.end = self.token_it.pos - 1;
458
459        log.debug("(map) end {s}@{d}", .{ @tagName(self.tree.tokens[node.base.end].id), node.base.end });
460
461        return &node.base;
462    }
463
464    fn list(self: *Parser) ParseError!*Node {
465        const node = try self.allocator.create(Node.List);
466        errdefer self.allocator.destroy(node);
467        node.* = .{};
468        node.base.tree = self.tree;
469        node.base.start = self.token_it.pos;
470        errdefer {
471            for (node.values.items) |val| {
472                val.deinit(self.allocator);
473            }
474            node.values.deinit(self.allocator);
475        }
476
477        log.debug("(list) begin {s}@{d}", .{ @tagName(self.tree.tokens[node.base.start].id), node.base.start });
478
479        while (true) {
480            self.eatCommentsAndSpace(&.{});
481
482            _ = self.eatToken(.seq_item_ind, &.{}) orelse break;
483
484            const val = (try self.value()) orelse return error.MalformedYaml;
485            try node.values.append(self.allocator, val);
486        }
487
488        node.base.end = self.token_it.pos - 1;
489
490        log.debug("(list) end {s}@{d}", .{ @tagName(self.tree.tokens[node.base.end].id), node.base.end });
491
492        return &node.base;
493    }
494
495    fn list_bracketed(self: *Parser) ParseError!*Node {
496        const node = try self.allocator.create(Node.List);
497        errdefer self.allocator.destroy(node);
498        node.* = .{};
499        node.base.tree = self.tree;
500        node.base.start = self.token_it.pos;
501        errdefer {
502            for (node.values.items) |val| {
503                val.deinit(self.allocator);
504            }
505            node.values.deinit(self.allocator);
506        }
507
508        log.debug("(list) begin {s}@{d}", .{ @tagName(self.tree.tokens[node.base.start].id), node.base.start });
509
510        _ = try self.expectToken(.flow_seq_start, &.{});
511
512        while (true) {
513            self.eatCommentsAndSpace(&.{.comment});
514
515            if (self.eatToken(.flow_seq_end, &.{.comment})) |pos| {
516                node.base.end = pos;
517                break;
518            }
519            _ = self.eatToken(.comma, &.{.comment});
520
521            const val = (try self.value()) orelse return error.MalformedYaml;
522            try node.values.append(self.allocator, val);
523        }
524
525        log.debug("(list) end {s}@{d}", .{ @tagName(self.tree.tokens[node.base.end].id), node.base.end });
526
527        return &node.base;
528    }
529
530    fn leaf_value(self: *Parser) ParseError!*Node {
531        const node = try self.allocator.create(Node.Value);
532        errdefer self.allocator.destroy(node);
533        node.* = .{ .string_value = .{} };
534        node.base.tree = self.tree;
535        node.base.start = self.token_it.pos;
536        errdefer node.string_value.deinit(self.allocator);
537
538        // TODO handle multiline strings in new block scope
539        while (self.token_it.next()) |tok| {
540            switch (tok.id) {
541                .single_quoted => {
542                    node.base.end = self.token_it.pos - 1;
543                    const raw = self.tree.getRaw(node.base.start, node.base.end);
544                    try self.parseSingleQuoted(node, raw);
545                    break;
546                },
547                .double_quoted => {
548                    node.base.end = self.token_it.pos - 1;
549                    const raw = self.tree.getRaw(node.base.start, node.base.end);
550                    try self.parseDoubleQuoted(node, raw);
551                    break;
552                },
553                .literal => {},
554                .space => {
555                    const trailing = self.token_it.pos - 2;
556                    self.eatCommentsAndSpace(&.{});
557                    if (self.token_it.peek()) |peek| {
558                        if (peek.id != .literal) {
559                            node.base.end = trailing;
560                            const raw = self.tree.getRaw(node.base.start, node.base.end);
561                            try node.string_value.appendSlice(self.allocator, raw);
562                            break;
563                        }
564                    }
565                },
566                else => {
567                    self.token_it.seekBy(-1);
568                    node.base.end = self.token_it.pos - 1;
569                    const raw = self.tree.getRaw(node.base.start, node.base.end);
570                    try node.string_value.appendSlice(self.allocator, raw);
571                    break;
572                },
573            }
574        }
575
576        log.debug("(leaf) {s}", .{self.tree.getRaw(node.base.start, node.base.end)});
577
578        return &node.base;
579    }
580
581    fn eatCommentsAndSpace(self: *Parser, comptime exclusions: []const Token.Id) void {
582        log.debug("eatCommentsAndSpace", .{});
583        outer: while (self.token_it.next()) |token| {
584            log.debug("  (token '{s}')", .{@tagName(token.id)});
585            switch (token.id) {
586                .comment, .space, .new_line => |space| {
587                    inline for (exclusions) |excl| {
588                        if (excl == space) {
589                            self.token_it.seekBy(-1);
590                            break :outer;
591                        }
592                    } else continue;
593                },
594                else => {
595                    self.token_it.seekBy(-1);
596                    break;
597                },
598            }
599        }
600    }
601
602    fn eatToken(self: *Parser, id: Token.Id, comptime exclusions: []const Token.Id) ?TokenIndex {
603        log.debug("eatToken('{s}')", .{@tagName(id)});
604        self.eatCommentsAndSpace(exclusions);
605        const pos = self.token_it.pos;
606        const token = self.token_it.next() orelse return null;
607        if (token.id == id) {
608            log.debug("  (found at {d})", .{pos});
609            return pos;
610        } else {
611            log.debug("  (not found)", .{});
612            self.token_it.seekBy(-1);
613            return null;
614        }
615    }
616
617    fn expectToken(self: *Parser, id: Token.Id, comptime exclusions: []const Token.Id) ParseError!TokenIndex {
618        log.debug("expectToken('{s}')", .{@tagName(id)});
619        return self.eatToken(id, exclusions) orelse error.UnexpectedToken;
620    }
621
622    fn getLine(self: *Parser, index: TokenIndex) usize {
623        return self.line_cols.get(index).?.line;
624    }
625
626    fn getCol(self: *Parser, index: TokenIndex) usize {
627        return self.line_cols.get(index).?.col;
628    }
629
630    fn parseSingleQuoted(self: *Parser, node: *Node.Value, raw: []const u8) ParseError!void {
631        assert(raw[0] == '\'' and raw[raw.len - 1] == '\'');
632
633        const raw_no_quotes = raw[1 .. raw.len - 1];
634        try node.string_value.ensureTotalCapacity(self.allocator, raw_no_quotes.len);
635
636        var state: enum {
637            start,
638            escape,
639        } = .start;
640        var index: usize = 0;
641
642        while (index < raw_no_quotes.len) : (index += 1) {
643            const c = raw_no_quotes[index];
644            switch (state) {
645                .start => switch (c) {
646                    '\'' => {
647                        state = .escape;
648                    },
649                    else => {
650                        node.string_value.appendAssumeCapacity(c);
651                    },
652                },
653                .escape => switch (c) {
654                    '\'' => {
655                        state = .start;
656                        node.string_value.appendAssumeCapacity(c);
657                    },
658                    else => return error.InvalidEscapeSequence,
659                },
660            }
661        }
662    }
663
664    fn parseDoubleQuoted(self: *Parser, node: *Node.Value, raw: []const u8) ParseError!void {
665        assert(raw[0] == '"' and raw[raw.len - 1] == '"');
666
667        const raw_no_quotes = raw[1 .. raw.len - 1];
668        try node.string_value.ensureTotalCapacity(self.allocator, raw_no_quotes.len);
669
670        var state: enum {
671            start,
672            escape,
673        } = .start;
674
675        var index: usize = 0;
676        while (index < raw_no_quotes.len) : (index += 1) {
677            const c = raw_no_quotes[index];
678            switch (state) {
679                .start => switch (c) {
680                    '\\' => {
681                        state = .escape;
682                    },
683                    else => {
684                        node.string_value.appendAssumeCapacity(c);
685                    },
686                },
687                .escape => switch (c) {
688                    'n' => {
689                        state = .start;
690                        node.string_value.appendAssumeCapacity('\n');
691                    },
692                    't' => {
693                        state = .start;
694                        node.string_value.appendAssumeCapacity('\t');
695                    },
696                    '"' => {
697                        state = .start;
698                        node.string_value.appendAssumeCapacity('"');
699                    },
700                    else => return error.InvalidEscapeSequence,
701                },
702            }
703        }
704    }
705};
706
707test {
708    std.testing.refAllDecls(@This());
709    _ = @import("parse/test.zig");
710}