master
1const std = @import("std");
2const assert = std.debug.assert;
3const log = std.log.scoped(.yaml);
4const mem = std.mem;
5
6const Allocator = mem.Allocator;
7const Tokenizer = @import("Tokenizer.zig");
8const Token = Tokenizer.Token;
9const TokenIndex = Tokenizer.TokenIndex;
10const TokenIterator = Tokenizer.TokenIterator;
11
12pub const ParseError = error{
13 InvalidEscapeSequence,
14 MalformedYaml,
15 NestedDocuments,
16 UnexpectedEof,
17 UnexpectedToken,
18 Unhandled,
19} || Allocator.Error;
20
21pub const Node = struct {
22 tag: Tag,
23 tree: *const Tree,
24 start: TokenIndex,
25 end: TokenIndex,
26
27 pub const Tag = enum {
28 doc,
29 map,
30 list,
31 value,
32
33 pub fn Type(comptime tag: Tag) type {
34 return switch (tag) {
35 .doc => Doc,
36 .map => Map,
37 .list => List,
38 .value => Value,
39 };
40 }
41 };
42
43 pub fn cast(self: *const Node, comptime T: type) ?*const T {
44 if (self.tag != T.base_tag) {
45 return null;
46 }
47 return @fieldParentPtr("base", self);
48 }
49
50 pub fn deinit(self: *Node, allocator: Allocator) void {
51 switch (self.tag) {
52 inline else => |tag| {
53 const parent: *tag.Type() = @fieldParentPtr("base", self);
54 parent.deinit(allocator);
55 allocator.destroy(parent);
56 },
57 }
58 }
59
60 pub fn format(self: *const Node, writer: *std.Io.Writer) std.Io.Writer.Error!void {
61 switch (self.tag) {
62 inline else => |tag| return @as(*tag.Type(), @fieldParentPtr("base", self)).format(writer),
63 }
64 }
65
66 pub const Doc = struct {
67 base: Node = Node{
68 .tag = Tag.doc,
69 .tree = undefined,
70 .start = undefined,
71 .end = undefined,
72 },
73 directive: ?TokenIndex = null,
74 value: ?*Node = null,
75
76 pub const base_tag: Node.Tag = .doc;
77
78 pub fn deinit(self: *Doc, allocator: Allocator) void {
79 if (self.value) |node| {
80 node.deinit(allocator);
81 }
82 }
83
84 pub fn format(self: *const Doc, writer: *std.Io.Writer) std.Io.Writer.Error!void {
85 if (self.directive) |id| {
86 try writer.print("{{ ", .{});
87 const directive = self.base.tree.getRaw(id, id);
88 try writer.print(".directive = {s}, ", .{directive});
89 }
90 if (self.value) |node| {
91 try writer.print("{}", .{node});
92 }
93 if (self.directive != null) {
94 try writer.print(" }}", .{});
95 }
96 }
97 };
98
99 pub const Map = struct {
100 base: Node = Node{
101 .tag = Tag.map,
102 .tree = undefined,
103 .start = undefined,
104 .end = undefined,
105 },
106 values: std.ArrayList(Entry) = .empty,
107
108 pub const base_tag: Node.Tag = .map;
109
110 pub const Entry = struct {
111 key: TokenIndex,
112 value: ?*Node,
113 };
114
115 pub fn deinit(self: *Map, allocator: Allocator) void {
116 for (self.values.items) |entry| {
117 if (entry.value) |value| {
118 value.deinit(allocator);
119 }
120 }
121 self.values.deinit(allocator);
122 }
123
124 pub fn format(self: *const Map, writer: *std.Io.Writer) std.Io.Writer.Error!void {
125 try std.fmt.format(writer, "{{ ", .{});
126 for (self.values.items) |entry| {
127 const key = self.base.tree.getRaw(entry.key, entry.key);
128 if (entry.value) |value| {
129 try std.fmt.format(writer, "{s} => {}, ", .{ key, value });
130 } else {
131 try std.fmt.format(writer, "{s} => null, ", .{key});
132 }
133 }
134 return std.fmt.format(writer, " }}", .{});
135 }
136 };
137
138 pub const List = struct {
139 base: Node = Node{
140 .tag = Tag.list,
141 .tree = undefined,
142 .start = undefined,
143 .end = undefined,
144 },
145 values: std.ArrayList(*Node) = .empty,
146
147 pub const base_tag: Node.Tag = .list;
148
149 pub fn deinit(self: *List, allocator: Allocator) void {
150 for (self.values.items) |node| {
151 node.deinit(allocator);
152 }
153 self.values.deinit(allocator);
154 }
155
156 pub fn format(self: *const List, writer: *std.Io.Writer) std.Io.Writer.Error!void {
157 try std.fmt.format(writer, "[ ", .{});
158 for (self.values.items) |node| {
159 try std.fmt.format(writer, "{}, ", .{node});
160 }
161 return std.fmt.format(writer, " ]", .{});
162 }
163 };
164
165 pub const Value = struct {
166 base: Node = Node{
167 .tag = Tag.value,
168 .tree = undefined,
169 .start = undefined,
170 .end = undefined,
171 },
172 string_value: std.ArrayList(u8) = .empty,
173
174 pub const base_tag: Node.Tag = .value;
175
176 pub fn deinit(self: *Value, allocator: Allocator) void {
177 self.string_value.deinit(allocator);
178 }
179
180 pub fn format(self: *const Value, writer: *std.Io.Writer) std.Io.Writer.Error!void {
181 const raw = self.base.tree.getRaw(self.base.start, self.base.end);
182 return std.fmt.format(writer, "{s}", .{raw});
183 }
184 };
185};
186
187pub const LineCol = struct {
188 line: usize,
189 col: usize,
190};
191
192pub const Tree = struct {
193 allocator: Allocator,
194 source: []const u8,
195 tokens: []Token,
196 line_cols: std.AutoHashMap(TokenIndex, LineCol),
197 docs: std.ArrayList(*Node) = .empty,
198
199 pub fn init(allocator: Allocator) Tree {
200 return .{
201 .allocator = allocator,
202 .source = undefined,
203 .tokens = undefined,
204 .line_cols = std.AutoHashMap(TokenIndex, LineCol).init(allocator),
205 };
206 }
207
208 pub fn deinit(self: *Tree) void {
209 self.allocator.free(self.tokens);
210 self.line_cols.deinit();
211 for (self.docs.items) |doc| {
212 doc.deinit(self.allocator);
213 }
214 self.docs.deinit(self.allocator);
215 }
216
217 pub fn getDirective(self: Tree, doc_index: usize) ?[]const u8 {
218 assert(doc_index < self.docs.items.len);
219 const doc = self.docs.items[doc_index].cast(Node.Doc) orelse return null;
220 const id = doc.directive orelse return null;
221 return self.getRaw(id, id);
222 }
223
224 pub fn getRaw(self: Tree, start: TokenIndex, end: TokenIndex) []const u8 {
225 assert(start <= end);
226 assert(start < self.tokens.len and end < self.tokens.len);
227 const start_token = self.tokens[start];
228 const end_token = self.tokens[end];
229 return self.source[start_token.start..end_token.end];
230 }
231
232 pub fn parse(self: *Tree, source: []const u8) !void {
233 var tokenizer = Tokenizer{ .buffer = source };
234 var tokens = std.array_list.Managed(Token).init(self.allocator);
235 defer tokens.deinit();
236
237 var line: usize = 0;
238 var prev_line_last_col: usize = 0;
239
240 while (true) {
241 const token = tokenizer.next();
242 const tok_id = tokens.items.len;
243 try tokens.append(token);
244
245 try self.line_cols.putNoClobber(tok_id, .{
246 .line = line,
247 .col = token.start - prev_line_last_col,
248 });
249
250 switch (token.id) {
251 .eof => break,
252 .new_line => {
253 line += 1;
254 prev_line_last_col = token.end;
255 },
256 else => {},
257 }
258 }
259
260 self.source = source;
261 self.tokens = try tokens.toOwnedSlice();
262
263 var it = TokenIterator{ .buffer = self.tokens };
264 var parser = Parser{
265 .allocator = self.allocator,
266 .tree = self,
267 .token_it = &it,
268 .line_cols = &self.line_cols,
269 };
270
271 parser.eatCommentsAndSpace(&.{});
272
273 while (true) {
274 parser.eatCommentsAndSpace(&.{});
275 const token = parser.token_it.next() orelse break;
276
277 log.debug("(main) next {s}@{d}", .{ @tagName(token.id), parser.token_it.pos - 1 });
278
279 switch (token.id) {
280 .eof => break,
281 else => {
282 parser.token_it.seekBy(-1);
283 const doc = try parser.doc();
284 try self.docs.append(self.allocator, doc);
285 },
286 }
287 }
288 }
289};
290
291const Parser = struct {
292 allocator: Allocator,
293 tree: *Tree,
294 token_it: *TokenIterator,
295 line_cols: *const std.AutoHashMap(TokenIndex, LineCol),
296
297 fn value(self: *Parser) ParseError!?*Node {
298 self.eatCommentsAndSpace(&.{});
299
300 const pos = self.token_it.pos;
301 const token = self.token_it.next() orelse return error.UnexpectedEof;
302
303 log.debug(" next {s}@{d}", .{ @tagName(token.id), pos });
304
305 switch (token.id) {
306 .literal => if (self.eatToken(.map_value_ind, &.{ .new_line, .comment })) |_| {
307 // map
308 self.token_it.seekTo(pos);
309 return self.map();
310 } else {
311 // leaf value
312 self.token_it.seekTo(pos);
313 return self.leaf_value();
314 },
315 .single_quoted, .double_quoted => {
316 // leaf value
317 self.token_it.seekBy(-1);
318 return self.leaf_value();
319 },
320 .seq_item_ind => {
321 // list
322 self.token_it.seekBy(-1);
323 return self.list();
324 },
325 .flow_seq_start => {
326 // list
327 self.token_it.seekBy(-1);
328 return self.list_bracketed();
329 },
330 else => return null,
331 }
332 }
333
334 fn doc(self: *Parser) ParseError!*Node {
335 const node = try self.allocator.create(Node.Doc);
336 errdefer self.allocator.destroy(node);
337 node.* = .{};
338 node.base.tree = self.tree;
339 node.base.start = self.token_it.pos;
340
341 log.debug("(doc) begin {s}@{d}", .{ @tagName(self.tree.tokens[node.base.start].id), node.base.start });
342
343 // Parse header
344 const explicit_doc: bool = if (self.eatToken(.doc_start, &.{})) |doc_pos| explicit_doc: {
345 if (self.getCol(doc_pos) > 0) return error.MalformedYaml;
346 if (self.eatToken(.tag, &.{ .new_line, .comment })) |_| {
347 node.directive = try self.expectToken(.literal, &.{ .new_line, .comment });
348 }
349 break :explicit_doc true;
350 } else false;
351
352 // Parse value
353 node.value = try self.value();
354 if (node.value == null) {
355 self.token_it.seekBy(-1);
356 }
357 errdefer if (node.value) |val| {
358 val.deinit(self.allocator);
359 };
360
361 // Parse footer
362 footer: {
363 if (self.eatToken(.doc_end, &.{})) |pos| {
364 if (!explicit_doc) return error.UnexpectedToken;
365 if (self.getCol(pos) > 0) return error.MalformedYaml;
366 node.base.end = pos;
367 break :footer;
368 }
369 if (self.eatToken(.doc_start, &.{})) |pos| {
370 if (!explicit_doc) return error.UnexpectedToken;
371 if (self.getCol(pos) > 0) return error.MalformedYaml;
372 self.token_it.seekBy(-1);
373 node.base.end = pos - 1;
374 break :footer;
375 }
376 if (self.eatToken(.eof, &.{})) |pos| {
377 node.base.end = pos - 1;
378 break :footer;
379 }
380 return error.UnexpectedToken;
381 }
382
383 log.debug("(doc) end {s}@{d}", .{ @tagName(self.tree.tokens[node.base.end].id), node.base.end });
384
385 return &node.base;
386 }
387
388 fn map(self: *Parser) ParseError!*Node {
389 const node = try self.allocator.create(Node.Map);
390 errdefer self.allocator.destroy(node);
391 node.* = .{};
392 node.base.tree = self.tree;
393 node.base.start = self.token_it.pos;
394 errdefer {
395 for (node.values.items) |entry| {
396 if (entry.value) |val| {
397 val.deinit(self.allocator);
398 }
399 }
400 node.values.deinit(self.allocator);
401 }
402
403 log.debug("(map) begin {s}@{d}", .{ @tagName(self.tree.tokens[node.base.start].id), node.base.start });
404
405 const col = self.getCol(node.base.start);
406
407 while (true) {
408 self.eatCommentsAndSpace(&.{});
409
410 // Parse key
411 const key_pos = self.token_it.pos;
412 if (self.getCol(key_pos) < col) {
413 break;
414 }
415
416 const key = self.token_it.next() orelse return error.UnexpectedEof;
417 switch (key.id) {
418 .literal => {},
419 .doc_start, .doc_end, .eof => {
420 self.token_it.seekBy(-1);
421 break;
422 },
423 else => {
424 // TODO key not being a literal
425 return error.Unhandled;
426 },
427 }
428
429 log.debug("(map) key {s}@{d}", .{ self.tree.getRaw(key_pos, key_pos), key_pos });
430
431 // Separator
432 _ = try self.expectToken(.map_value_ind, &.{ .new_line, .comment });
433
434 // Parse value
435 const val = try self.value();
436 errdefer if (val) |v| {
437 v.deinit(self.allocator);
438 };
439
440 if (val) |v| {
441 if (self.getCol(v.start) < self.getCol(key_pos)) {
442 return error.MalformedYaml;
443 }
444 if (v.cast(Node.Value)) |_| {
445 if (self.getCol(v.start) == self.getCol(key_pos)) {
446 return error.MalformedYaml;
447 }
448 }
449 }
450
451 try node.values.append(self.allocator, .{
452 .key = key_pos,
453 .value = val,
454 });
455 }
456
457 node.base.end = self.token_it.pos - 1;
458
459 log.debug("(map) end {s}@{d}", .{ @tagName(self.tree.tokens[node.base.end].id), node.base.end });
460
461 return &node.base;
462 }
463
464 fn list(self: *Parser) ParseError!*Node {
465 const node = try self.allocator.create(Node.List);
466 errdefer self.allocator.destroy(node);
467 node.* = .{};
468 node.base.tree = self.tree;
469 node.base.start = self.token_it.pos;
470 errdefer {
471 for (node.values.items) |val| {
472 val.deinit(self.allocator);
473 }
474 node.values.deinit(self.allocator);
475 }
476
477 log.debug("(list) begin {s}@{d}", .{ @tagName(self.tree.tokens[node.base.start].id), node.base.start });
478
479 while (true) {
480 self.eatCommentsAndSpace(&.{});
481
482 _ = self.eatToken(.seq_item_ind, &.{}) orelse break;
483
484 const val = (try self.value()) orelse return error.MalformedYaml;
485 try node.values.append(self.allocator, val);
486 }
487
488 node.base.end = self.token_it.pos - 1;
489
490 log.debug("(list) end {s}@{d}", .{ @tagName(self.tree.tokens[node.base.end].id), node.base.end });
491
492 return &node.base;
493 }
494
495 fn list_bracketed(self: *Parser) ParseError!*Node {
496 const node = try self.allocator.create(Node.List);
497 errdefer self.allocator.destroy(node);
498 node.* = .{};
499 node.base.tree = self.tree;
500 node.base.start = self.token_it.pos;
501 errdefer {
502 for (node.values.items) |val| {
503 val.deinit(self.allocator);
504 }
505 node.values.deinit(self.allocator);
506 }
507
508 log.debug("(list) begin {s}@{d}", .{ @tagName(self.tree.tokens[node.base.start].id), node.base.start });
509
510 _ = try self.expectToken(.flow_seq_start, &.{});
511
512 while (true) {
513 self.eatCommentsAndSpace(&.{.comment});
514
515 if (self.eatToken(.flow_seq_end, &.{.comment})) |pos| {
516 node.base.end = pos;
517 break;
518 }
519 _ = self.eatToken(.comma, &.{.comment});
520
521 const val = (try self.value()) orelse return error.MalformedYaml;
522 try node.values.append(self.allocator, val);
523 }
524
525 log.debug("(list) end {s}@{d}", .{ @tagName(self.tree.tokens[node.base.end].id), node.base.end });
526
527 return &node.base;
528 }
529
530 fn leaf_value(self: *Parser) ParseError!*Node {
531 const node = try self.allocator.create(Node.Value);
532 errdefer self.allocator.destroy(node);
533 node.* = .{ .string_value = .{} };
534 node.base.tree = self.tree;
535 node.base.start = self.token_it.pos;
536 errdefer node.string_value.deinit(self.allocator);
537
538 // TODO handle multiline strings in new block scope
539 while (self.token_it.next()) |tok| {
540 switch (tok.id) {
541 .single_quoted => {
542 node.base.end = self.token_it.pos - 1;
543 const raw = self.tree.getRaw(node.base.start, node.base.end);
544 try self.parseSingleQuoted(node, raw);
545 break;
546 },
547 .double_quoted => {
548 node.base.end = self.token_it.pos - 1;
549 const raw = self.tree.getRaw(node.base.start, node.base.end);
550 try self.parseDoubleQuoted(node, raw);
551 break;
552 },
553 .literal => {},
554 .space => {
555 const trailing = self.token_it.pos - 2;
556 self.eatCommentsAndSpace(&.{});
557 if (self.token_it.peek()) |peek| {
558 if (peek.id != .literal) {
559 node.base.end = trailing;
560 const raw = self.tree.getRaw(node.base.start, node.base.end);
561 try node.string_value.appendSlice(self.allocator, raw);
562 break;
563 }
564 }
565 },
566 else => {
567 self.token_it.seekBy(-1);
568 node.base.end = self.token_it.pos - 1;
569 const raw = self.tree.getRaw(node.base.start, node.base.end);
570 try node.string_value.appendSlice(self.allocator, raw);
571 break;
572 },
573 }
574 }
575
576 log.debug("(leaf) {s}", .{self.tree.getRaw(node.base.start, node.base.end)});
577
578 return &node.base;
579 }
580
581 fn eatCommentsAndSpace(self: *Parser, comptime exclusions: []const Token.Id) void {
582 log.debug("eatCommentsAndSpace", .{});
583 outer: while (self.token_it.next()) |token| {
584 log.debug(" (token '{s}')", .{@tagName(token.id)});
585 switch (token.id) {
586 .comment, .space, .new_line => |space| {
587 inline for (exclusions) |excl| {
588 if (excl == space) {
589 self.token_it.seekBy(-1);
590 break :outer;
591 }
592 } else continue;
593 },
594 else => {
595 self.token_it.seekBy(-1);
596 break;
597 },
598 }
599 }
600 }
601
602 fn eatToken(self: *Parser, id: Token.Id, comptime exclusions: []const Token.Id) ?TokenIndex {
603 log.debug("eatToken('{s}')", .{@tagName(id)});
604 self.eatCommentsAndSpace(exclusions);
605 const pos = self.token_it.pos;
606 const token = self.token_it.next() orelse return null;
607 if (token.id == id) {
608 log.debug(" (found at {d})", .{pos});
609 return pos;
610 } else {
611 log.debug(" (not found)", .{});
612 self.token_it.seekBy(-1);
613 return null;
614 }
615 }
616
617 fn expectToken(self: *Parser, id: Token.Id, comptime exclusions: []const Token.Id) ParseError!TokenIndex {
618 log.debug("expectToken('{s}')", .{@tagName(id)});
619 return self.eatToken(id, exclusions) orelse error.UnexpectedToken;
620 }
621
622 fn getLine(self: *Parser, index: TokenIndex) usize {
623 return self.line_cols.get(index).?.line;
624 }
625
626 fn getCol(self: *Parser, index: TokenIndex) usize {
627 return self.line_cols.get(index).?.col;
628 }
629
630 fn parseSingleQuoted(self: *Parser, node: *Node.Value, raw: []const u8) ParseError!void {
631 assert(raw[0] == '\'' and raw[raw.len - 1] == '\'');
632
633 const raw_no_quotes = raw[1 .. raw.len - 1];
634 try node.string_value.ensureTotalCapacity(self.allocator, raw_no_quotes.len);
635
636 var state: enum {
637 start,
638 escape,
639 } = .start;
640 var index: usize = 0;
641
642 while (index < raw_no_quotes.len) : (index += 1) {
643 const c = raw_no_quotes[index];
644 switch (state) {
645 .start => switch (c) {
646 '\'' => {
647 state = .escape;
648 },
649 else => {
650 node.string_value.appendAssumeCapacity(c);
651 },
652 },
653 .escape => switch (c) {
654 '\'' => {
655 state = .start;
656 node.string_value.appendAssumeCapacity(c);
657 },
658 else => return error.InvalidEscapeSequence,
659 },
660 }
661 }
662 }
663
664 fn parseDoubleQuoted(self: *Parser, node: *Node.Value, raw: []const u8) ParseError!void {
665 assert(raw[0] == '"' and raw[raw.len - 1] == '"');
666
667 const raw_no_quotes = raw[1 .. raw.len - 1];
668 try node.string_value.ensureTotalCapacity(self.allocator, raw_no_quotes.len);
669
670 var state: enum {
671 start,
672 escape,
673 } = .start;
674
675 var index: usize = 0;
676 while (index < raw_no_quotes.len) : (index += 1) {
677 const c = raw_no_quotes[index];
678 switch (state) {
679 .start => switch (c) {
680 '\\' => {
681 state = .escape;
682 },
683 else => {
684 node.string_value.appendAssumeCapacity(c);
685 },
686 },
687 .escape => switch (c) {
688 'n' => {
689 state = .start;
690 node.string_value.appendAssumeCapacity('\n');
691 },
692 't' => {
693 state = .start;
694 node.string_value.appendAssumeCapacity('\t');
695 },
696 '"' => {
697 state = .start;
698 node.string_value.appendAssumeCapacity('"');
699 },
700 else => return error.InvalidEscapeSequence,
701 },
702 }
703 }
704 }
705};
706
707test {
708 std.testing.refAllDecls(@This());
709 _ = @import("parse/test.zig");
710}