master
1const Tokenizer = @This();
2
3const std = @import("std");
4const log = std.log.scoped(.yaml);
5const testing = std.testing;
6
7buffer: []const u8,
8index: usize = 0,
9
10pub const Token = struct {
11 id: Id,
12 start: usize,
13 end: usize,
14
15 pub const Id = enum {
16 // zig fmt: off
17 eof,
18
19 new_line,
20 doc_start, // ---
21 doc_end, // ...
22 seq_item_ind, // -
23 map_value_ind, // :
24 flow_map_start, // {
25 flow_map_end, // }
26 flow_seq_start, // [
27 flow_seq_end, // ]
28
29 comma,
30 space,
31 tab,
32 comment, // #
33 alias, // *
34 anchor, // &
35 tag, // !
36
37 single_quoted, // '...'
38 double_quoted, // "..."
39 literal,
40 // zig fmt: on
41 };
42};
43
44pub const TokenIndex = usize;
45
46pub const TokenIterator = struct {
47 buffer: []const Token,
48 pos: TokenIndex = 0,
49
50 pub fn next(self: *TokenIterator) ?Token {
51 const token = self.peek() orelse return null;
52 self.pos += 1;
53 return token;
54 }
55
56 pub fn peek(self: TokenIterator) ?Token {
57 if (self.pos >= self.buffer.len) return null;
58 return self.buffer[self.pos];
59 }
60
61 pub fn reset(self: *TokenIterator) void {
62 self.pos = 0;
63 }
64
65 pub fn seekTo(self: *TokenIterator, pos: TokenIndex) void {
66 self.pos = pos;
67 }
68
69 pub fn seekBy(self: *TokenIterator, offset: isize) void {
70 const new_pos = @as(isize, @bitCast(self.pos)) + offset;
71 if (new_pos < 0) {
72 self.pos = 0;
73 } else {
74 self.pos = @as(usize, @intCast(new_pos));
75 }
76 }
77};
78
79fn stringMatchesPattern(comptime pattern: []const u8, slice: []const u8) bool {
80 comptime var count: usize = 0;
81 inline while (count < pattern.len) : (count += 1) {
82 if (count >= slice.len) return false;
83 const c = slice[count];
84 if (pattern[count] != c) return false;
85 }
86 return true;
87}
88
89fn matchesPattern(self: Tokenizer, comptime pattern: []const u8) bool {
90 return stringMatchesPattern(pattern, self.buffer[self.index..]);
91}
92
93pub fn next(self: *Tokenizer) Token {
94 var result = Token{
95 .id = .eof,
96 .start = self.index,
97 .end = undefined,
98 };
99
100 var state: enum {
101 start,
102 new_line,
103 space,
104 tab,
105 comment,
106 single_quoted,
107 double_quoted,
108 literal,
109 } = .start;
110
111 while (self.index < self.buffer.len) : (self.index += 1) {
112 const c = self.buffer[self.index];
113 switch (state) {
114 .start => switch (c) {
115 ' ' => {
116 state = .space;
117 },
118 '\t' => {
119 state = .tab;
120 },
121 '\n' => {
122 result.id = .new_line;
123 self.index += 1;
124 break;
125 },
126 '\r' => {
127 state = .new_line;
128 },
129
130 '-' => if (self.matchesPattern("---")) {
131 result.id = .doc_start;
132 self.index += "---".len;
133 break;
134 } else if (self.matchesPattern("- ")) {
135 result.id = .seq_item_ind;
136 self.index += "- ".len;
137 break;
138 } else {
139 state = .literal;
140 },
141
142 '.' => if (self.matchesPattern("...")) {
143 result.id = .doc_end;
144 self.index += "...".len;
145 break;
146 } else {
147 state = .literal;
148 },
149
150 ',' => {
151 result.id = .comma;
152 self.index += 1;
153 break;
154 },
155 '#' => {
156 state = .comment;
157 },
158 '*' => {
159 result.id = .alias;
160 self.index += 1;
161 break;
162 },
163 '&' => {
164 result.id = .anchor;
165 self.index += 1;
166 break;
167 },
168 '!' => {
169 result.id = .tag;
170 self.index += 1;
171 break;
172 },
173 '[' => {
174 result.id = .flow_seq_start;
175 self.index += 1;
176 break;
177 },
178 ']' => {
179 result.id = .flow_seq_end;
180 self.index += 1;
181 break;
182 },
183 ':' => {
184 result.id = .map_value_ind;
185 self.index += 1;
186 break;
187 },
188 '{' => {
189 result.id = .flow_map_start;
190 self.index += 1;
191 break;
192 },
193 '}' => {
194 result.id = .flow_map_end;
195 self.index += 1;
196 break;
197 },
198 '\'' => {
199 state = .single_quoted;
200 },
201 '"' => {
202 state = .double_quoted;
203 },
204 else => {
205 state = .literal;
206 },
207 },
208
209 .comment => switch (c) {
210 '\r', '\n' => {
211 result.id = .comment;
212 break;
213 },
214 else => {},
215 },
216
217 .space => switch (c) {
218 ' ' => {},
219 else => {
220 result.id = .space;
221 break;
222 },
223 },
224
225 .tab => switch (c) {
226 '\t' => {},
227 else => {
228 result.id = .tab;
229 break;
230 },
231 },
232
233 .new_line => switch (c) {
234 '\n' => {
235 result.id = .new_line;
236 self.index += 1;
237 break;
238 },
239 else => {}, // TODO this should be an error condition
240 },
241
242 .single_quoted => switch (c) {
243 '\'' => if (!self.matchesPattern("''")) {
244 result.id = .single_quoted;
245 self.index += 1;
246 break;
247 } else {
248 self.index += "''".len - 1;
249 },
250 else => {},
251 },
252
253 .double_quoted => switch (c) {
254 '"' => {
255 if (stringMatchesPattern("\\", self.buffer[self.index - 1 ..])) {
256 self.index += 1;
257 } else {
258 result.id = .double_quoted;
259 self.index += 1;
260 break;
261 }
262 },
263 else => {},
264 },
265
266 .literal => switch (c) {
267 '\r', '\n', ' ', '\'', '"', ',', ':', ']', '}' => {
268 result.id = .literal;
269 break;
270 },
271 else => {
272 result.id = .literal;
273 },
274 },
275 }
276 }
277
278 if (self.index >= self.buffer.len) {
279 switch (state) {
280 .literal => {
281 result.id = .literal;
282 },
283 else => {},
284 }
285 }
286
287 result.end = self.index;
288
289 log.debug("{any}", .{result});
290 log.debug(" | {s}", .{self.buffer[result.start..result.end]});
291
292 return result;
293}
294
295fn testExpected(source: []const u8, expected: []const Token.Id) !void {
296 var tokenizer = Tokenizer{
297 .buffer = source,
298 };
299
300 var given = std.array_list.Managed(Token.Id).init(testing.allocator);
301 defer given.deinit();
302
303 while (true) {
304 const token = tokenizer.next();
305 try given.append(token.id);
306 if (token.id == .eof) break;
307 }
308
309 try testing.expectEqualSlices(Token.Id, expected, given.items);
310}
311
312test {
313 std.testing.refAllDecls(@This());
314}
315
316test "empty doc" {
317 try testExpected("", &[_]Token.Id{.eof});
318}
319
320test "empty doc with explicit markers" {
321 try testExpected(
322 \\---
323 \\...
324 , &[_]Token.Id{
325 .doc_start, .new_line, .doc_end, .eof,
326 });
327}
328
329test "empty doc with explicit markers and a directive" {
330 try testExpected(
331 \\--- !tbd-v1
332 \\...
333 , &[_]Token.Id{
334 .doc_start,
335 .space,
336 .tag,
337 .literal,
338 .new_line,
339 .doc_end,
340 .eof,
341 });
342}
343
344test "sequence of values" {
345 try testExpected(
346 \\- 0
347 \\- 1
348 \\- 2
349 , &[_]Token.Id{
350 .seq_item_ind,
351 .literal,
352 .new_line,
353 .seq_item_ind,
354 .literal,
355 .new_line,
356 .seq_item_ind,
357 .literal,
358 .eof,
359 });
360}
361
362test "sequence of sequences" {
363 try testExpected(
364 \\- [ val1, val2]
365 \\- [val3, val4 ]
366 , &[_]Token.Id{
367 .seq_item_ind,
368 .flow_seq_start,
369 .space,
370 .literal,
371 .comma,
372 .space,
373 .literal,
374 .flow_seq_end,
375 .new_line,
376 .seq_item_ind,
377 .flow_seq_start,
378 .literal,
379 .comma,
380 .space,
381 .literal,
382 .space,
383 .flow_seq_end,
384 .eof,
385 });
386}
387
388test "mappings" {
389 try testExpected(
390 \\key1: value1
391 \\key2: value2
392 , &[_]Token.Id{
393 .literal,
394 .map_value_ind,
395 .space,
396 .literal,
397 .new_line,
398 .literal,
399 .map_value_ind,
400 .space,
401 .literal,
402 .eof,
403 });
404}
405
406test "inline mapped sequence of values" {
407 try testExpected(
408 \\key : [ val1,
409 \\ val2 ]
410 , &[_]Token.Id{
411 .literal,
412 .space,
413 .map_value_ind,
414 .space,
415 .flow_seq_start,
416 .space,
417 .literal,
418 .comma,
419 .space,
420 .new_line,
421 .space,
422 .literal,
423 .space,
424 .flow_seq_end,
425 .eof,
426 });
427}
428
429test "part of tbd" {
430 try testExpected(
431 \\--- !tapi-tbd
432 \\tbd-version: 4
433 \\targets: [ x86_64-macos ]
434 \\
435 \\uuids:
436 \\ - target: x86_64-macos
437 \\ value: F86CC732-D5E4-30B5-AA7D-167DF5EC2708
438 \\
439 \\install-name: '/usr/lib/libSystem.B.dylib'
440 \\...
441 , &[_]Token.Id{
442 .doc_start,
443 .space,
444 .tag,
445 .literal,
446 .new_line,
447 .literal,
448 .map_value_ind,
449 .space,
450 .literal,
451 .new_line,
452 .literal,
453 .map_value_ind,
454 .space,
455 .flow_seq_start,
456 .space,
457 .literal,
458 .space,
459 .flow_seq_end,
460 .new_line,
461 .new_line,
462 .literal,
463 .map_value_ind,
464 .new_line,
465 .space,
466 .seq_item_ind,
467 .literal,
468 .map_value_ind,
469 .space,
470 .literal,
471 .new_line,
472 .space,
473 .literal,
474 .map_value_ind,
475 .space,
476 .literal,
477 .new_line,
478 .new_line,
479 .literal,
480 .map_value_ind,
481 .space,
482 .single_quoted,
483 .new_line,
484 .doc_end,
485 .eof,
486 });
487}
488
489test "Unindented list" {
490 try testExpected(
491 \\b:
492 \\- foo: 1
493 \\c: 1
494 , &[_]Token.Id{
495 .literal,
496 .map_value_ind,
497 .new_line,
498 .seq_item_ind,
499 .literal,
500 .map_value_ind,
501 .space,
502 .literal,
503 .new_line,
504 .literal,
505 .map_value_ind,
506 .space,
507 .literal,
508 .eof,
509 });
510}
511
512test "escape sequences" {
513 try testExpected(
514 \\a: 'here''s an apostrophe'
515 \\b: "a newline\nand a\ttab"
516 \\c: "\"here\" and there"
517 , &[_]Token.Id{
518 .literal,
519 .map_value_ind,
520 .space,
521 .single_quoted,
522 .new_line,
523 .literal,
524 .map_value_ind,
525 .space,
526 .double_quoted,
527 .new_line,
528 .literal,
529 .map_value_ind,
530 .space,
531 .double_quoted,
532 .eof,
533 });
534}
535
536test "comments" {
537 try testExpected(
538 \\key: # some comment about the key
539 \\# first value
540 \\- val1
541 \\# second value
542 \\- val2
543 , &[_]Token.Id{
544 .literal,
545 .map_value_ind,
546 .space,
547 .comment,
548 .new_line,
549 .comment,
550 .new_line,
551 .seq_item_ind,
552 .literal,
553 .new_line,
554 .comment,
555 .new_line,
556 .seq_item_ind,
557 .literal,
558 .eof,
559 });
560}
561
562test "quoted literals" {
563 try testExpected(
564 \\'#000000'
565 \\'[000000'
566 \\"&someString"
567 , &[_]Token.Id{
568 .single_quoted,
569 .new_line,
570 .single_quoted,
571 .new_line,
572 .double_quoted,
573 .eof,
574 });
575}