master
  1const Tokenizer = @This();
  2
  3const std = @import("std");
  4const log = std.log.scoped(.yaml);
  5const testing = std.testing;
  6
  7buffer: []const u8,
  8index: usize = 0,
  9
 10pub const Token = struct {
 11    id: Id,
 12    start: usize,
 13    end: usize,
 14
 15    pub const Id = enum {
 16        // zig fmt: off
 17        eof,
 18
 19        new_line,
 20        doc_start,      // ---
 21        doc_end,        // ...
 22        seq_item_ind,   // -
 23        map_value_ind,  // :
 24        flow_map_start, // {
 25        flow_map_end,   // }
 26        flow_seq_start, // [
 27        flow_seq_end,   // ]
 28
 29        comma,
 30        space,
 31        tab,
 32        comment,        // #
 33        alias,          // *
 34        anchor,         // &
 35        tag,            // !
 36
 37        single_quoted,   // '...'
 38        double_quoted,   // "..."
 39        literal,
 40        // zig fmt: on
 41    };
 42};
 43
 44pub const TokenIndex = usize;
 45
 46pub const TokenIterator = struct {
 47    buffer: []const Token,
 48    pos: TokenIndex = 0,
 49
 50    pub fn next(self: *TokenIterator) ?Token {
 51        const token = self.peek() orelse return null;
 52        self.pos += 1;
 53        return token;
 54    }
 55
 56    pub fn peek(self: TokenIterator) ?Token {
 57        if (self.pos >= self.buffer.len) return null;
 58        return self.buffer[self.pos];
 59    }
 60
 61    pub fn reset(self: *TokenIterator) void {
 62        self.pos = 0;
 63    }
 64
 65    pub fn seekTo(self: *TokenIterator, pos: TokenIndex) void {
 66        self.pos = pos;
 67    }
 68
 69    pub fn seekBy(self: *TokenIterator, offset: isize) void {
 70        const new_pos = @as(isize, @bitCast(self.pos)) + offset;
 71        if (new_pos < 0) {
 72            self.pos = 0;
 73        } else {
 74            self.pos = @as(usize, @intCast(new_pos));
 75        }
 76    }
 77};
 78
 79fn stringMatchesPattern(comptime pattern: []const u8, slice: []const u8) bool {
 80    comptime var count: usize = 0;
 81    inline while (count < pattern.len) : (count += 1) {
 82        if (count >= slice.len) return false;
 83        const c = slice[count];
 84        if (pattern[count] != c) return false;
 85    }
 86    return true;
 87}
 88
 89fn matchesPattern(self: Tokenizer, comptime pattern: []const u8) bool {
 90    return stringMatchesPattern(pattern, self.buffer[self.index..]);
 91}
 92
 93pub fn next(self: *Tokenizer) Token {
 94    var result = Token{
 95        .id = .eof,
 96        .start = self.index,
 97        .end = undefined,
 98    };
 99
100    var state: enum {
101        start,
102        new_line,
103        space,
104        tab,
105        comment,
106        single_quoted,
107        double_quoted,
108        literal,
109    } = .start;
110
111    while (self.index < self.buffer.len) : (self.index += 1) {
112        const c = self.buffer[self.index];
113        switch (state) {
114            .start => switch (c) {
115                ' ' => {
116                    state = .space;
117                },
118                '\t' => {
119                    state = .tab;
120                },
121                '\n' => {
122                    result.id = .new_line;
123                    self.index += 1;
124                    break;
125                },
126                '\r' => {
127                    state = .new_line;
128                },
129
130                '-' => if (self.matchesPattern("---")) {
131                    result.id = .doc_start;
132                    self.index += "---".len;
133                    break;
134                } else if (self.matchesPattern("- ")) {
135                    result.id = .seq_item_ind;
136                    self.index += "- ".len;
137                    break;
138                } else {
139                    state = .literal;
140                },
141
142                '.' => if (self.matchesPattern("...")) {
143                    result.id = .doc_end;
144                    self.index += "...".len;
145                    break;
146                } else {
147                    state = .literal;
148                },
149
150                ',' => {
151                    result.id = .comma;
152                    self.index += 1;
153                    break;
154                },
155                '#' => {
156                    state = .comment;
157                },
158                '*' => {
159                    result.id = .alias;
160                    self.index += 1;
161                    break;
162                },
163                '&' => {
164                    result.id = .anchor;
165                    self.index += 1;
166                    break;
167                },
168                '!' => {
169                    result.id = .tag;
170                    self.index += 1;
171                    break;
172                },
173                '[' => {
174                    result.id = .flow_seq_start;
175                    self.index += 1;
176                    break;
177                },
178                ']' => {
179                    result.id = .flow_seq_end;
180                    self.index += 1;
181                    break;
182                },
183                ':' => {
184                    result.id = .map_value_ind;
185                    self.index += 1;
186                    break;
187                },
188                '{' => {
189                    result.id = .flow_map_start;
190                    self.index += 1;
191                    break;
192                },
193                '}' => {
194                    result.id = .flow_map_end;
195                    self.index += 1;
196                    break;
197                },
198                '\'' => {
199                    state = .single_quoted;
200                },
201                '"' => {
202                    state = .double_quoted;
203                },
204                else => {
205                    state = .literal;
206                },
207            },
208
209            .comment => switch (c) {
210                '\r', '\n' => {
211                    result.id = .comment;
212                    break;
213                },
214                else => {},
215            },
216
217            .space => switch (c) {
218                ' ' => {},
219                else => {
220                    result.id = .space;
221                    break;
222                },
223            },
224
225            .tab => switch (c) {
226                '\t' => {},
227                else => {
228                    result.id = .tab;
229                    break;
230                },
231            },
232
233            .new_line => switch (c) {
234                '\n' => {
235                    result.id = .new_line;
236                    self.index += 1;
237                    break;
238                },
239                else => {}, // TODO this should be an error condition
240            },
241
242            .single_quoted => switch (c) {
243                '\'' => if (!self.matchesPattern("''")) {
244                    result.id = .single_quoted;
245                    self.index += 1;
246                    break;
247                } else {
248                    self.index += "''".len - 1;
249                },
250                else => {},
251            },
252
253            .double_quoted => switch (c) {
254                '"' => {
255                    if (stringMatchesPattern("\\", self.buffer[self.index - 1 ..])) {
256                        self.index += 1;
257                    } else {
258                        result.id = .double_quoted;
259                        self.index += 1;
260                        break;
261                    }
262                },
263                else => {},
264            },
265
266            .literal => switch (c) {
267                '\r', '\n', ' ', '\'', '"', ',', ':', ']', '}' => {
268                    result.id = .literal;
269                    break;
270                },
271                else => {
272                    result.id = .literal;
273                },
274            },
275        }
276    }
277
278    if (self.index >= self.buffer.len) {
279        switch (state) {
280            .literal => {
281                result.id = .literal;
282            },
283            else => {},
284        }
285    }
286
287    result.end = self.index;
288
289    log.debug("{any}", .{result});
290    log.debug("    | {s}", .{self.buffer[result.start..result.end]});
291
292    return result;
293}
294
295fn testExpected(source: []const u8, expected: []const Token.Id) !void {
296    var tokenizer = Tokenizer{
297        .buffer = source,
298    };
299
300    var given = std.array_list.Managed(Token.Id).init(testing.allocator);
301    defer given.deinit();
302
303    while (true) {
304        const token = tokenizer.next();
305        try given.append(token.id);
306        if (token.id == .eof) break;
307    }
308
309    try testing.expectEqualSlices(Token.Id, expected, given.items);
310}
311
312test {
313    std.testing.refAllDecls(@This());
314}
315
316test "empty doc" {
317    try testExpected("", &[_]Token.Id{.eof});
318}
319
320test "empty doc with explicit markers" {
321    try testExpected(
322        \\---
323        \\...
324    , &[_]Token.Id{
325        .doc_start, .new_line, .doc_end, .eof,
326    });
327}
328
329test "empty doc with explicit markers and a directive" {
330    try testExpected(
331        \\--- !tbd-v1
332        \\...
333    , &[_]Token.Id{
334        .doc_start,
335        .space,
336        .tag,
337        .literal,
338        .new_line,
339        .doc_end,
340        .eof,
341    });
342}
343
344test "sequence of values" {
345    try testExpected(
346        \\- 0
347        \\- 1
348        \\- 2
349    , &[_]Token.Id{
350        .seq_item_ind,
351        .literal,
352        .new_line,
353        .seq_item_ind,
354        .literal,
355        .new_line,
356        .seq_item_ind,
357        .literal,
358        .eof,
359    });
360}
361
362test "sequence of sequences" {
363    try testExpected(
364        \\- [ val1, val2]
365        \\- [val3, val4 ]
366    , &[_]Token.Id{
367        .seq_item_ind,
368        .flow_seq_start,
369        .space,
370        .literal,
371        .comma,
372        .space,
373        .literal,
374        .flow_seq_end,
375        .new_line,
376        .seq_item_ind,
377        .flow_seq_start,
378        .literal,
379        .comma,
380        .space,
381        .literal,
382        .space,
383        .flow_seq_end,
384        .eof,
385    });
386}
387
388test "mappings" {
389    try testExpected(
390        \\key1: value1
391        \\key2: value2
392    , &[_]Token.Id{
393        .literal,
394        .map_value_ind,
395        .space,
396        .literal,
397        .new_line,
398        .literal,
399        .map_value_ind,
400        .space,
401        .literal,
402        .eof,
403    });
404}
405
406test "inline mapped sequence of values" {
407    try testExpected(
408        \\key :  [ val1, 
409        \\          val2 ]
410    , &[_]Token.Id{
411        .literal,
412        .space,
413        .map_value_ind,
414        .space,
415        .flow_seq_start,
416        .space,
417        .literal,
418        .comma,
419        .space,
420        .new_line,
421        .space,
422        .literal,
423        .space,
424        .flow_seq_end,
425        .eof,
426    });
427}
428
429test "part of tbd" {
430    try testExpected(
431        \\--- !tapi-tbd
432        \\tbd-version:     4
433        \\targets:         [ x86_64-macos ]
434        \\
435        \\uuids:
436        \\  - target:          x86_64-macos
437        \\    value:           F86CC732-D5E4-30B5-AA7D-167DF5EC2708
438        \\
439        \\install-name:    '/usr/lib/libSystem.B.dylib'
440        \\...
441    , &[_]Token.Id{
442        .doc_start,
443        .space,
444        .tag,
445        .literal,
446        .new_line,
447        .literal,
448        .map_value_ind,
449        .space,
450        .literal,
451        .new_line,
452        .literal,
453        .map_value_ind,
454        .space,
455        .flow_seq_start,
456        .space,
457        .literal,
458        .space,
459        .flow_seq_end,
460        .new_line,
461        .new_line,
462        .literal,
463        .map_value_ind,
464        .new_line,
465        .space,
466        .seq_item_ind,
467        .literal,
468        .map_value_ind,
469        .space,
470        .literal,
471        .new_line,
472        .space,
473        .literal,
474        .map_value_ind,
475        .space,
476        .literal,
477        .new_line,
478        .new_line,
479        .literal,
480        .map_value_ind,
481        .space,
482        .single_quoted,
483        .new_line,
484        .doc_end,
485        .eof,
486    });
487}
488
489test "Unindented list" {
490    try testExpected(
491        \\b:
492        \\- foo: 1
493        \\c: 1
494    , &[_]Token.Id{
495        .literal,
496        .map_value_ind,
497        .new_line,
498        .seq_item_ind,
499        .literal,
500        .map_value_ind,
501        .space,
502        .literal,
503        .new_line,
504        .literal,
505        .map_value_ind,
506        .space,
507        .literal,
508        .eof,
509    });
510}
511
512test "escape sequences" {
513    try testExpected(
514        \\a: 'here''s an apostrophe'
515        \\b: "a newline\nand a\ttab"
516        \\c: "\"here\" and there"
517    , &[_]Token.Id{
518        .literal,
519        .map_value_ind,
520        .space,
521        .single_quoted,
522        .new_line,
523        .literal,
524        .map_value_ind,
525        .space,
526        .double_quoted,
527        .new_line,
528        .literal,
529        .map_value_ind,
530        .space,
531        .double_quoted,
532        .eof,
533    });
534}
535
536test "comments" {
537    try testExpected(
538        \\key: # some comment about the key
539        \\# first value
540        \\- val1
541        \\# second value
542        \\- val2
543    , &[_]Token.Id{
544        .literal,
545        .map_value_ind,
546        .space,
547        .comment,
548        .new_line,
549        .comment,
550        .new_line,
551        .seq_item_ind,
552        .literal,
553        .new_line,
554        .comment,
555        .new_line,
556        .seq_item_ind,
557        .literal,
558        .eof,
559    });
560}
561
562test "quoted literals" {
563    try testExpected(
564        \\'#000000'
565        \\'[000000'
566        \\"&someString"
567    , &[_]Token.Id{
568        .single_quoted,
569        .new_line,
570        .single_quoted,
571        .new_line,
572        .double_quoted,
573        .eof,
574    });
575}