master
1//! A Markdown parser producing `Document`s.
2//!
3//! The parser operates at two levels: at the outer level, the parser accepts
4//! the content of an input document line by line and begins building the _block
5//! structure_ of the document. This creates a stack of currently open blocks.
6//!
7//! When the parser detects the end of a block, it closes the block, popping it
8//! from the open block stack and completing any additional parsing of the
9//! block's content. For blocks which contain parseable inline content, this
10//! invokes the inner level of the parser, handling the _inline structure_ of
11//! the block.
12//!
13//! Inline parsing scans through the collected inline content of a block. When
14//! it encounters a character that could indicate the beginning of an inline, it
15//! either handles the inline right away (if possible) or adds it to a pending
16//! inlines stack. When an inline is completed, it is added to a list of
17//! completed inlines, which (along with any surrounding text nodes) will become
18//! the children of the parent inline or the block whose inline content is being
19//! parsed.
20
21const std = @import("std");
22const mem = std.mem;
23const assert = std.debug.assert;
24const isWhitespace = std.ascii.isWhitespace;
25const Allocator = mem.Allocator;
26const expectEqual = std.testing.expectEqual;
27const Document = @import("Document.zig");
28const Node = Document.Node;
29const ExtraIndex = Document.ExtraIndex;
30const ExtraData = Document.ExtraData;
31const StringIndex = Document.StringIndex;
32const ArrayList = std.ArrayList;
33
34nodes: Node.List = .{},
35extra: ArrayList(u32) = .empty,
36scratch_extra: ArrayList(u32) = .empty,
37string_bytes: ArrayList(u8) = .empty,
38scratch_string: ArrayList(u8) = .empty,
39pending_blocks: ArrayList(Block) = .empty,
40allocator: Allocator,
41
42const Parser = @This();
43
44/// An arbitrary limit on the maximum number of columns in a table so that
45/// table-related metadata maintained by the parser does not require dynamic
46/// memory allocation.
47const max_table_columns = 128;
48
49/// A block element which is still receiving children.
50const Block = struct {
51 tag: Tag,
52 data: Data,
53 extra_start: usize,
54 string_start: usize,
55
56 const Tag = enum {
57 /// Data is `list`.
58 list,
59 /// Data is `list_item`.
60 list_item,
61 /// Data is `table`.
62 table,
63 /// Data is `none`.
64 table_row,
65 /// Data is `heading`.
66 heading,
67 /// Data is `code_block`.
68 code_block,
69 /// Data is `none`.
70 blockquote,
71 /// Data is `none`.
72 paragraph,
73 /// Data is `none`.
74 thematic_break,
75 };
76
77 const Data = union {
78 none: void,
79 list: struct {
80 marker: ListMarker,
81 /// Between 0 and 999,999,999, inclusive.
82 start: u30,
83 tight: bool,
84 last_line_blank: bool = false,
85 },
86 list_item: struct {
87 continuation_indent: usize,
88 },
89 table: struct {
90 column_alignments_buffer: [max_table_columns]Node.TableCellAlignment,
91 column_alignments_len: usize,
92 },
93 heading: struct {
94 /// Between 1 and 6, inclusive.
95 level: u3,
96 },
97 code_block: struct {
98 tag: StringIndex,
99 fence_len: usize,
100 indent: usize,
101 },
102
103 const ListMarker = enum {
104 @"-",
105 @"*",
106 @"+",
107 number_dot,
108 number_paren,
109 };
110 };
111
112 const ContentType = enum {
113 blocks,
114 inlines,
115 raw_inlines,
116 nothing,
117 };
118
119 fn canAccept(b: Block) ContentType {
120 return switch (b.tag) {
121 .list,
122 .list_item,
123 .table,
124 .blockquote,
125 => .blocks,
126
127 .heading,
128 .paragraph,
129 => .inlines,
130
131 .code_block,
132 => .raw_inlines,
133
134 .table_row,
135 .thematic_break,
136 => .nothing,
137 };
138 }
139
140 /// Attempts to continue `b` using the contents of `line`. If successful,
141 /// returns the remaining portion of `line` to be considered part of `b`
142 /// (e.g. for a blockquote, this would be everything except the leading
143 /// `>`). If unsuccessful, returns null.
144 fn match(b: Block, line: []const u8) ?[]const u8 {
145 const unindented = mem.trimStart(u8, line, " \t");
146 const indent = line.len - unindented.len;
147 return switch (b.tag) {
148 .list => line,
149 .list_item => if (indent >= b.data.list_item.continuation_indent)
150 line[b.data.list_item.continuation_indent..]
151 else if (unindented.len == 0)
152 // Blank lines should not close list items, since there may be
153 // more indented contents to follow after the blank line.
154 ""
155 else
156 null,
157 .table => if (unindented.len > 0) line else null,
158 .table_row => null,
159 .heading => null,
160 .code_block => code_block: {
161 const trimmed = mem.trimEnd(u8, unindented, " \t");
162 if (mem.indexOfNone(u8, trimmed, "`") != null or trimmed.len != b.data.code_block.fence_len) {
163 const effective_indent = @min(indent, b.data.code_block.indent);
164 break :code_block line[effective_indent..];
165 } else {
166 break :code_block null;
167 }
168 },
169 .blockquote => if (mem.startsWith(u8, unindented, ">"))
170 unindented[1..]
171 else
172 null,
173 .paragraph => if (unindented.len > 0) line else null,
174 .thematic_break => null,
175 };
176 }
177};
178
179pub fn init(allocator: Allocator) Allocator.Error!Parser {
180 var p: Parser = .{ .allocator = allocator };
181 try p.nodes.append(allocator, .{
182 .tag = .root,
183 .data = undefined,
184 });
185 try p.string_bytes.append(allocator, 0);
186 return p;
187}
188
189pub fn deinit(p: *Parser) void {
190 p.nodes.deinit(p.allocator);
191 p.extra.deinit(p.allocator);
192 p.scratch_extra.deinit(p.allocator);
193 p.string_bytes.deinit(p.allocator);
194 p.scratch_string.deinit(p.allocator);
195 p.pending_blocks.deinit(p.allocator);
196 p.* = undefined;
197}
198
199/// Accepts a single line of content. `line` should not have a trailing line
200/// ending character.
201pub fn feedLine(p: *Parser, line: []const u8) Allocator.Error!void {
202 var rest_line = line;
203 const first_unmatched = for (p.pending_blocks.items, 0..) |b, i| {
204 if (b.match(rest_line)) |rest| {
205 rest_line = rest;
206 } else {
207 break i;
208 }
209 } else p.pending_blocks.items.len;
210
211 const in_code_block = p.pending_blocks.items.len > 0 and
212 p.pending_blocks.getLast().tag == .code_block;
213 const code_block_end = in_code_block and
214 first_unmatched + 1 == p.pending_blocks.items.len;
215 // New blocks cannot be started if we are actively inside a code block or
216 // are just closing one (to avoid interpreting the closing ``` as a new code
217 // block start).
218 var maybe_block_start = if (!in_code_block or first_unmatched + 2 <= p.pending_blocks.items.len)
219 try p.startBlock(rest_line)
220 else
221 null;
222
223 // This is a lazy continuation line if there are no new blocks to open and
224 // the last open block is a paragraph.
225 if (maybe_block_start == null and
226 !isBlank(rest_line) and
227 p.pending_blocks.items.len > 0 and
228 p.pending_blocks.getLast().tag == .paragraph)
229 {
230 try p.addScratchStringLine(mem.trimStart(u8, rest_line, " \t"));
231 return;
232 }
233
234 // If a new block needs to be started, any paragraph needs to be closed,
235 // even though this isn't detected as part of the closing condition for
236 // paragraphs.
237 if (maybe_block_start != null and
238 p.pending_blocks.items.len > 0 and
239 p.pending_blocks.getLast().tag == .paragraph)
240 {
241 try p.closeLastBlock();
242 }
243
244 while (p.pending_blocks.items.len > first_unmatched) {
245 try p.closeLastBlock();
246 }
247
248 while (maybe_block_start) |block_start| : (maybe_block_start = try p.startBlock(rest_line)) {
249 try p.appendBlockStart(block_start);
250 // There may be more blocks to start within the same line.
251 rest_line = block_start.rest;
252 // Headings may only contain inline content.
253 if (block_start.tag == .heading) break;
254 // An opening code fence does not contain any additional block or inline
255 // content to process.
256 if (block_start.tag == .code_block) return;
257 }
258
259 // Do not append the end of a code block (```) as textual content.
260 if (code_block_end) return;
261
262 const can_accept = if (p.pending_blocks.getLastOrNull()) |last_pending_block|
263 last_pending_block.canAccept()
264 else
265 .blocks;
266 const rest_line_trimmed = mem.trimStart(u8, rest_line, " \t");
267 switch (can_accept) {
268 .blocks => {
269 // If we're inside a list item and the rest of the line is blank, it
270 // means that any subsequent child of the list item (or subsequent
271 // item in the list) will cause the containing list to be considered
272 // loose. However, we can't immediately declare that the list is
273 // loose, since we might just be looking at a blank line after the
274 // end of the last item in the list. The final determination will be
275 // made when appending the next child of the list or list item.
276 const maybe_containing_list_index = if (p.pending_blocks.items.len > 0 and p.pending_blocks.getLast().tag == .list_item)
277 p.pending_blocks.items.len - 2
278 else
279 null;
280
281 if (rest_line_trimmed.len > 0) {
282 try p.appendBlockStart(.{
283 .tag = .paragraph,
284 .data = .{ .none = {} },
285 .rest = undefined,
286 });
287 try p.addScratchStringLine(rest_line_trimmed);
288 }
289
290 if (maybe_containing_list_index) |containing_list_index| {
291 p.pending_blocks.items[containing_list_index].data.list.last_line_blank = rest_line_trimmed.len == 0;
292 }
293 },
294 .inlines => try p.addScratchStringLine(rest_line_trimmed),
295 .raw_inlines => try p.addScratchStringLine(rest_line),
296 .nothing => {},
297 }
298}
299
300/// Completes processing of the input and returns the parsed document.
301pub fn endInput(p: *Parser) Allocator.Error!Document {
302 while (p.pending_blocks.items.len > 0) {
303 try p.closeLastBlock();
304 }
305 // There should be no inline content pending after closing the last open
306 // block.
307 assert(p.scratch_string.items.len == 0);
308
309 const children = try p.addExtraChildren(@ptrCast(p.scratch_extra.items));
310 p.nodes.items(.data)[0] = .{ .container = .{ .children = children } };
311 p.scratch_string.items.len = 0;
312 p.scratch_extra.items.len = 0;
313
314 var nodes = p.nodes.toOwnedSlice();
315 errdefer nodes.deinit(p.allocator);
316 const extra = try p.extra.toOwnedSlice(p.allocator);
317 errdefer p.allocator.free(extra);
318 const string_bytes = try p.string_bytes.toOwnedSlice(p.allocator);
319 errdefer p.allocator.free(string_bytes);
320
321 return .{
322 .nodes = nodes,
323 .extra = extra,
324 .string_bytes = string_bytes,
325 };
326}
327
328/// Data describing the start of a new block element.
329const BlockStart = struct {
330 tag: Tag,
331 data: Data,
332 rest: []const u8,
333
334 const Tag = enum {
335 /// Data is `list_item`.
336 list_item,
337 /// Data is `table_row`.
338 table_row,
339 /// Data is `heading`.
340 heading,
341 /// Data is `code_block`.
342 code_block,
343 /// Data is `none`.
344 blockquote,
345 /// Data is `none`.
346 paragraph,
347 /// Data is `none`.
348 thematic_break,
349 };
350
351 const Data = union {
352 none: void,
353 list_item: struct {
354 marker: Block.Data.ListMarker,
355 number: u30,
356 continuation_indent: usize,
357 },
358 table_row: struct {
359 cells_buffer: [max_table_columns][]const u8,
360 cells_len: usize,
361 },
362 heading: struct {
363 /// Between 1 and 6, inclusive.
364 level: u3,
365 },
366 code_block: struct {
367 tag: StringIndex,
368 fence_len: usize,
369 indent: usize,
370 },
371 };
372};
373
374fn appendBlockStart(p: *Parser, block_start: BlockStart) !void {
375 if (p.pending_blocks.getLastOrNull()) |last_pending_block| {
376 // Close the last block if it is a list and the new block is not a list item
377 // or not of the same marker type.
378 const should_close_list = last_pending_block.tag == .list and
379 (block_start.tag != .list_item or
380 block_start.data.list_item.marker != last_pending_block.data.list.marker);
381 // The last block should also be closed if the new block is not a table
382 // row, which is the only allowed child of a table.
383 const should_close_table = last_pending_block.tag == .table and
384 block_start.tag != .table_row;
385 if (should_close_list or should_close_table) {
386 try p.closeLastBlock();
387 }
388 }
389
390 if (p.pending_blocks.getLastOrNull()) |last_pending_block| {
391 // If the last block is a list or list item, check for tightness based
392 // on the last line.
393 const maybe_containing_list = switch (last_pending_block.tag) {
394 .list => &p.pending_blocks.items[p.pending_blocks.items.len - 1],
395 .list_item => &p.pending_blocks.items[p.pending_blocks.items.len - 2],
396 else => null,
397 };
398 if (maybe_containing_list) |containing_list| {
399 if (containing_list.data.list.last_line_blank) {
400 containing_list.data.list.tight = false;
401 }
402 }
403 }
404
405 // Start a new list if the new block is a list item and there is no
406 // containing list yet.
407 if (block_start.tag == .list_item and
408 (p.pending_blocks.items.len == 0 or p.pending_blocks.getLast().tag != .list))
409 {
410 try p.pending_blocks.append(p.allocator, .{
411 .tag = .list,
412 .data = .{ .list = .{
413 .marker = block_start.data.list_item.marker,
414 .start = block_start.data.list_item.number,
415 .tight = true,
416 } },
417 .string_start = p.scratch_string.items.len,
418 .extra_start = p.scratch_extra.items.len,
419 });
420 }
421
422 if (block_start.tag == .table_row) {
423 // Likewise, table rows start a table implicitly.
424 if (p.pending_blocks.items.len == 0 or p.pending_blocks.getLast().tag != .table) {
425 try p.pending_blocks.append(p.allocator, .{
426 .tag = .table,
427 .data = .{ .table = .{
428 .column_alignments_buffer = undefined,
429 .column_alignments_len = 0,
430 } },
431 .string_start = p.scratch_string.items.len,
432 .extra_start = p.scratch_extra.items.len,
433 });
434 }
435
436 const current_row = p.scratch_extra.items.len - p.pending_blocks.getLast().extra_start;
437 if (current_row <= 1) {
438 var buffer: [max_table_columns]Node.TableCellAlignment = undefined;
439 const table_row = &block_start.data.table_row;
440 if (parseTableHeaderDelimiter(table_row.cells_buffer[0..table_row.cells_len], &buffer)) |alignments| {
441 const table = &p.pending_blocks.items[p.pending_blocks.items.len - 1].data.table;
442 @memcpy(table.column_alignments_buffer[0..alignments.len], alignments);
443 table.column_alignments_len = alignments.len;
444 if (current_row == 1) {
445 // We need to go back and mark the header row and its column
446 // alignments.
447 const datas = p.nodes.items(.data);
448 const header_data = datas[p.scratch_extra.getLast()];
449 for (p.extraChildren(header_data.container.children), 0..) |header_cell, i| {
450 const alignment = if (i < alignments.len) alignments[i] else .unset;
451 const cell_data = &datas[@intFromEnum(header_cell)].table_cell;
452 cell_data.info.alignment = alignment;
453 cell_data.info.header = true;
454 }
455 }
456 return;
457 }
458 }
459 }
460
461 const tag: Block.Tag, const data: Block.Data = switch (block_start.tag) {
462 .list_item => .{ .list_item, .{ .list_item = .{
463 .continuation_indent = block_start.data.list_item.continuation_indent,
464 } } },
465 .table_row => .{ .table_row, .{ .none = {} } },
466 .heading => .{ .heading, .{ .heading = .{
467 .level = block_start.data.heading.level,
468 } } },
469 .code_block => .{ .code_block, .{ .code_block = .{
470 .tag = block_start.data.code_block.tag,
471 .fence_len = block_start.data.code_block.fence_len,
472 .indent = block_start.data.code_block.indent,
473 } } },
474 .blockquote => .{ .blockquote, .{ .none = {} } },
475 .paragraph => .{ .paragraph, .{ .none = {} } },
476 .thematic_break => .{ .thematic_break, .{ .none = {} } },
477 };
478
479 try p.pending_blocks.append(p.allocator, .{
480 .tag = tag,
481 .data = data,
482 .string_start = p.scratch_string.items.len,
483 .extra_start = p.scratch_extra.items.len,
484 });
485
486 if (tag == .table_row) {
487 // Table rows are unique, since we already have all the children
488 // available in the BlockStart. We can immediately parse and append
489 // these children now.
490 const containing_table = p.pending_blocks.items[p.pending_blocks.items.len - 2];
491 const table = &containing_table.data.table;
492 const column_alignments = table.column_alignments_buffer[0..table.column_alignments_len];
493 const table_row = &block_start.data.table_row;
494 for (table_row.cells_buffer[0..table_row.cells_len], 0..) |cell_content, i| {
495 const cell_children = try p.parseInlines(cell_content);
496 const alignment = if (i < column_alignments.len) column_alignments[i] else .unset;
497 const cell = try p.addNode(.{
498 .tag = .table_cell,
499 .data = .{ .table_cell = .{
500 .info = .{
501 .alignment = alignment,
502 .header = false,
503 },
504 .children = cell_children,
505 } },
506 });
507 try p.addScratchExtraNode(cell);
508 }
509 }
510}
511
512fn startBlock(p: *Parser, line: []const u8) !?BlockStart {
513 const unindented = mem.trimStart(u8, line, " \t");
514 const indent = line.len - unindented.len;
515 if (isThematicBreak(line)) {
516 // Thematic breaks take precedence over list items.
517 return .{
518 .tag = .thematic_break,
519 .data = .{ .none = {} },
520 .rest = "",
521 };
522 } else if (startListItem(unindented)) |list_item| {
523 return .{
524 .tag = .list_item,
525 .data = .{ .list_item = .{
526 .marker = list_item.marker,
527 .number = list_item.number,
528 .continuation_indent = indent + list_item.marker_len,
529 } },
530 .rest = list_item.rest,
531 };
532 } else if (startTableRow(unindented)) |table_row| {
533 return .{
534 .tag = .table_row,
535 .data = .{ .table_row = .{
536 .cells_buffer = table_row.cells_buffer,
537 .cells_len = table_row.cells_len,
538 } },
539 .rest = "",
540 };
541 } else if (startHeading(unindented)) |heading| {
542 return .{
543 .tag = .heading,
544 .data = .{ .heading = .{
545 .level = heading.level,
546 } },
547 .rest = heading.rest,
548 };
549 } else if (try p.startCodeBlock(unindented)) |code_block| {
550 return .{
551 .tag = .code_block,
552 .data = .{ .code_block = .{
553 .tag = code_block.tag,
554 .fence_len = code_block.fence_len,
555 .indent = indent,
556 } },
557 .rest = "",
558 };
559 } else if (startBlockquote(unindented)) |rest| {
560 return .{
561 .tag = .blockquote,
562 .data = .{ .none = {} },
563 .rest = rest,
564 };
565 } else {
566 return null;
567 }
568}
569
570const ListItemStart = struct {
571 marker: Block.Data.ListMarker,
572 number: u30,
573 marker_len: usize,
574 rest: []const u8,
575};
576
577fn startListItem(unindented_line: []const u8) ?ListItemStart {
578 if (mem.startsWith(u8, unindented_line, "- ")) {
579 return .{
580 .marker = .@"-",
581 .number = undefined,
582 .marker_len = 2,
583 .rest = unindented_line[2..],
584 };
585 } else if (mem.startsWith(u8, unindented_line, "* ")) {
586 return .{
587 .marker = .@"*",
588 .number = undefined,
589 .marker_len = 2,
590 .rest = unindented_line[2..],
591 };
592 } else if (mem.startsWith(u8, unindented_line, "+ ")) {
593 return .{
594 .marker = .@"+",
595 .number = undefined,
596 .marker_len = 2,
597 .rest = unindented_line[2..],
598 };
599 }
600
601 const number_end = mem.indexOfNone(u8, unindented_line, "0123456789") orelse return null;
602 const after_number = unindented_line[number_end..];
603 const marker: Block.Data.ListMarker = if (mem.startsWith(u8, after_number, ". "))
604 .number_dot
605 else if (mem.startsWith(u8, after_number, ") "))
606 .number_paren
607 else
608 return null;
609 const number = std.fmt.parseInt(u30, unindented_line[0..number_end], 10) catch return null;
610 if (number > 999_999_999) return null;
611 return .{
612 .marker = marker,
613 .number = number,
614 .marker_len = number_end + 2,
615 .rest = after_number[2..],
616 };
617}
618
619const TableRowStart = struct {
620 cells_buffer: [max_table_columns][]const u8,
621 cells_len: usize,
622};
623
624fn startTableRow(unindented_line: []const u8) ?TableRowStart {
625 if (unindented_line.len < 2 or
626 !mem.startsWith(u8, unindented_line, "|") or
627 mem.endsWith(u8, unindented_line, "\\|") or
628 !mem.endsWith(u8, unindented_line, "|")) return null;
629
630 var cells_buffer: [max_table_columns][]const u8 = undefined;
631 var cells: ArrayList([]const u8) = .initBuffer(&cells_buffer);
632 const table_row_content = unindented_line[1 .. unindented_line.len - 1];
633 var cell_start: usize = 0;
634 var i: usize = 0;
635 while (i < table_row_content.len) : (i += 1) {
636 switch (table_row_content[i]) {
637 '\\' => i += 1,
638 '|' => {
639 cells.appendBounded(table_row_content[cell_start..i]) catch return null;
640 cell_start = i + 1;
641 },
642 '`' => {
643 // Ignoring pipes in code spans allows table cells to contain
644 // code using ||, for example.
645 const open_start = i;
646 i = mem.indexOfNonePos(u8, table_row_content, i, "`") orelse return null;
647 const open_len = i - open_start;
648 while (mem.indexOfScalarPos(u8, table_row_content, i, '`')) |close_start| {
649 i = mem.indexOfNonePos(u8, table_row_content, close_start, "`") orelse return null;
650 const close_len = i - close_start;
651 if (close_len == open_len) break;
652 } else return null;
653 },
654 else => {},
655 }
656 }
657 cells.appendBounded(table_row_content[cell_start..]) catch return null;
658
659 return .{ .cells_buffer = cells_buffer, .cells_len = cells.items.len };
660}
661
662fn parseTableHeaderDelimiter(
663 row_cells: []const []const u8,
664 buffer: []Node.TableCellAlignment,
665) ?[]Node.TableCellAlignment {
666 var alignments: ArrayList(Node.TableCellAlignment) = .initBuffer(buffer);
667 for (row_cells) |content| {
668 const alignment = parseTableHeaderDelimiterCell(content) orelse return null;
669 alignments.appendAssumeCapacity(alignment);
670 }
671 return alignments.items;
672}
673
674fn parseTableHeaderDelimiterCell(content: []const u8) ?Node.TableCellAlignment {
675 var state: enum {
676 before_rule,
677 after_left_anchor,
678 in_rule,
679 after_right_anchor,
680 after_rule,
681 } = .before_rule;
682 var left_anchor = false;
683 var right_anchor = false;
684 for (content) |c| {
685 switch (state) {
686 .before_rule => switch (c) {
687 ' ' => {},
688 ':' => {
689 left_anchor = true;
690 state = .after_left_anchor;
691 },
692 '-' => state = .in_rule,
693 else => return null,
694 },
695 .after_left_anchor => switch (c) {
696 '-' => state = .in_rule,
697 else => return null,
698 },
699 .in_rule => switch (c) {
700 '-' => {},
701 ':' => {
702 right_anchor = true;
703 state = .after_right_anchor;
704 },
705 ' ' => state = .after_rule,
706 else => return null,
707 },
708 .after_right_anchor => switch (c) {
709 ' ' => state = .after_rule,
710 else => return null,
711 },
712 .after_rule => switch (c) {
713 ' ' => {},
714 else => return null,
715 },
716 }
717 }
718
719 switch (state) {
720 .before_rule,
721 .after_left_anchor,
722 => return null,
723
724 .in_rule,
725 .after_right_anchor,
726 .after_rule,
727 => {},
728 }
729
730 return if (left_anchor and right_anchor)
731 .center
732 else if (left_anchor)
733 .left
734 else if (right_anchor)
735 .right
736 else
737 .unset;
738}
739
740test parseTableHeaderDelimiterCell {
741 try expectEqual(null, parseTableHeaderDelimiterCell(""));
742 try expectEqual(null, parseTableHeaderDelimiterCell(" "));
743 try expectEqual(.unset, parseTableHeaderDelimiterCell("-"));
744 try expectEqual(.unset, parseTableHeaderDelimiterCell(" - "));
745 try expectEqual(.unset, parseTableHeaderDelimiterCell("----"));
746 try expectEqual(.unset, parseTableHeaderDelimiterCell(" ---- "));
747 try expectEqual(null, parseTableHeaderDelimiterCell(":"));
748 try expectEqual(null, parseTableHeaderDelimiterCell("::"));
749 try expectEqual(.left, parseTableHeaderDelimiterCell(":-"));
750 try expectEqual(.left, parseTableHeaderDelimiterCell(" :----"));
751 try expectEqual(.center, parseTableHeaderDelimiterCell(":-:"));
752 try expectEqual(.center, parseTableHeaderDelimiterCell(":----:"));
753 try expectEqual(.center, parseTableHeaderDelimiterCell(" :----: "));
754 try expectEqual(.right, parseTableHeaderDelimiterCell("-:"));
755 try expectEqual(.right, parseTableHeaderDelimiterCell("----:"));
756 try expectEqual(.right, parseTableHeaderDelimiterCell(" ----: "));
757}
758
759const HeadingStart = struct {
760 level: u3,
761 rest: []const u8,
762};
763
764fn startHeading(unindented_line: []const u8) ?HeadingStart {
765 var level: u3 = 0;
766 return for (unindented_line, 0..) |c, i| {
767 switch (c) {
768 '#' => {
769 if (level == 6) break null;
770 level += 1;
771 },
772 ' ' => {
773 // We must have seen at least one # by this point, since
774 // unindented_line has no leading spaces.
775 assert(level > 0);
776 break .{
777 .level = level,
778 .rest = unindented_line[i + 1 ..],
779 };
780 },
781 else => break null,
782 }
783 } else null;
784}
785
786const CodeBlockStart = struct {
787 tag: StringIndex,
788 fence_len: usize,
789};
790
791fn startCodeBlock(p: *Parser, unindented_line: []const u8) !?CodeBlockStart {
792 var fence_len: usize = 0;
793 const tag_bytes = for (unindented_line, 0..) |c, i| {
794 switch (c) {
795 '`' => fence_len += 1,
796 else => break unindented_line[i..],
797 }
798 } else "";
799 // Code block tags may not contain backticks, since that would create
800 // potential confusion with inline code spans.
801 if (fence_len < 3 or mem.indexOfScalar(u8, tag_bytes, '`') != null) return null;
802 return .{
803 .tag = try p.addString(mem.trim(u8, tag_bytes, " ")),
804 .fence_len = fence_len,
805 };
806}
807
808fn startBlockquote(unindented_line: []const u8) ?[]const u8 {
809 return if (mem.startsWith(u8, unindented_line, ">"))
810 unindented_line[1..]
811 else
812 null;
813}
814
815fn isThematicBreak(line: []const u8) bool {
816 var char: ?u8 = null;
817 var count: usize = 0;
818 for (line) |c| {
819 switch (c) {
820 ' ' => {},
821 '-', '_', '*' => {
822 if (char != null and c != char.?) return false;
823 char = c;
824 count += 1;
825 },
826 else => return false,
827 }
828 }
829 return count >= 3;
830}
831
832fn closeLastBlock(p: *Parser) !void {
833 const b = p.pending_blocks.pop().?;
834 const node = switch (b.tag) {
835 .list => list: {
836 assert(b.string_start == p.scratch_string.items.len);
837
838 // Although tightness is parsed as a property of the list, it is
839 // stored at the list item level to make it possible to render each
840 // node without any context from its parents.
841 const list_items = p.scratch_extra.items[b.extra_start..];
842 const node_datas = p.nodes.items(.data);
843 if (!b.data.list.tight) {
844 for (list_items) |list_item| {
845 node_datas[list_item].list_item.tight = false;
846 }
847 }
848
849 const children = try p.addExtraChildren(@ptrCast(list_items));
850 break :list try p.addNode(.{
851 .tag = .list,
852 .data = .{ .list = .{
853 .start = switch (b.data.list.marker) {
854 .number_dot, .number_paren => @enumFromInt(b.data.list.start),
855 .@"-", .@"*", .@"+" => .unordered,
856 },
857 .children = children,
858 } },
859 });
860 },
861 .list_item => list_item: {
862 assert(b.string_start == p.scratch_string.items.len);
863 const children = try p.addExtraChildren(@ptrCast(p.scratch_extra.items[b.extra_start..]));
864 break :list_item try p.addNode(.{
865 .tag = .list_item,
866 .data = .{ .list_item = .{
867 .tight = true,
868 .children = children,
869 } },
870 });
871 },
872 .table => table: {
873 assert(b.string_start == p.scratch_string.items.len);
874 const children = try p.addExtraChildren(@ptrCast(p.scratch_extra.items[b.extra_start..]));
875 break :table try p.addNode(.{
876 .tag = .table,
877 .data = .{ .container = .{
878 .children = children,
879 } },
880 });
881 },
882 .table_row => table_row: {
883 assert(b.string_start == p.scratch_string.items.len);
884 const children = try p.addExtraChildren(@ptrCast(p.scratch_extra.items[b.extra_start..]));
885 break :table_row try p.addNode(.{
886 .tag = .table_row,
887 .data = .{ .container = .{
888 .children = children,
889 } },
890 });
891 },
892 .heading => heading: {
893 const children = try p.parseInlines(p.scratch_string.items[b.string_start..]);
894 break :heading try p.addNode(.{
895 .tag = .heading,
896 .data = .{ .heading = .{
897 .level = b.data.heading.level,
898 .children = children,
899 } },
900 });
901 },
902 .code_block => code_block: {
903 const content = try p.addString(p.scratch_string.items[b.string_start..]);
904 break :code_block try p.addNode(.{
905 .tag = .code_block,
906 .data = .{ .code_block = .{
907 .tag = b.data.code_block.tag,
908 .content = content,
909 } },
910 });
911 },
912 .blockquote => blockquote: {
913 assert(b.string_start == p.scratch_string.items.len);
914 const children = try p.addExtraChildren(@ptrCast(p.scratch_extra.items[b.extra_start..]));
915 break :blockquote try p.addNode(.{
916 .tag = .blockquote,
917 .data = .{ .container = .{
918 .children = children,
919 } },
920 });
921 },
922 .paragraph => paragraph: {
923 const children = try p.parseInlines(p.scratch_string.items[b.string_start..]);
924 break :paragraph try p.addNode(.{
925 .tag = .paragraph,
926 .data = .{ .container = .{
927 .children = children,
928 } },
929 });
930 },
931 .thematic_break => try p.addNode(.{
932 .tag = .thematic_break,
933 .data = .{ .none = {} },
934 }),
935 };
936 p.scratch_string.items.len = b.string_start;
937 p.scratch_extra.items.len = b.extra_start;
938 try p.addScratchExtraNode(node);
939}
940
941const InlineParser = struct {
942 parent: *Parser,
943 content: []const u8,
944 pos: usize = 0,
945 pending_inlines: ArrayList(PendingInline) = .empty,
946 completed_inlines: ArrayList(CompletedInline) = .empty,
947
948 const PendingInline = struct {
949 tag: Tag,
950 data: Data,
951 start: usize,
952
953 const Tag = enum {
954 /// Data is `emphasis`.
955 emphasis,
956 /// Data is `none`.
957 link,
958 /// Data is `none`.
959 image,
960 };
961
962 const Data = union {
963 none: void,
964 emphasis: struct {
965 underscore: bool,
966 run_len: usize,
967 },
968 };
969 };
970
971 const CompletedInline = struct {
972 node: Node.Index,
973 start: usize,
974 len: usize,
975 };
976
977 fn deinit(ip: *InlineParser) void {
978 ip.pending_inlines.deinit(ip.parent.allocator);
979 ip.completed_inlines.deinit(ip.parent.allocator);
980 }
981
982 /// Parses all of `ip.content`, returning the children of the node
983 /// containing the inline content.
984 fn parse(ip: *InlineParser) Allocator.Error!ExtraIndex {
985 while (ip.pos < ip.content.len) : (ip.pos += 1) {
986 switch (ip.content[ip.pos]) {
987 '\\' => ip.pos += 1,
988 '[' => try ip.pending_inlines.append(ip.parent.allocator, .{
989 .tag = .link,
990 .data = .{ .none = {} },
991 .start = ip.pos,
992 }),
993 '!' => if (ip.pos + 1 < ip.content.len and ip.content[ip.pos + 1] == '[') {
994 try ip.pending_inlines.append(ip.parent.allocator, .{
995 .tag = .image,
996 .data = .{ .none = {} },
997 .start = ip.pos,
998 });
999 ip.pos += 1;
1000 },
1001 ']' => try ip.parseLink(),
1002 '<' => try ip.parseAutolink(),
1003 '*', '_' => try ip.parseEmphasis(),
1004 '`' => try ip.parseCodeSpan(),
1005 'h' => if (ip.pos == 0 or isPreTextAutolink(ip.content[ip.pos - 1])) {
1006 try ip.parseTextAutolink();
1007 },
1008 else => {},
1009 }
1010 }
1011
1012 const children = try ip.encodeChildren(0, ip.content.len);
1013 // There may be pending inlines after parsing (e.g. unclosed emphasis
1014 // runs), but there must not be any completed inlines, since those
1015 // should all be part of `children`.
1016 assert(ip.completed_inlines.items.len == 0);
1017 return children;
1018 }
1019
1020 /// Parses a link, starting at the `]` at the end of the link text. `ip.pos`
1021 /// is left at the closing `)` of the link target or at the closing `]` if
1022 /// there is none.
1023 fn parseLink(ip: *InlineParser) !void {
1024 var i = ip.pending_inlines.items.len;
1025 while (i > 0) {
1026 i -= 1;
1027 if (ip.pending_inlines.items[i].tag == .link or
1028 ip.pending_inlines.items[i].tag == .image) break;
1029 } else return;
1030 const opener = ip.pending_inlines.items[i];
1031 ip.pending_inlines.shrinkRetainingCapacity(i);
1032 const text_start = switch (opener.tag) {
1033 .link => opener.start + 1,
1034 .image => opener.start + 2,
1035 else => unreachable,
1036 };
1037
1038 if (ip.pos + 1 >= ip.content.len or ip.content[ip.pos + 1] != '(') return;
1039 const text_end = ip.pos;
1040
1041 const target_start = text_end + 2;
1042 var target_end = target_start;
1043 var nesting_level: usize = 1;
1044 while (target_end < ip.content.len) : (target_end += 1) {
1045 switch (ip.content[target_end]) {
1046 '\\' => target_end += 1,
1047 '(' => nesting_level += 1,
1048 ')' => {
1049 if (nesting_level == 1) break;
1050 nesting_level -= 1;
1051 },
1052 else => {},
1053 }
1054 } else return;
1055 ip.pos = target_end;
1056
1057 const children = try ip.encodeChildren(text_start, text_end);
1058 const target = try ip.encodeLinkTarget(target_start, target_end);
1059
1060 const link = try ip.parent.addNode(.{
1061 .tag = switch (opener.tag) {
1062 .link => .link,
1063 .image => .image,
1064 else => unreachable,
1065 },
1066 .data = .{ .link = .{
1067 .target = target,
1068 .children = children,
1069 } },
1070 });
1071 try ip.completed_inlines.append(ip.parent.allocator, .{
1072 .node = link,
1073 .start = opener.start,
1074 .len = ip.pos - opener.start + 1,
1075 });
1076 }
1077
1078 fn encodeLinkTarget(ip: *InlineParser, start: usize, end: usize) !StringIndex {
1079 // For efficiency, we can encode directly into string_bytes rather than
1080 // creating a temporary string and then encoding it, since this process
1081 // is entirely linear.
1082 const string_top = ip.parent.string_bytes.items.len;
1083 errdefer ip.parent.string_bytes.shrinkRetainingCapacity(string_top);
1084
1085 var text_iter: TextIterator = .{ .content = ip.content[start..end] };
1086 while (text_iter.next()) |content| {
1087 switch (content) {
1088 .char => |c| try ip.parent.string_bytes.append(ip.parent.allocator, c),
1089 .text => |s| try ip.parent.string_bytes.appendSlice(ip.parent.allocator, s),
1090 .line_break => try ip.parent.string_bytes.appendSlice(ip.parent.allocator, "\\\n"),
1091 }
1092 }
1093 try ip.parent.string_bytes.append(ip.parent.allocator, 0);
1094 return @enumFromInt(string_top);
1095 }
1096
1097 /// Parses an autolink, starting at the opening `<`. `ip.pos` is left at the
1098 /// closing `>`, or remains unchanged at the opening `<` if there is none.
1099 fn parseAutolink(ip: *InlineParser) !void {
1100 const start = ip.pos;
1101 ip.pos += 1;
1102 var state: enum {
1103 start,
1104 scheme,
1105 target,
1106 } = .start;
1107 while (ip.pos < ip.content.len) : (ip.pos += 1) {
1108 switch (state) {
1109 .start => switch (ip.content[ip.pos]) {
1110 'A'...'Z', 'a'...'z' => state = .scheme,
1111 else => break,
1112 },
1113 .scheme => switch (ip.content[ip.pos]) {
1114 'A'...'Z', 'a'...'z', '0'...'9', '+', '.', '-' => {},
1115 ':' => state = .target,
1116 else => break,
1117 },
1118 .target => switch (ip.content[ip.pos]) {
1119 '<', ' ', '\t', '\n' => break, // Not allowed in autolinks
1120 '>' => {
1121 // Backslash escapes are not recognized in autolink targets.
1122 const target = try ip.parent.addString(ip.content[start + 1 .. ip.pos]);
1123 const node = try ip.parent.addNode(.{
1124 .tag = .autolink,
1125 .data = .{ .text = .{
1126 .content = target,
1127 } },
1128 });
1129 try ip.completed_inlines.append(ip.parent.allocator, .{
1130 .node = node,
1131 .start = start,
1132 .len = ip.pos - start + 1,
1133 });
1134 return;
1135 },
1136 else => {},
1137 },
1138 }
1139 }
1140 ip.pos = start;
1141 }
1142
1143 /// Parses a plain text autolink (not delimited by `<>`), starting at the
1144 /// first character in the link (an `h`). `ip.pos` is left at the last
1145 /// character of the link, or remains unchanged if there is no valid link.
1146 fn parseTextAutolink(ip: *InlineParser) !void {
1147 const start = ip.pos;
1148 var state: union(enum) {
1149 /// Inside `http`. Contains the rest of the text to be matched.
1150 http: []const u8,
1151 after_http,
1152 after_https,
1153 /// Inside `://`. Contains the rest of the text to be matched.
1154 authority: []const u8,
1155 /// Inside link content.
1156 content: struct {
1157 start: usize,
1158 paren_nesting: usize,
1159 },
1160 } = .{ .http = "http" };
1161
1162 while (ip.pos < ip.content.len) : (ip.pos += 1) {
1163 switch (state) {
1164 .http => |rest| {
1165 if (ip.content[ip.pos] != rest[0]) break;
1166 if (rest.len > 1) {
1167 state = .{ .http = rest[1..] };
1168 } else {
1169 state = .after_http;
1170 }
1171 },
1172 .after_http => switch (ip.content[ip.pos]) {
1173 's' => state = .after_https,
1174 ':' => state = .{ .authority = "//" },
1175 else => break,
1176 },
1177 .after_https => switch (ip.content[ip.pos]) {
1178 ':' => state = .{ .authority = "//" },
1179 else => break,
1180 },
1181 .authority => |rest| {
1182 if (ip.content[ip.pos] != rest[0]) break;
1183 if (rest.len > 1) {
1184 state = .{ .authority = rest[1..] };
1185 } else {
1186 state = .{ .content = .{
1187 .start = ip.pos + 1,
1188 .paren_nesting = 0,
1189 } };
1190 }
1191 },
1192 .content => |*content| switch (ip.content[ip.pos]) {
1193 ' ', '\t', '\n' => break,
1194 '(' => content.paren_nesting += 1,
1195 ')' => if (content.paren_nesting == 0) {
1196 break;
1197 } else {
1198 content.paren_nesting -= 1;
1199 },
1200 else => {},
1201 },
1202 }
1203 }
1204
1205 switch (state) {
1206 .http, .after_http, .after_https, .authority => {
1207 ip.pos = start;
1208 },
1209 .content => |content| {
1210 while (ip.pos > content.start and isPostTextAutolink(ip.content[ip.pos - 1])) {
1211 ip.pos -= 1;
1212 }
1213 if (ip.pos == content.start) {
1214 ip.pos = start;
1215 return;
1216 }
1217
1218 const target = try ip.parent.addString(ip.content[start..ip.pos]);
1219 const node = try ip.parent.addNode(.{
1220 .tag = .autolink,
1221 .data = .{ .text = .{
1222 .content = target,
1223 } },
1224 });
1225 try ip.completed_inlines.append(ip.parent.allocator, .{
1226 .node = node,
1227 .start = start,
1228 .len = ip.pos - start,
1229 });
1230 ip.pos -= 1;
1231 },
1232 }
1233 }
1234
1235 /// Returns whether `c` may appear before a text autolink is recognized.
1236 fn isPreTextAutolink(c: u8) bool {
1237 return switch (c) {
1238 ' ', '\t', '\n', '*', '_', '(' => true,
1239 else => false,
1240 };
1241 }
1242
1243 /// Returns whether `c` is punctuation that may appear after a text autolink
1244 /// and not be considered part of it.
1245 fn isPostTextAutolink(c: u8) bool {
1246 return switch (c) {
1247 '?', '!', '.', ',', ':', '*', '_' => true,
1248 else => false,
1249 };
1250 }
1251
1252 /// Parses emphasis, starting at the beginning of a run of `*` or `_`
1253 /// characters. `ip.pos` is left at the last character in the run after
1254 /// parsing.
1255 fn parseEmphasis(ip: *InlineParser) !void {
1256 const char = ip.content[ip.pos];
1257 var start = ip.pos;
1258 while (ip.pos + 1 < ip.content.len and ip.content[ip.pos + 1] == char) {
1259 ip.pos += 1;
1260 }
1261 var len = ip.pos - start + 1;
1262 const underscore = char == '_';
1263 const space_before = start == 0 or isWhitespace(ip.content[start - 1]);
1264 const space_after = start + len == ip.content.len or isWhitespace(ip.content[start + len]);
1265 const punct_before = start == 0 or isPunctuation(ip.content[start - 1]);
1266 const punct_after = start + len == ip.content.len or isPunctuation(ip.content[start + len]);
1267 // The rules for when emphasis may be closed or opened are stricter for
1268 // underscores to avoid inappropriately interpreting snake_case words as
1269 // containing emphasis markers.
1270 const can_open = if (underscore)
1271 !space_after and (space_before or punct_before)
1272 else
1273 !space_after;
1274 const can_close = if (underscore)
1275 !space_before and (space_after or punct_after)
1276 else
1277 !space_before;
1278
1279 if (can_close and ip.pending_inlines.items.len > 0) {
1280 var i = ip.pending_inlines.items.len;
1281 while (i > 0 and len > 0) {
1282 i -= 1;
1283 const opener = &ip.pending_inlines.items[i];
1284 if (opener.tag != .emphasis or
1285 opener.data.emphasis.underscore != underscore) continue;
1286
1287 const close_len = @min(opener.data.emphasis.run_len, len);
1288 const opener_end = opener.start + opener.data.emphasis.run_len;
1289
1290 const emphasis = try ip.encodeEmphasis(opener_end, start, close_len);
1291 const emphasis_start = opener_end - close_len;
1292 const emphasis_len = start - emphasis_start + close_len;
1293 try ip.completed_inlines.append(ip.parent.allocator, .{
1294 .node = emphasis,
1295 .start = emphasis_start,
1296 .len = emphasis_len,
1297 });
1298
1299 // There may still be other openers further down in the
1300 // stack to close, or part of this run might serve as an
1301 // opener itself.
1302 start += close_len;
1303 len -= close_len;
1304
1305 // Remove any pending inlines above this on the stack, since
1306 // closing this emphasis will prevent them from being closed.
1307 // Additionally, if this opener is completely consumed by
1308 // being closed, it can be removed.
1309 opener.data.emphasis.run_len -= close_len;
1310 if (opener.data.emphasis.run_len == 0) {
1311 ip.pending_inlines.shrinkRetainingCapacity(i);
1312 } else {
1313 ip.pending_inlines.shrinkRetainingCapacity(i + 1);
1314 }
1315 }
1316 }
1317
1318 if (can_open and len > 0) {
1319 try ip.pending_inlines.append(ip.parent.allocator, .{
1320 .tag = .emphasis,
1321 .data = .{ .emphasis = .{
1322 .underscore = underscore,
1323 .run_len = len,
1324 } },
1325 .start = start,
1326 });
1327 }
1328 }
1329
1330 /// Encodes emphasis specified by a run of `run_len` emphasis characters,
1331 /// with `start..end` being the range of content contained within the
1332 /// emphasis.
1333 fn encodeEmphasis(ip: *InlineParser, start: usize, end: usize, run_len: usize) !Node.Index {
1334 const children = try ip.encodeChildren(start, end);
1335 var inner = switch (run_len % 3) {
1336 1 => try ip.parent.addNode(.{
1337 .tag = .emphasis,
1338 .data = .{ .container = .{
1339 .children = children,
1340 } },
1341 }),
1342 2 => try ip.parent.addNode(.{
1343 .tag = .strong,
1344 .data = .{ .container = .{
1345 .children = children,
1346 } },
1347 }),
1348 0 => strong_emphasis: {
1349 const strong = try ip.parent.addNode(.{
1350 .tag = .strong,
1351 .data = .{ .container = .{
1352 .children = children,
1353 } },
1354 });
1355 break :strong_emphasis try ip.parent.addNode(.{
1356 .tag = .emphasis,
1357 .data = .{ .container = .{
1358 .children = try ip.parent.addExtraChildren(&.{strong}),
1359 } },
1360 });
1361 },
1362 else => unreachable,
1363 };
1364
1365 var run_left = run_len;
1366 while (run_left > 3) : (run_left -= 3) {
1367 const strong = try ip.parent.addNode(.{
1368 .tag = .strong,
1369 .data = .{ .container = .{
1370 .children = try ip.parent.addExtraChildren(&.{inner}),
1371 } },
1372 });
1373 inner = try ip.parent.addNode(.{
1374 .tag = .emphasis,
1375 .data = .{ .container = .{
1376 .children = try ip.parent.addExtraChildren(&.{strong}),
1377 } },
1378 });
1379 }
1380
1381 return inner;
1382 }
1383
1384 /// Parses a code span, starting at the beginning of the opening backtick
1385 /// run. `ip.pos` is left at the last character in the closing run after
1386 /// parsing.
1387 fn parseCodeSpan(ip: *InlineParser) !void {
1388 const opener_start = ip.pos;
1389 ip.pos = mem.indexOfNonePos(u8, ip.content, ip.pos, "`") orelse ip.content.len;
1390 const opener_len = ip.pos - opener_start;
1391
1392 const start = ip.pos;
1393 const end = while (mem.indexOfScalarPos(u8, ip.content, ip.pos, '`')) |closer_start| {
1394 ip.pos = mem.indexOfNonePos(u8, ip.content, closer_start, "`") orelse ip.content.len;
1395 const closer_len = ip.pos - closer_start;
1396
1397 if (closer_len == opener_len) break closer_start;
1398 } else unterminated: {
1399 ip.pos = ip.content.len;
1400 break :unterminated ip.content.len;
1401 };
1402
1403 var content = if (start < ip.content.len)
1404 ip.content[start..end]
1405 else
1406 "";
1407 // This single space removal rule allows code spans to be written which
1408 // start or end with backticks.
1409 if (mem.startsWith(u8, content, " `")) content = content[1..];
1410 if (mem.endsWith(u8, content, "` ")) content = content[0 .. content.len - 1];
1411
1412 const text = try ip.parent.addNode(.{
1413 .tag = .code_span,
1414 .data = .{ .text = .{
1415 .content = try ip.parent.addString(content),
1416 } },
1417 });
1418 try ip.completed_inlines.append(ip.parent.allocator, .{
1419 .node = text,
1420 .start = opener_start,
1421 .len = ip.pos - opener_start,
1422 });
1423 // Ensure ip.pos is pointing at the last character of the
1424 // closer, not after it.
1425 ip.pos -= 1;
1426 }
1427
1428 /// Encodes children parsed in the content range `start..end`. The children
1429 /// will be text nodes and any completed inlines within the range.
1430 fn encodeChildren(ip: *InlineParser, start: usize, end: usize) !ExtraIndex {
1431 const scratch_extra_top = ip.parent.scratch_extra.items.len;
1432 defer ip.parent.scratch_extra.shrinkRetainingCapacity(scratch_extra_top);
1433
1434 var child_index = ip.completed_inlines.items.len;
1435 while (child_index > 0 and ip.completed_inlines.items[child_index - 1].start >= start) {
1436 child_index -= 1;
1437 }
1438 const start_child_index = child_index;
1439
1440 var pos = start;
1441 while (child_index < ip.completed_inlines.items.len) : (child_index += 1) {
1442 const child_inline = ip.completed_inlines.items[child_index];
1443 // Completed inlines must be strictly nested within the encodable
1444 // content.
1445 assert(child_inline.start >= pos and child_inline.start + child_inline.len <= end);
1446
1447 if (child_inline.start > pos) {
1448 try ip.encodeTextNode(pos, child_inline.start);
1449 }
1450 try ip.parent.addScratchExtraNode(child_inline.node);
1451
1452 pos = child_inline.start + child_inline.len;
1453 }
1454 ip.completed_inlines.shrinkRetainingCapacity(start_child_index);
1455
1456 if (pos < end) {
1457 try ip.encodeTextNode(pos, end);
1458 }
1459
1460 const children = ip.parent.scratch_extra.items[scratch_extra_top..];
1461 return try ip.parent.addExtraChildren(@ptrCast(children));
1462 }
1463
1464 /// Encodes textual content `ip.content[start..end]` to `scratch_extra`. The
1465 /// encoded content may include both `text` and `line_break` nodes.
1466 fn encodeTextNode(ip: *InlineParser, start: usize, end: usize) !void {
1467 // For efficiency, we can encode directly into string_bytes rather than
1468 // creating a temporary string and then encoding it, since this process
1469 // is entirely linear.
1470 const string_top = ip.parent.string_bytes.items.len;
1471 errdefer ip.parent.string_bytes.shrinkRetainingCapacity(string_top);
1472
1473 var string_start = string_top;
1474 var text_iter: TextIterator = .{ .content = ip.content[start..end] };
1475 while (text_iter.next()) |content| {
1476 switch (content) {
1477 .char => |c| try ip.parent.string_bytes.append(ip.parent.allocator, c),
1478 .text => |s| try ip.parent.string_bytes.appendSlice(ip.parent.allocator, s),
1479 .line_break => {
1480 if (ip.parent.string_bytes.items.len > string_start) {
1481 try ip.parent.string_bytes.append(ip.parent.allocator, 0);
1482 try ip.parent.addScratchExtraNode(try ip.parent.addNode(.{
1483 .tag = .text,
1484 .data = .{ .text = .{
1485 .content = @enumFromInt(string_start),
1486 } },
1487 }));
1488 string_start = ip.parent.string_bytes.items.len;
1489 }
1490 try ip.parent.addScratchExtraNode(try ip.parent.addNode(.{
1491 .tag = .line_break,
1492 .data = .{ .none = {} },
1493 }));
1494 },
1495 }
1496 }
1497 if (ip.parent.string_bytes.items.len > string_start) {
1498 try ip.parent.string_bytes.append(ip.parent.allocator, 0);
1499 try ip.parent.addScratchExtraNode(try ip.parent.addNode(.{
1500 .tag = .text,
1501 .data = .{ .text = .{
1502 .content = @enumFromInt(string_start),
1503 } },
1504 }));
1505 }
1506 }
1507
1508 /// An iterator over parts of textual content, handling unescaping of
1509 /// escaped characters and line breaks.
1510 const TextIterator = struct {
1511 content: []const u8,
1512 pos: usize = 0,
1513
1514 const Content = union(enum) {
1515 char: u8,
1516 text: []const u8,
1517 line_break,
1518 };
1519
1520 const replacement = "\u{FFFD}";
1521
1522 fn next(iter: *TextIterator) ?Content {
1523 if (iter.pos >= iter.content.len) return null;
1524 if (iter.content[iter.pos] == '\\') {
1525 iter.pos += 1;
1526 if (iter.pos == iter.content.len) {
1527 return .{ .char = '\\' };
1528 } else if (iter.content[iter.pos] == '\n') {
1529 iter.pos += 1;
1530 return .line_break;
1531 } else if (isPunctuation(iter.content[iter.pos])) {
1532 const c = iter.content[iter.pos];
1533 iter.pos += 1;
1534 return .{ .char = c };
1535 } else {
1536 return .{ .char = '\\' };
1537 }
1538 }
1539 return iter.nextCodepoint();
1540 }
1541
1542 fn nextCodepoint(iter: *TextIterator) ?Content {
1543 switch (iter.content[iter.pos]) {
1544 0 => {
1545 iter.pos += 1;
1546 return .{ .text = replacement };
1547 },
1548 1...127 => |c| {
1549 iter.pos += 1;
1550 return .{ .char = c };
1551 },
1552 else => |b| {
1553 const cp_len = std.unicode.utf8ByteSequenceLength(b) catch {
1554 iter.pos += 1;
1555 return .{ .text = replacement };
1556 };
1557 const is_valid = iter.pos + cp_len <= iter.content.len and
1558 std.unicode.utf8ValidateSlice(iter.content[iter.pos..][0..cp_len]);
1559 const cp_encoded = if (is_valid)
1560 iter.content[iter.pos..][0..cp_len]
1561 else
1562 replacement;
1563 iter.pos += cp_len;
1564 return .{ .text = cp_encoded };
1565 },
1566 }
1567 }
1568 };
1569};
1570
1571fn parseInlines(p: *Parser, content: []const u8) !ExtraIndex {
1572 var ip: InlineParser = .{
1573 .parent = p,
1574 .content = mem.trim(u8, content, " \t\n"),
1575 };
1576 defer ip.deinit();
1577 return try ip.parse();
1578}
1579
1580pub fn extraData(p: Parser, comptime T: type, index: ExtraIndex) ExtraData(T) {
1581 const fields = @typeInfo(T).@"struct".fields;
1582 var i: usize = @intFromEnum(index);
1583 var result: T = undefined;
1584 inline for (fields) |field| {
1585 @field(result, field.name) = switch (field.type) {
1586 u32 => p.extra.items[i],
1587 else => @compileError("bad field type"),
1588 };
1589 i += 1;
1590 }
1591 return .{ .data = result, .end = i };
1592}
1593
1594pub fn extraChildren(p: Parser, index: ExtraIndex) []const Node.Index {
1595 const children = p.extraData(Node.Children, index);
1596 return @ptrCast(p.extra.items[children.end..][0..children.data.len]);
1597}
1598
1599fn addNode(p: *Parser, node: Node) !Node.Index {
1600 const index: Node.Index = @enumFromInt(@as(u32, @intCast(p.nodes.len)));
1601 try p.nodes.append(p.allocator, node);
1602 return index;
1603}
1604
1605fn addString(p: *Parser, s: []const u8) !StringIndex {
1606 if (s.len == 0) return .empty;
1607
1608 const index: StringIndex = @enumFromInt(@as(u32, @intCast(p.string_bytes.items.len)));
1609 try p.string_bytes.ensureUnusedCapacity(p.allocator, s.len + 1);
1610 p.string_bytes.appendSliceAssumeCapacity(s);
1611 p.string_bytes.appendAssumeCapacity(0);
1612 return index;
1613}
1614
1615fn addExtraChildren(p: *Parser, nodes: []const Node.Index) !ExtraIndex {
1616 const index: ExtraIndex = @enumFromInt(@as(u32, @intCast(p.extra.items.len)));
1617 try p.extra.ensureUnusedCapacity(p.allocator, nodes.len + 1);
1618 p.extra.appendAssumeCapacity(@intCast(nodes.len));
1619 p.extra.appendSliceAssumeCapacity(@ptrCast(nodes));
1620 return index;
1621}
1622
1623fn addScratchExtraNode(p: *Parser, node: Node.Index) !void {
1624 try p.scratch_extra.append(p.allocator, @intFromEnum(node));
1625}
1626
1627fn addScratchStringLine(p: *Parser, line: []const u8) !void {
1628 try p.scratch_string.ensureUnusedCapacity(p.allocator, line.len + 1);
1629 p.scratch_string.appendSliceAssumeCapacity(line);
1630 p.scratch_string.appendAssumeCapacity('\n');
1631}
1632
1633fn isBlank(line: []const u8) bool {
1634 return mem.indexOfNone(u8, line, " \t") == null;
1635}
1636
1637fn isPunctuation(c: u8) bool {
1638 return switch (c) {
1639 '!',
1640 '"',
1641 '#',
1642 '$',
1643 '%',
1644 '&',
1645 '\'',
1646 '(',
1647 ')',
1648 '*',
1649 '+',
1650 ',',
1651 '-',
1652 '.',
1653 '/',
1654 ':',
1655 ';',
1656 '<',
1657 '=',
1658 '>',
1659 '?',
1660 '@',
1661 '[',
1662 '\\',
1663 ']',
1664 '^',
1665 '_',
1666 '`',
1667 '{',
1668 '|',
1669 '}',
1670 '~',
1671 => true,
1672 else => false,
1673 };
1674}