Commit `9da3a9733d`

Ryan Liptak <squeek502@hotmail.com>
2023-05-05 02:49:23
std.mem: Split `tokenize` into 3 versions by delimiter type: full, any, and scalar
This allows users to choose which version they need for their particular use case, as the previous default (now the 'any' version) was (1) not always the desired type of delimiter and (2) performed worse than the scalar version if the delimiter was a single item.
master
1 parent bda645d
Changed files (1)
lib
std
mem.zig
@@ -1910,72 +1910,117 @@ test "byteSwapAllFields" {
     }, s);
 }
 
+/// Deprecated: use `tokenizeAny`, `tokenizeFull`, or `tokenizeScalar`
+pub const tokenize = tokenizeAny;
+
 /// Returns an iterator that iterates over the slices of `buffer` that are not
-/// any of the bytes in `delimiter_bytes`.
+/// any of the items in `delimiters`.
 ///
-/// `tokenize(u8, "   abc def    ghi  ", " ")` will return slices
+/// `tokenizeAny(u8, "   abc|def ||  ghi  ", " |")` will return slices
 /// for "abc", "def", "ghi", null, in that order.
 ///
 /// If `buffer` is empty, the iterator will return null.
-/// If `delimiter_bytes` does not exist in buffer,
+/// If none of `delimiters` exist in buffer,
+/// the iterator will return `buffer`, null, in that order.
+///
+/// See also: `tokenizeFull`, `tokenizeScalar`,
+///           `splitFull`,`splitAny`, `splitScalar`,
+///           `splitBackwardsFull`, `splitBackwardsAny`, and `splitBackwardsScalar`
+pub fn tokenizeAny(comptime T: type, buffer: []const T, delimiters: []const T) TokenIterator(T, .any) {
+    return .{
+        .index = 0,
+        .buffer = buffer,
+        .delimiter = delimiters,
+    };
+}
+
+/// Returns an iterator that iterates over the slices of `buffer` that are not
+/// the sequence in `delimiter`.
+///
+/// `tokenizeFull(u8, "<>abc><def<><>ghi", "<>")` will return slices
+/// for "abc><def", "ghi", null, in that order.
+///
+/// If `buffer` is empty, the iterator will return null.
+/// If `delimiter` does not exist in buffer,
 /// the iterator will return `buffer`, null, in that order.
+/// The delimiter length must not be zero.
 ///
-/// See also: `split` and `splitBackwards`.
-pub fn tokenize(comptime T: type, buffer: []const T, delimiter_bytes: []const T) TokenIterator(T) {
+/// See also: `tokenizeAny`, `tokenizeScalar`,
+///           `splitFull`,`splitAny`, and `splitScalar`
+///           `splitBackwardsFull`, `splitBackwardsAny`, and `splitBackwardsScalar`
+pub fn tokenizeFull(comptime T: type, buffer: []const T, delimiter: []const T) TokenIterator(T, .full) {
+    assert(delimiter.len != 0);
     return .{
         .index = 0,
         .buffer = buffer,
-        .delimiter_bytes = delimiter_bytes,
+        .delimiter = delimiter,
     };
 }
 
-test "tokenize" {
-    var it = tokenize(u8, "   abc def   ghi  ", " ");
+/// Returns an iterator that iterates over the slices of `buffer` that are not
+/// `delimiter`.
+///
+/// `tokenizeScalar(u8, "   abc def     ghi  ", ' ')` will return slices
+/// for "abc", "def", "ghi", null, in that order.
+///
+/// If `buffer` is empty, the iterator will return null.
+/// If `delimiter` does not exist in buffer,
+/// the iterator will return `buffer`, null, in that order.
+///
+/// See also: `tokenizeAny`, `tokenizeFull`,
+///           `splitFull`,`splitAny`, and `splitScalar`
+///           `splitBackwardsFull`, `splitBackwardsAny`, and `splitBackwardsScalar`
+pub fn tokenizeScalar(comptime T: type, buffer: []const T, delimiter: T) TokenIterator(T, .scalar) {
+    return .{
+        .index = 0,
+        .buffer = buffer,
+        .delimiter = delimiter,
+    };
+}
+
+test "tokenizeScalar" {
+    var it = tokenizeScalar(u8, "   abc def   ghi  ", ' ');
     try testing.expect(eql(u8, it.next().?, "abc"));
     try testing.expect(eql(u8, it.peek().?, "def"));
     try testing.expect(eql(u8, it.next().?, "def"));
     try testing.expect(eql(u8, it.next().?, "ghi"));
     try testing.expect(it.next() == null);
 
-    it = tokenize(u8, "..\\bob", "\\");
+    it = tokenizeScalar(u8, "..\\bob", '\\');
     try testing.expect(eql(u8, it.next().?, ".."));
     try testing.expect(eql(u8, "..", "..\\bob"[0..it.index]));
     try testing.expect(eql(u8, it.next().?, "bob"));
     try testing.expect(it.next() == null);
 
-    it = tokenize(u8, "//a/b", "/");
+    it = tokenizeScalar(u8, "//a/b", '/');
     try testing.expect(eql(u8, it.next().?, "a"));
     try testing.expect(eql(u8, it.next().?, "b"));
     try testing.expect(eql(u8, "//a/b", "//a/b"[0..it.index]));
     try testing.expect(it.next() == null);
 
-    it = tokenize(u8, "|", "|");
+    it = tokenizeScalar(u8, "|", '|');
     try testing.expect(it.next() == null);
     try testing.expect(it.peek() == null);
 
-    it = tokenize(u8, "", "|");
+    it = tokenizeScalar(u8, "", '|');
     try testing.expect(it.next() == null);
     try testing.expect(it.peek() == null);
 
-    it = tokenize(u8, "hello", "");
-    try testing.expect(eql(u8, it.next().?, "hello"));
-    try testing.expect(it.next() == null);
-
-    it = tokenize(u8, "hello", " ");
+    it = tokenizeScalar(u8, "hello", ' ');
     try testing.expect(eql(u8, it.next().?, "hello"));
     try testing.expect(it.next() == null);
 
-    var it16 = tokenize(
+    var it16 = tokenizeScalar(
         u16,
         std.unicode.utf8ToUtf16LeStringLiteral("hello"),
-        std.unicode.utf8ToUtf16LeStringLiteral(" "),
+        ' ',
     );
     try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("hello")));
     try testing.expect(it16.next() == null);
 }
 
-test "tokenize (multibyte)" {
-    var it = tokenize(u8, "a|b,c/d e", " /,|");
+test "tokenizeAny (multibyte)" {
+    var it = tokenizeAny(u8, "a|b,c/d e", " /,|");
     try testing.expect(eql(u8, it.next().?, "a"));
     try testing.expect(eql(u8, it.peek().?, "b"));
     try testing.expect(eql(u8, it.next().?, "b"));
@@ -1985,7 +2030,11 @@ test "tokenize (multibyte)" {
     try testing.expect(it.next() == null);
     try testing.expect(it.peek() == null);
 
-    var it16 = tokenize(
+    it = tokenizeAny(u8, "hello", "");
+    try testing.expect(eql(u8, it.next().?, "hello"));
+    try testing.expect(it.next() == null);
+
+    var it16 = tokenizeAny(
         u16,
         std.unicode.utf8ToUtf16LeStringLiteral("a|b,c/d e"),
         std.unicode.utf8ToUtf16LeStringLiteral(" /,|"),
@@ -1998,18 +2047,68 @@ test "tokenize (multibyte)" {
     try testing.expect(it16.next() == null);
 }
 
+test "tokenizeFull" {
+    var it = tokenizeFull(u8, "a<>b<><>c><>d><", "<>");
+    try testing.expectEqualStrings("a", it.next().?);
+    try testing.expectEqualStrings("b", it.peek().?);
+    try testing.expectEqualStrings("b", it.next().?);
+    try testing.expectEqualStrings("c>", it.next().?);
+    try testing.expectEqualStrings("d><", it.next().?);
+    try testing.expect(it.next() == null);
+    try testing.expect(it.peek() == null);
+
+    var it16 = tokenizeFull(
+        u16,
+        std.unicode.utf8ToUtf16LeStringLiteral("a<>b<><>c><>d><"),
+        std.unicode.utf8ToUtf16LeStringLiteral("<>"),
+    );
+    try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("a")));
+    try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("b")));
+    try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("c>")));
+    try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("d><")));
+    try testing.expect(it16.next() == null);
+}
+
 test "tokenize (reset)" {
-    var it = tokenize(u8, "   abc def   ghi  ", " ");
-    try testing.expect(eql(u8, it.next().?, "abc"));
-    try testing.expect(eql(u8, it.next().?, "def"));
-    try testing.expect(eql(u8, it.next().?, "ghi"));
+    {
+        var it = tokenizeAny(u8, "   abc def   ghi  ", " ");
+        try testing.expect(eql(u8, it.next().?, "abc"));
+        try testing.expect(eql(u8, it.next().?, "def"));
+        try testing.expect(eql(u8, it.next().?, "ghi"));
 
-    it.reset();
+        it.reset();
 
-    try testing.expect(eql(u8, it.next().?, "abc"));
-    try testing.expect(eql(u8, it.next().?, "def"));
-    try testing.expect(eql(u8, it.next().?, "ghi"));
-    try testing.expect(it.next() == null);
+        try testing.expect(eql(u8, it.next().?, "abc"));
+        try testing.expect(eql(u8, it.next().?, "def"));
+        try testing.expect(eql(u8, it.next().?, "ghi"));
+        try testing.expect(it.next() == null);
+    }
+    {
+        var it = tokenizeFull(u8, "<><>abc<>def<><>ghi<>", "<>");
+        try testing.expect(eql(u8, it.next().?, "abc"));
+        try testing.expect(eql(u8, it.next().?, "def"));
+        try testing.expect(eql(u8, it.next().?, "ghi"));
+
+        it.reset();
+
+        try testing.expect(eql(u8, it.next().?, "abc"));
+        try testing.expect(eql(u8, it.next().?, "def"));
+        try testing.expect(eql(u8, it.next().?, "ghi"));
+        try testing.expect(it.next() == null);
+    }
+    {
+        var it = tokenizeScalar(u8, "   abc def   ghi  ", ' ');
+        try testing.expect(eql(u8, it.next().?, "abc"));
+        try testing.expect(eql(u8, it.next().?, "def"));
+        try testing.expect(eql(u8, it.next().?, "ghi"));
+
+        it.reset();
+
+        try testing.expect(eql(u8, it.next().?, "abc"));
+        try testing.expect(eql(u8, it.next().?, "def"));
+        try testing.expect(eql(u8, it.next().?, "ghi"));
+        try testing.expect(it.next() == null);
+    }
 }
 
 /// Deprecated: use `splitFull`, `splitAny`, or `splitScalar`
@@ -2026,8 +2125,8 @@ pub const split = splitFull;
 /// The delimiter length must not be zero.
 ///
 /// See also: `splitAny`, `splitScalar`, `splitBackwardsFull`,
-///           `splitBackwardsAny`,`splitBackwardsScalar`, and
-///           `tokenize`.
+///           `splitBackwardsAny`,`splitBackwardsScalar`,
+///           `tokenizeAny`, `tokenizeFull`, and `tokenizeScalar`.
 pub fn splitFull(comptime T: type, buffer: []const T, delimiter: []const T) SplitIterator(T, .full) {
     assert(delimiter.len != 0);
     return .{
@@ -2047,8 +2146,8 @@ pub fn splitFull(comptime T: type, buffer: []const T, delimiter: []const T) Spli
 /// the iterator will return `buffer`, null, in that order.
 ///
 /// See also: `splitFull`, `splitScalar`, `splitBackwardsFull`,
-///           `splitBackwardsAny`,`splitBackwardsScalar`, and
-///           `tokenize`.
+///           `splitBackwardsAny`,`splitBackwardsScalar`,
+///           `tokenizeAny`, `tokenizeFull`, and `tokenizeScalar`.
 pub fn splitAny(comptime T: type, buffer: []const T, delimiters: []const T) SplitIterator(T, .any) {
     return .{
         .index = 0,
@@ -2067,8 +2166,8 @@ pub fn splitAny(comptime T: type, buffer: []const T, delimiters: []const T) Spli
 /// the iterator will return `buffer`, null, in that order.
 ///
 /// See also: `splitFull`, `splitAny`, `splitBackwardsFull`,
-///           `splitBackwardsAny`,`splitBackwardsScalar`, and
-///           `tokenize`.
+///           `splitBackwardsAny`,`splitBackwardsScalar`,
+///           `tokenizeAny`, `tokenizeFull`, and `tokenizeScalar`.
 pub fn splitScalar(comptime T: type, buffer: []const T, delimiter: T) SplitIterator(T, .scalar) {
     return .{
         .index = 0,
@@ -2224,8 +2323,8 @@ pub const splitBackwards = splitBackwardsFull;
 /// The delimiter length must not be zero.
 ///
 /// See also: `splitBackwardsAny`, `splitBackwardsScalar`,
-///           `splitFull`, `splitAny`,`splitScalar`, and
-///           `tokenize`.
+///           `splitFull`, `splitAny`,`splitScalar`,
+///           `tokenizeAny`, `tokenizeFull`, and `tokenizeScalar`.
 pub fn splitBackwardsFull(comptime T: type, buffer: []const T, delimiter: []const T) SplitBackwardsIterator(T, .full) {
     assert(delimiter.len != 0);
     return .{
@@ -2245,8 +2344,8 @@ pub fn splitBackwardsFull(comptime T: type, buffer: []const T, delimiter: []cons
 /// the iterator will return `buffer`, null, in that order.
 ///
 /// See also: `splitBackwardsFull`, `splitBackwardsScalar`,
-///           `splitFull`, `splitAny`,`splitScalar`, and
-///           `tokenize`.
+///           `splitFull`, `splitAny`,`splitScalar`,
+///           `tokenizeAny`, `tokenizeFull`, and `tokenizeScalar`.
 pub fn splitBackwardsAny(comptime T: type, buffer: []const T, delimiters: []const T) SplitBackwardsIterator(T, .any) {
     return .{
         .index = buffer.len,
@@ -2265,8 +2364,8 @@ pub fn splitBackwardsAny(comptime T: type, buffer: []const T, delimiters: []cons
 /// the iterator will return `buffer`, null, in that order.
 ///
 /// See also: `splitBackwardsFull`, `splitBackwardsAny`,
-///           `splitFull`, `splitAny`,`splitScalar`, and
-///           `tokenize`.
+///           `splitFull`, `splitAny`,`splitScalar`,
+///           `tokenizeAny`, `tokenizeFull`, and `tokenizeScalar`.
 pub fn splitBackwardsScalar(comptime T: type, buffer: []const T, delimiter: T) SplitBackwardsIterator(T, .scalar) {
     return .{
         .index = buffer.len,
@@ -2596,10 +2695,13 @@ test "endsWith" {
 
 pub const DelimiterType = enum { full, any, scalar };
 
-pub fn TokenIterator(comptime T: type) type {
+pub fn TokenIterator(comptime T: type, comptime delimiter_type: DelimiterType) type {
     return struct {
         buffer: []const T,
-        delimiter_bytes: []const T,
+        delimiter: switch (delimiter_type) {
+            .full, .any => []const T,
+            .scalar => T,
+        },
         index: usize,
 
         const Self = @This();
@@ -2616,7 +2718,10 @@ pub fn TokenIterator(comptime T: type) type {
         /// complete. Does not advance to the next token.
         pub fn peek(self: *Self) ?[]const T {
             // move to beginning of token
-            while (self.index < self.buffer.len and self.isSplitByte(self.buffer[self.index])) : (self.index += 1) {}
+            while (self.index < self.buffer.len and self.isDelimiter(self.index)) : (self.index += switch (delimiter_type) {
+                .full => self.delimiter.len,
+                .any, .scalar => 1,
+            }) {}
             const start = self.index;
             if (start == self.buffer.len) {
                 return null;
@@ -2624,7 +2729,7 @@ pub fn TokenIterator(comptime T: type) type {
 
             // move to end of token
             var end = start;
-            while (end < self.buffer.len and !self.isSplitByte(self.buffer[end])) : (end += 1) {}
+            while (end < self.buffer.len and !self.isDelimiter(end)) : (end += 1) {}
 
             return self.buffer[start..end];
         }
@@ -2633,7 +2738,10 @@ pub fn TokenIterator(comptime T: type) type {
         pub fn rest(self: Self) []const T {
             // move to beginning of token
             var index: usize = self.index;
-            while (index < self.buffer.len and self.isSplitByte(self.buffer[index])) : (index += 1) {}
+            while (index < self.buffer.len and self.isDelimiter(index)) : (index += switch (delimiter_type) {
+                .full => self.delimiter.len,
+                .any, .scalar => 1,
+            }) {}
             return self.buffer[index..];
         }
 
@@ -2642,13 +2750,20 @@ pub fn TokenIterator(comptime T: type) type {
             self.index = 0;
         }
 
-        fn isSplitByte(self: Self, byte: T) bool {
-            for (self.delimiter_bytes) |delimiter_byte| {
-                if (byte == delimiter_byte) {
-                    return true;
-                }
+        fn isDelimiter(self: Self, index: usize) bool {
+            switch (delimiter_type) {
+                .full => return startsWith(T, self.buffer[index..], self.delimiter),
+                .any => {
+                    const item = self.buffer[index];
+                    for (self.delimiter) |delimiter_item| {
+                        if (item == delimiter_item) {
+                            return true;
+                        }
+                    }
+                    return false;
+                },
+                .scalar => return self.buffer[index] == self.delimiter,
             }
-            return false;
         }
     };
 }
Commit 9da3a9733d

Commit `9da3a9733d`