Commit `67bd45f0cf`

Andrew Kelley <andrew@ziglang.org>

2019-02-04 21:24:06

adjustments to std.mem split / separate

* rename std.mem.split to std.mem.tokenize * add future deprecation notice to docs * (unrelated) add note to std.os.path.resolve docs * std.mem.separate - assert delimiter.len not zero * fix implementation of std.mem.separate to respect the delimiter * separate the two iterators to different structs

master

1 parent f44ce78

Changed files (8)

src-self-hosted

libc_installation.zig

std

@@ -57,10 +57,10 @@ pub const LibCInstallation = struct {
         const contents = try std.io.readFileAlloc(allocator, libc_file);
         defer allocator.free(contents);
 
-        var it = std.mem.split(contents, "\n");
+        var it = std.mem.tokenize(contents, "\n");
         while (it.next()) |line| {
             if (line.len == 0 or line[0] == '#') continue;
-            var line_it = std.mem.split(line, "=");
+            var line_it = std.mem.separate(line, "=");
             const name = line_it.next() orelse {
                 try stderr.print("missing equal sign after field name\n");
                 return error.ParseError;
@@ -213,7 +213,7 @@ pub const LibCInstallation = struct {
             },
         }
 
-        var it = std.mem.split(exec_result.stderr, "\n\r");
+        var it = std.mem.tokenize(exec_result.stderr, "\n\r");
         var search_paths = std.ArrayList([]const u8).init(loop.allocator);
         defer search_paths.deinit();
         while (it.next()) |line| {
@@ -410,7 +410,7 @@ async fn ccPrintFileName(loop: *event.Loop, o_file: []const u8, want_dirname: bo
             return error.CCompilerCrashed;
         },
     }
-    var it = std.mem.split(exec_result.stdout, "\n\r");
+    var it = std.mem.tokenize(exec_result.stdout, "\n\r");
     const line = it.next() orelse return error.LibCRuntimeNotFound;
     const dirname = std.os.path.dirname(line) orelse return error.LibCRuntimeNotFound;

@@ -351,7 +351,7 @@ fn buildOutputType(allocator: *Allocator, args: []const []const u8, out_type: Co
     const root_name = if (provided_name) |n| n else blk: {
         if (root_source_file) |file| {
             const basename = os.path.basename(file);
-            var it = mem.split(basename, ".");
+            var it = mem.separate(basename, ".");
             break :blk it.next() orelse basename;
         } else {
             try stderr.write("--name [name] not provided and unable to infer\n");

@@ -595,7 +595,7 @@ pub const ChildProcess = struct {
             const PATH = try os.getEnvVarOwned(self.allocator, "PATH");
             defer self.allocator.free(PATH);
 
-            var it = mem.split(PATH, ";");
+            var it = mem.tokenize(PATH, ";");
             while (it.next()) |search_path| {
                 const joined_path = try os.path.join(self.allocator, search_path, app_name);
                 defer self.allocator.free(joined_path);

@@ -608,7 +608,7 @@ pub fn posixExecve(argv: []const []const u8, env_map: *const BufMap, allocator:
     // +1 for the null terminating byte
     const path_buf = try allocator.alloc(u8, PATH.len + exe_path.len + 2);
     defer allocator.free(path_buf);
-    var it = mem.split(PATH, ":");
+    var it = mem.tokenize(PATH, ":");
     var seen_eacces = false;
     var err: usize = undefined;
     while (it.next()) |search_path| {

@@ -184,7 +184,7 @@ pub fn windowsParsePath(path: []const u8) WindowsPath {
                 return relative_path;
             }
 
-            var it = mem.split(path, []u8{this_sep});
+            var it = mem.tokenize(path, []u8{this_sep});
             _ = (it.next() orelse return relative_path);
             _ = (it.next() orelse return relative_path);
             return WindowsPath{
@@ -202,7 +202,7 @@ pub fn windowsParsePath(path: []const u8) WindowsPath {
                 return relative_path;
             }
 
-            var it = mem.split(path, []u8{this_sep});
+            var it = mem.tokenize(path, []u8{this_sep});
             _ = (it.next() orelse return relative_path);
             _ = (it.next() orelse return relative_path);
             return WindowsPath{
@@ -264,8 +264,8 @@ fn networkShareServersEql(ns1: []const u8, ns2: []const u8) bool {
     const sep1 = ns1[0];
     const sep2 = ns2[0];
 
-    var it1 = mem.split(ns1, []u8{sep1});
-    var it2 = mem.split(ns2, []u8{sep2});
+    var it1 = mem.tokenize(ns1, []u8{sep1});
+    var it2 = mem.tokenize(ns2, []u8{sep2});
 
     // TODO ASCII is wrong, we actually need full unicode support to compare paths.
     return asciiEqlIgnoreCase(it1.next().?, it2.next().?);
@@ -285,8 +285,8 @@ fn compareDiskDesignators(kind: WindowsPath.Kind, p1: []const u8, p2: []const u8
             const sep1 = p1[0];
             const sep2 = p2[0];
 
-            var it1 = mem.split(p1, []u8{sep1});
-            var it2 = mem.split(p2, []u8{sep2});
+            var it1 = mem.tokenize(p1, []u8{sep1});
+            var it2 = mem.tokenize(p2, []u8{sep2});
 
             // TODO ASCII is wrong, we actually need full unicode support to compare paths.
             return asciiEqlIgnoreCase(it1.next().?, it2.next().?) and asciiEqlIgnoreCase(it1.next().?, it2.next().?);
@@ -337,6 +337,8 @@ pub fn resolveSlice(allocator: *Allocator, paths: []const []const u8) ![]u8 {
 /// If all paths are relative it uses the current working directory as a starting point.
 /// Each drive has its own current working directory.
 /// Path separators are canonicalized to '\\' and drives are canonicalized to capital letters.
+/// Note: all usage of this function should be audited due to the existence of symlinks.
+/// Without performing actual syscalls, resolving `..` could be incorrect.
 pub fn resolveWindows(allocator: *Allocator, paths: []const []const u8) ![]u8 {
     if (paths.len == 0) {
         assert(is_windows); // resolveWindows called on non windows can't use getCwd
@@ -416,7 +418,7 @@ pub fn resolveWindows(allocator: *Allocator, paths: []const []const u8) ![]u8 {
             },
             WindowsPath.Kind.NetworkShare => {
                 result = try allocator.alloc(u8, max_size);
-                var it = mem.split(paths[first_index], "/\\");
+                var it = mem.tokenize(paths[first_index], "/\\");
                 const server_name = it.next().?;
                 const other_name = it.next().?;
 
@@ -483,7 +485,7 @@ pub fn resolveWindows(allocator: *Allocator, paths: []const []const u8) ![]u8 {
         if (!correct_disk_designator) {
             continue;
         }
-        var it = mem.split(p[parsed.disk_designator.len..], "/\\");
+        var it = mem.tokenize(p[parsed.disk_designator.len..], "/\\");
         while (it.next()) |component| {
             if (mem.eql(u8, component, ".")) {
                 continue;
@@ -516,6 +518,8 @@ pub fn resolveWindows(allocator: *Allocator, paths: []const []const u8) ![]u8 {
 /// It resolves "." and "..".
 /// The result does not have a trailing path separator.
 /// If all paths are relative it uses the current working directory as a starting point.
+/// Note: all usage of this function should be audited due to the existence of symlinks.
+/// Without performing actual syscalls, resolving `..` could be incorrect.
 pub fn resolvePosix(allocator: *Allocator, paths: []const []const u8) ![]u8 {
     if (paths.len == 0) {
         assert(!is_windows); // resolvePosix called on windows can't use getCwd
@@ -550,7 +554,7 @@ pub fn resolvePosix(allocator: *Allocator, paths: []const []const u8) ![]u8 {
     errdefer allocator.free(result);
 
     for (paths[first_index..]) |p, i| {
-        var it = mem.split(p, "/");
+        var it = mem.tokenize(p, "/");
         while (it.next()) |component| {
             if (mem.eql(u8, component, ".")) {
                 continue;
@@ -937,8 +941,8 @@ pub fn relativeWindows(allocator: *Allocator, from: []const u8, to: []const u8)
         return resolved_to;
     }
 
-    var from_it = mem.split(resolved_from, "/\\");
-    var to_it = mem.split(resolved_to, "/\\");
+    var from_it = mem.tokenize(resolved_from, "/\\");
+    var to_it = mem.tokenize(resolved_to, "/\\");
     while (true) {
         const from_component = from_it.next() orelse return mem.dupe(allocator, u8, to_it.rest());
         const to_rest = to_it.rest();
@@ -967,14 +971,12 @@ pub fn relativeWindows(allocator: *Allocator, from: []const u8, to: []const u8)
         // shave off the trailing slash
         result_index -= 1;
 
-        if (to_rest.len > 0) {
-            var rest_it = mem.split(to_rest, "/\\");
-            while (rest_it.next()) |to_component| {
-                result[result_index] = '\\';
-                result_index += 1;
-                mem.copy(u8, result[result_index..], to_component);
-                result_index += to_component.len;
-            }
+        var rest_it = mem.tokenize(to_rest, "/\\");
+        while (rest_it.next()) |to_component| {
+            result[result_index] = '\\';
+            result_index += 1;
+            mem.copy(u8, result[result_index..], to_component);
+            result_index += to_component.len;
         }
 
         return result[0..result_index];
@@ -990,8 +992,8 @@ pub fn relativePosix(allocator: *Allocator, from: []const u8, to: []const u8) ![
     const resolved_to = try resolvePosix(allocator, [][]const u8{to});
     defer allocator.free(resolved_to);
 
-    var from_it = mem.split(resolved_from, "/");
-    var to_it = mem.split(resolved_to, "/");
+    var from_it = mem.tokenize(resolved_from, "/");
+    var to_it = mem.tokenize(resolved_to, "/");
     while (true) {
         const from_component = from_it.next() orelse return mem.dupe(allocator, u8, to_it.rest());
         const to_rest = to_it.rest();

@@ -324,7 +324,7 @@ pub const Builder = struct {
 
     fn processNixOSEnvVars(self: *Builder) void {
         if (os.getEnvVarOwned(self.allocator, "NIX_CFLAGS_COMPILE")) |nix_cflags_compile| {
-            var it = mem.split(nix_cflags_compile, " ");
+            var it = mem.tokenize(nix_cflags_compile, " ");
             while (true) {
                 const word = it.next() orelse break;
                 if (mem.eql(u8, word, "-isystem")) {
@@ -342,7 +342,7 @@ pub const Builder = struct {
             assert(err == error.EnvironmentVariableNotFound);
         }
         if (os.getEnvVarOwned(self.allocator, "NIX_LDFLAGS")) |nix_ldflags| {
-            var it = mem.split(nix_ldflags, " ");
+            var it = mem.tokenize(nix_ldflags, " ");
             while (true) {
                 const word = it.next() orelse break;
                 if (mem.eql(u8, word, "-rpath")) {
@@ -689,7 +689,7 @@ pub const Builder = struct {
                 if (os.path.isAbsolute(name)) {
                     return name;
                 }
-                var it = mem.split(PATH, []u8{os.path.delimiter});
+                var it = mem.tokenize(PATH, []u8{os.path.delimiter});
                 while (it.next()) |path| {
                     const full_path = try os.path.join(self.allocator, path, self.fmt("{}{}", name, exe_extension));
                     if (os.path.real(self.allocator, full_path)) |real_path| {

@@ -689,58 +689,57 @@ pub fn eql_slice_u8(a: []const u8, b: []const u8) bool {
 }
 
 /// Returns an iterator that iterates over the slices of `buffer` that are not
-/// any of the bytes in `split_bytes`.
-/// split("   abc def    ghi  ", " ")
+/// any of the bytes in `delimiter_bytes`.
+/// tokenize("   abc def    ghi  ", " ")
 /// Will return slices for "abc", "def", "ghi", null, in that order.
-/// If `split_bytes` does not exist in buffer,
+/// If `buffer` is empty, the iterator will return null.
+/// If `delimiter_bytes` does not exist in buffer,
 /// the iterator will return `buffer`, null, in that order.
-pub fn split(buffer: []const u8, split_bytes: []const u8) SplitIterator {
-    return SplitIterator{
+/// See also the related function `separate`.
+pub fn tokenize(buffer: []const u8, delimiter_bytes: []const u8) TokenIterator {
+    return TokenIterator{
         .index = 0,
         .buffer = buffer,
-        .split_bytes = split_bytes,
-        .glob = true,
-        .spun = false,
+        .delimiter_bytes = delimiter_bytes,
     };
 }
 
-test "mem.split" {
-    var it = split("   abc def   ghi  ", " ");
+test "mem.tokenize" {
+    var it = tokenize("   abc def   ghi  ", " ");
     assert(eql(u8, it.next().?, "abc"));
     assert(eql(u8, it.next().?, "def"));
     assert(eql(u8, it.next().?, "ghi"));
     assert(it.next() == null);
 
-    it = split("..\\bob", "\\");
+    it = tokenize("..\\bob", "\\");
     assert(eql(u8, it.next().?, ".."));
     assert(eql(u8, "..", "..\\bob"[0..it.index]));
     assert(eql(u8, it.next().?, "bob"));
     assert(it.next() == null);
 
-    it = split("//a/b", "/");
+    it = tokenize("//a/b", "/");
     assert(eql(u8, it.next().?, "a"));
     assert(eql(u8, it.next().?, "b"));
     assert(eql(u8, "//a/b", "//a/b"[0..it.index]));
     assert(it.next() == null);
 
-    it = split("|", "|");
+    it = tokenize("|", "|");
     assert(it.next() == null);
 
-    it = split("", "|");
-    assert(eql(u8, it.next().?, ""));
+    it = tokenize("", "|");
     assert(it.next() == null);
 
-    it = split("hello", "");
+    it = tokenize("hello", "");
     assert(eql(u8, it.next().?, "hello"));
     assert(it.next() == null);
 
-    it = split("hello", " ");
+    it = tokenize("hello", " ");
     assert(eql(u8, it.next().?, "hello"));
     assert(it.next() == null);
 }
 
-test "mem.split (multibyte)" {
-    var it = split("a|b,c/d e", " /,|");
+test "mem.tokenize (multibyte)" {
+    var it = tokenize("a|b,c/d e", " /,|");
     assert(eql(u8, it.next().?, "a"));
     assert(eql(u8, it.next().?, "b"));
     assert(eql(u8, it.next().?, "c"));
@@ -750,18 +749,21 @@ test "mem.split (multibyte)" {
 }
 
 /// Returns an iterator that iterates over the slices of `buffer` that
-/// seperates by bytes in `delimiter`.
+/// are separated by bytes in `delimiter`.
 /// separate("abc|def||ghi", "|")
-/// Will return slices for "abc", "def", "", "ghi", null, in that order.
+/// will return slices for "abc", "def", "", "ghi", null, in that order.
 /// If `delimiter` does not exist in buffer,
 /// the iterator will return `buffer`, null, in that order.
+/// The delimiter length must not be zero.
+/// See also the related function `tokenize`.
+/// It is planned to rename this function to `split` before 1.0.0, like this:
+/// pub fn split(buffer: []const u8, delimiter: []const u8) SplitIterator {
 pub fn separate(buffer: []const u8, delimiter: []const u8) SplitIterator {
+    assert(delimiter.len != 0);
     return SplitIterator{
         .index = 0,
         .buffer = buffer,
-        .split_bytes = delimiter,
-        .glob = false,
-        .spun = false,
+        .delimiter = delimiter,
     };
 }
 
@@ -782,19 +784,15 @@ test "mem.separate" {
     assert(eql(u8, it.next().?, ""));
     assert(it.next() == null);
 
-    it = separate("hello", "");
-    assert(eql(u8, it.next().?, "hello"));
-    assert(it.next() == null);
-
     it = separate("hello", " ");
     assert(eql(u8, it.next().?, "hello"));
     assert(it.next() == null);
 }
 
 test "mem.separate (multibyte)" {
-    var it = separate("a|b,c/d e", " /,|");
+    var it = separate("a, b ,, c, d, e", ", ");
     assert(eql(u8, it.next().?, "a"));
-    assert(eql(u8, it.next().?, "b"));
+    assert(eql(u8, it.next().?, "b ,"));
     assert(eql(u8, it.next().?, "c"));
     assert(eql(u8, it.next().?, "d"));
     assert(eql(u8, it.next().?, "e"));
@@ -819,49 +817,38 @@ test "mem.endsWith" {
     assert(!endsWith(u8, "Bob", "Bo"));
 }
 
-pub const SplitIterator = struct {
+pub const TokenIterator = struct {
     buffer: []const u8,
-    split_bytes: []const u8,
+    delimiter_bytes: []const u8,
     index: usize,
-    glob: bool,
-    spun: bool,
 
-    /// Iterates and returns null or optionally a slice the next split segment
-    pub fn next(self: *SplitIterator) ?[]const u8 {
-        if (self.spun) {
-            if (self.index + 1 > self.buffer.len) return null;
-            self.index += 1;
-        }
-
-        self.spun = true;
-
-        if (self.glob) {
-            while (self.index < self.buffer.len and self.isSplitByte(self.buffer[self.index])) : (self.index += 1) {}
+    /// Returns a slice of the next token, or null if tokenization is complete.
+    pub fn next(self: *TokenIterator) ?[]const u8 {
+        // move to beginning of token
+        while (self.index < self.buffer.len and self.isSplitByte(self.buffer[self.index])) : (self.index += 1) {}
+        const start = self.index;
+        if (start == self.buffer.len) {
+            return null;
         }
 
-        var cursor = self.index;
-        while (cursor < self.buffer.len and !self.isSplitByte(self.buffer[cursor])) : (cursor += 1) {}
-
-        defer self.index = cursor;
+        // move to end of token
+        while (self.index < self.buffer.len and !self.isSplitByte(self.buffer[self.index])) : (self.index += 1) {}
+        const end = self.index;
 
-        if (cursor == self.buffer.len) {
-            return if (self.glob and self.index == cursor and self.index > 0) null else self.buffer[self.index..];
-        }
-
-        return self.buffer[self.index..cursor];
+        return self.buffer[start..end];
     }
 
     /// Returns a slice of the remaining bytes. Does not affect iterator state.
-    pub fn rest(self: *const SplitIterator) []const u8 {
+    pub fn rest(self: TokenIterator) []const u8 {
         // move to beginning of token
         var index: usize = self.index;
         while (index < self.buffer.len and self.isSplitByte(self.buffer[index])) : (index += 1) {}
         return self.buffer[index..];
     }
 
-    fn isSplitByte(self: *const SplitIterator, byte: u8) bool {
-        for (self.split_bytes) |split_byte| {
-            if (byte == split_byte) {
+    fn isSplitByte(self: TokenIterator, byte: u8) bool {
+        for (self.delimiter_bytes) |delimiter_byte| {
+            if (byte == delimiter_byte) {
                 return true;
             }
         }
@@ -869,6 +856,32 @@ pub const SplitIterator = struct {
     }
 };
 
+pub const SplitIterator = struct {
+    buffer: []const u8,
+    index: ?usize,
+    delimiter: []const u8,
+
+    /// Returns a slice of the next field, or null if splitting is complete.
+    pub fn next(self: *SplitIterator) ?[]const u8 {
+        const start = self.index orelse return null;
+        const end = if (indexOfPos(u8, self.buffer, start, self.delimiter)) |delim_start| blk: {
+            self.index = delim_start + self.delimiter.len;
+            break :blk delim_start;
+        } else blk: {
+            self.index = null;
+            break :blk self.buffer.len;
+        };
+        return self.buffer[start..end];
+    }
+
+    /// Returns a slice of the remaining bytes. Does not affect iterator state.
+    pub fn rest(self: SplitIterator) []const u8 {
+        const end = self.buffer.len;
+        const start = self.index orelse end;
+        return self.buffer[start..end];
+    }
+};
+
 /// Naively combines a series of strings with a separator.
 /// Allocates memory for the result, which must be freed by the caller.
 pub fn join(allocator: *Allocator, sep: u8, strings: ...) ![]u8 {

@@ -189,14 +189,14 @@ fn findLLVM(b: *Builder, llvm_config_exe: []const u8) !LibraryDep {
     const prefix_output = try b.exec([][]const u8{ llvm_config_exe, "--prefix" });
 
     var result = LibraryDep{
-        .prefix = mem.split(prefix_output, " \r\n").next().?,
+        .prefix = mem.tokenize(prefix_output, " \r\n").next().?,
         .libs = ArrayList([]const u8).init(b.allocator),
         .system_libs = ArrayList([]const u8).init(b.allocator),
         .includes = ArrayList([]const u8).init(b.allocator),
         .libdirs = ArrayList([]const u8).init(b.allocator),
     };
     {
-        var it = mem.split(libs_output, " \r\n");
+        var it = mem.tokenize(libs_output, " \r\n");
         while (it.next()) |lib_arg| {
             if (mem.startsWith(u8, lib_arg, "-l")) {
                 try result.system_libs.append(lib_arg[2..]);
@@ -210,7 +210,7 @@ fn findLLVM(b: *Builder, llvm_config_exe: []const u8) !LibraryDep {
         }
     }
     {
-        var it = mem.split(includes_output, " \r\n");
+        var it = mem.tokenize(includes_output, " \r\n");
         while (it.next()) |include_arg| {
             if (mem.startsWith(u8, include_arg, "-I")) {
                 try result.includes.append(include_arg[2..]);
@@ -220,7 +220,7 @@ fn findLLVM(b: *Builder, llvm_config_exe: []const u8) !LibraryDep {
         }
     }
     {
-        var it = mem.split(libdir_output, " \r\n");
+        var it = mem.tokenize(libdir_output, " \r\n");
         while (it.next()) |libdir| {
             if (mem.startsWith(u8, libdir, "-L")) {
                 try result.libdirs.append(libdir[2..]);
@@ -233,7 +233,7 @@ fn findLLVM(b: *Builder, llvm_config_exe: []const u8) !LibraryDep {
 }
 
 pub fn installStdLib(b: *Builder, stdlib_files: []const u8) void {
-    var it = mem.split(stdlib_files, ";");
+    var it = mem.tokenize(stdlib_files, ";");
     while (it.next()) |stdlib_file| {
         const src_path = os.path.join(b.allocator, "std", stdlib_file) catch unreachable;
         const dest_path = os.path.join(b.allocator, "lib", "zig", "std", stdlib_file) catch unreachable;
@@ -242,7 +242,7 @@ pub fn installStdLib(b: *Builder, stdlib_files: []const u8) void {
 }
 
 pub fn installCHeaders(b: *Builder, c_header_files: []const u8) void {
-    var it = mem.split(c_header_files, ";");
+    var it = mem.tokenize(c_header_files, ";");
     while (it.next()) |c_header_file| {
         const src_path = os.path.join(b.allocator, "c_headers", c_header_file) catch unreachable;
         const dest_path = os.path.join(b.allocator, "lib", "zig", "include", c_header_file) catch unreachable;
@@ -277,7 +277,7 @@ fn configureStage2(b: *Builder, exe: var, ctx: Context) !void {
     addCppLib(b, exe, ctx.cmake_binary_dir, "zig_cpp");
     if (ctx.lld_include_dir.len != 0) {
         exe.addIncludeDir(ctx.lld_include_dir);
-        var it = mem.split(ctx.lld_libraries, ";");
+        var it = mem.tokenize(ctx.lld_libraries, ";");
         while (it.next()) |lib| {
             exe.addObjectFile(lib);
         }
@@ -334,7 +334,7 @@ fn addCxxKnownPath(
         ctx.cxx_compiler,
         b.fmt("-print-file-name={}", objname),
     });
-    const path_unpadded = mem.split(path_padded, "\r\n").next().?;
+    const path_unpadded = mem.tokenize(path_padded, "\r\n").next().?;
     if (mem.eql(u8, path_unpadded, objname)) {
         if (errtxt) |msg| {
             warn("{}", msg);

Commit 67bd45f0cf

Commit `67bd45f0cf`