Commit b76398c993

Shawn Landden <shawn@git.icu>
2019-03-22 03:33:37
std: add ascii with C ASCII character classes
Does NOT look at the locale the way the C functions do. int isalnum(int c); int isalpha(int c); int iscntrl(int c); int isdigit(int c); int isgraph(int c); int islower(int c); int isprint(int c); int ispunct(int c); int isspace(int c); int isupper(int c); int isxdigit(int c); int isascii(int c); int isblank(int c); int toupper(int c); int tolower(int c); Tested to match glibc (when using C locale) with this program: const c = @cImport({ // See https://github.com/ziglang/zig/issues/515 @cDefine("_NO_CRT_STDIO_INLINE", "1"); @cInclude("stdio.h"); @cInclude("string.h"); @cInclude("ctype.h"); }); const std = @import("std"); const ascii = std.ascii; const abort = std.os.abort; export fn main(argc: c_int, argv: **u8) c_int { var i: u8 = undefined; i = 0; while (true) { if (ascii.isAlNum(i) != (c.isalnum(i) > 0)) { abort(); } if (ascii.isAlpha(i) != (c.isalpha(i) > 0)) { abort(); } if (ascii.isCtrl(i) != (c.iscntrl(i) > 0)) { abort(); } if (ascii.isDigit(i) != (c.isdigit(i) > 0)) { abort(); } if (ascii.isGraph(i) != (c.isgraph(i) > 0)) { abort(); } if (ascii.isLower(i) != (c.islower(i) > 0)) { abort(); } if (ascii.isPrint(i) != (c.isprint(i) > 0)) { abort(); } if (ascii.isPunct(i) != (c.ispunct(i) > 0)) { abort(); } if (ascii.isSpace(i) != (c.isspace(i) > 0)) { abort(); } if (ascii.isUpper(i) != (c.isupper(i) > 0)) { abort(); } if (ascii.isXDigit(i) != (c.isxdigit(i) > 0)) { abort(); } if (i == 255) { break; } i += 1; } _ = c.printf(c"Success!\n"); return 0; }
1 parent 6272847
Changed files (3)
std/ascii.zig
@@ -0,0 +1,232 @@
+// Does NOT look at the locale the way C89's toupper(3), isspace() et cetera does.
+// I could have taken only a u7 to make this clear, but it would be slower
+// It is my opinion that encodings other than UTF-8 should not be supported.
+//
+// (and 128 bytes is not much to pay).
+// Also does not handle Unicode character classes.
+//
+// https://upload.wikimedia.org/wikipedia/commons/thumb/c/cf/USASCII_code_chart.png/1200px-USASCII_code_chart.png
+
+const tIndex = enum(u3) {
+    Alpha,
+    Hex,
+    Space,
+    Digit,
+    Lower,
+    Upper,
+    // Ctrl, < 0x20 || == DEL
+    // Print, = Graph || == ' '. NOT '\t' et cetera
+    Punct,
+    Graph,
+    //ASCII, | ~0b01111111
+    //isBlank, == ' ' || == '\x09'
+};
+
+const combinedTable = init: {
+    comptime var table: [256]u8 = undefined;
+
+    const std = @import("std");
+    const mem = std.mem;
+
+    const alpha = []u1{
+    //  0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
+        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
+    };
+    const lower = []u1{
+    //  0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
+    };
+    const upper = []u1{
+    //  0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    };
+    const digit = []u1{
+    //  0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
+
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    };
+    const hex = []u1{
+    //  0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
+
+        0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    };
+    const space = []u1{
+    //  0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    };
+    const punct = []u1{
+    //  0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
+
+        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
+        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
+    };
+    const graph = []u1{
+    //  0, 1, 2, 3, 4, 5, 6, 7 ,8, 9,10,11,12,13,14,15
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+    };
+
+    comptime var i = 0;
+    inline while (i < 128) : (i += 1) {
+        table[i] =
+            u8(alpha[i]) << @enumToInt(tIndex.Alpha) |
+            u8(hex[i])   << @enumToInt(tIndex.Hex) |
+            u8(space[i]) << @enumToInt(tIndex.Space) |
+            u8(digit[i]) << @enumToInt(tIndex.Digit) |
+            u8(lower[i]) << @enumToInt(tIndex.Lower) |
+            u8(upper[i]) << @enumToInt(tIndex.Upper) |
+            u8(punct[i]) << @enumToInt(tIndex.Punct) |
+            u8(graph[i]) << @enumToInt(tIndex.Graph);
+    }
+    mem.set(u8, table[128..256], 0);
+    break :init table;
+};
+
+fn inTable(c: u8, t: tIndex) bool {
+    return (combinedTable[c] & (u8(1) << @enumToInt(t))) != 0;
+}
+
+pub fn isAlNum(c: u8) bool {
+    return (combinedTable[c] & ((u8(1) << @enumToInt(tIndex.Alpha)) | 
+                                 u8(1) << @enumToInt(tIndex.Digit))) != 0;
+}
+
+pub fn isAlpha(c: u8) bool {
+    return inTable(c, tIndex.Alpha);
+}
+
+pub fn isCtrl(c: u8) bool {
+    return c < 0x20 or c == 127; //DEL
+}
+
+pub fn isCntrl(c: u8) bool {
+    return isCtrl(c);
+}
+
+pub fn isDigit(c: u8) bool {
+    return inTable(c, tIndex.Digit);
+}
+
+pub fn isGraph(c: u8) bool {
+    return inTable(c, tIndex.Graph);
+}
+
+pub fn isLower(c: u8) bool {
+    return inTable(c, tIndex.Lower);
+}
+
+pub fn isPrint(c: u8) bool {
+    return inTable(c, tIndex.Graph) or c == ' ';
+}
+
+pub fn isPunct(c: u8) bool {
+    return inTable(c, tIndex.Punct);
+}
+
+pub fn isSpace(c: u8) bool {
+    return inTable(c, tIndex.Space);
+}
+
+pub fn isUpper(c: u8) bool {
+    return inTable(c, tIndex.Upper);
+}
+
+pub fn isXDigit(c: u8) bool {
+    return inTable(c, tIndex.Hex);
+}
+
+pub fn isASCII(c: u8) bool {
+    return c < 128;
+}
+
+pub fn isBlank(c: u8) bool {
+    return (c == ' ') or (c == '\x09');
+}
+
+pub fn toUpper(c: u8) u8 {
+    if (isLower(c)) {
+        return c - 0x20;
+    } else {
+        return c;
+    }
+}
+
+pub fn toLower(c: u8) u8 {
+    if (isUpper(c)) {
+        return c + 0x20;
+    } else {
+        return c;
+    }
+}
+
+test "ascii character classes" {
+    const std = @import("std");
+    const testing = std.testing;
+
+    testing.expect('C' == toUpper('c'));
+    testing.expect(':' == toUpper(':'));
+    testing.expect('\xab' == toUpper('\xab'));
+    testing.expect('c' == toLower('C'));
+    testing.expect(isAlpha('c'));
+    testing.expect(!isAlpha('5'));
+    testing.expect(isSpace(' '));
+}
std/std.zig
@@ -42,6 +42,7 @@ pub const pdb = @import("pdb.zig");
 pub const rand = @import("rand.zig");
 pub const rb = @import("rb.zig");
 pub const sort = @import("sort.zig");
+pub const ascii = @import("ascii.zig");
 pub const testing = @import("testing.zig");
 pub const unicode = @import("unicode.zig");
 pub const valgrind = @import("valgrind.zig");
CMakeLists.txt
@@ -446,6 +446,7 @@ set(ZIG_CPP_SOURCES
 
 set(ZIG_STD_FILES
     "array_list.zig"
+    "ascii.zig"
     "atomic.zig"
     "atomic/int.zig"
     "atomic/queue.zig"