zig/lib/std/ascii.zig at master

  1//! The 7-bit [ASCII](https://en.wikipedia.org/wiki/ASCII) character encoding standard.
  2//!
  3//! This is not to be confused with the 8-bit [extended ASCII](https://en.wikipedia.org/wiki/Extended_ASCII) character encoding.
  4//!
  5//! Even though this module concerns itself with 7-bit ASCII,
  6//! functions use `u8` as the type instead of `u7` for convenience and compatibility.
  7//! Characters outside of the 7-bit range are gracefully handled (e.g. by returning `false`).
  8//!
  9//! See also: https://en.wikipedia.org/wiki/ASCII#Character_set
 10
 11const std = @import("std");
 12
 13pub const lowercase = "abcdefghijklmnopqrstuvwxyz";
 14pub const uppercase = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
 15pub const letters = lowercase ++ uppercase;
 16
 17/// The C0 control codes of the ASCII encoding.
 18///
 19/// See also: https://en.wikipedia.org/wiki/C0_and_C1_control_codes and `isControl`
 20pub const control_code = struct {
 21    /// Null.
 22    pub const nul = 0x00;
 23    /// Start of Heading.
 24    pub const soh = 0x01;
 25    /// Start of Text.
 26    pub const stx = 0x02;
 27    /// End of Text.
 28    pub const etx = 0x03;
 29    /// End of Transmission.
 30    pub const eot = 0x04;
 31    /// Enquiry.
 32    pub const enq = 0x05;
 33    /// Acknowledge.
 34    pub const ack = 0x06;
 35    /// Bell, Alert.
 36    pub const bel = 0x07;
 37    /// Backspace.
 38    pub const bs = 0x08;
 39    /// Horizontal Tab, Tab ('\t').
 40    pub const ht = 0x09;
 41    /// Line Feed, Newline ('\n').
 42    pub const lf = 0x0A;
 43    /// Vertical Tab.
 44    pub const vt = 0x0B;
 45    /// Form Feed.
 46    pub const ff = 0x0C;
 47    /// Carriage Return ('\r').
 48    pub const cr = 0x0D;
 49    /// Shift Out.
 50    pub const so = 0x0E;
 51    /// Shift In.
 52    pub const si = 0x0F;
 53    /// Data Link Escape.
 54    pub const dle = 0x10;
 55    /// Device Control One (XON).
 56    pub const dc1 = 0x11;
 57    /// Device Control Two.
 58    pub const dc2 = 0x12;
 59    /// Device Control Three (XOFF).
 60    pub const dc3 = 0x13;
 61    /// Device Control Four.
 62    pub const dc4 = 0x14;
 63    /// Negative Acknowledge.
 64    pub const nak = 0x15;
 65    /// Synchronous Idle.
 66    pub const syn = 0x16;
 67    /// End of Transmission Block
 68    pub const etb = 0x17;
 69    /// Cancel.
 70    pub const can = 0x18;
 71    /// End of Medium.
 72    pub const em = 0x19;
 73    /// Substitute.
 74    pub const sub = 0x1A;
 75    /// Escape.
 76    pub const esc = 0x1B;
 77    /// File Separator.
 78    pub const fs = 0x1C;
 79    /// Group Separator.
 80    pub const gs = 0x1D;
 81    /// Record Separator.
 82    pub const rs = 0x1E;
 83    /// Unit Separator.
 84    pub const us = 0x1F;
 85
 86    /// Delete.
 87    pub const del = 0x7F;
 88
 89    /// An alias to `dc1`.
 90    pub const xon = dc1;
 91    /// An alias to `dc3`.
 92    pub const xoff = dc3;
 93};
 94
 95/// Returns whether the character is alphanumeric: A-Z, a-z, or 0-9.
 96pub fn isAlphanumeric(c: u8) bool {
 97    return switch (c) {
 98        '0'...'9', 'A'...'Z', 'a'...'z' => true,
 99        else => false,
100    };
101}
102
103/// Returns whether the character is alphabetic: A-Z or a-z.
104pub fn isAlphabetic(c: u8) bool {
105    return switch (c) {
106        'A'...'Z', 'a'...'z' => true,
107        else => false,
108    };
109}
110
111/// Returns whether the character is a control character.
112///
113/// See also: `control_code`
114pub fn isControl(c: u8) bool {
115    return c <= control_code.us or c == control_code.del;
116}
117
118/// Returns whether the character is a digit.
119pub fn isDigit(c: u8) bool {
120    return switch (c) {
121        '0'...'9' => true,
122        else => false,
123    };
124}
125
126/// Returns whether the character is a lowercase letter.
127pub fn isLower(c: u8) bool {
128    return switch (c) {
129        'a'...'z' => true,
130        else => false,
131    };
132}
133
134/// Returns whether the character is printable and has some graphical representation,
135/// including the space character.
136pub fn isPrint(c: u8) bool {
137    return isAscii(c) and !isControl(c);
138}
139
140/// Returns whether this character is included in `whitespace`.
141pub fn isWhitespace(c: u8) bool {
142    return switch (c) {
143        ' ', '\t'...'\r' => true,
144        else => false,
145    };
146}
147
148/// Whitespace for general use.
149/// This may be used with e.g. `std.mem.trim` to trim whitespace.
150///
151/// See also: `isWhitespace`
152pub const whitespace = [_]u8{ ' ', '\t', '\n', '\r', control_code.vt, control_code.ff };
153
154test whitespace {
155    for (whitespace) |char| try std.testing.expect(isWhitespace(char));
156
157    var i: u8 = 0;
158    while (isAscii(i)) : (i += 1) {
159        if (isWhitespace(i)) try std.testing.expect(std.mem.indexOfScalar(u8, &whitespace, i) != null);
160    }
161}
162
163/// Returns whether the character is an uppercase letter.
164pub fn isUpper(c: u8) bool {
165    return switch (c) {
166        'A'...'Z' => true,
167        else => false,
168    };
169}
170
171/// Returns whether the character is a hexadecimal digit: A-F, a-f, or 0-9.
172pub fn isHex(c: u8) bool {
173    return switch (c) {
174        '0'...'9', 'A'...'F', 'a'...'f' => true,
175        else => false,
176    };
177}
178
179/// Returns whether the character is a 7-bit ASCII character.
180pub fn isAscii(c: u8) bool {
181    return c < 128;
182}
183
184/// Uppercases the character and returns it as-is if already uppercase or not a letter.
185pub fn toUpper(c: u8) u8 {
186    const mask = @as(u8, @intFromBool(isLower(c))) << 5;
187    return c ^ mask;
188}
189
190/// Lowercases the character and returns it as-is if already lowercase or not a letter.
191pub fn toLower(c: u8) u8 {
192    const mask = @as(u8, @intFromBool(isUpper(c))) << 5;
193    return c | mask;
194}
195
196test "ASCII character classes" {
197    const testing = std.testing;
198
199    try testing.expect(!isControl('a'));
200    try testing.expect(!isControl('z'));
201    try testing.expect(!isControl(' '));
202    try testing.expect(isControl(control_code.nul));
203    try testing.expect(isControl(control_code.ff));
204    try testing.expect(isControl(control_code.us));
205    try testing.expect(isControl(control_code.del));
206    try testing.expect(!isControl(0x80));
207    try testing.expect(!isControl(0xff));
208
209    try testing.expect('C' == toUpper('c'));
210    try testing.expect(':' == toUpper(':'));
211    try testing.expect('\xab' == toUpper('\xab'));
212    try testing.expect(!isUpper('z'));
213    try testing.expect(!isUpper(0x80));
214    try testing.expect(!isUpper(0xff));
215
216    try testing.expect('c' == toLower('C'));
217    try testing.expect(':' == toLower(':'));
218    try testing.expect('\xab' == toLower('\xab'));
219    try testing.expect(!isLower('Z'));
220    try testing.expect(!isLower(0x80));
221    try testing.expect(!isLower(0xff));
222
223    try testing.expect(isAlphanumeric('Z'));
224    try testing.expect(isAlphanumeric('z'));
225    try testing.expect(isAlphanumeric('5'));
226    try testing.expect(isAlphanumeric('a'));
227    try testing.expect(!isAlphanumeric('!'));
228    try testing.expect(!isAlphanumeric(0x80));
229    try testing.expect(!isAlphanumeric(0xff));
230
231    try testing.expect(!isAlphabetic('5'));
232    try testing.expect(isAlphabetic('c'));
233    try testing.expect(!isAlphabetic('@'));
234    try testing.expect(isAlphabetic('Z'));
235    try testing.expect(!isAlphabetic(0x80));
236    try testing.expect(!isAlphabetic(0xff));
237
238    try testing.expect(isWhitespace(' '));
239    try testing.expect(isWhitespace('\t'));
240    try testing.expect(isWhitespace('\r'));
241    try testing.expect(isWhitespace('\n'));
242    try testing.expect(isWhitespace(control_code.ff));
243    try testing.expect(!isWhitespace('.'));
244    try testing.expect(!isWhitespace(control_code.us));
245    try testing.expect(!isWhitespace(0x80));
246    try testing.expect(!isWhitespace(0xff));
247
248    try testing.expect(!isHex('g'));
249    try testing.expect(isHex('b'));
250    try testing.expect(isHex('F'));
251    try testing.expect(isHex('9'));
252    try testing.expect(!isHex(0x80));
253    try testing.expect(!isHex(0xff));
254
255    try testing.expect(!isDigit('~'));
256    try testing.expect(isDigit('0'));
257    try testing.expect(isDigit('9'));
258    try testing.expect(!isDigit(0x80));
259    try testing.expect(!isDigit(0xff));
260
261    try testing.expect(isPrint(' '));
262    try testing.expect(isPrint('@'));
263    try testing.expect(isPrint('~'));
264    try testing.expect(!isPrint(control_code.esc));
265    try testing.expect(!isPrint(0x80));
266    try testing.expect(!isPrint(0xff));
267}
268
269/// Writes a lower case copy of `ascii_string` to `output`.
270/// Asserts `output.len >= ascii_string.len`.
271pub fn lowerString(output: []u8, ascii_string: []const u8) []u8 {
272    std.debug.assert(output.len >= ascii_string.len);
273    for (ascii_string, 0..) |c, i| {
274        output[i] = toLower(c);
275    }
276    return output[0..ascii_string.len];
277}
278
279test lowerString {
280    var buf: [1024]u8 = undefined;
281    const result = lowerString(&buf, "aBcDeFgHiJkLmNOPqrst0234+💩!");
282    try std.testing.expectEqualStrings("abcdefghijklmnopqrst0234+💩!", result);
283}
284
285/// Allocates a lower case copy of `ascii_string`.
286/// Caller owns returned string and must free with `allocator`.
287pub fn allocLowerString(allocator: std.mem.Allocator, ascii_string: []const u8) ![]u8 {
288    const result = try allocator.alloc(u8, ascii_string.len);
289    return lowerString(result, ascii_string);
290}
291
292test allocLowerString {
293    const result = try allocLowerString(std.testing.allocator, "aBcDeFgHiJkLmNOPqrst0234+💩!");
294    defer std.testing.allocator.free(result);
295    try std.testing.expectEqualStrings("abcdefghijklmnopqrst0234+💩!", result);
296}
297
298/// Writes an upper case copy of `ascii_string` to `output`.
299/// Asserts `output.len >= ascii_string.len`.
300pub fn upperString(output: []u8, ascii_string: []const u8) []u8 {
301    std.debug.assert(output.len >= ascii_string.len);
302    for (ascii_string, 0..) |c, i| {
303        output[i] = toUpper(c);
304    }
305    return output[0..ascii_string.len];
306}
307
308test upperString {
309    var buf: [1024]u8 = undefined;
310    const result = upperString(&buf, "aBcDeFgHiJkLmNOPqrst0234+💩!");
311    try std.testing.expectEqualStrings("ABCDEFGHIJKLMNOPQRST0234+💩!", result);
312}
313
314/// Allocates an upper case copy of `ascii_string`.
315/// Caller owns returned string and must free with `allocator`.
316pub fn allocUpperString(allocator: std.mem.Allocator, ascii_string: []const u8) ![]u8 {
317    const result = try allocator.alloc(u8, ascii_string.len);
318    return upperString(result, ascii_string);
319}
320
321test allocUpperString {
322    const result = try allocUpperString(std.testing.allocator, "aBcDeFgHiJkLmNOPqrst0234+💩!");
323    defer std.testing.allocator.free(result);
324    try std.testing.expectEqualStrings("ABCDEFGHIJKLMNOPQRST0234+💩!", result);
325}
326
327/// Compares strings `a` and `b` case-insensitively and returns whether they are equal.
328pub fn eqlIgnoreCase(a: []const u8, b: []const u8) bool {
329    if (a.len != b.len) return false;
330    for (a, 0..) |a_c, i| {
331        if (toLower(a_c) != toLower(b[i])) return false;
332    }
333    return true;
334}
335
336test eqlIgnoreCase {
337    try std.testing.expect(eqlIgnoreCase("HEl💩Lo!", "hel💩lo!"));
338    try std.testing.expect(!eqlIgnoreCase("hElLo!", "hello! "));
339    try std.testing.expect(!eqlIgnoreCase("hElLo!", "helro!"));
340}
341
342pub fn startsWithIgnoreCase(haystack: []const u8, needle: []const u8) bool {
343    return if (needle.len > haystack.len) false else eqlIgnoreCase(haystack[0..needle.len], needle);
344}
345
346test startsWithIgnoreCase {
347    try std.testing.expect(startsWithIgnoreCase("boB", "Bo"));
348    try std.testing.expect(!startsWithIgnoreCase("Needle in hAyStAcK", "haystack"));
349}
350
351pub fn endsWithIgnoreCase(haystack: []const u8, needle: []const u8) bool {
352    return if (needle.len > haystack.len) false else eqlIgnoreCase(haystack[haystack.len - needle.len ..], needle);
353}
354
355test endsWithIgnoreCase {
356    try std.testing.expect(endsWithIgnoreCase("Needle in HaYsTaCk", "haystack"));
357    try std.testing.expect(!endsWithIgnoreCase("BoB", "Bo"));
358}
359
360/// Finds `needle` in `haystack`, ignoring case, starting at index 0.
361pub fn indexOfIgnoreCase(haystack: []const u8, needle: []const u8) ?usize {
362    return indexOfIgnoreCasePos(haystack, 0, needle);
363}
364
365/// Finds `needle` in `haystack`, ignoring case, starting at `start_index`.
366/// Uses Boyer-Moore-Horspool algorithm on large inputs; `indexOfIgnoreCasePosLinear` on small inputs.
367pub fn indexOfIgnoreCasePos(haystack: []const u8, start_index: usize, needle: []const u8) ?usize {
368    if (needle.len > haystack.len) return null;
369    if (needle.len == 0) return start_index;
370
371    if (haystack.len < 52 or needle.len <= 4)
372        return indexOfIgnoreCasePosLinear(haystack, start_index, needle);
373
374    var skip_table: [256]usize = undefined;
375    boyerMooreHorspoolPreprocessIgnoreCase(needle, skip_table[0..]);
376
377    var i: usize = start_index;
378    while (i <= haystack.len - needle.len) {
379        if (eqlIgnoreCase(haystack[i .. i + needle.len], needle)) return i;
380        i += skip_table[toLower(haystack[i + needle.len - 1])];
381    }
382
383    return null;
384}
385
386/// Consider using `indexOfIgnoreCasePos` instead of this, which will automatically use a
387/// more sophisticated algorithm on larger inputs.
388pub fn indexOfIgnoreCasePosLinear(haystack: []const u8, start_index: usize, needle: []const u8) ?usize {
389    var i: usize = start_index;
390    const end = haystack.len - needle.len;
391    while (i <= end) : (i += 1) {
392        if (eqlIgnoreCase(haystack[i .. i + needle.len], needle)) return i;
393    }
394    return null;
395}
396
397fn boyerMooreHorspoolPreprocessIgnoreCase(pattern: []const u8, table: *[256]usize) void {
398    for (table) |*c| {
399        c.* = pattern.len;
400    }
401
402    var i: usize = 0;
403    // The last item is intentionally ignored and the skip size will be pattern.len.
404    // This is the standard way Boyer-Moore-Horspool is implemented.
405    while (i < pattern.len - 1) : (i += 1) {
406        table[toLower(pattern[i])] = pattern.len - 1 - i;
407    }
408}
409
410test indexOfIgnoreCase {
411    try std.testing.expect(indexOfIgnoreCase("one Two Three Four", "foUr").? == 14);
412    try std.testing.expect(indexOfIgnoreCase("one two three FouR", "gOur") == null);
413    try std.testing.expect(indexOfIgnoreCase("foO", "Foo").? == 0);
414    try std.testing.expect(indexOfIgnoreCase("foo", "fool") == null);
415    try std.testing.expect(indexOfIgnoreCase("FOO foo", "fOo").? == 0);
416
417    try std.testing.expect(indexOfIgnoreCase("one two three four five six seven eight nine ten eleven", "ThReE fOUr").? == 8);
418    try std.testing.expect(indexOfIgnoreCase("one two three four five six seven eight nine ten eleven", "Two tWo") == null);
419}
420
421/// Returns the lexicographical order of two slices. O(n).
422pub fn orderIgnoreCase(lhs: []const u8, rhs: []const u8) std.math.Order {
423    if (lhs.ptr != rhs.ptr) {
424        const n = @min(lhs.len, rhs.len);
425        var i: usize = 0;
426        while (i < n) : (i += 1) {
427            switch (std.math.order(toLower(lhs[i]), toLower(rhs[i]))) {
428                .eq => continue,
429                .lt => return .lt,
430                .gt => return .gt,
431            }
432        }
433    }
434    return std.math.order(lhs.len, rhs.len);
435}
436
437/// Returns whether the lexicographical order of `lhs` is lower than `rhs`.
438pub fn lessThanIgnoreCase(lhs: []const u8, rhs: []const u8) bool {
439    return orderIgnoreCase(lhs, rhs) == .lt;
440}
441
442pub const HexEscape = struct {
443    bytes: []const u8,
444    charset: *const [16]u8,
445
446    pub const upper_charset = "0123456789ABCDEF";
447    pub const lower_charset = "0123456789abcdef";
448
449    pub fn format(se: HexEscape, w: *std.Io.Writer) std.Io.Writer.Error!void {
450        const charset = se.charset;
451
452        var buf: [4]u8 = undefined;
453        buf[0] = '\\';
454        buf[1] = 'x';
455
456        for (se.bytes) |c| {
457            if (std.ascii.isPrint(c)) {
458                try w.writeByte(c);
459            } else {
460                buf[2] = charset[c >> 4];
461                buf[3] = charset[c & 15];
462                try w.writeAll(&buf);
463            }
464        }
465    }
466};
467
468/// Replaces non-ASCII bytes with hex escapes.
469pub fn hexEscape(bytes: []const u8, case: std.fmt.Case) std.fmt.Alt(HexEscape, HexEscape.format) {
470    return .{ .data = .{ .bytes = bytes, .charset = switch (case) {
471        .lower => HexEscape.lower_charset,
472        .upper => HexEscape.upper_charset,
473    } } };
474}
475
476test hexEscape {
477    try std.testing.expectFmt("abc 123", "{f}", .{hexEscape("abc 123", .lower)});
478    try std.testing.expectFmt("ab\\xffc", "{f}", .{hexEscape("ab\xffc", .lower)});
479    try std.testing.expectFmt("abc 123", "{f}", .{hexEscape("abc 123", .upper)});
480    try std.testing.expectFmt("ab\\xFFc", "{f}", .{hexEscape("ab\xffc", .upper)});
481}