master
1#include <string.h>
2#include <stdint.h>
3#include <limits.h>
4
5#ifdef __wasm_simd128__
6#include <wasm_simd128.h>
7#endif
8
9#define SS (sizeof(size_t))
10#define ALIGN (sizeof(size_t)-1)
11#define ONES ((size_t)-1/UCHAR_MAX)
12#define HIGHS (ONES * (UCHAR_MAX/2+1))
13#define HASZERO(x) ((x)-ONES & ~(x) & HIGHS)
14
15void *memchr(const void *src, int c, size_t n)
16{
17#if defined(__wasm_simd128__) && defined(__wasilibc_simd_string)
18 // Skip Clang 19 and Clang 20 which have a bug (llvm/llvm-project#146574)
19 // which results in an ICE when inline assembly is used with a vector result.
20#if __clang_major__ != 19 && __clang_major__ != 20
21 // When n is zero, a function that locates a character finds no occurrence.
22 // Otherwise, decrement n to ensure sub_overflow overflows
23 // when n would go equal-to-or-below zero.
24 if (!n--) {
25 return NULL;
26 }
27
28 // Note that reading before/after the allocation of a pointer is UB in
29 // C, so inline assembly is used to generate the exact machine
30 // instruction we want with opaque semantics to the compiler to avoid
31 // the UB.
32 uintptr_t align = (uintptr_t)src % sizeof(v128_t);
33 uintptr_t addr = (uintptr_t)src - align;
34 v128_t vc = wasm_i8x16_splat(c);
35
36 for (;;) {
37 v128_t v;
38 __asm__ (
39 "local.get %1\n"
40 "v128.load 0\n"
41 "local.set %0\n"
42 : "=r"(v)
43 : "r"(addr)
44 : "memory");
45 v128_t cmp = wasm_i8x16_eq(v, vc);
46 // Bitmask is slow on AArch64, any_true is much faster.
47 if (wasm_v128_any_true(cmp)) {
48 // Clear the bits corresponding to align (little-endian)
49 // so we can count trailing zeros.
50 int mask = wasm_i8x16_bitmask(cmp) >> align << align;
51 // At least one bit will be set, unless align cleared them.
52 // Knowing this helps the compiler if it unrolls the loop.
53 __builtin_assume(mask || align);
54 // If the mask became zero because of align,
55 // it's as if we didn't find anything.
56 if (mask) {
57 // Find the offset of the first one bit (little-endian).
58 // That's a match, unless it is beyond the end of the object.
59 // Recall that we decremented n, so less-than-or-equal-to is correct.
60 size_t ctz = __builtin_ctz(mask);
61 return ctz - align <= n ? (char *)src + (addr + ctz - (uintptr_t)src)
62 : NULL;
63 }
64 }
65 // Decrement n; if it overflows we're done.
66 if (__builtin_sub_overflow(n, sizeof(v128_t) - align, &n)) {
67 return NULL;
68 }
69 align = 0;
70 addr += sizeof(v128_t);
71 }
72#endif
73#endif
74
75 const unsigned char *s = src;
76 c = (unsigned char)c;
77#ifdef __GNUC__
78 for (; ((uintptr_t)s & ALIGN) && n && *s != c; s++, n--);
79 if (n && *s != c) {
80 typedef size_t __attribute__((__may_alias__)) word;
81 const word *w;
82 size_t k = ONES * c;
83 for (w = (const void *)s; n>=SS && !HASZERO(*w^k); w++, n-=SS);
84 s = (const void *)w;
85 }
86#endif
87 for (; n && *s != c; s++, n--);
88 return n ? (void *)s : 0;
89}