1#include <string.h>
 2#include <stdint.h>
 3#include <limits.h>
 4
 5#ifdef __wasm_simd128__
 6#include <wasm_simd128.h>
 7#endif
 8
 9#define SS (sizeof(size_t))
10#define ALIGN (sizeof(size_t)-1)
11#define ONES ((size_t)-1/UCHAR_MAX)
12#define HIGHS (ONES * (UCHAR_MAX/2+1))
13#define HASZERO(x) ((x)-ONES & ~(x) & HIGHS)
14
15void *memchr(const void *src, int c, size_t n)
16{
17#if defined(__wasm_simd128__) && defined(__wasilibc_simd_string)
18	// Skip Clang 19 and Clang 20 which have a bug (llvm/llvm-project#146574)
19	// which results in an ICE when inline assembly is used with a vector result.
20#if __clang_major__ != 19 && __clang_major__ != 20
21	// When n is zero, a function that locates a character finds no occurrence.
22	// Otherwise, decrement n to ensure sub_overflow overflows
23	// when n would go equal-to-or-below zero.
24	if (!n--) {
25		return NULL;
26	}
27
28	// Note that reading before/after the allocation of a pointer is UB in
29	// C, so inline assembly is used to generate the exact machine
30	// instruction we want with opaque semantics to the compiler to avoid
31	// the UB.
32	uintptr_t align = (uintptr_t)src % sizeof(v128_t);
33	uintptr_t addr = (uintptr_t)src - align;
34	v128_t vc = wasm_i8x16_splat(c);
35
36	for (;;) {
37		v128_t v;
38		__asm__ (
39			"local.get %1\n"
40			"v128.load 0\n"
41			"local.set %0\n"
42			: "=r"(v)
43			: "r"(addr)
44			: "memory");
45		v128_t cmp = wasm_i8x16_eq(v, vc);
46		// Bitmask is slow on AArch64, any_true is much faster.
47		if (wasm_v128_any_true(cmp)) {
48			// Clear the bits corresponding to align (little-endian)
49			// so we can count trailing zeros.
50			int mask = wasm_i8x16_bitmask(cmp) >> align << align;
51			// At least one bit will be set, unless align cleared them.
52			// Knowing this helps the compiler if it unrolls the loop.
53			__builtin_assume(mask || align);
54			// If the mask became zero because of align,
55			// it's as if we didn't find anything.
56			if (mask) {
57				// Find the offset of the first one bit (little-endian).
58				// That's a match, unless it is beyond the end of the object.
59				// Recall that we decremented n, so less-than-or-equal-to is correct.
60				size_t ctz = __builtin_ctz(mask);
61				return ctz - align <= n ? (char *)src + (addr + ctz - (uintptr_t)src)
62				                        : NULL;
63			}
64		}
65		// Decrement n; if it overflows we're done.
66		if (__builtin_sub_overflow(n, sizeof(v128_t) - align, &n)) {
67			return NULL;
68		}
69		align = 0;
70		addr += sizeof(v128_t);
71	}
72#endif
73#endif
74
75	const unsigned char *s = src;
76	c = (unsigned char)c;
77#ifdef __GNUC__
78	for (; ((uintptr_t)s & ALIGN) && n && *s != c; s++, n--);
79	if (n && *s != c) {
80		typedef size_t __attribute__((__may_alias__)) word;
81		const word *w;
82		size_t k = ONES * c;
83		for (w = (const void *)s; n>=SS && !HASZERO(*w^k); w++, n-=SS);
84		s = (const void *)w;
85	}
86#endif
87	for (; n && *s != c; s++, n--);
88	return n ? (void *)s : 0;
89}