Commit aae5560712

Yefeng Li <yefeng@emma-app.com>
2025-08-25 19:07:07
Remove memcmp and memset from bundled musl and wasi
1 parent 42eb132
Changed files (10)
lib
libc
musl
wasi
libc-top-half
musl
src
src
lib/libc/musl/src/string/aarch64/memset.S
@@ -1,115 +0,0 @@
-/*
- * memset - fill memory with a constant byte
- *
- * Copyright (c) 2012-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
- *
- */
-
-#define dstin   x0
-#define val     x1
-#define valw    w1
-#define count   x2
-#define dst     x3
-#define dstend  x4
-#define zva_val x5
-
-.global memset
-.type memset,%function
-memset:
-
-	dup     v0.16B, valw
-	add     dstend, dstin, count
-
-	cmp     count, 96
-	b.hi    .Lset_long
-	cmp     count, 16
-	b.hs    .Lset_medium
-	mov     val, v0.D[0]
-
-	/* Set 0..15 bytes.  */
-	tbz     count, 3, 1f
-	str     val, [dstin]
-	str     val, [dstend, -8]
-	ret
-	nop
-1:      tbz     count, 2, 2f
-	str     valw, [dstin]
-	str     valw, [dstend, -4]
-	ret
-2:      cbz     count, 3f
-	strb    valw, [dstin]
-	tbz     count, 1, 3f
-	strh    valw, [dstend, -2]
-3:      ret
-
-	/* Set 17..96 bytes.  */
-.Lset_medium:
-	str     q0, [dstin]
-	tbnz    count, 6, .Lset96
-	str     q0, [dstend, -16]
-	tbz     count, 5, 1f
-	str     q0, [dstin, 16]
-	str     q0, [dstend, -32]
-1:      ret
-
-	.p2align 4
-	/* Set 64..96 bytes.  Write 64 bytes from the start and
-	   32 bytes from the end.  */
-.Lset96:
-	str     q0, [dstin, 16]
-	stp     q0, q0, [dstin, 32]
-	stp     q0, q0, [dstend, -32]
-	ret
-
-	.p2align 4
-.Lset_long:
-	and     valw, valw, 255
-	bic     dst, dstin, 15
-	str     q0, [dstin]
-	cmp     count, 160
-	ccmp    valw, 0, 0, hs
-	b.ne    .Lno_zva
-
-#ifndef SKIP_ZVA_CHECK
-	mrs     zva_val, dczid_el0
-	and     zva_val, zva_val, 31
-	cmp     zva_val, 4              /* ZVA size is 64 bytes.  */
-	b.ne    .Lno_zva
-#endif
-	str     q0, [dst, 16]
-	stp     q0, q0, [dst, 32]
-	bic     dst, dst, 63
-	sub     count, dstend, dst      /* Count is now 64 too large.  */
-	sub     count, count, 128       /* Adjust count and bias for loop.  */
-
-	.p2align 4
-.Lzva_loop:
-	add     dst, dst, 64
-	dc      zva, dst
-	subs    count, count, 64
-	b.hi    .Lzva_loop
-	stp     q0, q0, [dstend, -64]
-	stp     q0, q0, [dstend, -32]
-	ret
-
-.Lno_zva:
-	sub     count, dstend, dst      /* Count is 16 too large.  */
-	sub     dst, dst, 16            /* Dst is biased by -32.  */
-	sub     count, count, 64 + 16   /* Adjust count and bias for loop.  */
-.Lno_zva_loop:
-	stp     q0, q0, [dst, 32]
-	stp     q0, q0, [dst, 64]!
-	subs    count, count, 64
-	b.hi    .Lno_zva_loop
-	stp     q0, q0, [dstend, -64]
-	stp     q0, q0, [dstend, -32]
-	ret
-
-.size memset,.-memset
-
lib/libc/musl/src/string/arm/__aeabi_memset.s
@@ -1,31 +0,0 @@
-.syntax unified
-
-.global __aeabi_memclr8
-.global __aeabi_memclr4
-.global __aeabi_memclr
-.global __aeabi_memset8
-.global __aeabi_memset4
-.global __aeabi_memset
-
-.type __aeabi_memclr8,%function
-.type __aeabi_memclr4,%function
-.type __aeabi_memclr,%function
-.type __aeabi_memset8,%function
-.type __aeabi_memset4,%function
-.type __aeabi_memset,%function
-
-__aeabi_memclr8:
-__aeabi_memclr4:
-__aeabi_memclr:
-	movs  r2, #0
-__aeabi_memset8:
-__aeabi_memset4:
-__aeabi_memset:
-	cmp   r1, #0
-	beq   2f
-	adds  r1, r0, r1
-1:	strb  r2, [r0]
-	adds  r0, r0, #1
-	cmp   r1, r0
-	bne   1b
-2:	bx    lr
lib/libc/musl/src/string/i386/memset.s
@@ -1,76 +0,0 @@
-.global memset
-.type memset,@function
-memset:
-	mov 12(%esp),%ecx
-	cmp $62,%ecx
-	ja 2f
-
-	mov 8(%esp),%dl
-	mov 4(%esp),%eax
-	test %ecx,%ecx
-	jz 1f
-
-	mov %dl,%dh
-
-	mov %dl,(%eax)
-	mov %dl,-1(%eax,%ecx)
-	cmp $2,%ecx
-	jbe 1f
-
-	mov %dx,1(%eax)
-	mov %dx,(-1-2)(%eax,%ecx)
-	cmp $6,%ecx
-	jbe 1f
-
-	shl $16,%edx
-	mov 8(%esp),%dl
-	mov 8(%esp),%dh
-
-	mov %edx,(1+2)(%eax)
-	mov %edx,(-1-2-4)(%eax,%ecx)
-	cmp $14,%ecx
-	jbe 1f
-
-	mov %edx,(1+2+4)(%eax)
-	mov %edx,(1+2+4+4)(%eax)
-	mov %edx,(-1-2-4-8)(%eax,%ecx)
-	mov %edx,(-1-2-4-4)(%eax,%ecx)
-	cmp $30,%ecx
-	jbe 1f
-
-	mov %edx,(1+2+4+8)(%eax)
-	mov %edx,(1+2+4+8+4)(%eax)
-	mov %edx,(1+2+4+8+8)(%eax)
-	mov %edx,(1+2+4+8+12)(%eax)
-	mov %edx,(-1-2-4-8-16)(%eax,%ecx)
-	mov %edx,(-1-2-4-8-12)(%eax,%ecx)
-	mov %edx,(-1-2-4-8-8)(%eax,%ecx)
-	mov %edx,(-1-2-4-8-4)(%eax,%ecx)
-
-1:	ret 	
-
-2:	movzbl 8(%esp),%eax
-	mov %edi,12(%esp)
-	imul $0x1010101,%eax
-	mov 4(%esp),%edi
-	test $15,%edi
-	mov %eax,-4(%edi,%ecx)
-	jnz 2f
-
-1:	shr $2, %ecx
-	rep
-	stosl
-	mov 4(%esp),%eax
-	mov 12(%esp),%edi
-	ret
-	
-2:	xor %edx,%edx
-	sub %edi,%edx
-	and $15,%edx
-	mov %eax,(%edi)
-	mov %eax,4(%edi)
-	mov %eax,8(%edi)
-	mov %eax,12(%edi)
-	sub %edx,%ecx
-	add %edx,%edi
-	jmp 1b
lib/libc/musl/src/string/x86_64/memset.s
@@ -1,72 +0,0 @@
-.global memset
-.type memset,@function
-memset:
-	movzbq %sil,%rax
-	mov $0x101010101010101,%r8
-	imul %r8,%rax
-
-	cmp $126,%rdx
-	ja 2f
-
-	test %edx,%edx
-	jz 1f
-
-	mov %sil,(%rdi)
-	mov %sil,-1(%rdi,%rdx)
-	cmp $2,%edx
-	jbe 1f
-
-	mov %ax,1(%rdi)
-	mov %ax,(-1-2)(%rdi,%rdx)
-	cmp $6,%edx
-	jbe 1f
-
-	mov %eax,(1+2)(%rdi)
-	mov %eax,(-1-2-4)(%rdi,%rdx)
-	cmp $14,%edx
-	jbe 1f
-
-	mov %rax,(1+2+4)(%rdi)
-	mov %rax,(-1-2-4-8)(%rdi,%rdx)
-	cmp $30,%edx
-	jbe 1f
-
-	mov %rax,(1+2+4+8)(%rdi)
-	mov %rax,(1+2+4+8+8)(%rdi)
-	mov %rax,(-1-2-4-8-16)(%rdi,%rdx)
-	mov %rax,(-1-2-4-8-8)(%rdi,%rdx)
-	cmp $62,%edx
-	jbe 1f
-
-	mov %rax,(1+2+4+8+16)(%rdi)
-	mov %rax,(1+2+4+8+16+8)(%rdi)
-	mov %rax,(1+2+4+8+16+16)(%rdi)
-	mov %rax,(1+2+4+8+16+24)(%rdi)
-	mov %rax,(-1-2-4-8-16-32)(%rdi,%rdx)
-	mov %rax,(-1-2-4-8-16-24)(%rdi,%rdx)
-	mov %rax,(-1-2-4-8-16-16)(%rdi,%rdx)
-	mov %rax,(-1-2-4-8-16-8)(%rdi,%rdx)
-
-1:	mov %rdi,%rax
-	ret
-
-2:	test $15,%edi
-	mov %rdi,%r8
-	mov %rax,-8(%rdi,%rdx)
-	mov %rdx,%rcx
-	jnz 2f
-
-1:	shr $3,%rcx
-	rep
-	stosq
-	mov %r8,%rax
-	ret
-
-2:	xor %edx,%edx
-	sub %edi,%edx
-	and $15,%edx
-	mov %rax,(%rdi)
-	mov %rax,8(%rdi)
-	sub %rdx,%rcx
-	add %rdx,%rdi
-	jmp 1b
lib/libc/musl/src/string/memcmp.c
@@ -1,8 +0,0 @@
-#include <string.h>
-
-int memcmp(const void *vl, const void *vr, size_t n)
-{
-	const unsigned char *l=vl, *r=vr;
-	for (; n && *l == *r; n--, l++, r++);
-	return n ? *l-*r : 0;
-}
lib/libc/musl/src/string/memset.c
@@ -1,90 +0,0 @@
-#include <string.h>
-#include <stdint.h>
-
-void *memset(void *dest, int c, size_t n)
-{
-	unsigned char *s = dest;
-	size_t k;
-
-	/* Fill head and tail with minimal branching. Each
-	 * conditional ensures that all the subsequently used
-	 * offsets are well-defined and in the dest region. */
-
-	if (!n) return dest;
-	s[0] = c;
-	s[n-1] = c;
-	if (n <= 2) return dest;
-	s[1] = c;
-	s[2] = c;
-	s[n-2] = c;
-	s[n-3] = c;
-	if (n <= 6) return dest;
-	s[3] = c;
-	s[n-4] = c;
-	if (n <= 8) return dest;
-
-	/* Advance pointer to align it at a 4-byte boundary,
-	 * and truncate n to a multiple of 4. The previous code
-	 * already took care of any head/tail that get cut off
-	 * by the alignment. */
-
-	k = -(uintptr_t)s & 3;
-	s += k;
-	n -= k;
-	n &= -4;
-
-#ifdef __GNUC__
-	typedef uint32_t __attribute__((__may_alias__)) u32;
-	typedef uint64_t __attribute__((__may_alias__)) u64;
-
-	u32 c32 = ((u32)-1)/255 * (unsigned char)c;
-
-	/* In preparation to copy 32 bytes at a time, aligned on
-	 * an 8-byte bounary, fill head/tail up to 28 bytes each.
-	 * As in the initial byte-based head/tail fill, each
-	 * conditional below ensures that the subsequent offsets
-	 * are valid (e.g. !(n<=24) implies n>=28). */
-
-	*(u32 *)(s+0) = c32;
-	*(u32 *)(s+n-4) = c32;
-	if (n <= 8) return dest;
-	*(u32 *)(s+4) = c32;
-	*(u32 *)(s+8) = c32;
-	*(u32 *)(s+n-12) = c32;
-	*(u32 *)(s+n-8) = c32;
-	if (n <= 24) return dest;
-	*(u32 *)(s+12) = c32;
-	*(u32 *)(s+16) = c32;
-	*(u32 *)(s+20) = c32;
-	*(u32 *)(s+24) = c32;
-	*(u32 *)(s+n-28) = c32;
-	*(u32 *)(s+n-24) = c32;
-	*(u32 *)(s+n-20) = c32;
-	*(u32 *)(s+n-16) = c32;
-
-	/* Align to a multiple of 8 so we can fill 64 bits at a time,
-	 * and avoid writing the same bytes twice as much as is
-	 * practical without introducing additional branching. */
-
-	k = 24 + ((uintptr_t)s & 4);
-	s += k;
-	n -= k;
-
-	/* If this loop is reached, 28 tail bytes have already been
-	 * filled, so any remainder when n drops below 32 can be
-	 * safely ignored. */
-
-	u64 c64 = c32 | ((u64)c32 << 32);
-	for (; n >= 32; n-=32, s+=32) {
-		*(u64 *)(s+0) = c64;
-		*(u64 *)(s+8) = c64;
-		*(u64 *)(s+16) = c64;
-		*(u64 *)(s+24) = c64;
-	}
-#else
-	/* Pure C fallback with no aliasing violations. */
-	for (; n; n--, s++) *s = c;
-#endif
-
-	return dest;
-}
lib/libc/wasi/libc-top-half/musl/src/string/memcmp.c
@@ -1,43 +0,0 @@
-#include <string.h>
-
-#ifdef __wasm_simd128__
-#include <wasm_simd128.h>
-#endif
-
-int memcmp(const void *vl, const void *vr, size_t n)
-{
-#if defined(__wasm_simd128__) && defined(__wasilibc_simd_string)
-	if (n >= sizeof(v128_t)) {
-		// memcmp is allowed to read up to n bytes from each object.
-		// Find the first different character in the objects.
-		// Unaligned loads handle the case where the objects
-		// have mismatching alignments.
-		const v128_t *v1 = (v128_t *)vl;
-		const v128_t *v2 = (v128_t *)vr;
-		while (n) {
-			const v128_t cmp = wasm_i8x16_eq(wasm_v128_load(v1), wasm_v128_load(v2));
-			// Bitmask is slow on AArch64, all_true is much faster.
-			if (!wasm_i8x16_all_true(cmp)) {
-				// Find the offset of the first zero bit (little-endian).
-				size_t ctz = __builtin_ctz(~wasm_i8x16_bitmask(cmp));
-				const unsigned char *u1 = (unsigned char *)v1 + ctz;
-				const unsigned char *u2 = (unsigned char *)v2 + ctz;
-				// This may help the compiler if the function is inlined.
-				__builtin_assume(*u1 - *u2 != 0);
-				return *u1 - *u2;
-			}
-			// This makes n a multiple of sizeof(v128_t)
-			// for every iteration except the first.
-			size_t align = (n - 1) % sizeof(v128_t) + 1;
-			v1 = (v128_t *)((char *)v1 + align);
-			v2 = (v128_t *)((char *)v2 + align);
-			n -= align;
-		}
-		return 0;
-	}
-#endif
-
-	const unsigned char *l=vl, *r=vr;
-	for (; n && *l == *r; n--, l++, r++);
-	return n ? *l-*r : 0;
-}
lib/libc/wasi/libc-top-half/musl/src/string/memset.c
@@ -1,94 +0,0 @@
-#include <string.h>
-#include <stdint.h>
-
-void *memset(void *dest, int c, size_t n)
-{
-#if defined(__wasm_bulk_memory__)
-	if (n > BULK_MEMORY_THRESHOLD)
-		return __builtin_memset(dest, c, n);
-#endif
-	unsigned char *s = dest;
-	size_t k;
-
-	/* Fill head and tail with minimal branching. Each
-	 * conditional ensures that all the subsequently used
-	 * offsets are well-defined and in the dest region. */
-
-	if (!n) return dest;
-	s[0] = c;
-	s[n-1] = c;
-	if (n <= 2) return dest;
-	s[1] = c;
-	s[2] = c;
-	s[n-2] = c;
-	s[n-3] = c;
-	if (n <= 6) return dest;
-	s[3] = c;
-	s[n-4] = c;
-	if (n <= 8) return dest;
-
-	/* Advance pointer to align it at a 4-byte boundary,
-	 * and truncate n to a multiple of 4. The previous code
-	 * already took care of any head/tail that get cut off
-	 * by the alignment. */
-
-	k = -(uintptr_t)s & 3;
-	s += k;
-	n -= k;
-	n &= -4;
-
-#ifdef __GNUC__
-	typedef uint32_t __attribute__((__may_alias__)) u32;
-	typedef uint64_t __attribute__((__may_alias__)) u64;
-
-	u32 c32 = ((u32)-1)/255 * (unsigned char)c;
-
-	/* In preparation to copy 32 bytes at a time, aligned on
-	 * an 8-byte bounary, fill head/tail up to 28 bytes each.
-	 * As in the initial byte-based head/tail fill, each
-	 * conditional below ensures that the subsequent offsets
-	 * are valid (e.g. !(n<=24) implies n>=28). */
-
-	*(u32 *)(s+0) = c32;
-	*(u32 *)(s+n-4) = c32;
-	if (n <= 8) return dest;
-	*(u32 *)(s+4) = c32;
-	*(u32 *)(s+8) = c32;
-	*(u32 *)(s+n-12) = c32;
-	*(u32 *)(s+n-8) = c32;
-	if (n <= 24) return dest;
-	*(u32 *)(s+12) = c32;
-	*(u32 *)(s+16) = c32;
-	*(u32 *)(s+20) = c32;
-	*(u32 *)(s+24) = c32;
-	*(u32 *)(s+n-28) = c32;
-	*(u32 *)(s+n-24) = c32;
-	*(u32 *)(s+n-20) = c32;
-	*(u32 *)(s+n-16) = c32;
-
-	/* Align to a multiple of 8 so we can fill 64 bits at a time,
-	 * and avoid writing the same bytes twice as much as is
-	 * practical without introducing additional branching. */
-
-	k = 24 + ((uintptr_t)s & 4);
-	s += k;
-	n -= k;
-
-	/* If this loop is reached, 28 tail bytes have already been
-	 * filled, so any remainder when n drops below 32 can be
-	 * safely ignored. */
-
-	u64 c64 = c32 | ((u64)c32 << 32);
-	for (; n >= 32; n-=32, s+=32) {
-		*(u64 *)(s+0) = c64;
-		*(u64 *)(s+8) = c64;
-		*(u64 *)(s+16) = c64;
-		*(u64 *)(s+24) = c64;
-	}
-#else
-	/* Pure C fallback with no aliasing violations. */
-	for (; n; n--, s++) *s = c;
-#endif
-
-	return dest;
-}
src/libs/musl.zig
@@ -1786,20 +1786,15 @@ const src_files = [_][]const u8{
     "musl/src/stdlib/strtol.c",
     "musl/src/stdlib/wcstod.c",
     "musl/src/stdlib/wcstol.c",
-    "musl/src/string/aarch64/memset.S",
-    "musl/src/string/arm/__aeabi_memset.s",
     "musl/src/string/bcmp.c",
     "musl/src/string/bcopy.c",
     "musl/src/string/explicit_bzero.c",
-    "musl/src/string/i386/memset.s",
     "musl/src/string/index.c",
     "musl/src/string/memccpy.c",
     "musl/src/string/memchr.c",
-    "musl/src/string/memcmp.c",
     "musl/src/string/memmem.c",
     "musl/src/string/mempcpy.c",
     "musl/src/string/memrchr.c",
-    "musl/src/string/memset.c",
     "musl/src/string/rindex.c",
     "musl/src/string/stpcpy.c",
     "musl/src/string/stpncpy.c",
@@ -1855,7 +1850,6 @@ const src_files = [_][]const u8{
     "musl/src/string/wmemcpy.c",
     "musl/src/string/wmemmove.c",
     "musl/src/string/wmemset.c",
-    "musl/src/string/x86_64/memset.s",
     "musl/src/temp/mkdtemp.c",
     "musl/src/temp/mkostemp.c",
     "musl/src/temp/mkostemps.c",
src/libs/wasi_libc.zig
@@ -1221,9 +1221,7 @@ const libc_top_half_src_files = [_][]const u8{
     "wasi/libc-top-half/musl/src/stdlib/wcstod.c",
     "wasi/libc-top-half/musl/src/stdlib/wcstol.c",
     "wasi/libc-top-half/musl/src/string/memchr.c",
-    "wasi/libc-top-half/musl/src/string/memcmp.c",
     "wasi/libc-top-half/musl/src/string/memrchr.c",
-    "wasi/libc-top-half/musl/src/string/memset.c",
     "wasi/libc-top-half/musl/src/string/strchrnul.c",
     "wasi/libc-top-half/musl/src/thread/pthread_attr_get.c",
     "wasi/libc-top-half/musl/src/thread/pthread_attr_setguardsize.c",