x86: Only align destination to 1x VEC_SIZE in memset 4x loop

Current code aligns to 2x VEC_SIZE. Aligning to 2x has no affect on performance other than potentially resulting in an additional iteration of the loop. 1x maintains aligned stores (the only reason to align in this case) and doesn't incur any unnecessary loop iterations. Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com> (cherry picked from commit 9469261cf1924d350feeec64d2c80cafbbdcdd4d)
author: Noah Goldstein <goldstein.w.n@gmail.com> 2023-11-01 15:30:26 -0500
committer: Sunil K Pandey <skpgkp2@gmail.com> 2025-01-09 21:25:24 -0800
commit: f0c2fcce5f3b2776a6da7b1274463b60d2b9a655 (patch)
tree: 2b993548edef1ff425059076cea39dc1ecb57e89
parent: 0c6f7cd550a1852a8828d773623fc6256c6afcc4 (diff)
1 files changed, 1 insertions, 1 deletions
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index f37be6218a..09fd08ebc0 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -293,7 +293,7 @@ L(more_2x_vec):
 	leaq	(VEC_SIZE * 4)(%rax), %LOOP_REG
 #endif
 	/* Align dst for loop.  */
-	andq	$(VEC_SIZE * -2), %LOOP_REG
+	andq	$(VEC_SIZE * -1), %LOOP_REG
 	.p2align 4
 L(loop):
 	VMOVA	%VMM(0), LOOP_4X_OFFSET(%LOOP_REG)
author	Noah Goldstein <goldstein.w.n@gmail.com>	2023-11-01 15:30:26 -0500
committer	Sunil K Pandey <skpgkp2@gmail.com>	2025-01-09 21:25:24 -0800
commit	f0c2fcce5f3b2776a6da7b1274463b60d2b9a655 (patch)
tree	2b993548edef1ff425059076cea39dc1ecb57e89
parent	0c6f7cd550a1852a8828d773623fc6256c6afcc4 (diff)