From 42f07a44ef88e0c9ff1bd5343786013272467414 Mon Sep 17 00:00:00 2001
From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Date: Thu, 13 Nov 2025 09:58:19 -0300
Subject: math: Remove ldbl-96 fma implementation

It is worse than the ldbl-64 version on recent x86 hardware.  With
Zen3 and gcc-15:

ldbl-96 removal
reciprocal-throughput        master        patched   improvement
x86_64                    1176.2200       289.4640         4.06x
i686                      1476.0600       636.8660         2.32x

latency                      master        patched   improvement
x86_64                    1176.2200        293.7360        4.00x
i686                      1480.0700        658.4160        2.25x

Checked on x86_64-linux-gnu and i686-linux-gnu.

Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
---
 sysdeps/i386/i686/multiarch/s_fma.c |   2 +-
 sysdeps/ieee754/ldbl-96/s_fma.c     | 106 ------------------------------------
 2 files changed, 1 insertion(+), 107 deletions(-)
 delete mode 100644 sysdeps/ieee754/ldbl-96/s_fma.c

diff --git a/sysdeps/i386/i686/multiarch/s_fma.c b/sysdeps/i386/i686/multiarch/s_fma.c
index 70d0afe529..6dadb9439f 100644
--- a/sysdeps/i386/i686/multiarch/s_fma.c
+++ b/sysdeps/i386/i686/multiarch/s_fma.c
@@ -38,4 +38,4 @@ libm_alias_double_narrow (__fma, fma)
 
 #define __fma __fma_ia32
 
-#include <sysdeps/ieee754/ldbl-96/s_fma.c>
+#include <sysdeps/ieee754/dbl-64/s_fma.c>
diff --git a/sysdeps/ieee754/ldbl-96/s_fma.c b/sysdeps/ieee754/ldbl-96/s_fma.c
deleted file mode 100644
index a9abf8f1de..0000000000
--- a/sysdeps/ieee754/ldbl-96/s_fma.c
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Compute x * y + z as ternary operation.
-   Copyright (C) 2010-2025 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#define NO_MATH_REDIRECT
-#include <float.h>
-#define dfmal __hide_dfmal
-#define f32xfmaf64 __hide_f32xfmaf64
-#include <math.h>
-#undef dfmal
-#undef f32xfmaf64
-#include <fenv.h>
-#include <ieee754.h>
-#include <math-barriers.h>
-#include <libm-alias-double.h>
-#include <math-narrow-alias.h>
-
-/* This implementation uses rounding to odd to avoid problems with
-   double rounding.  See a paper by Boldo and Melquiond:
-   http://www.lri.fr/~melquion/doc/08-tc.pdf  */
-
-double
-__fma (double x, double y, double z)
-{
-  if (__glibc_unlikely (!isfinite (x) || !isfinite (y)))
-    return x * y + z;
-  else if (__glibc_unlikely (!isfinite (z)))
-    /* If z is Inf, but x and y are finite, the result should be z
-       rather than NaN.  */
-    return (z + x) + y;
-
-  /* Ensure correct sign of exact 0 + 0.  */
-  if (__glibc_unlikely ((x == 0 || y == 0) && z == 0))
-    {
-      x = math_opt_barrier (x);
-      return x * y + z;
-    }
-
-  fenv_t env;
-  __feholdexcept (&env);
-  __fesetround (FE_TONEAREST);
-
-  /* Multiplication m1 + m2 = x * y using Dekker's algorithm.  */
-#define C ((1ULL << (LDBL_MANT_DIG + 1) / 2) + 1)
-  long double x1 = (long double) x * C;
-  long double y1 = (long double) y * C;
-  long double m1 = (long double) x * y;
-  x1 = (x - x1) + x1;
-  y1 = (y - y1) + y1;
-  long double x2 = x - x1;
-  long double y2 = y - y1;
-  long double m2 = (((x1 * y1 - m1) + x1 * y2) + x2 * y1) + x2 * y2;
-
-  /* Addition a1 + a2 = z + m1 using Knuth's algorithm.  */
-  long double a1 = z + m1;
-  long double t1 = a1 - z;
-  long double t2 = a1 - t1;
-  t1 = m1 - t1;
-  t2 = z - t2;
-  long double a2 = t1 + t2;
-  /* Ensure the arithmetic is not scheduled after feclearexcept call.  */
-  math_force_eval (m2);
-  math_force_eval (a2);
-  __feclearexcept (FE_INEXACT);
-
-  /* If the result is an exact zero, ensure it has the correct sign.  */
-  if (a1 == 0 && m2 == 0)
-    {
-      __feupdateenv (&env);
-      /* Ensure that round-to-nearest value of z + m1 is not reused.  */
-      z = math_opt_barrier (z);
-      return z + m1;
-    }
-
-  __fesetround (FE_TOWARDZERO);
-  /* Perform m2 + a2 addition with round to odd.  */
-  a2 = a2 + m2;
-
-  /* Add that to a1 again using rounding to odd.  */
-  union ieee854_long_double u;
-  u.d = a1 + a2;
-  if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7fff)
-    u.ieee.mantissa1 |= __fetestexcept (FE_INEXACT) != 0;
-  __feupdateenv (&env);
-
-  /* Add finally round to double precision.  */
-  return u.d;
-}
-#ifndef __fma
-libm_alias_double (__fma, fma)
-libm_alias_double_narrow (__fma, fma)
-#endif
-- 
cgit v1.2.3