3 files changed, 194 insertions, 13 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 6bf9008c3d67..3b5d4ad11b6d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17993,7 +17993,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
 
   // (fsub A, 0) -> A
   if (N1CFP && N1CFP->isZero()) {
-    if (!N1CFP->isNegative() || Flags.hasNoSignedZeros()) {
+    if (!N1CFP->isNegative() || Flags.hasNoSignedZeros() || DAG.isKnownNeverZeroFloat(N0)) {
       return N0;
     }
   }
@@ -18022,13 +18022,17 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
     }
   }
 
-  if (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros() &&
-      N1.getOpcode() == ISD::FADD) {
+  // X - (X + Y) -> -Y is valid when:
+  // 1. NoSignedZeros is enabled (globally or via flags), OR
+  // 2. Y is known to never be zero (preventing X == Y case that could produce signed zero)
+  if (Flags.hasAllowReassociation() && N1.getOpcode() == ISD::FADD) {
     // X - (X + Y) -> -Y
-    if (N0 == N1->getOperand(0))
+    if (N0 == N1->getOperand(0) &&
+        (Flags.hasNoSignedZeros() || DAG.isKnownNeverZeroFloat(N1->getOperand(1))))
       return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(1));
     // X - (Y + X) -> -Y
-    if (N0 == N1->getOperand(1))
+    if (N0 == N1->getOperand(1) &&
+        (Flags.hasNoSignedZeros() || DAG.isKnownNeverZeroFloat(N1->getOperand(0))))
       return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(0));
   }
 
@@ -18337,8 +18341,9 @@ template <class MatchContextClass> SDValue DAGCombiner::visitFMA(SDNode *N) {
   }
 
   if (N->getFlags().hasNoNaNs() && N->getFlags().hasNoInfs()) {
-    if (N->getFlags().hasNoSignedZeros() ||
-        (N2CFP && !N2CFP->isExactlyValue(-0.0))) {
+    if (N->getFlags().hasNoSignedZeros() || DAG.isKnownNeverZeroFloat(N2) ||
+        (N2CFP && !N2CFP->isExactlyValue(-0.0)) ||
+        DAG.isKnownNeverZeroFloat(N2)) {
       if (N0CFP && N0CFP->isZero())
         return N2;
       if (N1CFP && N1CFP->isZero())
@@ -18870,6 +18875,35 @@ SDValue DAGCombiner::visitFPOW(SDNode *N) {
 
   return SDValue();
 }
+/// Check if a use of a floating-point value doesn't care about the sign of zero.
+/// This allows us to optimize (sitofp (fptosi x)) -> ftrunc(x) even without
+/// NoSignedZerosFPMath, as long as all uses are sign-insensitive.
+static bool isSignInsensitiveUse(SDNode *Use, unsigned OperandNo, SelectionDAG &DAG) {
+  switch (Use->getOpcode()) {
+  // Comparisons: IEEE 754 specifies +0.0 == -0.0.
+  case ISD::SETCC:
+  // fabs always produces +0.0.
+  case ISD::FABS:
+    return true;
+  // Arithmetic with non-zero constants overwrites the sign.
+  case ISD::FADD:
+  case ISD::FSUB: {
+    SDValue Other = Use->getOperand(1 - OperandNo);
+    return DAG.isKnownNeverZeroFloat(Other);
+  }
+  default:
+    return false;
+  }
+}
+
+/// Check if all uses of a value are insensitive to the sign of zero.
+static bool allUsesSignInsensitive(SDValue V, SelectionDAG &DAG) {
+  return all_of(V->uses(), [&](SDUse &Use) {
+    SDNode *User = Use.getUser();
+    unsigned OperandNo = Use.getOperandNo();
+    return isSignInsensitiveUse(User, OperandNo, DAG);
+  });
+}
 
 static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
                                const TargetLowering &TLI) {
@@ -18885,18 +18919,24 @@ static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
   if (!TLI.isOperationLegal(ISD::FTRUNC, VT))
     return SDValue();
 
+  // Check if we can ignore signed zeros via global flag OR use-based analysis.
+  bool CanIgnoreSignedZeros = DAG.getTarget().Options.NoSignedZerosFPMath ||
+                              allUsesSignInsensitive(SDValue(N, 0), DAG);
+  if (!CanIgnoreSignedZeros)
+    return SDValue();
+
   // fptosi/fptoui round towards zero, so converting from FP to integer and
   // back is the same as an 'ftrunc': [us]itofp (fpto[us]i X) --> ftrunc X
   SDValue N0 = N->getOperand(0);
   if (N->getOpcode() == ISD::SINT_TO_FP && N0.getOpcode() == ISD::FP_TO_SINT &&
       N0.getOperand(0).getValueType() == VT) {
-    if (DAG.getTarget().Options.NoSignedZerosFPMath)
+    if (CanIgnoreSignedZeros)
       return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
   }
 
   if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT &&
       N0.getOperand(0).getValueType() == VT) {
-    if (DAG.getTarget().Options.NoSignedZerosFPMath)
+    if (CanIgnoreSignedZeros)
       return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
 
     // Strict math: use FABS to handle negative inputs correctly.
@@ -19333,10 +19373,18 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
   // FIXME: This is duplicated in getNegatibleCost, but getNegatibleCost doesn't
   // know it was called from a context with a nsz flag if the input fsub does
   // not.
-  if (N0.getOpcode() == ISD::FSUB && N->getFlags().hasNoSignedZeros() &&
-      N0.hasOneUse()) {
-    return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N0.getOperand(1),
-                       N0.getOperand(0));
+  if (N0.getOpcode() == ISD::FSUB && N0.hasOneUse()) {
+    SDValue X = N0.getOperand(0);
+    SDValue Y = N0.getOperand(1);
+
+    // Safe if NoSignedZeros, or if we can prove X != Y (avoiding the -0.0 vs +0.0 issue)
+    // For now, we use a conservative check: if either operand is known never zero,
+    // then X - Y can't produce a signed zero from X == Y.
+    if (N->getFlags().hasNoSignedZeros() ||
+        DAG.isKnownNeverZeroFloat(X) ||
+        DAG.isKnownNeverZeroFloat(Y)) {
+      return DAG.getNode(ISD::FSUB, SDLoc(N), VT, Y, X);
+    }
   }
 
   if (SimplifyDemandedBits(SDValue(N, 0)))
diff --git a/llvm/test/CodeGen/AArch64/dagcombine-nsz-relaxations.ll b/llvm/test/CodeGen/AArch64/dagcombine-nsz-relaxations.ll
new file mode 100644
index 000000000000..3ccfb2b6e756
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dagcombine-nsz-relaxations.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+
+; Test DAGCombiner optimizations that can bypass NoSignedZerosFPMath requirement
+; by using isKnownNeverZeroFloat analysis.
+
+; ===== Test 1: fsub A, 0 -> A =====
+; When A is known to be non-zero, we can eliminate the subtraction
+
+define double @fsub_nonzero_minus_zero(double %x) {
+; CHECK-LABEL: fsub_nonzero_minus_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d1, #1.00000000
+; CHECK-NEXT:    fadd d0, d0, d1
+; CHECK-NEXT:    ret
+  %add = fadd double %x, 1.0
+  %sub = fsub double %add, 0.0
+  ret double %sub
+}
+
+; ===== Test 2: fneg(fsub(A, B)) -> fsub(B, A) =====
+; When A or B is known to be non-zero, we can swap the operands
+
+define double @fneg_fsub_nonzero_nonzero(double %x) {
+; CHECK-LABEL: fneg_fsub_nonzero_nonzero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d1, #1.00000000
+; CHECK-NEXT:    fadd d0, d0, d1
+; CHECK-NEXT:    fmov d1, #2.00000000
+; CHECK-NEXT:    fsub d0, d1, d0
+; CHECK-NEXT:    ret
+  %add = fadd double %x, 1.0
+  %sub = fsub double %add, 2.0
+  %neg = fneg double %sub
+  ret double %neg
+}
+
+; Negative test: both could be same value, can't optimize
+define double @fneg_fsub_maybe_equal(double %x, double %y) {
+; CHECK-LABEL: fneg_fsub_maybe_equal:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fsub d0, d0, d1
+; CHECK-NEXT:    fneg d0, d0
+; CHECK-NEXT:    ret
+  %sub = fsub double %x, %y
+  %neg = fneg double %sub
+  ret double %neg
+}
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sitofp-to-ftrunc.ll b/llvm/test/CodeGen/AArch64/fptosi-sitofp-to-ftrunc.ll
new file mode 100644
index 000000000000..cb4d3aaf8e5a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fptosi-sitofp-to-ftrunc.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+
+; Test that (sitofp (fptosi x)) can be optimized to ftrunc when all uses are
+; either insenstive to the sign-bit (comparison) or that they will fix the
+; sign-bit when the original value is in the range of (-1.0, 0.0) and then the
+; rounding mode can affect the result.
+
+define double @fptosi_sitofp_never_zero_add(double %x) {
+; CHECK-LABEL: fptosi_sitofp_never_zero_add:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintz d0, d0
+; CHECK-NEXT:    fmov d1, #1.00000000
+; CHECK-NEXT:    fadd d0, d0, d1
+; CHECK-NEXT:    ret
+  %conv1 = fptosi double %x to i32
+  %conv2 = sitofp i32 %conv1 to double
+  %add = fadd double %conv2, 1.0
+  ret double %add
+}
+
+define i1 @fptosi_sitofp_compare(double %x) {
+; CHECK-LABEL: fptosi_sitofp_compare:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintz d0, d0
+; CHECK-NEXT:    fcmp d0, #0.0
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %conv1 = fptosi double %x to i32
+  %conv2 = sitofp i32 %conv1 to double
+  %cmp = fcmp oeq double %conv2, 0.0
+  ret i1 %cmp
+}
+
+define double @fptosi_sitofp_fabs(double %x) {
+; CHECK-LABEL: fptosi_sitofp_fabs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    frintz d0, d0
+; CHECK-NEXT:    fabs d0, d0
+; CHECK-NEXT:    ret
+  %conv1 = fptosi double %x to i32
+  %conv2 = sitofp i32 %conv1 to double
+  %abs = call double @llvm.fabs.f64(double %conv2)
+  ret double %abs
+}
+
+define double @fptosi_sitofp_mul(double %x) {
+; CHECK-LABEL: fptosi_sitofp_mul:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs w8, d0
+; CHECK-NEXT:    scvtf d0, w8
+; CHECK-NEXT:    fadd d0, d0, d0
+; CHECK-NEXT:    ret
+  %conv1 = fptosi double %x to i32
+  %conv2 = sitofp i32 %conv1 to double
+  %mul = fmul double %conv2, 2.0
+  ret double %mul
+}
+
+define double @fptosi_sitofp_could_be_zero(double %x) {
+; CHECK-LABEL: fptosi_sitofp_could_be_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs w8, d0
+; CHECK-NEXT:    scvtf d0, w8
+; CHECK-NEXT:    ret
+  %conv1 = fptosi double %x to i32
+  %conv2 = sitofp i32 %conv1 to double
+  ret double %conv2
+}
+
+define double @fptosi_sitofp_add_zero(double %x) {
+; CHECK-LABEL: fptosi_sitofp_add_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs w8, d0
+; CHECK-NEXT:    movi d1, #0000000000000000
+; CHECK-NEXT:    scvtf d0, w8
+; CHECK-NEXT:    fadd d0, d0, d1
+; CHECK-NEXT:    ret
+  %conv1 = fptosi double %x to i32
+  %conv2 = sitofp i32 %conv1 to double
+  %add = fadd double %conv2, 0.0
+  ret double %add
+}
+
+declare double @llvm.fabs.f64(double)