summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp36
1 files changed, 30 insertions, 6 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index c3f751c1a988..b35f9faf024b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4903,16 +4903,40 @@ bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
LLT S16 = LLT::scalar(16);
LLT S32 = LLT::scalar(32);
+ // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
+ // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
+ // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
+ // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
+ // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
+ // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
+ // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
+ // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
+ // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
+ // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
+ // q16.u = opx(V_CVT_F16_F32, q32.u);
+ // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
+
auto LHSExt = B.buildFPExt(S32, LHS, Flags);
auto RHSExt = B.buildFPExt(S32, RHS, Flags);
-
- auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
+ auto NegRHSExt = B.buildFNeg(S32, RHSExt);
+ auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
.addUse(RHSExt.getReg(0))
.setMIFlags(Flags);
-
- auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
- auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
-
+ auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags);
+ MachineInstrBuilder Err;
+ if (ST.hasMadMacF32Insts()) {
+ Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
+ Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags);
+ Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
+ } else {
+ Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
+ Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags);
+ Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
+ }
+ auto Tmp = B.buildFMul(S32, Err, Rcp, Flags);
+ Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
+ Quot = B.buildFAdd(S32, Tmp, Quot, Flags);
+ auto RDst = B.buildFPTrunc(S16, Quot, Flags);
B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
.addUse(RDst.getReg(0))
.addUse(RHS)