diff options
| author | Hassnaa Hamdi <hassnaa.hamdi@arm.com> | 2025-11-18 13:15:47 +0000 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-11-18 13:15:47 +0000 |
| commit | 3d5d32c6058807008e579dd5ea2faced33a7943b (patch) | |
| tree | 724113316b3c3d34ca00c54f242329524548193d | |
| parent | 52f4c360e382e6926dccb315d4402af6211e25f0 (diff) | |
[CGP]: Optimize mul.overflow. (#148343)
- Detect cases where LHS & RHS values will not cause overflow
(when the Hi halfs are zero).
| -rw-r--r-- | llvm/include/llvm/CodeGen/TargetLowering.h | 7 | ||||
| -rw-r--r-- | llvm/lib/CodeGen/CodeGenPrepare.cpp | 182 | ||||
| -rw-r--r-- | llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 9 | ||||
| -rw-r--r-- | llvm/lib/Target/AArch64/AArch64ISelLowering.h | 5 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AArch64/i128-math.ll | 189 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AArch64/i128_with_overflow.ll | 93 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AArch64/mul-i128-overflow.ll | 261 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll | 97 |
8 files changed, 699 insertions, 144 deletions
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index cec7d09f494d..4c932c523e42 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3492,6 +3492,13 @@ public: return MathUsed && (VT.isSimple() || !isOperationExpand(Opcode, VT)); } + // Return true if the target wants to optimize the mul overflow intrinsic + // for the given \p VT. + virtual bool shouldOptimizeMulOverflowWithZeroHighBits(LLVMContext &Context, + EVT VT) const { + return false; + } + // Return true if it is profitable to use a scalar input to a BUILD_VECTOR // even if the vector itself has multiple uses. virtual bool aggressivelyPreferBuildVectorSources(EVT VecVT) const { diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index b6dd174f9be8..587c1372b19c 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -431,6 +431,8 @@ private: bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Type *AccessTy, unsigned AddrSpace); bool optimizeGatherScatterInst(Instruction *MemoryInst, Value *Ptr); + bool optimizeMulWithOverflow(Instruction *I, bool IsSigned, + ModifyDT &ModifiedDT); bool optimizeInlineAsmInst(CallInst *CS); bool optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT); bool optimizeExt(Instruction *&I); @@ -2797,6 +2799,10 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) { } } return false; + case Intrinsic::umul_with_overflow: + return optimizeMulWithOverflow(II, /*IsSigned=*/false, ModifiedDT); + case Intrinsic::smul_with_overflow: + return optimizeMulWithOverflow(II, /*IsSigned=*/true, ModifiedDT); } SmallVector<Value *, 2> PtrOps; @@ -6391,6 +6397,182 @@ bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst, return true; } +// This is a helper for CodeGenPrepare::optimizeMulWithOverflow. +// Check the pattern we are interested in where there are maximum 2 uses +// of the intrinsic which are the extract instructions. +static bool matchOverflowPattern(Instruction *&I, ExtractValueInst *&MulExtract, + ExtractValueInst *&OverflowExtract) { + // Bail out if it's more than 2 users: + if (I->hasNUsesOrMore(3)) + return false; + + for (User *U : I->users()) { + auto *Extract = dyn_cast<ExtractValueInst>(U); + if (!Extract || Extract->getNumIndices() != 1) + return false; + + unsigned Index = Extract->getIndices()[0]; + if (Index == 0) + MulExtract = Extract; + else if (Index == 1) + OverflowExtract = Extract; + else + return false; + } + return true; +} + +// Rewrite the mul_with_overflow intrinsic by checking if both of the +// operands' value ranges are within the legal type. If so, we can optimize the +// multiplication algorithm. This code is supposed to be written during the step +// of type legalization, but given that we need to reconstruct the IR which is +// not doable there, we do it here. +// The IR after the optimization will look like: +// entry: +// if signed: +// ( (lhs_lo>>BW-1) ^ lhs_hi) || ( (rhs_lo>>BW-1) ^ rhs_hi) ? overflow, +// overflow_no +// else: +// (lhs_hi != 0) || (rhs_hi != 0) ? overflow, overflow_no +// overflow_no: +// overflow: +// overflow.res: +// \returns true if optimization was applied +// TODO: This optimization can be further improved to optimize branching on +// overflow where the 'overflow_no' BB can branch directly to the false +// successor of overflow, but that would add additional complexity so we leave +// it for future work. +bool CodeGenPrepare::optimizeMulWithOverflow(Instruction *I, bool IsSigned, + ModifyDT &ModifiedDT) { + // Check if target supports this optimization. + if (!TLI->shouldOptimizeMulOverflowWithZeroHighBits( + I->getContext(), + TLI->getValueType(*DL, I->getType()->getContainedType(0)))) + return false; + + ExtractValueInst *MulExtract = nullptr, *OverflowExtract = nullptr; + if (!matchOverflowPattern(I, MulExtract, OverflowExtract)) + return false; + + // Keep track of the instruction to stop reoptimizing it again. + InsertedInsts.insert(I); + + Value *LHS = I->getOperand(0); + Value *RHS = I->getOperand(1); + Type *Ty = LHS->getType(); + unsigned VTHalfBitWidth = Ty->getScalarSizeInBits() / 2; + Type *LegalTy = Ty->getWithNewBitWidth(VTHalfBitWidth); + + // New BBs: + BasicBlock *OverflowEntryBB = + I->getParent()->splitBasicBlock(I, "", /*Before*/ true); + OverflowEntryBB->takeName(I->getParent()); + // Keep the 'br' instruction that is generated as a result of the split to be + // erased/replaced later. + Instruction *OldTerminator = OverflowEntryBB->getTerminator(); + BasicBlock *NoOverflowBB = + BasicBlock::Create(I->getContext(), "overflow.no", I->getFunction()); + NoOverflowBB->moveAfter(OverflowEntryBB); + BasicBlock *OverflowBB = + BasicBlock::Create(I->getContext(), "overflow", I->getFunction()); + OverflowBB->moveAfter(NoOverflowBB); + + // BB overflow.entry: + IRBuilder<> Builder(OverflowEntryBB); + // Extract low and high halves of LHS: + Value *LoLHS = Builder.CreateTrunc(LHS, LegalTy, "lo.lhs"); + Value *HiLHS = Builder.CreateLShr(LHS, VTHalfBitWidth, "lhs.lsr"); + HiLHS = Builder.CreateTrunc(HiLHS, LegalTy, "hi.lhs"); + + // Extract low and high halves of RHS: + Value *LoRHS = Builder.CreateTrunc(RHS, LegalTy, "lo.rhs"); + Value *HiRHS = Builder.CreateLShr(RHS, VTHalfBitWidth, "rhs.lsr"); + HiRHS = Builder.CreateTrunc(HiRHS, LegalTy, "hi.rhs"); + + Value *IsAnyBitTrue; + if (IsSigned) { + Value *SignLoLHS = + Builder.CreateAShr(LoLHS, VTHalfBitWidth - 1, "sign.lo.lhs"); + Value *SignLoRHS = + Builder.CreateAShr(LoRHS, VTHalfBitWidth - 1, "sign.lo.rhs"); + Value *XorLHS = Builder.CreateXor(HiLHS, SignLoLHS); + Value *XorRHS = Builder.CreateXor(HiRHS, SignLoRHS); + Value *Or = Builder.CreateOr(XorLHS, XorRHS, "or.lhs.rhs"); + IsAnyBitTrue = Builder.CreateCmp(ICmpInst::ICMP_NE, Or, + ConstantInt::getNullValue(Or->getType())); + } else { + Value *CmpLHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiLHS, + ConstantInt::getNullValue(LegalTy)); + Value *CmpRHS = Builder.CreateCmp(ICmpInst::ICMP_NE, HiRHS, + ConstantInt::getNullValue(LegalTy)); + IsAnyBitTrue = Builder.CreateOr(CmpLHS, CmpRHS, "or.lhs.rhs"); + } + Builder.CreateCondBr(IsAnyBitTrue, OverflowBB, NoOverflowBB); + + // BB overflow.no: + Builder.SetInsertPoint(NoOverflowBB); + Value *ExtLoLHS, *ExtLoRHS; + if (IsSigned) { + ExtLoLHS = Builder.CreateSExt(LoLHS, Ty, "lo.lhs.ext"); + ExtLoRHS = Builder.CreateSExt(LoRHS, Ty, "lo.rhs.ext"); + } else { + ExtLoLHS = Builder.CreateZExt(LoLHS, Ty, "lo.lhs.ext"); + ExtLoRHS = Builder.CreateZExt(LoRHS, Ty, "lo.rhs.ext"); + } + + Value *Mul = Builder.CreateMul(ExtLoLHS, ExtLoRHS, "mul.overflow.no"); + + // Create the 'overflow.res' BB to merge the results of + // the two paths: + BasicBlock *OverflowResBB = I->getParent(); + OverflowResBB->setName("overflow.res"); + + // BB overflow.no: jump to overflow.res BB + Builder.CreateBr(OverflowResBB); + // No we don't need the old terminator in overflow.entry BB, erase it: + OldTerminator->eraseFromParent(); + + // BB overflow.res: + Builder.SetInsertPoint(OverflowResBB, OverflowResBB->getFirstInsertionPt()); + // Create PHI nodes to merge results from no.overflow BB and overflow BB to + // replace the extract instructions. + PHINode *OverflowResPHI = Builder.CreatePHI(Ty, 2), + *OverflowFlagPHI = + Builder.CreatePHI(IntegerType::getInt1Ty(I->getContext()), 2); + + // Add the incoming values from no.overflow BB and later from overflow BB. + OverflowResPHI->addIncoming(Mul, NoOverflowBB); + OverflowFlagPHI->addIncoming(ConstantInt::getFalse(I->getContext()), + NoOverflowBB); + + // Replace all users of MulExtract and OverflowExtract to use the PHI nodes. + if (MulExtract) { + MulExtract->replaceAllUsesWith(OverflowResPHI); + MulExtract->eraseFromParent(); + } + if (OverflowExtract) { + OverflowExtract->replaceAllUsesWith(OverflowFlagPHI); + OverflowExtract->eraseFromParent(); + } + + // Remove the intrinsic from parent (overflow.res BB) as it will be part of + // overflow BB + I->removeFromParent(); + // BB overflow: + I->insertInto(OverflowBB, OverflowBB->end()); + Builder.SetInsertPoint(OverflowBB, OverflowBB->end()); + Value *MulOverflow = Builder.CreateExtractValue(I, {0}, "mul.overflow"); + Value *OverflowFlag = Builder.CreateExtractValue(I, {1}, "overflow.flag"); + Builder.CreateBr(OverflowResBB); + + // Add The Extracted values to the PHINodes in the overflow.res BB. + OverflowResPHI->addIncoming(MulOverflow, OverflowBB); + OverflowFlagPHI->addIncoming(OverflowFlag, OverflowBB); + + ModifiedDT = ModifyDT::ModifyBBDT; + return true; +} + /// If there are any memory operands, use OptimizeMemoryInst to sink their /// address computing into the block when possible / profitable. bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 42567883b259..d21e19b2ecd4 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -18851,6 +18851,15 @@ bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, return (Index == 0 || Index == ResVT.getVectorMinNumElements()); } +bool AArch64TargetLowering::shouldOptimizeMulOverflowWithZeroHighBits( + LLVMContext &Context, EVT VT) const { + if (getTypeAction(Context, VT) != TypeExpandInteger) + return false; + + EVT LegalTy = EVT::getIntegerVT(Context, VT.getSizeInBits() / 2); + return getTypeAction(Context, LegalTy) == TargetLowering::TypeLegal; +} + /// Turn vector tests of the signbit in the form of: /// xor (sra X, elt_size(X)-1), -1 /// into: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 70bfae717fb7..be198e54cbcb 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -333,6 +333,11 @@ public: return TargetLowering::shouldFormOverflowOp(Opcode, VT, true); } + // Return true if the target wants to optimize the mul overflow intrinsic + // for the given \p VT. + bool shouldOptimizeMulOverflowWithZeroHighBits(LLVMContext &Context, + EVT VT) const override; + Value *emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override; Value *emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, diff --git a/llvm/test/CodeGen/AArch64/i128-math.ll b/llvm/test/CodeGen/AArch64/i128-math.ll index 9e1c0c1b115a..12ae241dda4b 100644 --- a/llvm/test/CodeGen/AArch64/i128-math.ll +++ b/llvm/test/CodeGen/AArch64/i128-math.ll @@ -262,21 +262,29 @@ define i128 @u128_mul(i128 %x, i128 %y) { define { i128, i8 } @u128_checked_mul(i128 %x, i128 %y) { ; CHECK-LABEL: u128_checked_mul: ; CHECK: // %bb.0: +; CHECK-NEXT: orr x8, x1, x3 +; CHECK-NEXT: cbz x8, .LBB17_2 +; CHECK-NEXT: // %bb.1: // %overflow ; CHECK-NEXT: mul x9, x3, x0 ; CHECK-NEXT: cmp x1, #0 ; CHECK-NEXT: ccmp x3, #0, #4, ne -; CHECK-NEXT: umulh x8, x1, x2 -; CHECK-NEXT: umulh x10, x3, x0 +; CHECK-NEXT: umulh x10, x1, x2 +; CHECK-NEXT: umulh x8, x3, x0 ; CHECK-NEXT: madd x9, x1, x2, x9 -; CHECK-NEXT: ccmp xzr, x8, #0, eq -; CHECK-NEXT: umulh x11, x0, x2 ; CHECK-NEXT: ccmp xzr, x10, #0, eq +; CHECK-NEXT: umulh x11, x0, x2 +; CHECK-NEXT: ccmp xzr, x8, #0, eq ; CHECK-NEXT: mul x0, x0, x2 ; CHECK-NEXT: cset w8, ne ; CHECK-NEXT: adds x1, x11, x9 ; CHECK-NEXT: csinc w8, w8, wzr, lo ; CHECK-NEXT: eor w2, w8, #0x1 ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB17_2: // %overflow.no +; CHECK-NEXT: umulh x1, x0, x2 +; CHECK-NEXT: mul x0, x0, x2 +; CHECK-NEXT: eor w2, w8, #0x1 +; CHECK-NEXT: ret %1 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y) %2 = extractvalue { i128, i1 } %1, 0 %3 = extractvalue { i128, i1 } %1, 1 @@ -290,20 +298,28 @@ define { i128, i8 } @u128_checked_mul(i128 %x, i128 %y) { define { i128, i8 } @u128_overflowing_mul(i128 %x, i128 %y) { ; CHECK-LABEL: u128_overflowing_mul: ; CHECK: // %bb.0: +; CHECK-NEXT: orr x8, x1, x3 +; CHECK-NEXT: cbz x8, .LBB18_2 +; CHECK-NEXT: // %bb.1: // %overflow ; CHECK-NEXT: mul x9, x3, x0 ; CHECK-NEXT: cmp x1, #0 ; CHECK-NEXT: ccmp x3, #0, #4, ne -; CHECK-NEXT: umulh x8, x1, x2 -; CHECK-NEXT: umulh x10, x3, x0 +; CHECK-NEXT: umulh x10, x1, x2 +; CHECK-NEXT: umulh x8, x3, x0 ; CHECK-NEXT: madd x9, x1, x2, x9 -; CHECK-NEXT: ccmp xzr, x8, #0, eq -; CHECK-NEXT: umulh x11, x0, x2 ; CHECK-NEXT: ccmp xzr, x10, #0, eq +; CHECK-NEXT: umulh x11, x0, x2 +; CHECK-NEXT: ccmp xzr, x8, #0, eq ; CHECK-NEXT: mul x0, x0, x2 ; CHECK-NEXT: cset w8, ne ; CHECK-NEXT: adds x1, x11, x9 ; CHECK-NEXT: csinc w2, w8, wzr, lo ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB18_2: // %overflow.no +; CHECK-NEXT: umulh x1, x0, x2 +; CHECK-NEXT: mul x0, x0, x2 +; CHECK-NEXT: mov w2, wzr +; CHECK-NEXT: ret %1 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y) %2 = extractvalue { i128, i1 } %1, 0 %3 = extractvalue { i128, i1 } %1, 1 @@ -316,19 +332,28 @@ define { i128, i8 } @u128_overflowing_mul(i128 %x, i128 %y) { define i128 @u128_saturating_mul(i128 %x, i128 %y) { ; CHECK-LABEL: u128_saturating_mul: ; CHECK: // %bb.0: -; CHECK-NEXT: mul x9, x3, x0 +; CHECK-NEXT: orr x8, x1, x3 +; CHECK-NEXT: cbz x8, .LBB19_2 +; CHECK-NEXT: // %bb.1: // %overflow +; CHECK-NEXT: mul x8, x3, x0 ; CHECK-NEXT: cmp x1, #0 ; CHECK-NEXT: ccmp x3, #0, #4, ne -; CHECK-NEXT: umulh x8, x1, x2 -; CHECK-NEXT: umulh x10, x3, x0 -; CHECK-NEXT: madd x9, x1, x2, x9 -; CHECK-NEXT: ccmp xzr, x8, #0, eq -; CHECK-NEXT: umulh x11, x0, x2 +; CHECK-NEXT: umulh x10, x1, x2 +; CHECK-NEXT: umulh x9, x3, x0 +; CHECK-NEXT: madd x11, x1, x2, x8 ; CHECK-NEXT: ccmp xzr, x10, #0, eq +; CHECK-NEXT: umulh x12, x0, x2 +; CHECK-NEXT: ccmp xzr, x9, #0, eq ; CHECK-NEXT: mul x8, x0, x2 ; CHECK-NEXT: cset w10, ne -; CHECK-NEXT: adds x9, x11, x9 +; CHECK-NEXT: adds x9, x12, x11 ; CHECK-NEXT: csinc w10, w10, wzr, lo +; CHECK-NEXT: b .LBB19_3 +; CHECK-NEXT: .LBB19_2: // %overflow.no +; CHECK-NEXT: umulh x9, x0, x2 +; CHECK-NEXT: mov w10, wzr +; CHECK-NEXT: mul x8, x0, x2 +; CHECK-NEXT: .LBB19_3: // %overflow.res ; CHECK-NEXT: cmp w10, #0 ; CHECK-NEXT: csinv x0, x8, xzr, eq ; CHECK-NEXT: csinv x1, x9, xzr, eq @@ -355,6 +380,11 @@ define i128 @i128_mul(i128 %x, i128 %y) { define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) { ; CHECK-LABEL: i128_checked_mul: ; CHECK: // %bb.0: +; CHECK-NEXT: eor x8, x3, x2, asr #63 +; CHECK-NEXT: eor x9, x1, x0, asr #63 +; CHECK-NEXT: orr x8, x9, x8 +; CHECK-NEXT: cbz x8, .LBB21_2 +; CHECK-NEXT: // %bb.1: // %overflow ; CHECK-NEXT: asr x9, x1, #63 ; CHECK-NEXT: umulh x10, x0, x2 ; CHECK-NEXT: asr x13, x3, #63 @@ -364,24 +394,30 @@ define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) { ; CHECK-NEXT: adds x10, x11, x10 ; CHECK-NEXT: mul x14, x0, x3 ; CHECK-NEXT: umulh x12, x0, x3 -; CHECK-NEXT: adc x9, x8, x9 +; CHECK-NEXT: adc x8, x8, x9 +; CHECK-NEXT: mov x9, x1 ; CHECK-NEXT: mul x13, x0, x13 -; CHECK-NEXT: adds x8, x14, x10 -; CHECK-NEXT: mul x15, x1, x3 -; CHECK-NEXT: smulh x10, x1, x3 -; CHECK-NEXT: mov x1, x8 -; CHECK-NEXT: adc x11, x12, x13 -; CHECK-NEXT: asr x12, x9, #63 -; CHECK-NEXT: asr x13, x11, #63 -; CHECK-NEXT: adds x9, x9, x11 ; CHECK-NEXT: asr x11, x8, #63 +; CHECK-NEXT: mul x15, x1, x3 +; CHECK-NEXT: adds x1, x14, x10 +; CHECK-NEXT: smulh x9, x9, x3 +; CHECK-NEXT: adc x10, x12, x13 +; CHECK-NEXT: asr x12, x10, #63 +; CHECK-NEXT: adds x8, x8, x10 +; CHECK-NEXT: asr x10, x1, #63 ; CHECK-NEXT: mul x0, x0, x2 -; CHECK-NEXT: adc x12, x12, x13 -; CHECK-NEXT: adds x9, x15, x9 -; CHECK-NEXT: adc x10, x10, x12 -; CHECK-NEXT: cmp x9, x11 -; CHECK-NEXT: ccmp x10, x11, #0, eq -; CHECK-NEXT: cset w2, eq +; CHECK-NEXT: adc x11, x11, x12 +; CHECK-NEXT: adds x8, x15, x8 +; CHECK-NEXT: adc x9, x9, x11 +; CHECK-NEXT: cmp x8, x10 +; CHECK-NEXT: ccmp x9, x10, #0, eq +; CHECK-NEXT: cset w8, ne +; CHECK-NEXT: eor w2, w8, #0x1 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB21_2: // %overflow.no +; CHECK-NEXT: smulh x1, x0, x2 +; CHECK-NEXT: mul x0, x0, x2 +; CHECK-NEXT: eor w2, w8, #0x1 ; CHECK-NEXT: ret %1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y) %2 = extractvalue { i128, i1 } %1, 0 @@ -396,6 +432,11 @@ define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) { define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) { ; CHECK-LABEL: i128_overflowing_mul: ; CHECK: // %bb.0: +; CHECK-NEXT: eor x8, x3, x2, asr #63 +; CHECK-NEXT: eor x9, x1, x0, asr #63 +; CHECK-NEXT: orr x8, x9, x8 +; CHECK-NEXT: cbz x8, .LBB22_2 +; CHECK-NEXT: // %bb.1: // %overflow ; CHECK-NEXT: asr x9, x1, #63 ; CHECK-NEXT: umulh x10, x0, x2 ; CHECK-NEXT: asr x13, x3, #63 @@ -405,25 +446,30 @@ define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) { ; CHECK-NEXT: adds x10, x11, x10 ; CHECK-NEXT: mul x14, x0, x3 ; CHECK-NEXT: umulh x12, x0, x3 -; CHECK-NEXT: adc x9, x8, x9 +; CHECK-NEXT: adc x8, x8, x9 +; CHECK-NEXT: mov x9, x1 ; CHECK-NEXT: mul x13, x0, x13 -; CHECK-NEXT: adds x8, x14, x10 -; CHECK-NEXT: mul x15, x1, x3 -; CHECK-NEXT: smulh x10, x1, x3 -; CHECK-NEXT: mov x1, x8 -; CHECK-NEXT: adc x11, x12, x13 -; CHECK-NEXT: asr x12, x9, #63 -; CHECK-NEXT: asr x13, x11, #63 -; CHECK-NEXT: adds x9, x9, x11 ; CHECK-NEXT: asr x11, x8, #63 +; CHECK-NEXT: mul x15, x1, x3 +; CHECK-NEXT: adds x1, x14, x10 +; CHECK-NEXT: smulh x9, x9, x3 +; CHECK-NEXT: adc x10, x12, x13 +; CHECK-NEXT: asr x12, x10, #63 +; CHECK-NEXT: adds x8, x8, x10 +; CHECK-NEXT: asr x10, x1, #63 ; CHECK-NEXT: mul x0, x0, x2 -; CHECK-NEXT: adc x12, x12, x13 -; CHECK-NEXT: adds x9, x15, x9 -; CHECK-NEXT: adc x10, x10, x12 -; CHECK-NEXT: cmp x9, x11 -; CHECK-NEXT: ccmp x10, x11, #0, eq +; CHECK-NEXT: adc x11, x11, x12 +; CHECK-NEXT: adds x8, x15, x8 +; CHECK-NEXT: adc x9, x9, x11 +; CHECK-NEXT: cmp x8, x10 +; CHECK-NEXT: ccmp x9, x10, #0, eq ; CHECK-NEXT: cset w2, ne ; CHECK-NEXT: ret +; CHECK-NEXT: .LBB22_2: // %overflow.no +; CHECK-NEXT: smulh x1, x0, x2 +; CHECK-NEXT: mul x0, x0, x2 +; CHECK-NEXT: mov w2, wzr +; CHECK-NEXT: ret %1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y) %2 = extractvalue { i128, i1 } %1, 0 %3 = extractvalue { i128, i1 } %1, 1 @@ -436,6 +482,11 @@ define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) { define i128 @i128_saturating_mul(i128 %x, i128 %y) { ; CHECK-LABEL: i128_saturating_mul: ; CHECK: // %bb.0: +; CHECK-NEXT: eor x8, x3, x2, asr #63 +; CHECK-NEXT: eor x9, x1, x0, asr #63 +; CHECK-NEXT: orr x8, x9, x8 +; CHECK-NEXT: cbz x8, .LBB23_2 +; CHECK-NEXT: // %bb.1: // %overflow ; CHECK-NEXT: asr x9, x1, #63 ; CHECK-NEXT: umulh x10, x0, x2 ; CHECK-NEXT: asr x13, x3, #63 @@ -445,29 +496,35 @@ define i128 @i128_saturating_mul(i128 %x, i128 %y) { ; CHECK-NEXT: adds x10, x11, x10 ; CHECK-NEXT: mul x14, x0, x3 ; CHECK-NEXT: umulh x12, x0, x3 -; CHECK-NEXT: adc x8, x8, x9 +; CHECK-NEXT: adc x9, x8, x9 ; CHECK-NEXT: mul x13, x0, x13 -; CHECK-NEXT: adds x9, x14, x10 -; CHECK-NEXT: mul x11, x1, x3 -; CHECK-NEXT: adc x10, x12, x13 -; CHECK-NEXT: smulh x12, x1, x3 -; CHECK-NEXT: asr x13, x8, #63 -; CHECK-NEXT: asr x14, x10, #63 -; CHECK-NEXT: adds x8, x8, x10 -; CHECK-NEXT: adc x10, x13, x14 -; CHECK-NEXT: adds x8, x11, x8 -; CHECK-NEXT: asr x11, x9, #63 -; CHECK-NEXT: mul x13, x0, x2 -; CHECK-NEXT: adc x10, x12, x10 -; CHECK-NEXT: eor x12, x3, x1 -; CHECK-NEXT: eor x8, x8, x11 -; CHECK-NEXT: eor x10, x10, x11 -; CHECK-NEXT: asr x11, x12, #63 -; CHECK-NEXT: orr x8, x8, x10 -; CHECK-NEXT: eor x10, x11, #0x7fffffffffffffff -; CHECK-NEXT: cmp x8, #0 -; CHECK-NEXT: csinv x0, x13, x11, eq -; CHECK-NEXT: csel x1, x10, x9, ne +; CHECK-NEXT: adds x8, x14, x10 +; CHECK-NEXT: mul x15, x1, x3 +; CHECK-NEXT: asr x14, x8, #63 +; CHECK-NEXT: smulh x10, x1, x3 +; CHECK-NEXT: adc x11, x12, x13 +; CHECK-NEXT: asr x12, x9, #63 +; CHECK-NEXT: asr x13, x11, #63 +; CHECK-NEXT: adds x11, x9, x11 +; CHECK-NEXT: mul x9, x0, x2 +; CHECK-NEXT: adc x12, x12, x13 +; CHECK-NEXT: adds x11, x15, x11 +; CHECK-NEXT: adc x10, x10, x12 +; CHECK-NEXT: cmp x11, x14 +; CHECK-NEXT: ccmp x10, x14, #0, eq +; CHECK-NEXT: cset w10, ne +; CHECK-NEXT: b .LBB23_3 +; CHECK-NEXT: .LBB23_2: // %overflow.no +; CHECK-NEXT: smulh x8, x0, x2 +; CHECK-NEXT: mov w10, wzr +; CHECK-NEXT: mul x9, x0, x2 +; CHECK-NEXT: .LBB23_3: // %overflow.res +; CHECK-NEXT: eor x11, x3, x1 +; CHECK-NEXT: cmp w10, #0 +; CHECK-NEXT: asr x11, x11, #63 +; CHECK-NEXT: eor x12, x11, #0x7fffffffffffffff +; CHECK-NEXT: csinv x0, x9, x11, eq +; CHECK-NEXT: csel x1, x12, x8, ne ; CHECK-NEXT: ret %1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y) %2 = extractvalue { i128, i1 } %1, 0 diff --git a/llvm/test/CodeGen/AArch64/i128_with_overflow.ll b/llvm/test/CodeGen/AArch64/i128_with_overflow.ll index 9924b7c63f76..3d90e094a574 100644 --- a/llvm/test/CodeGen/AArch64/i128_with_overflow.ll +++ b/llvm/test/CodeGen/AArch64/i128_with_overflow.ll @@ -224,21 +224,29 @@ cleanup: define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) { ; CHECK-LABEL: test_umul_i128: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: orr x8, x1, x3 +; CHECK-NEXT: cbz x8, .LBB4_2 +; CHECK-NEXT: // %bb.1: // %overflow ; CHECK-NEXT: mul x9, x3, x0 ; CHECK-NEXT: cmp x1, #0 ; CHECK-NEXT: ccmp x3, #0, #4, ne -; CHECK-NEXT: umulh x8, x1, x2 -; CHECK-NEXT: umulh x10, x3, x0 +; CHECK-NEXT: umulh x10, x1, x2 +; CHECK-NEXT: umulh x8, x3, x0 ; CHECK-NEXT: madd x9, x1, x2, x9 -; CHECK-NEXT: ccmp xzr, x8, #0, eq -; CHECK-NEXT: umulh x11, x0, x2 ; CHECK-NEXT: ccmp xzr, x10, #0, eq +; CHECK-NEXT: umulh x11, x0, x2 +; CHECK-NEXT: ccmp xzr, x8, #0, eq +; CHECK-NEXT: mul x0, x0, x2 ; CHECK-NEXT: cset w8, ne ; CHECK-NEXT: adds x1, x11, x9 ; CHECK-NEXT: csinc w8, w8, wzr, lo -; CHECK-NEXT: cmp w8, #1 -; CHECK-NEXT: b.ne .LBB4_2 -; CHECK-NEXT: // %bb.1: // %if.then +; CHECK-NEXT: cbnz w8, .LBB4_3 +; CHECK-NEXT: b .LBB4_4 +; CHECK-NEXT: .LBB4_2: // %overflow.no +; CHECK-NEXT: umulh x1, x0, x2 +; CHECK-NEXT: mul x0, x0, x2 +; CHECK-NEXT: cbz w8, .LBB4_4 +; CHECK-NEXT: .LBB4_3: // %if.then ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w30, -16 @@ -247,9 +255,7 @@ define i128 @test_umul_i128(i128 noundef %x, i128 noundef %y) { ; CHECK-NEXT: sxtw x0, w0 ; CHECK-NEXT: asr x1, x0, #63 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB4_2: // %if.end -; CHECK-NEXT: mul x0, x0, x2 +; CHECK-NEXT: .LBB4_4: // %cleanup ; CHECK-NEXT: ret entry: %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y) @@ -273,34 +279,40 @@ cleanup: define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) { ; CHECK-LABEL: test_smul_i128: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: asr x10, x1, #63 -; CHECK-NEXT: umulh x11, x0, x2 -; CHECK-NEXT: asr x14, x3, #63 -; CHECK-NEXT: mov x8, x1 -; CHECK-NEXT: mul x12, x1, x2 -; CHECK-NEXT: umulh x9, x1, x2 -; CHECK-NEXT: mul x10, x10, x2 -; CHECK-NEXT: adds x11, x12, x11 -; CHECK-NEXT: mul x15, x0, x3 -; CHECK-NEXT: umulh x13, x0, x3 -; CHECK-NEXT: adc x9, x9, x10 -; CHECK-NEXT: mul x14, x0, x14 -; CHECK-NEXT: mul x16, x1, x3 -; CHECK-NEXT: adds x1, x15, x11 -; CHECK-NEXT: asr x11, x9, #63 -; CHECK-NEXT: smulh x8, x8, x3 -; CHECK-NEXT: adc x10, x13, x14 -; CHECK-NEXT: asr x12, x10, #63 -; CHECK-NEXT: adds x9, x9, x10 -; CHECK-NEXT: adc x10, x11, x12 -; CHECK-NEXT: adds x9, x16, x9 -; CHECK-NEXT: asr x11, x1, #63 -; CHECK-NEXT: adc x8, x8, x10 -; CHECK-NEXT: eor x8, x8, x11 -; CHECK-NEXT: eor x9, x9, x11 +; CHECK-NEXT: eor x8, x3, x2, asr #63 +; CHECK-NEXT: eor x9, x1, x0, asr #63 ; CHECK-NEXT: orr x8, x9, x8 -; CHECK-NEXT: cbz x8, .LBB5_2 -; CHECK-NEXT: // %bb.1: // %if.then +; CHECK-NEXT: cbz x8, .LBB5_4 +; CHECK-NEXT: // %bb.1: // %overflow +; CHECK-NEXT: asr x9, x1, #63 +; CHECK-NEXT: umulh x10, x0, x2 +; CHECK-NEXT: asr x13, x3, #63 +; CHECK-NEXT: mul x11, x1, x2 +; CHECK-NEXT: umulh x8, x1, x2 +; CHECK-NEXT: mul x9, x9, x2 +; CHECK-NEXT: adds x10, x11, x10 +; CHECK-NEXT: mul x14, x0, x3 +; CHECK-NEXT: umulh x12, x0, x3 +; CHECK-NEXT: adc x8, x8, x9 +; CHECK-NEXT: mov x9, x1 +; CHECK-NEXT: mul x13, x0, x13 +; CHECK-NEXT: asr x11, x8, #63 +; CHECK-NEXT: mul x15, x1, x3 +; CHECK-NEXT: adds x1, x14, x10 +; CHECK-NEXT: smulh x9, x9, x3 +; CHECK-NEXT: adc x10, x12, x13 +; CHECK-NEXT: asr x12, x10, #63 +; CHECK-NEXT: adds x8, x8, x10 +; CHECK-NEXT: asr x10, x1, #63 +; CHECK-NEXT: mul x0, x0, x2 +; CHECK-NEXT: adc x11, x11, x12 +; CHECK-NEXT: adds x8, x15, x8 +; CHECK-NEXT: adc x9, x9, x11 +; CHECK-NEXT: cmp x8, x10 +; CHECK-NEXT: ccmp x9, x10, #0, eq +; CHECK-NEXT: cset w8, ne +; CHECK-NEXT: cbz w8, .LBB5_3 +; CHECK-NEXT: .LBB5_2: // %if.then ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w30, -16 @@ -309,10 +321,13 @@ define i128 @test_smul_i128(i128 noundef %x, i128 noundef %y) { ; CHECK-NEXT: sxtw x0, w0 ; CHECK-NEXT: asr x1, x0, #63 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .LBB5_3: // %cleanup ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB5_2: // %if.end +; CHECK-NEXT: .LBB5_4: // %overflow.no +; CHECK-NEXT: smulh x1, x0, x2 ; CHECK-NEXT: mul x0, x0, x2 -; CHECK-NEXT: ret +; CHECK-NEXT: cbnz w8, .LBB5_2 +; CHECK-NEXT: b .LBB5_3 entry: %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y) %1 = extractvalue { i128, i1 } %0, 1 diff --git a/llvm/test/CodeGen/AArch64/mul-i128-overflow.ll b/llvm/test/CodeGen/AArch64/mul-i128-overflow.ll new file mode 100644 index 000000000000..7b60f81539aa --- /dev/null +++ b/llvm/test/CodeGen/AArch64/mul-i128-overflow.ll @@ -0,0 +1,261 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64 -o - %s | FileCheck %s + + +declare i32 @error() + +define i128 @test1(i128 noundef %x, i128 noundef %y) { +; CHECK-LABEL: test1: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: eor x8, x3, x2, asr #63 +; CHECK-NEXT: eor x9, x1, x0, asr #63 +; CHECK-NEXT: orr x8, x9, x8 +; CHECK-NEXT: cbz x8, .LBB0_4 +; CHECK-NEXT: // %bb.1: // %overflow +; CHECK-NEXT: asr x9, x1, #63 +; CHECK-NEXT: umulh x10, x0, x2 +; CHECK-NEXT: asr x13, x3, #63 +; CHECK-NEXT: mul x11, x1, x2 +; CHECK-NEXT: umulh x8, x1, x2 +; CHECK-NEXT: mul x9, x9, x2 +; CHECK-NEXT: adds x10, x11, x10 +; CHECK-NEXT: mul x14, x0, x3 +; CHECK-NEXT: umulh x12, x0, x3 +; CHECK-NEXT: adc x8, x8, x9 +; CHECK-NEXT: mov x9, x1 +; CHECK-NEXT: mul x13, x0, x13 +; CHECK-NEXT: asr x11, x8, #63 +; CHECK-NEXT: mul x15, x1, x3 +; CHECK-NEXT: adds x1, x14, x10 +; CHECK-NEXT: smulh x9, x9, x3 +; CHECK-NEXT: adc x10, x12, x13 +; CHECK-NEXT: asr x12, x10, #63 +; CHECK-NEXT: adds x8, x8, x10 +; CHECK-NEXT: asr x10, x1, #63 +; CHECK-NEXT: mul x0, x0, x2 +; CHECK-NEXT: adc x11, x11, x12 +; CHECK-NEXT: adds x8, x15, x8 +; CHECK-NEXT: adc x9, x9, x11 +; CHECK-NEXT: cmp x8, x10 +; CHECK-NEXT: ccmp x9, x10, #0, eq +; CHECK-NEXT: cset w8, ne +; CHECK-NEXT: cbz w8, .LBB0_3 +; CHECK-NEXT: .LBB0_2: // %if.then +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: bl error +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x0, w0 +; CHECK-NEXT: asr x1, x0, #63 +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .LBB0_3: // %cleanup +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_4: // %overflow.no +; CHECK-NEXT: smulh x1, x0, x2 +; CHECK-NEXT: mul x0, x0, x2 +; CHECK-NEXT: cbnz w8, .LBB0_2 +; CHECK-NEXT: b .LBB0_3 +entry: + %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y) + %1 = extractvalue { i128, i1 } %0, 1 + br i1 %1, label %if.then, label %if.end + +if.then: + %call = tail call i32 @error() + %conv1 = sext i32 %call to i128 + br label %cleanup + +if.end: + %2 = extractvalue { i128, i1 } %0, 0 + br label %cleanup + +cleanup: + %retval.0 = phi i128 [ %conv1, %if.then ], [ %2, %if.end ] + ret i128 %retval.0 +} + +define i128 @test2(i128 noundef %x, i128 noundef %y, ptr %out) { +; CHECK-LABEL: test2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: eor x8, x3, x2, asr #63 +; CHECK-NEXT: eor x9, x1, x0, asr #63 +; CHECK-NEXT: orr x8, x9, x8 +; CHECK-NEXT: cbz x8, .LBB1_4 +; CHECK-NEXT: // %bb.1: // %overflow +; CHECK-NEXT: asr x9, x1, #63 +; CHECK-NEXT: umulh x10, x0, x2 +; CHECK-NEXT: asr x13, x3, #63 +; CHECK-NEXT: mul x11, x1, x2 +; CHECK-NEXT: umulh x8, x1, x2 +; CHECK-NEXT: mul x9, x9, x2 +; CHECK-NEXT: adds x10, x11, x10 +; CHECK-NEXT: mul x14, x0, x3 +; CHECK-NEXT: umulh x12, x0, x3 +; CHECK-NEXT: adc x8, x8, x9 +; CHECK-NEXT: mov x9, x1 +; CHECK-NEXT: mul x13, x0, x13 +; CHECK-NEXT: asr x11, x8, #63 +; CHECK-NEXT: mul x15, x1, x3 +; CHECK-NEXT: adds x1, x14, x10 +; CHECK-NEXT: smulh x9, x9, x3 +; CHECK-NEXT: adc x10, x12, x13 +; CHECK-NEXT: asr x12, x10, #63 +; CHECK-NEXT: adds x8, x8, x10 +; CHECK-NEXT: asr x10, x1, #63 +; CHECK-NEXT: mul x0, x0, x2 +; CHECK-NEXT: adc x11, x11, x12 +; CHECK-NEXT: adds x8, x15, x8 +; CHECK-NEXT: adc x9, x9, x11 +; CHECK-NEXT: cmp x8, x10 +; CHECK-NEXT: ccmp x9, x10, #0, eq +; CHECK-NEXT: cset w8, ne +; CHECK-NEXT: stp x0, x1, [x4] +; CHECK-NEXT: cbz w8, .LBB1_3 +; CHECK-NEXT: .LBB1_2: // %if.then +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: bl error +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x0, w0 +; CHECK-NEXT: asr x1, x0, #63 +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .LBB1_3: // %cleanup +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB1_4: // %overflow.no +; CHECK-NEXT: smulh x1, x0, x2 +; CHECK-NEXT: mul x0, x0, x2 +; CHECK-NEXT: stp x0, x1, [x4] +; CHECK-NEXT: cbnz w8, .LBB1_2 +; CHECK-NEXT: b .LBB1_3 +entry: + %0 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y) + %1 = extractvalue { i128, i1 } %0, 0 + store i128 %1, ptr %out + %2 = extractvalue { i128, i1 } %0, 1 + br i1 %2, label %if.then, label %cleanup + +if.then: + %call = tail call i32 @error() + %conv1 = sext i32 %call to i128 + br label %cleanup + +cleanup: + %retval.0 = phi i128 [ %conv1, %if.then ], [ %1, %entry ] + ret i128 %retval.0 +} + +define i128 @test3(i128 noundef %x, i128 noundef %y, ptr %out) { +; CHECK-LABEL: test3: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: orr x8, x1, x3 +; CHECK-NEXT: cbz x8, .LBB2_3 +; CHECK-NEXT: // %bb.1: // %overflow +; CHECK-NEXT: mul x8, x3, x0 +; CHECK-NEXT: cmp x1, #0 +; CHECK-NEXT: ccmp x3, #0, #4, ne +; CHECK-NEXT: umulh x10, x1, x2 +; CHECK-NEXT: umulh x9, x3, x0 +; CHECK-NEXT: madd x11, x1, x2, x8 +; CHECK-NEXT: ccmp xzr, x10, #0, eq +; CHECK-NEXT: umulh x12, x0, x2 +; CHECK-NEXT: ccmp xzr, x9, #0, eq +; CHECK-NEXT: mul x8, x0, x2 +; CHECK-NEXT: cset w10, ne +; CHECK-NEXT: adds x9, x12, x11 +; CHECK-NEXT: csinc w10, w10, wzr, lo +; CHECK-NEXT: stp x8, x9, [x4] +; CHECK-NEXT: cbnz w10, .LBB2_4 +; CHECK-NEXT: .LBB2_2: +; CHECK-NEXT: mov x1, xzr +; CHECK-NEXT: mov w0, #1 // =0x1 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB2_3: // %overflow.no +; CHECK-NEXT: umulh x9, x0, x2 +; CHECK-NEXT: mov w10, wzr +; CHECK-NEXT: mul x8, x0, x2 +; CHECK-NEXT: stp x8, x9, [x4] +; CHECK-NEXT: cbz w10, .LBB2_2 +; CHECK-NEXT: .LBB2_4: // %if.then +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: bl error +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x0, w0 +; CHECK-NEXT: asr x1, x0, #63 +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y) + %1 = extractvalue { i128, i1 } %0, 0 + store i128 %1, ptr %out + %2 = extractvalue { i128, i1 } %0, 1 + br i1 %2, label %if.then, label %cleanup + +if.then: + %call = tail call i32 @error() + %conv1 = sext i32 %call to i128 + br label %cleanup + +cleanup: + %retval.0 = phi i128 [ %conv1, %if.then ], [ 1, %entry ] + ret i128 %retval.0 +} + +define i128 @test4(i128 noundef %x, i128 noundef %y, i128 %out) { +; CHECK-LABEL: test4: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: orr x8, x1, x3 +; CHECK-NEXT: cbz x8, .LBB3_2 +; CHECK-NEXT: // %bb.1: // %overflow +; CHECK-NEXT: mul x8, x3, x0 +; CHECK-NEXT: cmp x1, #0 +; CHECK-NEXT: ccmp x3, #0, #4, ne +; CHECK-NEXT: umulh x10, x1, x2 +; CHECK-NEXT: umulh x9, x3, x0 +; CHECK-NEXT: madd x11, x1, x2, x8 +; CHECK-NEXT: ccmp xzr, x10, #0, eq +; CHECK-NEXT: umulh x12, x0, x2 +; CHECK-NEXT: ccmp xzr, x9, #0, eq +; CHECK-NEXT: mul x8, x0, x2 +; CHECK-NEXT: cset w10, ne +; CHECK-NEXT: adds x9, x12, x11 +; CHECK-NEXT: csinc w10, w10, wzr, lo +; CHECK-NEXT: b .LBB3_3 +; CHECK-NEXT: .LBB3_2: // %overflow.no +; CHECK-NEXT: umulh x9, x0, x2 +; CHECK-NEXT: mov w10, wzr +; CHECK-NEXT: mul x8, x0, x2 +; CHECK-NEXT: .LBB3_3: // %overflow.res +; CHECK-NEXT: adds x0, x8, x4 +; CHECK-NEXT: adc x1, x9, x5 +; CHECK-NEXT: cbz w10, .LBB3_5 +; CHECK-NEXT: // %bb.4: // %if.then +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: bl error +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x0, w0 +; CHECK-NEXT: asr x1, x0, #63 +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .LBB3_5: // %cleanup +; CHECK-NEXT: ret +entry: + %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %x, i128 %y) + %1 = extractvalue { i128, i1 } %0, 0 + %res = add i128 %1, %out + %2 = extractvalue { i128, i1 } %0, 1 + br i1 %2, label %if.then, label %cleanup + +if.then: + %call = tail call i32 @error() + %conv1 = sext i32 %call to i128 + br label %cleanup + +cleanup: + %retval.0 = phi i128 [ %conv1, %if.then ], [ %res, %entry ] + ret i128 %retval.0 +} diff --git a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll index edfd80b4f270..ace0c83e63c7 100644 --- a/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll @@ -4,20 +4,28 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; AARCH-LABEL: muloti_test: ; AARCH: // %bb.0: // %start +; AARCH-NEXT: orr x8, x1, x3 +; AARCH-NEXT: cbz x8, .LBB0_2 +; AARCH-NEXT: // %bb.1: // %overflow ; AARCH-NEXT: mul x9, x3, x0 ; AARCH-NEXT: cmp x1, #0 ; AARCH-NEXT: ccmp x3, #0, #4, ne -; AARCH-NEXT: umulh x8, x1, x2 -; AARCH-NEXT: umulh x10, x3, x0 +; AARCH-NEXT: umulh x10, x1, x2 +; AARCH-NEXT: umulh x8, x3, x0 ; AARCH-NEXT: madd x9, x1, x2, x9 -; AARCH-NEXT: ccmp xzr, x8, #0, eq -; AARCH-NEXT: umulh x11, x0, x2 ; AARCH-NEXT: ccmp xzr, x10, #0, eq +; AARCH-NEXT: umulh x11, x0, x2 +; AARCH-NEXT: ccmp xzr, x8, #0, eq ; AARCH-NEXT: mul x0, x0, x2 ; AARCH-NEXT: cset w8, ne ; AARCH-NEXT: adds x1, x11, x9 ; AARCH-NEXT: csinc w2, w8, wzr, lo ; AARCH-NEXT: ret +; AARCH-NEXT: .LBB0_2: // %overflow.no +; AARCH-NEXT: umulh x1, x0, x2 +; AARCH-NEXT: mul x0, x0, x2 +; AARCH-NEXT: mov w2, wzr +; AARCH-NEXT: ret start: %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2 %1 = extractvalue { i128, i1 } %0, 0 @@ -35,45 +43,56 @@ start: define i128 @__muloti4(i128 %0, i128 %1, ptr nocapture nonnull writeonly align 4 %2) #2 { ; AARCH-LABEL: __muloti4: ; AARCH: // %bb.0: // %Entry -; AARCH-NEXT: asr x11, x1, #63 -; AARCH-NEXT: asr x9, x3, #63 -; AARCH-NEXT: umulh x12, x0, x2 -; AARCH-NEXT: mov x8, x1 +; AARCH-NEXT: eor x8, x3, x2, asr #63 +; AARCH-NEXT: eor x9, x1, x0, asr #63 ; AARCH-NEXT: str wzr, [x4] -; AARCH-NEXT: mul x13, x1, x2 -; AARCH-NEXT: umulh x10, x1, x2 -; AARCH-NEXT: mul x11, x11, x2 -; AARCH-NEXT: adds x12, x13, x12 -; AARCH-NEXT: mul x15, x0, x3 -; AARCH-NEXT: umulh x14, x0, x3 -; AARCH-NEXT: adc x10, x10, x11 -; AARCH-NEXT: mul x9, x0, x9 -; AARCH-NEXT: mul x16, x1, x3 -; AARCH-NEXT: adds x1, x15, x12 -; AARCH-NEXT: asr x12, x10, #63 -; AARCH-NEXT: smulh x11, x8, x3 -; AARCH-NEXT: adc x9, x14, x9 -; AARCH-NEXT: asr x13, x9, #63 -; AARCH-NEXT: adds x9, x10, x9 -; AARCH-NEXT: asr x10, x1, #63 +; AARCH-NEXT: orr x8, x9, x8 +; AARCH-NEXT: cbz x8, .LBB1_2 +; AARCH-NEXT: // %bb.1: // %overflow +; AARCH-NEXT: asr x9, x1, #63 +; AARCH-NEXT: umulh x10, x0, x2 +; AARCH-NEXT: asr x13, x3, #63 +; AARCH-NEXT: mul x11, x1, x2 +; AARCH-NEXT: umulh x8, x1, x2 +; AARCH-NEXT: mul x9, x9, x2 +; AARCH-NEXT: adds x10, x11, x10 +; AARCH-NEXT: mul x14, x0, x3 +; AARCH-NEXT: umulh x12, x0, x3 +; AARCH-NEXT: adc x9, x8, x9 +; AARCH-NEXT: mul x13, x0, x13 +; AARCH-NEXT: adds x8, x14, x10 +; AARCH-NEXT: mul x15, x1, x3 +; AARCH-NEXT: smulh x10, x1, x3 +; AARCH-NEXT: adc x11, x12, x13 +; AARCH-NEXT: asr x12, x9, #63 +; AARCH-NEXT: asr x13, x11, #63 +; AARCH-NEXT: adds x9, x9, x11 +; AARCH-NEXT: asr x11, x8, #63 ; AARCH-NEXT: mul x0, x0, x2 ; AARCH-NEXT: adc x12, x12, x13 -; AARCH-NEXT: adds x9, x16, x9 -; AARCH-NEXT: adc x11, x11, x12 -; AARCH-NEXT: cmp x9, x10 -; AARCH-NEXT: ccmp x11, x10, #0, eq +; AARCH-NEXT: adds x9, x15, x9 +; AARCH-NEXT: adc x10, x10, x12 +; AARCH-NEXT: cmp x9, x11 +; AARCH-NEXT: ccmp x10, x11, #0, eq ; AARCH-NEXT: cset w9, ne -; AARCH-NEXT: tbz x8, #63, .LBB1_2 -; AARCH-NEXT: // %bb.1: // %Entry -; AARCH-NEXT: eor x8, x3, #0x8000000000000000 -; AARCH-NEXT: orr x8, x2, x8 -; AARCH-NEXT: cbz x8, .LBB1_3 -; AARCH-NEXT: .LBB1_2: // %Else2 -; AARCH-NEXT: cbz w9, .LBB1_4 -; AARCH-NEXT: .LBB1_3: // %Then7 -; AARCH-NEXT: mov w8, #1 // =0x1 -; AARCH-NEXT: str w8, [x4] -; AARCH-NEXT: .LBB1_4: // %Block9 +; AARCH-NEXT: tbnz x1, #63, .LBB1_3 +; AARCH-NEXT: b .LBB1_4 +; AARCH-NEXT: .LBB1_2: // %overflow.no +; AARCH-NEXT: smulh x8, x0, x2 +; AARCH-NEXT: mov w9, wzr +; AARCH-NEXT: mul x0, x0, x2 +; AARCH-NEXT: tbz x1, #63, .LBB1_4 +; AARCH-NEXT: .LBB1_3: // %overflow.res +; AARCH-NEXT: eor x10, x3, #0x8000000000000000 +; AARCH-NEXT: orr x10, x2, x10 +; AARCH-NEXT: cbz x10, .LBB1_5 +; AARCH-NEXT: .LBB1_4: // %Else2 +; AARCH-NEXT: cbz w9, .LBB1_6 +; AARCH-NEXT: .LBB1_5: // %Then7 +; AARCH-NEXT: mov w9, #1 // =0x1 +; AARCH-NEXT: str w9, [x4] +; AARCH-NEXT: .LBB1_6: // %Block9 +; AARCH-NEXT: mov x1, x8 ; AARCH-NEXT: ret Entry: store i32 0, ptr %2, align 4 |
