diff options
Diffstat (limited to 'llvm/lib/Target')
242 files changed, 5952 insertions, 3282 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 36f3a670808d..12fc976a70ea 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -598,6 +598,9 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( llvm_unreachable("Unsupported ElementSize"); } + // Preserve undef state until DOP's reg is defined. + unsigned DOPRegState = MI.getOperand(DOPIdx).isUndef() ? RegState::Undef : 0; + // // Create the destructive operation (if required) // @@ -616,10 +619,11 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfxZero)) .addReg(DstReg, RegState::Define) .addReg(MI.getOperand(PredIdx).getReg()) - .addReg(MI.getOperand(DOPIdx).getReg()); + .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState); // After the movprfx, the destructive operand is same as Dst DOPIdx = 0; + DOPRegState = 0; // Create the additional LSL to zero the lanes when the DstReg is not // unique. Zeros the lanes in z0 that aren't active in p0 with sequence @@ -638,8 +642,9 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( assert(DOPRegIsUnique && "The destructive operand should be unique"); PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfx)) .addReg(DstReg, RegState::Define) - .addReg(MI.getOperand(DOPIdx).getReg()); + .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState); DOPIdx = 0; + DOPRegState = 0; } // @@ -647,10 +652,11 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( // DOP = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode)) .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)); + DOPRegState = DOPRegState | RegState::Kill; switch (DType) { case AArch64::DestructiveUnaryPassthru: - DOP.addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill) + DOP.addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState) .add(MI.getOperand(PredIdx)) .add(MI.getOperand(SrcIdx)); break; @@ -659,20 +665,20 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( case AArch64::DestructiveBinaryComm: case AArch64::DestructiveBinaryCommWithRev: DOP.add(MI.getOperand(PredIdx)) - .addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill) - .add(MI.getOperand(SrcIdx)); + .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState) + .add(MI.getOperand(SrcIdx)); break; case AArch64::DestructiveTernaryCommWithRev: DOP.add(MI.getOperand(PredIdx)) - .addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill) + .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState) .add(MI.getOperand(SrcIdx)) .add(MI.getOperand(Src2Idx)); break; } if (PRFX) { - finalizeBundle(MBB, PRFX->getIterator(), MBBI->getIterator()); transferImpOps(MI, PRFX, DOP); + finalizeBundle(MBB, PRFX->getIterator(), MBBI->getIterator()); } else transferImpOps(MI, DOP, DOP); @@ -1591,18 +1597,22 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, "Non-writeback variants of STGloop / STZGloop should not " "survive past PrologEpilogInserter."); case AArch64::STR_ZZZZXI: + case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS: return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 4); case AArch64::STR_ZZZXI: return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 3); case AArch64::STR_ZZXI: + case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS: return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 2); case AArch64::STR_PPXI: return expandSVESpillFill(MBB, MBBI, AArch64::STR_PXI, 2); case AArch64::LDR_ZZZZXI: + case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS: return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 4); case AArch64::LDR_ZZZXI: return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 3); case AArch64::LDR_ZZXI: + case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS: return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 2); case AArch64::LDR_PPXI: return expandSVESpillFill(MBB, MBBI, AArch64::LDR_PXI, 2); diff --git a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp index 83804b4b09bc..21756177fc74 100644 --- a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp +++ b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp @@ -812,7 +812,7 @@ bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) { if (skipFunction(Fn.getFunction())) return false; - TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo()); + TII = ST.getInstrInfo(); TRI = ST.getRegisterInfo(); MachineLoopInfo &LI = getAnalysis<MachineLoopInfoWrapperPass>().getLI(); diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td index 9973df865ea1..c1c1f0a1024d 100644 --- a/llvm/lib/Target/AArch64/AArch64Features.td +++ b/llvm/lib/Target/AArch64/AArch64Features.td @@ -840,6 +840,13 @@ def FeatureDisableFastIncVL : SubtargetFeature<"disable-fast-inc-vl", "HasDisableFastIncVL", "true", "Do not prefer INC/DEC, ALL, { 1, 2, 4 } over ADDVL">; +// On most processors we want to avoid moving from WZR to vector registers +// (relying on materializing 0 to a FPR and moving from there instead), +// but on some (in-order) cores it's preferable to avoid the extra instruction instead. +def FeatureUseWzrToVecMove : SubtargetFeature<"use-wzr-to-vec-move", + "UseWzrToVecMove", "true", + "Move from WZR to insert 0 into vector registers">; + //===----------------------------------------------------------------------===// // Architectures. // diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 666ff8bbab42..885f2a94f85f 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -731,8 +731,7 @@ void AArch64FrameLowering::resetCFIToInitialState( MachineFunction &MF = *MBB.getParent(); const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); - const auto &TRI = - static_cast<const AArch64RegisterInfo &>(*Subtarget.getRegisterInfo()); + const auto &TRI = *Subtarget.getRegisterInfo(); const auto &MFI = *MF.getInfo<AArch64FunctionInfo>(); CFIInstBuilder CFIBuilder(MBB, MBB.begin(), MachineInstr::NoFlags); @@ -1746,7 +1745,7 @@ static void emitShadowCallStackEpilogue(const TargetInstrInfo &TII, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - const DebugLoc &DL) { + const DebugLoc &DL, bool NeedsWinCFI) { // Shadow call stack epilog: ldr x30, [x18, #-8]! BuildMI(MBB, MBBI, DL, TII.get(AArch64::LDRXpre)) .addReg(AArch64::X18, RegState::Define) @@ -1755,6 +1754,10 @@ static void emitShadowCallStackEpilogue(const TargetInstrInfo &TII, .addImm(-8) .setMIFlag(MachineInstr::FrameDestroy); + if (NeedsWinCFI) + BuildMI(MBB, MBBI, DL, TII.get(AArch64::SEH_Nop)) + .setMIFlag(MachineInstr::FrameDestroy); + if (MF.getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo(MF)) CFIInstBuilder(MBB, MBBI, MachineInstr::FrameDestroy) .buildRestore(AArch64::X18); @@ -1899,13 +1902,15 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, BuildMI(MBB, MBBI, DL, TII->get(AArch64::PAUTH_PROLOGUE)) .setMIFlag(MachineInstr::FrameSetup); } - if (NeedsWinCFI) - HasWinCFI = true; // AArch64PointerAuth pass will insert SEH_PACSignLR + // AArch64PointerAuth pass will insert SEH_PACSignLR + HasWinCFI |= NeedsWinCFI; } - if (MFnI.needsShadowCallStackPrologueEpilogue(MF)) + if (MFnI.needsShadowCallStackPrologueEpilogue(MF)) { emitShadowCallStackPrologue(*TII, MF, MBB, MBBI, DL, NeedsWinCFI, MFnI.needsDwarfUnwindInfo(MF)); + HasWinCFI |= NeedsWinCFI; + } if (EmitCFI && MFnI.isMTETagged()) { BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITMTETAGGED)) @@ -1990,8 +1995,13 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, "unexpected function without stack frame but with SVE objects"); // All of the stack allocation is for locals. AFI->setLocalStackSize(NumBytes); - if (!NumBytes) + if (!NumBytes) { + if (NeedsWinCFI && HasWinCFI) { + BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd)) + .setMIFlag(MachineInstr::FrameSetup); + } return; + } // REDZONE: If the stack size is less than 128 bytes, we don't need // to actually allocate. if (canUseRedZone(MF)) { @@ -2460,8 +2470,11 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock::iterator EpilogStartI = MBB.end(); auto FinishingTouches = make_scope_exit([&]() { - if (AFI->needsShadowCallStackPrologueEpilogue(MF)) - emitShadowCallStackEpilogue(*TII, MF, MBB, MBB.getFirstTerminator(), DL); + if (AFI->needsShadowCallStackPrologueEpilogue(MF)) { + emitShadowCallStackEpilogue(*TII, MF, MBB, MBB.getFirstTerminator(), DL, + NeedsWinCFI); + HasWinCFI |= NeedsWinCFI; + } if (EmitCFI) emitCalleeSavedGPRRestores(MBB, MBB.getFirstTerminator()); if (AFI->shouldSignReturnAddress(MF)) { @@ -2472,8 +2485,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, TII->get(AArch64::PAUTH_EPILOGUE)) .setMIFlag(MachineInstr::FrameDestroy); } - if (NeedsWinCFI) - HasWinCFI = true; // AArch64PointerAuth pass will insert SEH_PACSignLR + // AArch64PointerAuth pass will insert SEH_PACSignLR + HasWinCFI |= NeedsWinCFI; } if (HasWinCFI) { BuildMI(MBB, MBB.getFirstTerminator(), DL, @@ -3030,9 +3043,11 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(), ObjectOffset); if (FPAfterSVECalleeSaves) { - assert(-ObjectOffset > (int64_t)AFI->getSVECalleeSavedStackSize() && - "Math isn't correct for CSRs with FPAfterSVECalleeSaves"); FPOffset += StackOffset::getScalable(AFI->getSVECalleeSavedStackSize()); + if (-ObjectOffset <= (int64_t)AFI->getSVECalleeSavedStackSize()) { + FPOffset += StackOffset::getFixed(AFI->getCalleeSavedStackSize()); + SPOffset += StackOffset::getFixed(AFI->getCalleeSavedStackSize()); + } } // Always use the FP for SVE spills if available and beneficial. if (hasFP(MF) && (SPOffset.getFixed() || diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index f7de61f044a7..f026726c3f48 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1143,6 +1143,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, ISD::SIGN_EXTEND_INREG, ISD::CONCAT_VECTORS, ISD::EXTRACT_SUBVECTOR, ISD::INSERT_SUBVECTOR, ISD::STORE, ISD::BUILD_VECTOR}); + setTargetDAGCombine(ISD::SMIN); setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::LOAD); @@ -2392,6 +2393,15 @@ static bool isIntImmediate(const SDNode *N, uint64_t &Imm) { return false; } +bool isVectorizedBinOp(unsigned Opcode) { + switch (Opcode) { + case AArch64ISD::SQDMULH: + return true; + default: + return false; + } +} + // isOpcWithIntImmediate - This method tests to see if the node is a specific // opcode and that it has a immediate integer right operand. // If so Imm will receive the value. @@ -2600,6 +2610,12 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode( APInt(Known.getBitWidth(), Op->getConstantOperandVal(0))); break; } + case AArch64ISD::MOVIshift: { + Known = KnownBits::makeConstant( + APInt(Known.getBitWidth(), Op->getConstantOperandVal(0) + << Op->getConstantOperandVal(1))); + break; + } case AArch64ISD::LOADgot: case AArch64ISD::ADDlow: { if (!Subtarget->isTargetILP32()) @@ -5512,7 +5528,8 @@ static SDValue optimizeIncrementingWhile(SDNode *N, SelectionDAG &DAG, unsigned Op0 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 1 : 0; unsigned Op1 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 2 : 1; - if (!isa<ConstantSDNode>(N->getOperand(Op1))) + if (!N->getValueType(0).isScalableVector() || + !isa<ConstantSDNode>(N->getOperand(Op1))) return SDValue(); SDLoc DL(N); @@ -6422,7 +6439,9 @@ bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { } } - return true; + EVT PreExtScalarVT = ExtVal->getOperand(0).getValueType().getScalarType(); + return PreExtScalarVT == MVT::i8 || PreExtScalarVT == MVT::i16 || + PreExtScalarVT == MVT::i32 || PreExtScalarVT == MVT::i64; } unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) { @@ -17138,7 +17157,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor, /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 bool AArch64TargetLowering::lowerInterleavedLoad( - LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, + Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); @@ -17146,6 +17165,11 @@ bool AArch64TargetLowering::lowerInterleavedLoad( assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); + auto *LI = dyn_cast<LoadInst>(Load); + if (!LI) + return false; + assert(!Mask && "Unexpected mask on a load"); + const DataLayout &DL = LI->getDataLayout(); VectorType *VTy = Shuffles[0]->getType(); @@ -17469,16 +17493,18 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, } bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( - LoadInst *LI, ArrayRef<Value *> DeinterleavedValues) const { - unsigned Factor = DeinterleavedValues.size(); + Instruction *Load, Value *Mask, IntrinsicInst *DI) const { + const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID()); if (Factor != 2 && Factor != 4) { LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n"); return false; } + auto *LI = dyn_cast<LoadInst>(Load); + if (!LI) + return false; + assert(!Mask && "Unexpected mask on a load\n"); - Value *FirstActive = *llvm::find_if(DeinterleavedValues, - [](Value *V) { return V != nullptr; }); - VectorType *VTy = cast<VectorType>(FirstActive->getType()); + VectorType *VTy = getDeinterleavedVectorType(DI); const DataLayout &DL = LI->getModule()->getDataLayout(); bool UseScalable; @@ -17506,6 +17532,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue()); Value *BaseAddr = LI->getPointerOperand(); + Value *Result = nullptr; if (NumLoads > 1) { // Create multiple legal small ldN. SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy)); @@ -17526,35 +17553,35 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( } LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump()); } - // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4 - for (unsigned J = 0; J < Factor; ++J) { - if (DeinterleavedValues[J]) - DeinterleavedValues[J]->replaceAllUsesWith(ExtractedLdValues[J]); - } + + // Merge the values from different factors. + Result = PoisonValue::get(DI->getType()); + for (unsigned J = 0; J < Factor; ++J) + Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J); } else { - Value *Result; if (UseScalable) Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN"); else Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN"); - // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4 - for (unsigned I = 0; I < Factor; I++) { - if (DeinterleavedValues[I]) { - Value *NewExtract = Builder.CreateExtractValue(Result, I); - DeinterleavedValues[I]->replaceAllUsesWith(NewExtract); - } - } } + + // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4 + DI->replaceAllUsesWith(Result); return true; } bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore( - StoreInst *SI, ArrayRef<Value *> InterleavedValues) const { + Instruction *Store, Value *Mask, + ArrayRef<Value *> InterleavedValues) const { unsigned Factor = InterleavedValues.size(); if (Factor != 2 && Factor != 4) { LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n"); return false; } + StoreInst *SI = dyn_cast<StoreInst>(Store); + if (!SI) + return false; + assert(!Mask && "Unexpected mask on plain store"); VectorType *VTy = cast<VectorType>(InterleavedValues[0]->getType()); const DataLayout &DL = SI->getModule()->getDataLayout(); @@ -20119,8 +20146,9 @@ static SDValue performConcatVectorsCombine(SDNode *N, // size, combine into an binop of two contacts of the source vectors. eg: // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d)) if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() && - DAG.getTargetLoweringInfo().isBinOp(N0Opc) && N0->hasOneUse() && - N1->hasOneUse()) { + (DAG.getTargetLoweringInfo().isBinOp(N0Opc) || + isVectorizedBinOp(N0Opc)) && + N0->hasOneUse() && N1->hasOneUse()) { SDValue N00 = N0->getOperand(0); SDValue N01 = N0->getOperand(1); SDValue N10 = N1->getOperand(0); @@ -20979,6 +21007,98 @@ static SDValue performBuildVectorCombine(SDNode *N, return SDValue(); } +// A special combine for the sqdmulh family of instructions. +// smin( sra ( mul( sext v0, sext v1 ) ), SHIFT_AMOUNT ), +// SATURATING_VAL ) can be reduced to sqdmulh(...) +static SDValue trySQDMULHCombine(SDNode *N, SelectionDAG &DAG) { + + if (N->getOpcode() != ISD::SMIN) + return SDValue(); + + EVT DestVT = N->getValueType(0); + + if (!DestVT.isVector() || DestVT.getScalarSizeInBits() > 64 || + DestVT.isScalableVector()) + return SDValue(); + + ConstantSDNode *Clamp = isConstOrConstSplat(N->getOperand(1)); + + if (!Clamp) + return SDValue(); + + MVT ScalarType; + unsigned ShiftAmt = 0; + switch (Clamp->getSExtValue()) { + case (1ULL << 15) - 1: + ScalarType = MVT::i16; + ShiftAmt = 16; + break; + case (1ULL << 31) - 1: + ScalarType = MVT::i32; + ShiftAmt = 32; + break; + default: + return SDValue(); + } + + SDValue Sra = N->getOperand(0); + if (Sra.getOpcode() != ISD::SRA || !Sra.hasOneUse()) + return SDValue(); + + ConstantSDNode *RightShiftVec = isConstOrConstSplat(Sra.getOperand(1)); + if (!RightShiftVec) + return SDValue(); + unsigned SExtValue = RightShiftVec->getSExtValue(); + + if (SExtValue != (ShiftAmt - 1)) + return SDValue(); + + SDValue Mul = Sra.getOperand(0); + if (Mul.getOpcode() != ISD::MUL) + return SDValue(); + + SDValue SExt0 = Mul.getOperand(0); + SDValue SExt1 = Mul.getOperand(1); + + if (SExt0.getOpcode() != ISD::SIGN_EXTEND || + SExt1.getOpcode() != ISD::SIGN_EXTEND) + return SDValue(); + + EVT SExt0Type = SExt0.getOperand(0).getValueType(); + EVT SExt1Type = SExt1.getOperand(0).getValueType(); + + if (SExt0Type != SExt1Type || SExt0Type.getScalarType() != ScalarType || + SExt0Type.getFixedSizeInBits() > 128 || !SExt0Type.isPow2VectorType() || + SExt0Type.getVectorNumElements() == 1) + return SDValue(); + + SDLoc DL(N); + SDValue V0 = SExt0.getOperand(0); + SDValue V1 = SExt1.getOperand(0); + + // Ensure input vectors are extended to legal types + if (SExt0Type.getFixedSizeInBits() < 64) { + unsigned VecNumElements = SExt0Type.getVectorNumElements(); + EVT ExtVecVT = MVT::getVectorVT(MVT::getIntegerVT(64 / VecNumElements), + VecNumElements); + V0 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V0); + V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V1); + } + + SDValue SQDMULH = + DAG.getNode(AArch64ISD::SQDMULH, DL, V0.getValueType(), V0, V1); + + return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, SQDMULH); +} + +static SDValue performSMINCombine(SDNode *N, SelectionDAG &DAG) { + if (SDValue V = trySQDMULHCombine(N, DAG)) { + return V; + } + + return SDValue(); +} + static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { SDLoc DL(N); @@ -26730,6 +26850,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performAddSubCombine(N, DCI); case ISD::BUILD_VECTOR: return performBuildVectorCombine(N, DCI, DAG); + case ISD::SMIN: + return performSMINCombine(N, DAG); case ISD::TRUNCATE: return performTruncateCombine(N, DAG, DCI); case AArch64ISD::ANDS: @@ -30286,6 +30408,7 @@ bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode( bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const { return Op.getOpcode() == AArch64ISD::DUP || Op.getOpcode() == AArch64ISD::MOVI || + Op.getOpcode() == AArch64ISD::MOVIshift || (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && Op.getOperand(0).getOpcode() == AArch64ISD::DUP) || TargetLowering::isTargetCanonicalConstantNode(Op); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 65fe08e92c23..713793ec77da 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -211,18 +211,19 @@ public: unsigned getMaxSupportedInterleaveFactor() const override { return 4; } - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const override; bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; - bool lowerDeinterleaveIntrinsicToLoad( - LoadInst *LI, ArrayRef<Value *> DeinterleaveValues) const override; + bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, + IntrinsicInst *DI) const override; bool lowerInterleaveIntrinsicToStore( - StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override; + Instruction *Store, Value *Mask, + ArrayRef<Value *> InterleaveValues) const override; bool isLegalAddImmediate(int64_t) const override; bool isLegalAddScalableImmediate(int64_t) const override; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index c1474773faa7..bc57537ad5df 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -20,6 +20,7 @@ #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/CFIInstBuilder.h" #include "llvm/CodeGen/LivePhysRegs.h" @@ -35,6 +36,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/StackMaps.h" +#include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DebugInfoMetadata.h" @@ -2482,8 +2484,10 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) { case AArch64::LDR_PXI: case AArch64::LDR_ZXI: case AArch64::LDR_ZZXI: + case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS: case AArch64::LDR_ZZZXI: case AArch64::LDR_ZZZZXI: + case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS: case AArch64::LDRBBui: case AArch64::LDRBui: case AArch64::LDRDui: @@ -2525,8 +2529,10 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) { case AArch64::STR_PXI: case AArch64::STR_ZXI: case AArch64::STR_ZZXI: + case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS: case AArch64::STR_ZZZXI: case AArch64::STR_ZZZZXI: + case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS: case AArch64::STRBBui: case AArch64::STRBui: case AArch64::STRDui: @@ -4318,7 +4324,9 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale, break; // SVE case AArch64::STR_ZZZZXI: + case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS: case AArch64::LDR_ZZZZXI: + case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS: Scale = TypeSize::getScalable(16); Width = TypeSize::getScalable(16 * 4); MinOffset = -256; @@ -4332,7 +4340,9 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale, MaxOffset = 253; break; case AArch64::STR_ZZXI: + case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS: case AArch64::LDR_ZZXI: + case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS: Scale = TypeSize::getScalable(16); Width = TypeSize::getScalable(16 * 2); MinOffset = -256; @@ -5559,8 +5569,12 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Twov2d; Offset = false; - } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) || - AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) { + } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) { + assert(Subtarget.isSVEorStreamingSVEAvailable() && + "Unexpected register store without SVE store instructions"); + Opc = AArch64::STR_ZZXI_STRIDED_CONTIGUOUS; + StackID = TargetStackID::ScalableVector; + } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) { assert(Subtarget.isSVEorStreamingSVEAvailable() && "Unexpected register store without SVE store instructions"); Opc = AArch64::STR_ZZXI; @@ -5584,8 +5598,12 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); Opc = AArch64::ST1Fourv2d; Offset = false; - } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) || - AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) { + } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) { + assert(Subtarget.isSVEorStreamingSVEAvailable() && + "Unexpected register store without SVE store instructions"); + Opc = AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS; + StackID = TargetStackID::ScalableVector; + } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) { assert(Subtarget.isSVEorStreamingSVEAvailable() && "Unexpected register store without SVE store instructions"); Opc = AArch64::STR_ZZZZXI; @@ -5736,8 +5754,12 @@ void AArch64InstrInfo::loadRegFromStackSlot( assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Twov2d; Offset = false; - } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) || - AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) { + } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) { + assert(Subtarget.isSVEorStreamingSVEAvailable() && + "Unexpected register load without SVE load instructions"); + Opc = AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS; + StackID = TargetStackID::ScalableVector; + } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) { assert(Subtarget.isSVEorStreamingSVEAvailable() && "Unexpected register load without SVE load instructions"); Opc = AArch64::LDR_ZZXI; @@ -5761,8 +5783,12 @@ void AArch64InstrInfo::loadRegFromStackSlot( assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); Opc = AArch64::LD1Fourv2d; Offset = false; - } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) || - AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) { + } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) { + assert(Subtarget.isSVEorStreamingSVEAvailable() && + "Unexpected register load without SVE load instructions"); + Opc = AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS; + StackID = TargetStackID::ScalableVector; + } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) { assert(Subtarget.isSVEorStreamingSVEAvailable() && "Unexpected register load without SVE load instructions"); Opc = AArch64::LDR_ZZZZXI; @@ -6264,13 +6290,13 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( // LDRWui %0:sub_32<def,read-undef>, %stack.0 // if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) { - const TargetRegisterClass *FillRC; + const TargetRegisterClass *FillRC = nullptr; switch (DstMO.getSubReg()) { default: - FillRC = nullptr; break; case AArch64::sub_32: - FillRC = &AArch64::GPR32RegClass; + if (AArch64::GPR64RegClass.hasSubClassEq(getRegClass(DstReg))) + FillRC = &AArch64::GPR32RegClass; break; case AArch64::ssub: FillRC = &AArch64::FPR32RegClass; @@ -7327,6 +7353,9 @@ bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const { case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2: case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1: case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2: + case AArch64MachineCombinerPattern::GATHER_LANE_i32: + case AArch64MachineCombinerPattern::GATHER_LANE_i16: + case AArch64MachineCombinerPattern::GATHER_LANE_i8: return true; } // end switch (Pattern) return false; @@ -7367,11 +7396,252 @@ static bool getMiscPatterns(MachineInstr &Root, return false; } +static bool getGatherPattern(MachineInstr &Root, + SmallVectorImpl<unsigned> &Patterns, + unsigned LoadLaneOpCode, unsigned NumLanes) { + const MachineFunction *MF = Root.getMF(); + + // Early exit if optimizing for size. + if (MF->getFunction().hasMinSize()) + return false; + + const MachineRegisterInfo &MRI = MF->getRegInfo(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + + // The root of the pattern must load into the last lane of the vector. + if (Root.getOperand(2).getImm() != NumLanes - 1) + return false; + + // Check that we have load into all lanes except lane 0. + // For each load we also want to check that: + // 1. It has a single non-debug use (since we will be replacing the virtual + // register) + // 2. That the addressing mode only uses a single offset register. + auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); + auto Range = llvm::seq<unsigned>(1, NumLanes - 1); + SmallSet<unsigned, 4> RemainingLanes(Range.begin(), Range.end()); + while (!RemainingLanes.empty() && CurrInstr && + CurrInstr->getOpcode() == LoadLaneOpCode && + MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) && + CurrInstr->getNumOperands() == 4) { + RemainingLanes.erase(CurrInstr->getOperand(2).getImm()); + CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg()); + } + + if (!RemainingLanes.empty()) + return false; + + // Match the SUBREG_TO_REG sequence. + if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG) + return false; + + // Verify that the subreg to reg loads an integer into the first lane. + auto Lane0LoadReg = CurrInstr->getOperand(2).getReg(); + unsigned SingleLaneSizeInBits = 128 / NumLanes; + if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits) + return false; + + // Verify that it also has a single non debug use. + if (!MRI.hasOneNonDBGUse(Lane0LoadReg)) + return false; + + switch (NumLanes) { + case 4: + Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i32); + break; + case 8: + Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i16); + break; + case 16: + Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i8); + break; + default: + llvm_unreachable("Got bad number of lanes for gather pattern."); + } + + return true; +} + +/// Search for patterns where we use LD1 instructions to load into +/// separate lanes of an 128 bit Neon register. We can increase Memory Level +/// Parallelism by loading into 2 Neon registers instead. +static bool getLoadPatterns(MachineInstr &Root, + SmallVectorImpl<unsigned> &Patterns) { + + // The pattern searches for loads into single lanes. + switch (Root.getOpcode()) { + case AArch64::LD1i32: + return getGatherPattern(Root, Patterns, Root.getOpcode(), 4); + case AArch64::LD1i16: + return getGatherPattern(Root, Patterns, Root.getOpcode(), 8); + case AArch64::LD1i8: + return getGatherPattern(Root, Patterns, Root.getOpcode(), 16); + default: + return false; + } +} + +static void +generateGatherPattern(MachineInstr &Root, + SmallVectorImpl<MachineInstr *> &InsInstrs, + SmallVectorImpl<MachineInstr *> &DelInstrs, + DenseMap<Register, unsigned> &InstrIdxForVirtReg, + unsigned Pattern, unsigned NumLanes) { + + MachineFunction &MF = *Root.getParent()->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + + // Gather the initial load instructions to build the pattern + SmallVector<MachineInstr *, 16> LoadToLaneInstrs; + MachineInstr *CurrInstr = &Root; + for (unsigned i = 0; i < NumLanes - 1; ++i) { + LoadToLaneInstrs.push_back(CurrInstr); + CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg()); + } + + // Sort the load instructions according to the lane. + llvm::sort(LoadToLaneInstrs, + [](const MachineInstr *A, const MachineInstr *B) { + return A->getOperand(2).getImm() > B->getOperand(2).getImm(); + }); + + MachineInstr *SubregToReg = CurrInstr; + LoadToLaneInstrs.push_back( + MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg())); + auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs); + + const TargetRegisterClass *FPR128RegClass = + MRI.getRegClass(Root.getOperand(0).getReg()); + + auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr, + Register SrcRegister, unsigned Lane, + Register OffsetRegister) { + auto NewRegister = MRI.createVirtualRegister(FPR128RegClass); + MachineInstrBuilder LoadIndexIntoRegister = + BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()), + NewRegister) + .addReg(SrcRegister) + .addImm(Lane) + .addReg(OffsetRegister, getKillRegState(true)); + InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size())); + InsInstrs.push_back(LoadIndexIntoRegister); + return NewRegister; + }; + + // Helper to create load instruction based on opcode + auto CreateLoadInstruction = [&](unsigned NumLanes, Register DestReg, + Register OffsetReg) -> MachineInstrBuilder { + unsigned Opcode; + switch (NumLanes) { + case 4: + Opcode = AArch64::LDRSui; + break; + case 8: + Opcode = AArch64::LDRHui; + break; + case 16: + Opcode = AArch64::LDRBui; + break; + default: + llvm_unreachable( + "Got unsupported number of lanes in machine-combiner gather pattern"); + } + // Immediate offset load + return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg) + .addReg(OffsetReg) + .addImm(0); // immediate offset + }; + + // Load the remaining lanes into register 0. + auto LanesToLoadToReg0 = + llvm::make_range(LoadToLaneInstrsAscending.begin() + 1, + LoadToLaneInstrsAscending.begin() + NumLanes / 2); + auto PrevReg = SubregToReg->getOperand(0).getReg(); + for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) { + PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1, + LoadInstr->getOperand(3).getReg()); + DelInstrs.push_back(LoadInstr); + } + auto LastLoadReg0 = PrevReg; + + // First load into register 1. Perform a LDRSui to zero out the upper lanes in + // a single instruction. + auto Lane0Load = *LoadToLaneInstrsAscending.begin(); + auto OriginalSplitLoad = + *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2); + auto DestRegForMiddleIndex = MRI.createVirtualRegister( + MRI.getRegClass(Lane0Load->getOperand(0).getReg())); + + MachineInstrBuilder MiddleIndexLoadInstr = + CreateLoadInstruction(NumLanes, DestRegForMiddleIndex, + OriginalSplitLoad->getOperand(3).getReg()); + + InstrIdxForVirtReg.insert( + std::make_pair(DestRegForMiddleIndex, InsInstrs.size())); + InsInstrs.push_back(MiddleIndexLoadInstr); + DelInstrs.push_back(OriginalSplitLoad); + + // Subreg To Reg instruction for register 1. + auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass); + unsigned SubregType; + switch (NumLanes) { + case 4: + SubregType = AArch64::ssub; + break; + case 8: + SubregType = AArch64::hsub; + break; + case 16: + SubregType = AArch64::bsub; + break; + default: + llvm_unreachable( + "Got invalid NumLanes for machine-combiner gather pattern"); + } + + auto SubRegToRegInstr = + BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()), + DestRegForSubregToReg) + .addImm(0) + .addReg(DestRegForMiddleIndex, getKillRegState(true)) + .addImm(SubregType); + InstrIdxForVirtReg.insert( + std::make_pair(DestRegForSubregToReg, InsInstrs.size())); + InsInstrs.push_back(SubRegToRegInstr); + + // Load remaining lanes into register 1. + auto LanesToLoadToReg1 = + llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1, + LoadToLaneInstrsAscending.end()); + PrevReg = SubRegToRegInstr->getOperand(0).getReg(); + for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) { + PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1, + LoadInstr->getOperand(3).getReg()); + if (Index == NumLanes / 2 - 2) { + break; + } + DelInstrs.push_back(LoadInstr); + } + auto LastLoadReg1 = PrevReg; + + // Create the final zip instruction to combine the results. + MachineInstrBuilder ZipInstr = + BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64), + Root.getOperand(0).getReg()) + .addReg(LastLoadReg0) + .addReg(LastLoadReg1); + InsInstrs.push_back(ZipInstr); +} + CombinerObjective AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const { switch (Pattern) { case AArch64MachineCombinerPattern::SUBADD_OP1: case AArch64MachineCombinerPattern::SUBADD_OP2: + case AArch64MachineCombinerPattern::GATHER_LANE_i32: + case AArch64MachineCombinerPattern::GATHER_LANE_i16: + case AArch64MachineCombinerPattern::GATHER_LANE_i8: return CombinerObjective::MustReduceDepth; default: return TargetInstrInfo::getCombinerObjective(Pattern); @@ -7401,6 +7671,10 @@ bool AArch64InstrInfo::getMachineCombinerPatterns( if (getMiscPatterns(Root, Patterns)) return true; + // Load patterns + if (getLoadPatterns(Root, Patterns)) + return true; + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, DoRegPressureReduce); } @@ -8656,6 +8930,21 @@ void AArch64InstrInfo::genAlternativeCodeSequence( MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs); break; } + case AArch64MachineCombinerPattern::GATHER_LANE_i32: { + generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + Pattern, 4); + break; + } + case AArch64MachineCombinerPattern::GATHER_LANE_i16: { + generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + Pattern, 8); + break; + } + case AArch64MachineCombinerPattern::GATHER_LANE_i8: { + generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + Pattern, 16); + break; + } } // end switch (Pattern) // Record MUL and ADD/SUB for deletion @@ -9561,10 +9850,15 @@ AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB, }; auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() { // At least one unsafe register is not dead. We do not want to outline at - // this point. If it is long enough to outline from, save the range - // [RangeBegin, RangeEnd). - if (RangeLen > 1) - Ranges.push_back(std::make_pair(RangeBegin, RangeEnd)); + // this point. If it is long enough to outline from and does not cross a + // bundle boundary, save the range [RangeBegin, RangeEnd). + if (RangeLen <= 1) + return; + if (!RangeBegin.isEnd() && RangeBegin->isBundledWithPred()) + return; + if (!RangeEnd.isEnd() && RangeEnd->isBundledWithPred()) + return; + Ranges.emplace_back(RangeBegin, RangeEnd); }; // Find the first point where all unsafe registers are dead. // FIND: <safe instr> <-- end of first potential range diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 7c255da333e4..02734866e712 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -172,6 +172,10 @@ enum AArch64MachineCombinerPattern : unsigned { FMULv8i16_indexed_OP2, FNMADD, + + GATHER_LANE_i32, + GATHER_LANE_i16, + GATHER_LANE_i8 }; class AArch64InstrInfo final : public AArch64GenInstrInfo { const AArch64RegisterInfo RI; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index ddc685fae5e9..6c46b18d506c 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -419,6 +419,8 @@ def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER", def AllowMisalignedMemAccesses : Predicate<"!Subtarget->requiresStrictAlign()">; +def UseWzrToVecMove : Predicate<"Subtarget->useWzrToVecMove()">; + //===----------------------------------------------------------------------===// // AArch64-specific DAG Nodes. @@ -1022,6 +1024,7 @@ def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull, [SDNPCommutative]>; def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull, [SDNPCommutative]>; +def AArch64sqdmulh : SDNode<"AArch64ISD::SQDMULH", SDT_AArch64mull>; // Reciprocal estimates and steps. def AArch64frecpe : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>; @@ -7376,6 +7379,7 @@ def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn), (i64 0)), dsub)>; +let Predicates = [UseWzrToVecMove] in { def : Pat<(vector_insert (v8f16 V128:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)), (INSvi16gpr V128:$Rn, VectorIndexH:$imm, WZR)>; def : Pat<(vector_insert (v4f16 V64:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)), @@ -7386,6 +7390,7 @@ def : Pat<(vector_insert (v2f32 V64:$Rn), (f32 fpimm0), (i64 VectorIndexS:$imm)) (EXTRACT_SUBREG (INSvi32gpr (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)), VectorIndexS:$imm, WZR), dsub)>; def : Pat<(vector_insert v2f64:$Rn, (f64 fpimm0), (i64 VectorIndexD:$imm)), (INSvi64gpr V128:$Rn, VectorIndexS:$imm, XZR)>; +} def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn), (f16 FPR16:$Rm), (i64 VectorIndexH:$imm))), @@ -9439,6 +9444,15 @@ def : Pat<(v4i32 (mulhu V128:$Rn, V128:$Rm)), (EXTRACT_SUBREG V128:$Rm, dsub)), (UMULLv4i32_v2i64 V128:$Rn, V128:$Rm))>; +def : Pat<(v4i16 (AArch64sqdmulh (v4i16 V64:$Rn), (v4i16 V64:$Rm))), + (SQDMULHv4i16 V64:$Rn, V64:$Rm)>; +def : Pat<(v2i32 (AArch64sqdmulh (v2i32 V64:$Rn), (v2i32 V64:$Rm))), + (SQDMULHv2i32 V64:$Rn, V64:$Rm)>; +def : Pat<(v8i16 (AArch64sqdmulh (v8i16 V128:$Rn), (v8i16 V128:$Rm))), + (SQDMULHv8i16 V128:$Rn, V128:$Rm)>; +def : Pat<(v4i32 (AArch64sqdmulh (v4i32 V128:$Rn), (v4i32 V128:$Rm))), + (SQDMULHv4i32 V128:$Rn, V128:$Rm)>; + // Conversions within AdvSIMD types in the same register size are free. // But because we need a consistent lane ordering, in big endian many // conversions require one or more REV instructions. diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index e6b22695761e..782d62a7e5e1 100644 --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -1666,7 +1666,7 @@ static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI, "Given Opc should be a Load or Store with an immediate"); // OpcA will be the first instruction in the pair. if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) { - Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0); + Flags.setSExtIdx(NonSExtOpc == OpcA ? 1 : 0); return true; } @@ -3078,7 +3078,7 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { return false; Subtarget = &Fn.getSubtarget<AArch64Subtarget>(); - TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo()); + TII = Subtarget->getInstrInfo(); TRI = Subtarget->getRegisterInfo(); AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index 5379305bc7a7..adc984ad795a 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -21,40 +21,46 @@ def TuneA320 : SubtargetFeature<"a320", "ARMProcFamily", "CortexA320", "Cortex-A320 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeaturePostRAScheduler]>; + FeaturePostRAScheduler, + FeatureUseWzrToVecMove]>; def TuneA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", "Cortex-A53 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, FeatureBalanceFPOps, - FeaturePostRAScheduler]>; + FeaturePostRAScheduler, + FeatureUseWzrToVecMove]>; def TuneA55 : SubtargetFeature<"a55", "ARMProcFamily", "CortexA55", "Cortex-A55 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, FeaturePostRAScheduler, - FeatureFuseAddress]>; + FeatureFuseAddress, + FeatureUseWzrToVecMove]>; def TuneA510 : SubtargetFeature<"a510", "ARMProcFamily", "CortexA510", "Cortex-A510 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeaturePostRAScheduler + FeaturePostRAScheduler, + FeatureUseWzrToVecMove ]>; def TuneA520 : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520", "Cortex-A520 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeaturePostRAScheduler]>; + FeaturePostRAScheduler, + FeatureUseWzrToVecMove]>; def TuneA520AE : SubtargetFeature<"a520ae", "ARMProcFamily", "CortexA520", "Cortex-A520AE ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeaturePostRAScheduler]>; + FeaturePostRAScheduler, + FeatureUseWzrToVecMove]>; def TuneA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", "Cortex-A57 ARM processors", [ diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index dd23bf51a98c..77dfab83a834 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -1370,3 +1370,8 @@ bool AArch64RegisterInfo::shouldAnalyzePhysregInMachineLoopInfo( MCRegister R) const { return R == AArch64::VG; } + +bool AArch64RegisterInfo::isIgnoredCVReg(MCRegister LLVMReg) const { + return (LLVMReg >= AArch64::Z0 && LLVMReg <= AArch64::Z31) || + (LLVMReg >= AArch64::P0 && LLVMReg <= AArch64::P15); +} diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h index cc94be611a2e..1ed8e959fdd2 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h @@ -154,6 +154,8 @@ public: SmallVectorImpl<uint64_t> &Ops) const override; bool shouldAnalyzePhysregInMachineLoopInfo(MCRegister R) const override; + + virtual bool isIgnoredCVReg(MCRegister LLVMReg) const override; }; } // end namespace llvm diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index eddb96979f7b..0c4b4f4c3ed8 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -2625,16 +2625,22 @@ let Predicates = [HasSVE_or_SME] in { // These get expanded to individual LDR_ZXI/STR_ZXI instructions in // AArch64ExpandPseudoInsts. let mayLoad = 1, hasSideEffects = 0 in { - def LDR_ZZXI : Pseudo<(outs ZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; + def LDR_ZZXI_STRIDED_CONTIGUOUS : Pseudo<(outs ZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; + def LDR_ZZZZXI_STRIDED_CONTIGUOUS : Pseudo<(outs ZZZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; + + def LDR_ZZXI : Pseudo<(outs ZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; def LDR_ZZZXI : Pseudo<(outs ZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; - def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; - def LDR_PPXI : Pseudo<(outs PPR2:$pp), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; + def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; + def LDR_PPXI : Pseudo<(outs PPR2:$pp), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; } let mayStore = 1, hasSideEffects = 0 in { - def STR_ZZXI : Pseudo<(outs), (ins ZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; + def STR_ZZXI_STRIDED_CONTIGUOUS : Pseudo<(outs), (ins ZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; + def STR_ZZZZXI_STRIDED_CONTIGUOUS : Pseudo<(outs), (ins ZZZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; + + def STR_ZZXI : Pseudo<(outs), (ins ZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; def STR_ZZZXI : Pseudo<(outs), (ins ZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; - def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; - def STR_PPXI : Pseudo<(outs), (ins PPR2:$pp, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; + def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; + def STR_PPXI : Pseudo<(outs), (ins PPR2:$pp, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>; } let AddedComplexity = 1 in { diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td index 8d3a4553d4b7..b2c3da03b4b8 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td @@ -157,6 +157,7 @@ def V2Write_20c_1V0 : SchedWriteRes<[V2UnitV0]> { let Latency = 20; def V2Write_2c_1V1 : SchedWriteRes<[V2UnitV1]> { let Latency = 2; } def V2Write_2c_1V13 : SchedWriteRes<[V2UnitV13]> { let Latency = 2; } def V2Write_3c_1V1 : SchedWriteRes<[V2UnitV1]> { let Latency = 3; } +def V2Write_3c_1V13 : SchedWriteRes<[V2UnitV13]> { let Latency = 3; } def V2Write_4c_1V1 : SchedWriteRes<[V2UnitV1]> { let Latency = 4; } def V2Write_4c_1V13 : SchedWriteRes<[V2UnitV13]> { let Latency = 4; } def V2Write_6c_1V1 : SchedWriteRes<[V2UnitV1]> { let Latency = 6; } @@ -256,8 +257,8 @@ def V2Write_4c_1L01_1V01 : SchedWriteRes<[V2UnitL01, V2UnitV01]> { let NumMicroOps = 2; } -def V2Write_4c_1V13_1V : SchedWriteRes<[V2UnitV13, V2UnitV]> { - let Latency = 4; +def V2Write_5c_1V13_1V : SchedWriteRes<[V2UnitV13, V2UnitV]> { + let Latency = 5; let NumMicroOps = 2; } @@ -376,8 +377,8 @@ def V2Write_6c_1L_1S : SchedWriteRes<[V2UnitL, V2UnitS]> { let NumMicroOps = 2; } -def V2Write_4c_2V13 : SchedWriteRes<[V2UnitV13, V2UnitV13]> { - let Latency = 4; +def V2Write_6c_2V13 : SchedWriteRes<[V2UnitV13, V2UnitV13]> { + let Latency = 6; let NumMicroOps = 2; } @@ -1468,14 +1469,14 @@ def : SchedAlias<WriteVq, V2Write_2c_1V>; def : InstRW<[V2Wr_VA, V2Rd_VA], (instregex "^[SU]ABAL?v")>; // ASIMD arith, reduce, 4H/4S -def : InstRW<[V2Write_2c_1V13], (instregex "^(ADDV|[SU]ADDLV)v4(i16|i32)v$")>; +def : InstRW<[V2Write_3c_1V13], (instregex "^(ADDV|[SU]ADDLV)v4(i16|i32)v$")>; // ASIMD arith, reduce, 8B/8H -def : InstRW<[V2Write_4c_1V13_1V], +def : InstRW<[V2Write_5c_1V13_1V], (instregex "^(ADDV|[SU]ADDLV)v8(i8|i16)v$")>; // ASIMD arith, reduce, 16B -def : InstRW<[V2Write_4c_2V13], (instregex "^(ADDV|[SU]ADDLV)v16i8v$")>; +def : InstRW<[V2Write_6c_2V13], (instregex "^(ADDV|[SU]ADDLV)v16i8v$")>; // ASIMD dot product // ASIMD dot product using signed and unsigned integers @@ -1486,15 +1487,15 @@ def : InstRW<[V2Wr_VDOT, V2Rd_VDOT], def : InstRW<[V2Wr_VMMA, V2Rd_VMMA], (instrs SMMLA, UMMLA, USMMLA)>; // ASIMD max/min, reduce, 4H/4S -def : InstRW<[V2Write_2c_1V13], (instregex "^[SU](MAX|MIN)Vv4i16v$", +def : InstRW<[V2Write_3c_1V13], (instregex "^[SU](MAX|MIN)Vv4i16v$", "^[SU](MAX|MIN)Vv4i32v$")>; // ASIMD max/min, reduce, 8B/8H -def : InstRW<[V2Write_4c_1V13_1V], (instregex "^[SU](MAX|MIN)Vv8i8v$", +def : InstRW<[V2Write_5c_1V13_1V], (instregex "^[SU](MAX|MIN)Vv8i8v$", "^[SU](MAX|MIN)Vv8i16v$")>; // ASIMD max/min, reduce, 16B -def : InstRW<[V2Write_4c_2V13], (instregex "[SU](MAX|MIN)Vv16i8v$")>; +def : InstRW<[V2Write_6c_2V13], (instregex "[SU](MAX|MIN)Vv16i8v$")>; // ASIMD multiply def : InstRW<[V2Write_4c_1V02], (instregex "^MULv", "^SQ(R)?DMULHv")>; diff --git a/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp index c9e729025c70..dd775da97112 100644 --- a/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp +++ b/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp @@ -133,7 +133,7 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) { if (!ST.enableStorePairSuppress()) return false; - TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo()); + TII = ST.getInstrInfo(); TRI = ST.getRegisterInfo(); MRI = &MF.getRegInfo(); SchedModel.init(&ST); diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 095682334679..2409cc862f21 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -270,6 +270,7 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) { break; case NeoverseV2: case NeoverseV3: + CacheLineSize = 64; EpilogueVectorizationMinVF = 8; MaxInterleaveFactor = 4; ScatterOverhead = 13; diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td index 1f3d619f6dd8..1b0e90b0e0dc 100644 --- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td +++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td @@ -2387,6 +2387,9 @@ def : RWSysReg<"TRBSR_EL3", 0b11, 0b110, 0b1001, 0b1011, 0b011>; // v9.6 FEAT_PoPS // let Requires = [{ {AArch64::FeaturePoPS} }] in { -def : DC<"CIGDVAPS", 0b000, 0b0111, 0b1111, 0b101>; def : DC<"CIVAPS", 0b000, 0b0111, 0b1111, 0b001>; } + +let Requires = [{ {AArch64::FeaturePoPS, AArch64::FeatureMTE} }] in { +def : DC<"CIGDVAPS", 0b000, 0b0111, 0b1111, 0b101>; +} diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 20e772655811..90d3d92d6bbf 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -2674,14 +2674,14 @@ static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC, static std::optional<Instruction *> instCombineDMB(InstCombiner &IC, IntrinsicInst &II) { // If this barrier is post-dominated by identical one we can remove it - auto *NI = II.getNextNonDebugInstruction(); + auto *NI = II.getNextNode(); unsigned LookaheadThreshold = DMBLookaheadThreshold; auto CanSkipOver = [](Instruction *I) { return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects(); }; while (LookaheadThreshold-- && CanSkipOver(NI)) { auto *NIBB = NI->getParent(); - NI = NI->getNextNonDebugInstruction(); + NI = NI->getNextNode(); if (!NI) { if (auto *SuccBB = NIBB->getUniqueSuccessor()) NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime(); @@ -2723,6 +2723,16 @@ static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC, return std::nullopt; } +static std::optional<Instruction *> +instCombineInStreamingMode(InstCombiner &IC, IntrinsicInst &II) { + SMEAttrs FnSMEAttrs(*II.getFunction()); + bool IsStreaming = FnSMEAttrs.hasStreamingInterfaceOrBody(); + if (IsStreaming || !FnSMEAttrs.hasStreamingCompatibleInterface()) + return IC.replaceInstUsesWith( + II, ConstantInt::getBool(II.getType(), IsStreaming)); + return std::nullopt; +} + std::optional<Instruction *> AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { @@ -2828,6 +2838,8 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, return instCombineSVEUxt(IC, II, 16); case Intrinsic::aarch64_sve_uxtw: return instCombineSVEUxt(IC, II, 32); + case Intrinsic::aarch64_sme_in_streaming_mode: + return instCombineInStreamingMode(IC, II); } return std::nullopt; @@ -3712,7 +3724,7 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode, InstructionCost AArch64TTIImpl::getVectorInstrCostHelper( unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, - bool HasRealUse, const Instruction *I, Value *Scalar, + const Instruction *I, Value *Scalar, ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const { assert(Val->isVectorTy() && "This must be a vector type"); @@ -3732,12 +3744,10 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper( } // The element at index zero is already inside the vector. - // - For a physical (HasRealUse==true) insert-element or extract-element + // - For a insert-element or extract-element // instruction that extracts integers, an explicit FPR -> GPR move is // needed. So it has non-zero cost. - // - For the rest of cases (virtual instruction or element type is float), - // consider the instruction free. - if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy())) + if (Index == 0 && !Val->getScalarType()->isIntegerTy()) return 0; // This is recognising a LD1 single-element structure to one lane of one @@ -3887,25 +3897,28 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, const Value *Op0, const Value *Op1) const { - bool HasRealUse = - Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0); - return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, HasRealUse); + // Treat insert at lane 0 into a poison vector as having zero cost. This + // ensures vector broadcasts via an insert + shuffle (and will be lowered to a + // single dup) are treated as cheap. + if (Opcode == Instruction::InsertElement && Index == 0 && Op0 && + isa<PoisonValue>(Op0)) + return 0; + return getVectorInstrCostHelper(Opcode, Val, CostKind, Index); } InstructionCost AArch64TTIImpl::getVectorInstrCost( unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const { - return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, false, nullptr, - Scalar, ScalarUserAndIdx); + return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar, + ScalarUserAndIdx); } InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const { - return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, - true /* HasRealUse */, &I); + return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I); } InstructionCost AArch64TTIImpl::getScalarizationOverhead( @@ -4114,10 +4127,8 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) { // SDIV/UDIV operations are lowered using SVE, then we can have less // costs. - if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty) - ->getPrimitiveSizeInBits() - .getFixedValue() < 128) { - EVT VT = TLI->getValueType(DL, Ty); + if (VT.isSimple() && isa<FixedVectorType>(Ty) && + Ty->getPrimitiveSizeInBits().getFixedValue() < 128) { static const CostTblEntry DivTbl[]{ {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8}, {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5}, @@ -4894,15 +4905,14 @@ void AArch64TTIImpl::getUnrollingPreferences( // Disable partial & runtime unrolling on -Os. UP.PartialOptSizeThreshold = 0; + // No need to unroll auto-vectorized loops + if (findStringMetadataForLoop(L, "llvm.loop.isvectorized")) + return; + // Scan the loop: don't unroll loops with calls as this could prevent - // inlining. Don't unroll vector loops either, as they don't benefit much from - // unrolling. + // inlining. for (auto *BB : L->getBlocks()) { for (auto &I : *BB) { - // Don't unroll vectorised loop. - if (I.getType()->isVectorTy()) - return; - if (isa<CallBase>(I)) { if (isa<CallInst>(I) || isa<InvokeInst>(I)) if (const Function *F = cast<CallBase>(I).getCalledFunction()) @@ -5201,33 +5211,34 @@ AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll // AND: llvm/test/CodeGen/AArch64/reduce-and.ll static const CostTblEntry CostTblNoPairwise[]{ - {ISD::ADD, MVT::v8i8, 2}, - {ISD::ADD, MVT::v16i8, 2}, - {ISD::ADD, MVT::v4i16, 2}, - {ISD::ADD, MVT::v8i16, 2}, - {ISD::ADD, MVT::v4i32, 2}, - {ISD::ADD, MVT::v2i64, 2}, - {ISD::OR, MVT::v8i8, 15}, - {ISD::OR, MVT::v16i8, 17}, - {ISD::OR, MVT::v4i16, 7}, - {ISD::OR, MVT::v8i16, 9}, - {ISD::OR, MVT::v2i32, 3}, - {ISD::OR, MVT::v4i32, 5}, - {ISD::OR, MVT::v2i64, 3}, - {ISD::XOR, MVT::v8i8, 15}, - {ISD::XOR, MVT::v16i8, 17}, - {ISD::XOR, MVT::v4i16, 7}, - {ISD::XOR, MVT::v8i16, 9}, - {ISD::XOR, MVT::v2i32, 3}, - {ISD::XOR, MVT::v4i32, 5}, - {ISD::XOR, MVT::v2i64, 3}, - {ISD::AND, MVT::v8i8, 15}, - {ISD::AND, MVT::v16i8, 17}, - {ISD::AND, MVT::v4i16, 7}, - {ISD::AND, MVT::v8i16, 9}, - {ISD::AND, MVT::v2i32, 3}, - {ISD::AND, MVT::v4i32, 5}, - {ISD::AND, MVT::v2i64, 3}, + {ISD::ADD, MVT::v8i8, 2}, + {ISD::ADD, MVT::v16i8, 2}, + {ISD::ADD, MVT::v4i16, 2}, + {ISD::ADD, MVT::v8i16, 2}, + {ISD::ADD, MVT::v2i32, 2}, + {ISD::ADD, MVT::v4i32, 2}, + {ISD::ADD, MVT::v2i64, 2}, + {ISD::OR, MVT::v8i8, 5}, // fmov + orr_lsr + orr_lsr + lsr + orr + {ISD::OR, MVT::v16i8, 7}, // ext + orr + same as v8i8 + {ISD::OR, MVT::v4i16, 4}, // fmov + orr_lsr + lsr + orr + {ISD::OR, MVT::v8i16, 6}, // ext + orr + same as v4i16 + {ISD::OR, MVT::v2i32, 3}, // fmov + lsr + orr + {ISD::OR, MVT::v4i32, 5}, // ext + orr + same as v2i32 + {ISD::OR, MVT::v2i64, 3}, // ext + orr + fmov + {ISD::XOR, MVT::v8i8, 5}, // Same as above for or... + {ISD::XOR, MVT::v16i8, 7}, + {ISD::XOR, MVT::v4i16, 4}, + {ISD::XOR, MVT::v8i16, 6}, + {ISD::XOR, MVT::v2i32, 3}, + {ISD::XOR, MVT::v4i32, 5}, + {ISD::XOR, MVT::v2i64, 3}, + {ISD::AND, MVT::v8i8, 5}, // Same as above for or... + {ISD::AND, MVT::v16i8, 7}, + {ISD::AND, MVT::v4i16, 4}, + {ISD::AND, MVT::v8i16, 6}, + {ISD::AND, MVT::v2i32, 3}, + {ISD::AND, MVT::v4i32, 5}, + {ISD::AND, MVT::v2i64, 3}, }; switch (ISD) { default: diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index ff0ab68a16a8..b27eb2ef7a39 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -65,16 +65,14 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> { // A helper function called by 'getVectorInstrCost'. // - // 'Val' and 'Index' are forwarded from 'getVectorInstrCost'; 'HasRealUse' - // indicates whether the vector instruction is available in the input IR or - // just imaginary in vectorizer passes. - /// \param ScalarUserAndIdx encodes the information about extracts from a + // 'Val' and 'Index' are forwarded from 'getVectorInstrCost'; + // \param ScalarUserAndIdx encodes the information about extracts from a /// vector with 'Scalar' being the value being extracted,'User' being the user /// of the extract(nullptr if user is not known before vectorization) and /// 'Idx' being the extract lane. InstructionCost getVectorInstrCostHelper( unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, - bool HasRealUse, const Instruction *I = nullptr, Value *Scalar = nullptr, + const Instruction *I = nullptr, Value *Scalar = nullptr, ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx = {}) const; public: diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index 84884d98e6f9..b9d3e1bf835b 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -142,7 +142,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target, uint64_t Value, MCContext &Ctx, const Triple &TheTriple, bool IsResolved) { int64_t SignedValue = static_cast<int64_t>(Value); - switch (Fixup.getTargetKind()) { + switch (Fixup.getKind()) { default: llvm_unreachable("Unknown fixup kind!"); case AArch64::fixup_aarch64_pcrel_adr_imm21: @@ -417,7 +417,7 @@ static bool shouldForceRelocation(const MCFixup &Fixup) { // same page as the ADRP and the instruction should encode 0x0. Assuming the // section isn't 0x1000-aligned, we therefore need to delegate this decision // to the linker -- a relocation! - return Fixup.getTargetKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21; + return Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21; } void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, @@ -431,7 +431,7 @@ void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, if (mc::isRelocation(Kind)) return; - if (Fixup.getTargetKind() == FK_Data_8 && TheTriple.isOSBinFormatELF()) { + if (Fixup.getKind() == FK_Data_8 && TheTriple.isOSBinFormatELF()) { auto RefKind = static_cast<AArch64::Specifier>(Target.getSpecifier()); AArch64::Specifier SymLoc = AArch64::getSymbolLoc(RefKind); if (SymLoc == AArch64::S_AUTH || SymLoc == AArch64::S_AUTHADDR) { @@ -488,7 +488,7 @@ void AArch64AsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, AArch64::Specifier RefKind = static_cast<AArch64::Specifier>(Target.getSpecifier()); if (AArch64::getSymbolLoc(RefKind) == AArch64::S_SABS || - (!RefKind && Fixup.getTargetKind() == AArch64::fixup_aarch64_movw)) { + (!RefKind && Fixup.getKind() == AArch64::fixup_aarch64_movw)) { // If the immediate is negative, generate MOVN else MOVZ. // (Bit 30 = 0) ==> MOVN, (Bit 30 = 1) ==> MOVZ. if (SignedValue < 0) diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp index c3881fc79ba6..7618a5769186 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp @@ -57,7 +57,7 @@ AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI, bool IsILP32) // assumes IsILP32 is true bool AArch64ELFObjectWriter::isNonILP32reloc(const MCFixup &Fixup, AArch64::Specifier RefKind) const { - if (Fixup.getTargetKind() != AArch64::fixup_aarch64_movw) + if (Fixup.getKind() != AArch64::fixup_aarch64_movw) return false; switch (RefKind) { case AArch64::S_ABS_G3: @@ -84,7 +84,7 @@ bool AArch64ELFObjectWriter::isNonILP32reloc(const MCFixup &Fixup, unsigned AArch64ELFObjectWriter::getRelocType(const MCFixup &Fixup, const MCValue &Target, bool IsPCRel) const { - unsigned Kind = Fixup.getTargetKind(); + auto Kind = Fixup.getKind(); AArch64::Specifier RefKind = static_cast<AArch64::Specifier>(Target.getSpecifier()); AArch64::Specifier SymLoc = AArch64::getSymbolLoc(RefKind); @@ -212,7 +212,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(const MCFixup &Fixup, } else { if (IsILP32 && isNonILP32reloc(Fixup, RefKind)) return ELF::R_AARCH64_NONE; - switch (Fixup.getTargetKind()) { + switch (Fixup.getKind()) { case FK_Data_1: reportError(Fixup.getLoc(), "1-byte data relocations not supported"); return ELF::R_AARCH64_NONE; diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index f2144375fd95..08f547a85073 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -529,11 +529,9 @@ void AArch64TargetELFStreamer::finish() { static_cast<MCSectionELF *>(Ctx.getObjectFileInfo()->getTextSection()); bool Empty = true; for (auto &F : *Text) { - if (auto *DF = dyn_cast<MCDataFragment>(&F)) { - if (!DF->getContents().empty()) { - Empty = false; - break; - } + if (F.getSize()) { + Empty = false; + break; } } if (Empty) @@ -561,8 +559,7 @@ void AArch64TargetELFStreamer::finish() { if (!Sym.isMemtag()) continue; auto *SRE = MCSymbolRefExpr::create(&Sym, Ctx); - (void)S.emitRelocDirective(*Zero, "BFD_RELOC_NONE", SRE, SMLoc(), - *Ctx.getSubtargetInfo()); + S.emitRelocDirective(*Zero, "BFD_RELOC_NONE", SRE); } } diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp index f918e3cbc7b8..5c8f57664a2c 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp @@ -356,7 +356,7 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI, else if (TheTriple.isOSBinFormatCOFF()) MAI = new AArch64MCAsmInfoGNUCOFF(); else - llvm_unreachable("Invalid target"); // FIXME: This is not unreachable + reportFatalUsageError("unsupported object format"); // Initial state of the frame pointer is SP. unsigned Reg = MRI.getDwarfRegNum(AArch64::SP, true); diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp index 61458d7c24be..1ac340a1b58a 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -53,7 +53,7 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo( RelocType = unsigned(MachO::ARM64_RELOC_UNSIGNED); Log2Size = ~0U; - switch (Fixup.getTargetKind()) { + switch (Fixup.getKind()) { default: return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 23f106a9c1d4..007b481f8496 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -153,6 +153,9 @@ private: const TargetMachine &TM; }; +void initializeAMDGPUPrepareAGPRAllocLegacyPass(PassRegistry &); +extern char &AMDGPUPrepareAGPRAllocLegacyID; + void initializeAMDGPUReserveWWMRegsLegacyPass(PassRegistry &); extern char &AMDGPUReserveWWMRegsLegacyID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 31420caca089..0e0e83b7a6b5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -89,6 +89,12 @@ def FeatureEnableFlatScratch : SubtargetFeature<"enable-flat-scratch", "Use scratch_* flat memory instructions to access scratch" >; +def FeatureFlatGVSMode : SubtargetFeature<"flat-gvs-mode", + "FlatGVSMode", + "true", + "Have GVS addressing mode with flat_* instructions" +>; + def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts", "AddNoCarryInsts", "true", @@ -541,6 +547,12 @@ def FeatureRealTrue16Insts : SubtargetFeature<"real-true16", "Use true 16-bit registers" >; +def FeatureBF16TransInsts : SubtargetFeature<"bf16-trans-insts", + "HasBF16TransInsts", + "true", + "Has bf16 transcendental instructions" +>; + def FeatureBF16ConversionInsts : SubtargetFeature<"bf16-cvt-insts", "HasBF16ConversionInsts", "true", @@ -1106,6 +1118,12 @@ def FeatureBitOp3Insts : SubtargetFeature<"bitop3-insts", "Has v_bitop3_b32/v_bitop3_b16 instructions" >; +def FeatureTanhInsts : SubtargetFeature<"tanh-insts", + "HasTanhInsts", + "true", + "Has v_tanh_f32/f16 instructions" +>; + def FeatureTransposeLoadF4F6Insts : SubtargetFeature<"transpose-load-f4f6-insts", "HasTransposeLoadF4F6Insts", "true", @@ -1948,6 +1966,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureShaderCyclesHiLoRegisters, FeatureArchitectedFlatScratch, FeatureArchitectedSGPRs, + FeatureFlatGVSMode, FeatureAtomicFaddRtnInsts, FeatureAtomicFaddNoRtnInsts, FeatureAtomicDsPkAdd16Insts, @@ -1966,7 +1985,9 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureScalarDwordx3Loads, FeatureDPPSrc1SGPR, FeatureBitOp3Insts, + FeatureTanhInsts, FeatureTransposeLoadF4F6Insts, + FeatureBF16TransInsts, FeatureBF16ConversionInsts, FeatureCvtPkF16F32Inst, FeatureMinimum3Maximum3PKF16, @@ -2374,6 +2395,9 @@ def HasFlatScratchSTMode : Predicate<"Subtarget->hasFlatScratchSTMode()">, def HasFlatScratchSVSMode : Predicate<"Subtarget->hasFlatScratchSVSMode()">, AssemblerPredicate<(any_of FeatureGFX940Insts, FeatureGFX11Insts)>; +def HasFlatGVSMode : Predicate<"Subtarget->hasFlatGVSMode()">, + AssemblerPredicate<(all_of FeatureFlatGVSMode)>; + def HasGFX10_AEncoding : Predicate<"Subtarget->hasGFX10_AEncoding()">, AssemblerPredicate<(all_of FeatureGFX10_AEncoding)>; @@ -2442,6 +2466,9 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() && // FIXME When we default to RealTrue16 instead of Fake, change the line as follows. // AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>; +def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">, + AssemblerPredicate<(all_of FeatureBF16TransInsts)>; + def HasBF16ConversionInsts : Predicate<"Subtarget->hasBF16ConversionInsts()">, AssemblerPredicate<(all_of FeatureBF16ConversionInsts)>; @@ -2657,6 +2684,9 @@ def HasDefaultComponentBroadcast def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">, AssemblerPredicate<(all_of FeatureDsSrc2Insts)>; +def HasAddPC64Inst : Predicate<"Subtarget->hasAddPC64Inst()">, + AssemblerPredicate<(any_of FeatureGFX1250Insts)>; + def EnableFlatScratch : Predicate<"Subtarget->enableFlatScratch()">; def DisableFlatScratch : Predicate<"!Subtarget->enableFlatScratch()">; @@ -2680,6 +2710,9 @@ def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">, def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">, AssemblerPredicate<(all_of FeatureBitOp3Insts)>; +def HasTanhInsts : Predicate<"Subtarget->hasTanhInsts()">, + AssemblerPredicate<(all_of FeatureTanhInsts)>; + def HasTransposeLoadF4F6Insts : Predicate<"Subtarget->hasTransposeLoadF4F6Insts()">, AssemblerPredicate<(all_of FeatureTransposeLoadF4F6Insts)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp index 79cf49f88d6d..dedee46a4423 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp @@ -13,11 +13,9 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "Utils/AMDGPUBaseInfo.h" -#include "llvm/Analysis/CycleAnalysis.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/IR/IntrinsicsR600.h" -#include "llvm/InitializePasses.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/IPO/Attributor.h" diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 22b921fb2084..5f1983791cfa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -45,12 +45,6 @@ static cl::opt<bool> WidenLoads( cl::ReallyHidden, cl::init(false)); -static cl::opt<bool> Widen16BitOps( - "amdgpu-codegenprepare-widen-16-bit-ops", - cl::desc( - "Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"), - cl::ReallyHidden, cl::init(false)); - static cl::opt<bool> BreakLargePHIs("amdgpu-codegenprepare-break-large-phis", cl::desc("Break large PHI nodes for DAGISel"), @@ -150,18 +144,6 @@ public: bool canBreakPHINode(const PHINode &I); - /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to - /// binary operation \p V. - /// - /// \returns Binary operation \p V. - /// \returns \p T's base element bit width. - unsigned getBaseElementBitWidth(const Type *T) const; - - /// \returns Equivalent 32 bit integer type for given type \p T. For example, - /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32> - /// is returned. - Type *getI32Ty(IRBuilder<> &B, const Type *T) const; - /// \returns True if binary operation \p I is a signed binary operation, false /// otherwise. bool isSigned(const BinaryOperator &I) const; @@ -170,10 +152,6 @@ public: /// signed 'icmp' operation, false otherwise. bool isSigned(const SelectInst &I) const; - /// \returns True if type \p T needs to be promoted to 32 bit integer type, - /// false otherwise. - bool needsPromotionToI32(const Type *T) const; - /// Return true if \p T is a legal scalar floating point type. bool isLegalFloatingTy(const Type *T) const; @@ -188,52 +166,6 @@ public: computeKnownFPClass(V, fcSubnormal, CtxI).isKnownNeverSubnormal(); } - /// Promotes uniform binary operation \p I to equivalent 32 bit binary - /// operation. - /// - /// \details \p I's base element bit width must be greater than 1 and less - /// than or equal 16. Promotion is done by sign or zero extending operands to - /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and - /// truncating the result of 32 bit binary operation back to \p I's original - /// type. Division operation is not promoted. - /// - /// \returns True if \p I is promoted to equivalent 32 bit binary operation, - /// false otherwise. - bool promoteUniformOpToI32(BinaryOperator &I) const; - - /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation. - /// - /// \details \p I's base element bit width must be greater than 1 and less - /// than or equal 16. Promotion is done by sign or zero extending operands to - /// 32 bits, and replacing \p I with 32 bit 'icmp' operation. - /// - /// \returns True. - bool promoteUniformOpToI32(ICmpInst &I) const; - - /// Promotes uniform 'select' operation \p I to 32 bit 'select' - /// operation. - /// - /// \details \p I's base element bit width must be greater than 1 and less - /// than or equal 16. Promotion is done by sign or zero extending operands to - /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the - /// result of 32 bit 'select' operation back to \p I's original type. - /// - /// \returns True. - bool promoteUniformOpToI32(SelectInst &I) const; - - /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse' - /// intrinsic. - /// - /// \details \p I's base element bit width must be greater than 1 and less - /// than or equal 16. Promotion is done by zero extending the operand to 32 - /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the - /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the - /// shift amount is 32 minus \p I's base element bit width), and truncating - /// the result of the shift operation back to \p I's original type. - /// - /// \returns True. - bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; - /// \returns The minimum number of bits needed to store the value of \Op as an /// unsigned integer. Truncating to this size and then zero-extending to /// the original will not change the value. @@ -320,13 +252,11 @@ public: bool visitInstruction(Instruction &I) { return false; } bool visitBinaryOperator(BinaryOperator &I); bool visitLoadInst(LoadInst &I); - bool visitICmpInst(ICmpInst &I); bool visitSelectInst(SelectInst &I); bool visitPHINode(PHINode &I); bool visitAddrSpaceCastInst(AddrSpaceCastInst &I); bool visitIntrinsicInst(IntrinsicInst &I); - bool visitBitreverseIntrinsicInst(IntrinsicInst &I); bool visitFMinLike(IntrinsicInst &I); bool visitSqrt(IntrinsicInst &I); bool run(); @@ -380,22 +310,6 @@ bool AMDGPUCodeGenPrepareImpl::run() { return MadeChange; } -unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type *T) const { - assert(needsPromotionToI32(T) && "T does not need promotion to i32"); - - if (T->isIntegerTy()) - return T->getIntegerBitWidth(); - return cast<VectorType>(T)->getElementType()->getIntegerBitWidth(); -} - -Type *AMDGPUCodeGenPrepareImpl::getI32Ty(IRBuilder<> &B, const Type *T) const { - assert(needsPromotionToI32(T) && "T does not need promotion to i32"); - - if (T->isIntegerTy()) - return B.getInt32Ty(); - return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T)); -} - bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const { return I.getOpcode() == Instruction::AShr || I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; @@ -406,59 +320,11 @@ bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const { cast<ICmpInst>(I.getOperand(0))->isSigned(); } -bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type *T) const { - if (!Widen16BitOps) - return false; - - const IntegerType *IntTy = dyn_cast<IntegerType>(T); - if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16) - return true; - - if (const VectorType *VT = dyn_cast<VectorType>(T)) { - // TODO: The set of packed operations is more limited, so may want to - // promote some anyway. - if (ST.hasVOP3PInsts()) - return false; - - return needsPromotionToI32(VT->getElementType()); - } - - return false; -} - bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const { return Ty->isFloatTy() || Ty->isDoubleTy() || (Ty->isHalfTy() && ST.has16BitInsts()); } -// Return true if the op promoted to i32 should have nsw set. -static bool promotedOpIsNSW(const Instruction &I) { - switch (I.getOpcode()) { - case Instruction::Shl: - case Instruction::Add: - case Instruction::Sub: - return true; - case Instruction::Mul: - return I.hasNoUnsignedWrap(); - default: - return false; - } -} - -// Return true if the op promoted to i32 should have nuw set. -static bool promotedOpIsNUW(const Instruction &I) { - switch (I.getOpcode()) { - case Instruction::Shl: - case Instruction::Add: - case Instruction::Mul: - return true; - case Instruction::Sub: - return I.hasNoUnsignedWrap(); - default: - return false; - } -} - bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const { Type *Ty = I.getType(); int TySize = DL.getTypeSizeInBits(Ty); @@ -467,134 +333,6 @@ bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const { return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniform(&I); } -bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(BinaryOperator &I) const { - assert(needsPromotionToI32(I.getType()) && - "I does not need promotion to i32"); - - if (I.getOpcode() == Instruction::SDiv || - I.getOpcode() == Instruction::UDiv || - I.getOpcode() == Instruction::SRem || - I.getOpcode() == Instruction::URem) - return false; - - IRBuilder<> Builder(&I); - Builder.SetCurrentDebugLocation(I.getDebugLoc()); - - Type *I32Ty = getI32Ty(Builder, I.getType()); - Value *ExtOp0 = nullptr; - Value *ExtOp1 = nullptr; - Value *ExtRes = nullptr; - Value *TruncRes = nullptr; - - if (isSigned(I)) { - ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); - ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); - } else { - ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); - ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); - } - - ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1); - if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) { - if (promotedOpIsNSW(cast<Instruction>(I))) - Inst->setHasNoSignedWrap(); - - if (promotedOpIsNUW(cast<Instruction>(I))) - Inst->setHasNoUnsignedWrap(); - - if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) - Inst->setIsExact(ExactOp->isExact()); - } - - TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); - - I.replaceAllUsesWith(TruncRes); - I.eraseFromParent(); - - return true; -} - -bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(ICmpInst &I) const { - assert(needsPromotionToI32(I.getOperand(0)->getType()) && - "I does not need promotion to i32"); - - IRBuilder<> Builder(&I); - Builder.SetCurrentDebugLocation(I.getDebugLoc()); - - Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType()); - Value *ExtOp0 = nullptr; - Value *ExtOp1 = nullptr; - Value *NewICmp = nullptr; - - if (I.isSigned()) { - ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); - ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); - } else { - ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); - ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); - } - NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); - - I.replaceAllUsesWith(NewICmp); - I.eraseFromParent(); - - return true; -} - -bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(SelectInst &I) const { - assert(needsPromotionToI32(I.getType()) && - "I does not need promotion to i32"); - - IRBuilder<> Builder(&I); - Builder.SetCurrentDebugLocation(I.getDebugLoc()); - - Type *I32Ty = getI32Ty(Builder, I.getType()); - Value *ExtOp1 = nullptr; - Value *ExtOp2 = nullptr; - Value *ExtRes = nullptr; - Value *TruncRes = nullptr; - - if (isSigned(I)) { - ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); - ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); - } else { - ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); - ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); - } - ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); - TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); - - I.replaceAllUsesWith(TruncRes); - I.eraseFromParent(); - - return true; -} - -bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32( - IntrinsicInst &I) const { - assert(I.getIntrinsicID() == Intrinsic::bitreverse && - "I must be bitreverse intrinsic"); - assert(needsPromotionToI32(I.getType()) && - "I does not need promotion to i32"); - - IRBuilder<> Builder(&I); - Builder.SetCurrentDebugLocation(I.getDebugLoc()); - - Type *I32Ty = getI32Ty(Builder, I.getType()); - Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); - Value *ExtRes = - Builder.CreateIntrinsic(Intrinsic::bitreverse, {I32Ty}, {ExtOp}); - Value *LShrOp = - Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType())); - Value *TruncRes = - Builder.CreateTrunc(LShrOp, I.getType()); - - I.replaceAllUsesWith(TruncRes); - I.eraseFromParent(); - - return true; -} - unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const { return computeKnownBits(Op, DL, AC).countMaxActiveBits(); } @@ -1635,10 +1373,6 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) { if (foldBinOpIntoSelect(I)) return true; - if (ST.has16BitInsts() && needsPromotionToI32(I.getType()) && - UA.isUniform(&I) && promoteUniformOpToI32(I)) - return true; - if (UseMul24Intrin && replaceMulWithMul24(I)) return true; if (tryNarrowMathIfNoOverflow(&I, ST.getTargetLowering(), @@ -1770,16 +1504,6 @@ bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) { return false; } -bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) { - bool Changed = false; - - if (ST.has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) && - UA.isUniform(&I)) - Changed |= promoteUniformOpToI32(I); - - return Changed; -} - bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) { Value *Cond = I.getCondition(); Value *TrueVal = I.getTrueValue(); @@ -1787,12 +1511,6 @@ bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) { Value *CmpVal; CmpPredicate Pred; - if (ST.has16BitInsts() && needsPromotionToI32(I.getType())) { - if (UA.isUniform(&I)) - return promoteUniformOpToI32(I); - return false; - } - // Match fract pattern with nan check. if (!match(Cond, m_FCmp(Pred, m_Value(CmpVal), m_NonNaN()))) return false; @@ -2196,8 +1914,6 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) { bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) { switch (I.getIntrinsicID()) { - case Intrinsic::bitreverse: - return visitBitreverseIntrinsicInst(I); case Intrinsic::minnum: case Intrinsic::minimumnum: case Intrinsic::minimum: @@ -2209,16 +1925,6 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) { } } -bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(IntrinsicInst &I) { - bool Changed = false; - - if (ST.has16BitInsts() && needsPromotionToI32(I.getType()) && - UA.isUniform(&I)) - Changed |= promoteUniformBitreverseToI32(I); - - return Changed; -} - /// Match non-nan fract pattern. /// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0)) /// minimumnum(fsub(x, floor(x)), nextafter(1.0, -1.0)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 1b909568fc55..7b5d4077e85f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -55,6 +55,14 @@ def gi_vop3pmodsneg : GIComplexOperandMatcher<s32, "selectVOP3PModsNeg">, GIComplexPatternEquiv<VOP3PModsNeg>; +def gi_vop3pmodsnegs : + GIComplexOperandMatcher<s32, "selectVOP3PModsNegs">, + GIComplexPatternEquiv<VOP3PModsNegs>; + +def gi_dotiuvop3pmodsnegabs : + GIComplexOperandMatcher<s32, "selectVOP3PModsNegAbs">, + GIComplexPatternEquiv<VOP3PModsNegAbs>; + def gi_wmmaopselvop3pmods : GIComplexOperandMatcher<s32, "selectWMMAOpSelVOP3PMods">, GIComplexPatternEquiv<WMMAOpSelVOP3PMods>; @@ -83,6 +91,10 @@ def gi_swmmacindex16 : GIComplexOperandMatcher<s32, "selectSWMMACIndex16">, GIComplexPatternEquiv<SWMMACIndex16>; +def gi_swmmacindex32 : + GIComplexOperandMatcher<s64, "selectSWMMACIndex32">, + GIComplexPatternEquiv<SWMMACIndex32>; + def gi_vop3opselmods : GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">, GIComplexPatternEquiv<VOP3OpSelMods>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 202693b31612..25672a52345c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -447,6 +447,35 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { return; } + bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN(); + if (IsGCN && Subtarget->has64BitLiterals() && VT.getSizeInBits() == 64 && + CurDAG->isConstantValueOfAnyType(SDValue(N, 0))) { + uint64_t C = 0; + bool AllConst = true; + unsigned EltSize = EltVT.getSizeInBits(); + for (unsigned I = 0; I < NumVectorElts; ++I) { + SDValue Op = N->getOperand(I); + if (Op.isUndef()) { + AllConst = false; + break; + } + uint64_t Val; + if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Op)) { + Val = CF->getValueAPF().bitcastToAPInt().getZExtValue(); + } else + Val = cast<ConstantSDNode>(Op)->getZExtValue(); + C |= Val << (EltSize * I); + } + if (AllConst) { + SDValue CV = CurDAG->getTargetConstant(C, DL, MVT::i64); + MachineSDNode *Copy = + CurDAG->getMachineNode(AMDGPU::S_MOV_B64_IMM_PSEUDO, DL, VT, CV); + CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, VT, SDValue(Copy, 0), + RegClass); + return; + } + } + assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not " "supported yet"); // 32 = Max Num Vector Elements @@ -454,7 +483,6 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { // 1 = Vector Register Class SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1); - bool IsGCN = CurDAG->getSubtarget().getTargetTriple().isAMDGCN(); RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32); bool IsRegSeq = true; unsigned NOps = N->getNumOperands(); @@ -676,7 +704,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { case ISD::Constant: case ISD::ConstantFP: { - if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N)) + if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N) || + Subtarget->has64BitLiterals()) break; uint64_t Imm; @@ -1632,8 +1661,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset) const { - const SIRegisterInfo *TRI = - static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); const SIInstrInfo *TII = Subtarget->getInstrInfo(); MachineFunction &MF = CurDAG->getMachineFunction(); const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); @@ -3245,6 +3273,7 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src, return SelectVOP3PMods(In, Src, SrcMods, true); } +// Select neg_lo from the i1 immediate operand. bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const { const ConstantSDNode *C = cast<ConstantSDNode>(In); // Literal i1 value set in intrinsic, represents SrcMods for the next operand. @@ -3260,6 +3289,47 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const { return true; } +// Select both neg_lo and neg_hi from the i1 immediate operand. This is +// specifically for F16/BF16 operands in WMMA instructions, where neg_lo applies +// to matrix's even k elements, and neg_hi applies to matrix's odd k elements. +bool AMDGPUDAGToDAGISel::SelectVOP3PModsNegs(SDValue In, SDValue &Src) const { + const ConstantSDNode *C = cast<ConstantSDNode>(In); + // Literal i1 value set in intrinsic, represents SrcMods for the next operand. + // 1 promotes packed values to signed, 0 treats them as unsigned. + assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value"); + + unsigned Mods = SISrcMods::OP_SEL_1; + unsigned SrcSign = C->getZExtValue(); + if (SrcSign == 1) + Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); + + Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +// Select neg, abs, or both neg and abs from the i16 immediate operans. +bool AMDGPUDAGToDAGISel::SelectVOP3PModsNegAbs(SDValue In, SDValue &Src) const { + const ConstantSDNode *C = cast<ConstantSDNode>(In); + unsigned Mods = SISrcMods::OP_SEL_1; + unsigned SrcMod = C->getZExtValue(); + switch (SrcMod) { + default: // Any other value will be silently ignored (considered as 0). + break; + case 1: + Mods ^= SISrcMods::NEG; + break; + case 2: + Mods ^= SISrcMods::ABS; + break; + case 3: + Mods ^= (SISrcMods::NEG | SISrcMods::ABS); + break; + } + + Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const { const ConstantSDNode *C = cast<ConstantSDNode>(In); @@ -3611,6 +3681,41 @@ bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src, return true; } +bool AMDGPUDAGToDAGISel::SelectSWMMACIndex32(SDValue In, SDValue &Src, + SDValue &IndexKey) const { + unsigned Key = 0; + Src = In; + + SDValue InI32; + + if (In.getOpcode() == ISD::ANY_EXTEND || In.getOpcode() == ISD::ZERO_EXTEND) { + const SDValue &ExtendSrc = In.getOperand(0); + if (ExtendSrc.getValueSizeInBits() == 32) + InI32 = ExtendSrc; + } else if (In->getOpcode() == ISD::BITCAST) { + const SDValue &CastSrc = In.getOperand(0); + if (CastSrc.getOpcode() == ISD::BUILD_VECTOR && + CastSrc.getOperand(0).getValueSizeInBits() == 32) { + ConstantSDNode *Zero = dyn_cast<ConstantSDNode>(CastSrc.getOperand(1)); + if (Zero && Zero->getZExtValue() == 0) + InI32 = CastSrc.getOperand(0); + } + } + + if (InI32 && InI32.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + const SDValue &ExtractVecEltSrc = InI32.getOperand(0); + ConstantSDNode *EltIdx = dyn_cast<ConstantSDNode>(InI32.getOperand(1)); + if (ExtractVecEltSrc.getValueSizeInBits() == 64 && EltIdx && + EltIdx->getZExtValue() == 1) { + Key = 1; + Src = ExtractVecEltSrc; + } + } + + IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32); + return true; +} + bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const { Src = In; @@ -3885,10 +3990,8 @@ SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const { bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const { assert(CurDAG->getTarget().getTargetTriple().isAMDGCN()); - const SIRegisterInfo *SIRI = - static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); - const SIInstrInfo * SII = - static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo(); + const SIInstrInfo *SII = Subtarget->getInstrInfo(); unsigned Limit = 0; bool AllUsesAcceptSReg = true; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index f3b9364fdb92..9967f46e085e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -222,6 +222,8 @@ private: bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3PModsNeg(SDValue In, SDValue &Src) const; + bool SelectVOP3PModsNegs(SDValue In, SDValue &Src) const; + bool SelectVOP3PModsNegAbs(SDValue In, SDValue &Src) const; bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const; bool SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src, @@ -233,6 +235,7 @@ private: bool SelectSWMMACIndex8(SDValue In, SDValue &Src, SDValue &IndexKey) const; bool SelectSWMMACIndex16(SDValue In, SDValue &Src, SDValue &IndexKey) const; + bool SelectSWMMACIndex32(SDValue In, SDValue &Src, SDValue &IndexKey) const; bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index e64d2162441a..3d040fb705a8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4006,7 +4006,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine( case Intrinsic::amdgcn_rsq: case Intrinsic::amdgcn_rcp_legacy: case Intrinsic::amdgcn_rsq_legacy: - case Intrinsic::amdgcn_rsq_clamp: { + case Intrinsic::amdgcn_rsq_clamp: + case Intrinsic::amdgcn_tanh: { // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted SDValue Src = N->getOperand(1); return Src.isUndef() ? Src : SDValue(); @@ -4842,11 +4843,94 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, return SDValue(); } +// Detect when CMP and SELECT use the same constant and fold them to avoid +// loading the constant twice. Specifically handles patterns like: +// %cmp = icmp eq i32 %val, 4242 +// %sel = select i1 %cmp, i32 4242, i32 %other +// It can be optimized to reuse %val instead of 4242 in select. +static SDValue +foldCmpSelectWithSharedConstant(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + const AMDGPUSubtarget *ST) { + SDValue Cond = N->getOperand(0); + SDValue TrueVal = N->getOperand(1); + SDValue FalseVal = N->getOperand(2); + + // Check if condition is a comparison. + if (Cond.getOpcode() != ISD::SETCC) + return SDValue(); + + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + + bool isFloatingPoint = LHS.getValueType().isFloatingPoint(); + bool isInteger = LHS.getValueType().isInteger(); + + // Handle simple floating-point and integer types only. + if (!isFloatingPoint && !isInteger) + return SDValue(); + + bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ); + bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE); + if (!isEquality && !isNonEquality) + return SDValue(); + + SDValue ArgVal, ConstVal; + if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) || + (isInteger && isa<ConstantSDNode>(RHS))) { + ConstVal = RHS; + ArgVal = LHS; + } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) || + (isInteger && isa<ConstantSDNode>(LHS))) { + ConstVal = LHS; + ArgVal = RHS; + } else { + return SDValue(); + } + + // Check if constant should not be optimized - early return if not. + if (isFloatingPoint) { + const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF(); + const GCNSubtarget *GCNST = static_cast<const GCNSubtarget *>(ST); + + // Only optimize normal floating-point values (finite, non-zero, and + // non-subnormal as per IEEE 754), skip optimization for inlinable + // floating-point constants. + if (!Val.isNormal() || GCNST->getInstrInfo()->isInlineConstant(Val)) + return SDValue(); + } else { + int64_t IntVal = cast<ConstantSDNode>(ConstVal)->getSExtValue(); + + // Skip optimization for inlinable integer immediates. + // Inlinable immediates include: -16 to 64 (inclusive). + if (IntVal >= -16 && IntVal <= 64) + return SDValue(); + } + + // For equality and non-equality comparisons, patterns: + // select (setcc x, const), const, y -> select (setcc x, const), x, y + // select (setccinv x, const), y, const -> select (setccinv x, const), y, x + if (!(isEquality && TrueVal == ConstVal) && + !(isNonEquality && FalseVal == ConstVal)) + return SDValue(); + + SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal; + SDValue SelectRHS = + (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal; + return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond, + SelectLHS, SelectRHS); +} + SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0))) return Folded; + // Try to fold CMP + SELECT patterns with shared constants (both FP and + // integer). + if (SDValue Folded = foldCmpSelectWithSharedConstant(N, DCI, Subtarget)) + return Folded; + SDValue Cond = N->getOperand(0); if (Cond.getOpcode() != ISD::SETCC) return SDValue(); @@ -5733,6 +5817,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) NODE_NAME_CASE(CONST_DATA_PTR) NODE_NAME_CASE(PC_ADD_REL_OFFSET) + NODE_NAME_CASE(PC_ADD_REL_OFFSET64) NODE_NAME_CASE(LDS) NODE_NAME_CASE(DUMMY_CHAIN) NODE_NAME_CASE(LOAD_D16_HI) @@ -6196,7 +6281,8 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode( case Intrinsic::amdgcn_rsq: case Intrinsic::amdgcn_rcp_legacy: case Intrinsic::amdgcn_rsq_legacy: - case Intrinsic::amdgcn_rsq_clamp: { + case Intrinsic::amdgcn_rsq_clamp: + case Intrinsic::amdgcn_tanh: { if (SNaN) return true; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 0dd2183b72b2..4e8c6c7ea3b2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -545,6 +545,7 @@ enum NodeType : unsigned { /// Pointer to the start of the shader's constant data. CONST_DATA_PTR, PC_ADD_REL_OFFSET, + PC_ADD_REL_OFFSET64, LDS, DUMMY_CHAIN, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp index 44eaebffb70d..9a90787963d7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp @@ -25,6 +25,7 @@ namespace { class AMDGPUInsertDelayAlu { public: + const GCNSubtarget *ST; const SIInstrInfo *SII; const TargetRegisterInfo *TRI; @@ -65,13 +66,16 @@ public: // Types of delay that can be encoded in an s_delay_alu instruction. enum DelayType { VALU, TRANS, SALU, OTHER }; - // Get the delay type for an instruction with the specified TSFlags. - static DelayType getDelayType(uint64_t TSFlags) { - if (TSFlags & SIInstrFlags::TRANS) + // Get the delay type for a MachineInstr. + DelayType getDelayType(const MachineInstr &MI) { + if (SIInstrInfo::isTRANS(MI)) return TRANS; - if (TSFlags & SIInstrFlags::VALU) + // WMMA XDL ops are treated the same as TRANS. + if (AMDGPU::isGFX1250(*ST) && SII->isXDLWMMA(MI)) + return TRANS; + if (SIInstrInfo::isVALU(MI)) return VALU; - if (TSFlags & SIInstrFlags::SALU) + if (SIInstrInfo::isSALU(MI)) return SALU; return OTHER; } @@ -368,7 +372,7 @@ public: continue; } - DelayType Type = getDelayType(MI.getDesc().TSFlags); + DelayType Type = getDelayType(MI); if (instructionWaitsForSGPRWrites(MI)) { auto It = State.find(LastSGPRFromVALU); @@ -456,12 +460,12 @@ public: LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName() << "\n"); - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - if (!ST.hasDelayAlu()) + ST = &MF.getSubtarget<GCNSubtarget>(); + if (!ST->hasDelayAlu()) return false; - SII = ST.getInstrInfo(); - TRI = ST.getRegisterInfo(); + SII = ST->getInstrInfo(); + TRI = ST->getRegisterInfo(); SchedModel = &SII->getSchedModel(); // Calculate the delay state for each basic block, iterating until we reach diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index b8996fb97f1c..e2c2e8912c71 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -700,7 +700,8 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { break; } case Intrinsic::amdgcn_sqrt: - case Intrinsic::amdgcn_rsq: { + case Intrinsic::amdgcn_rsq: + case Intrinsic::amdgcn_tanh: { Value *Src = II.getArgOperand(0); if (isa<PoisonValue>(Src)) return IC.replaceInstUsesWith(II, Src); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index ea79c57080fa..1a63c48e3666 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3513,6 +3513,25 @@ static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) { return Register(); } +Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const { + Register AnyExtSrc; + if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc)))) + return MRI->getType(AnyExtSrc) == LLT::scalar(32) ? AnyExtSrc : Register(); + + // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 G_IMPLICIT_DEF) + const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); + if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) + return Register(); + + assert(Def->getNumOperands() == 3 && + MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64)); + + if (mi_match(Def->getOperand(2).getReg(), *MRI, m_GImplicitDef())) + return Def->getOperand(1).getReg(); + + return Register(); +} + bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{ if (!Subtarget->hasVMemToLDSLoad()) return false; @@ -4904,6 +4923,7 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { return selectVOP3PRetHelper(Root, true); } +// Select neg_lo from the i1 immediate operand. InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const { // Literal i1 value set in intrinsic, represents SrcMods for the next operand. @@ -4919,6 +4939,50 @@ AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const { }}; } +// Select both neg_lo and neg_hi from the i1 immediate operand. This is +// specifically for F16/BF16 operands in WMMA instructions, where neg_lo applies +// to matrix's even k elements, and neg_hi applies to matrix's odd k elements. +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3PModsNegs(MachineOperand &Root) const { + // Literal i1 value set in intrinsic, represents SrcMods for the next operand. + // Value is in Imm operand as i1 sign extended to int64_t. + // 1(-1) promotes packed values to signed, 0 treats them as unsigned. + assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) && + "expected i1 value"); + unsigned Mods = SISrcMods::OP_SEL_1; + if (Root.getImm() == -1) + Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; +} + +// Select neg, abs, or both neg and abs from the i16 immediate operans. +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3PModsNegAbs(MachineOperand &Root) const { + + assert(Root.isImm() && "Modifier for C must be an immediate"); + + unsigned Mods = SISrcMods::OP_SEL_1; + switch (Root.getImm()) { + default: // Any other value will be silently ignored (considered as 0). + break; + case 1: + Mods ^= SISrcMods::NEG; + break; + case 2: + Mods ^= SISrcMods::ABS; + break; + case 3: + Mods ^= (SISrcMods::NEG | SISrcMods::ABS); + break; + } + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods( MachineOperand &Root) const { @@ -5150,6 +5214,35 @@ AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const { } InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const { + Register Src = + getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg(); + unsigned Key = 0; + + Register S32 = matchZeroExtendFromS32(*MRI, Src); + if (!S32) + S32 = matchAnyExtendFromS32(Src); + + if (S32) { + const MachineInstr *Def = getDefIgnoringCopies(S32, *MRI); + if (Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) { + assert(Def->getNumOperands() == 3); + Register DstReg1 = Def->getOperand(1).getReg(); + if (mi_match(S32, *MRI, + m_any_of(m_SpecificReg(DstReg1), m_Copy(m_Reg(DstReg1))))) { + Src = Def->getOperand(2).getReg(); + Key = 1; + } + } + } + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key + }}; +} + +InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { Register Src; unsigned Mods; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 8e9e573147a8..2cb7904d27cc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -201,6 +201,10 @@ private: InstructionSelector::ComplexRendererFns selectVOP3PModsNeg(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectVOP3PModsNegs(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectVOP3PModsNegAbs(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectWMMAOpSelVOP3PMods(MachineOperand &Root) const; @@ -217,6 +221,8 @@ private: selectSWMMACIndex8(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectSWMMACIndex16(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectSWMMACIndex32(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectVOP3OpSelMods(MachineOperand &Root) const; @@ -411,6 +417,9 @@ private: // shift amount operand's `ShAmtBits` bits is unneeded. bool isUnneededShiftMask(const MachineInstr &MI, unsigned ShAmtBits) const; + /// Match an any extend from a 32-bit value to 64-bit. + Register matchAnyExtendFromS32(Register Reg) const; + const SIInstrInfo &TII; const SIRegisterInfo &TRI; const AMDGPURegisterBankInfo &RBI; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index aa678df675fb..e7bf88d2ee5b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2932,14 +2932,22 @@ bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : B.getMRI()->createGenericVirtualRegister(ConstPtrTy); - MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) - .addDef(PCReg); + if (ST.has64BitLiterals()) { + assert(GAFlags != SIInstrInfo::MO_NONE); - MIB.addGlobalAddress(GV, Offset, GAFlags); - if (GAFlags == SIInstrInfo::MO_NONE) - MIB.addImm(0); - else - MIB.addGlobalAddress(GV, Offset, GAFlags + 1); + MachineInstrBuilder MIB = + B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg); + MIB.addGlobalAddress(GV, Offset, GAFlags + 2); + } else { + MachineInstrBuilder MIB = + B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg); + + MIB.addGlobalAddress(GV, Offset, GAFlags); + if (GAFlags == SIInstrInfo::MO_NONE) + MIB.addImm(0); + else + MIB.addGlobalAddress(GV, Offset, GAFlags + 1); + } if (!B.getMRI()->getRegClassOrNull(PCReg)) B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); @@ -2955,6 +2963,15 @@ void AMDGPULegalizerInfo::buildAbsGlobalAddress( MachineRegisterInfo &MRI) const { bool RequiresHighHalf = PtrTy.getSizeInBits() != 32; + if (RequiresHighHalf && ST.has64BitLiterals()) { + if (!MRI.getRegClassOrNull(DstReg)) + MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass); + B.buildInstr(AMDGPU::S_MOV_B64) + .addDef(DstReg) + .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS64); + return; + } + LLT S32 = LLT::scalar(32); // Use the destination directly, if and only if we store the lower address @@ -7622,6 +7639,20 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_image_bvh_dual_intersect_ray: case Intrinsic::amdgcn_image_bvh8_intersect_ray: return legalizeBVHDualOrBVH8IntersectRayIntrinsic(MI, B); + case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: { + Register Index = MI.getOperand(5).getReg(); + LLT S64 = LLT::scalar(64); + if (MRI.getType(Index) != S64) + MI.getOperand(5).setReg(B.buildAnyExt(S64, Index).getReg(0)); + return true; + } case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16: case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16: @@ -7636,15 +7667,24 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0)); return true; } + case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16: + case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16: + case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: { Register Index = MI.getOperand(7).getReg(); - LLT S32 = LLT::scalar(32); - if (MRI.getType(Index) != S32) - MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0)); + LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8 + ? LLT::scalar(64) + : LLT::scalar(32); + if (MRI.getType(Index) != IdxTy) + MI.getOperand(7).setReg(B.buildAnyExt(IdxTy, Index).getReg(0)); return true; } + case Intrinsic::amdgcn_fmed3: { GISelChangeObserver &Observer = Helper.Observer; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 2dec16de940d..c84a0f6e3138 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -50,6 +50,7 @@ static AMDGPUMCExpr::Specifier getSpecifier(unsigned MOFlags) { default: return AMDGPUMCExpr::S_None; case SIInstrInfo::MO_GOTPCREL: + case SIInstrInfo::MO_GOTPCREL64: return AMDGPUMCExpr::S_GOTPCREL; case SIInstrInfo::MO_GOTPCREL32_LO: return AMDGPUMCExpr::S_GOTPCREL32_LO; @@ -59,10 +60,14 @@ static AMDGPUMCExpr::Specifier getSpecifier(unsigned MOFlags) { return AMDGPUMCExpr::S_REL32_LO; case SIInstrInfo::MO_REL32_HI: return AMDGPUMCExpr::S_REL32_HI; + case SIInstrInfo::MO_REL64: + return AMDGPUMCExpr::S_REL64; case SIInstrInfo::MO_ABS32_LO: return AMDGPUMCExpr::S_ABS32_LO; case SIInstrInfo::MO_ABS32_HI: return AMDGPUMCExpr::S_ABS32_HI; + case SIInstrInfo::MO_ABS64: + return AMDGPUMCExpr::S_ABS64; } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 5d298304c27f..b6c6d927d0e8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -114,7 +114,9 @@ MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUse MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass()) MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass()) MACHINE_FUNCTION_PASS("amdgpu-preload-kern-arg-prolog", AMDGPUPreloadKernArgPrologPass()) +MACHINE_FUNCTION_PASS("amdgpu-prepare-agpr-alloc", AMDGPUPrepareAGPRAllocPass()) MACHINE_FUNCTION_PASS("amdgpu-nsa-reassign", GCNNSAReassignPass()) +MACHINE_FUNCTION_PASS("amdgpu-wait-sgpr-hazards", AMDGPUWaitSGPRHazardsPass()) MACHINE_FUNCTION_PASS("gcn-create-vopd", GCNCreateVOPDPass()) MACHINE_FUNCTION_PASS("gcn-dpp-combine", GCNDPPCombinePass()) MACHINE_FUNCTION_PASS("si-fix-sgpr-copies", SIFixSGPRCopiesPass()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp new file mode 100644 index 000000000000..3b06e9b00ac6 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp @@ -0,0 +1,108 @@ +//===-- AMDGPUPrepareAGPRAlloc.cpp ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Make simple transformations to relax register constraints for cases which can +// allocate to AGPRs or VGPRs. Replace materialize of inline immediates into +// AGPR or VGPR with a pseudo with an AV_* class register constraint. This +// allows later passes to inflate the register class if necessary. The register +// allocator does not know to replace instructions to relax constraints. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUPrepareAGPRAlloc.h" +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-prepare-agpr-alloc" + +namespace { + +class AMDGPUPrepareAGPRAllocImpl { +private: + const SIInstrInfo &TII; + MachineRegisterInfo &MRI; + +public: + AMDGPUPrepareAGPRAllocImpl(const GCNSubtarget &ST, MachineRegisterInfo &MRI) + : TII(*ST.getInstrInfo()), MRI(MRI) {} + bool run(MachineFunction &MF); +}; + +class AMDGPUPrepareAGPRAllocLegacy : public MachineFunctionPass { +public: + static char ID; + + AMDGPUPrepareAGPRAllocLegacy() : MachineFunctionPass(ID) { + initializeAMDGPUPrepareAGPRAllocLegacyPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "AMDGPU Prepare AGPR Alloc"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE, + "AMDGPU Prepare AGPR Alloc", false, false) +INITIALIZE_PASS_END(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE, + "AMDGPU Prepare AGPR Alloc", false, false) + +char AMDGPUPrepareAGPRAllocLegacy::ID = 0; + +char &llvm::AMDGPUPrepareAGPRAllocLegacyID = AMDGPUPrepareAGPRAllocLegacy::ID; + +bool AMDGPUPrepareAGPRAllocLegacy::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + return AMDGPUPrepareAGPRAllocImpl(ST, MF.getRegInfo()).run(MF); +} + +PreservedAnalyses +AMDGPUPrepareAGPRAllocPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + AMDGPUPrepareAGPRAllocImpl(ST, MF.getRegInfo()).run(MF); + return PreservedAnalyses::all(); +} + +bool AMDGPUPrepareAGPRAllocImpl::run(MachineFunction &MF) { + if (MRI.isReserved(AMDGPU::AGPR0)) + return false; + + const MCInstrDesc &AVImmPseudo = TII.get(AMDGPU::AV_MOV_B32_IMM_PSEUDO); + + bool Changed = false; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if ((MI.getOpcode() == AMDGPU::V_MOV_B32_e32 && + TII.isInlineConstant(MI, 1)) || + (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && + MI.getOperand(1).isImm())) { + MI.setDesc(AVImmPseudo); + Changed = true; + } + } + } + + return Changed; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h new file mode 100644 index 000000000000..dc598c98f241 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h @@ -0,0 +1,23 @@ +//===- AMDGPUPrepareAGPRAlloc.h ---------------------------------*- C++- *-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H + +#include "llvm/CodeGen/MachinePassManager.h" + +namespace llvm { +class AMDGPUPrepareAGPRAllocPass + : public PassInfoMixin<AMDGPUPrepareAGPRAllocPass> { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp index 7a2a7fc250e2..f5e14c71b02d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp @@ -88,7 +88,7 @@ void AMDGPUPrintfRuntimeBindingImpl::getConversionSpecifiers( // are %p and %s, which use to know if we // are either storing a literal string or a // pointer to the printf buffer. - static const char ConvSpecifiers[] = "cdieEfgGaosuxXp"; + static const char ConvSpecifiers[] = "cdieEfFgGaAosuxXp"; size_t CurFmtSpecifierIdx = 0; size_t PrevFmtSpecifierIdx = 0; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 6a59a28b1d32..411159c8aa33 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -23,7 +23,6 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineUniformityAnalysis.h" #include "llvm/IR/IntrinsicsAMDGPU.h" -#include "llvm/Support/AMDGPUAddrSpace.h" #define DEBUG_TYPE "amdgpu-regbanklegalize" diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 1483d97d23fc..bf2f37bddb9e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4546,6 +4546,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_rcp_legacy: case Intrinsic::amdgcn_rsq_legacy: case Intrinsic::amdgcn_rsq_clamp: + case Intrinsic::amdgcn_tanh: case Intrinsic::amdgcn_fmul_legacy: case Intrinsic::amdgcn_fma_legacy: case Intrinsic::amdgcn_frexp_mant: @@ -4557,6 +4558,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_cvt_pk_u16: case Intrinsic::amdgcn_cvt_pk_f16_fp8: case Intrinsic::amdgcn_cvt_pk_f16_bf8: + case Intrinsic::amdgcn_sat_pk4_i4_i8: + case Intrinsic::amdgcn_sat_pk4_u4_u8: case Intrinsic::amdgcn_fmed3: case Intrinsic::amdgcn_cubeid: case Intrinsic::amdgcn_cubema: @@ -4688,6 +4691,44 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: + case Intrinsic::amdgcn_wmma_f32_16x16x4_f32: + case Intrinsic::amdgcn_wmma_f32_16x16x32_bf16: + case Intrinsic::amdgcn_wmma_f32_16x16x32_f16: + case Intrinsic::amdgcn_wmma_f16_16x16x32_f16: + case Intrinsic::amdgcn_wmma_bf16_16x16x32_bf16: + case Intrinsic::amdgcn_wmma_bf16f32_16x16x32_bf16: + case Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_fp8: + case Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_bf8: + case Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_fp8: + case Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_bf8: + case Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_fp8: + case Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_bf8: + case Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_fp8: + case Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_bf8: + case Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_fp8: + case Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_bf8: + case Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_fp8: + case Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_bf8: + case Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_fp8: + case Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_bf8: + case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8: + case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8: + case Intrinsic::amdgcn_wmma_i32_16x16x64_iu8: + case Intrinsic::amdgcn_wmma_f32_32x16x128_f4: + case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16: + case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: + case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: return getDefaultMappingVOP(MI); case Intrinsic::amdgcn_log: case Intrinsic::amdgcn_exp2: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index 46027b889023..8101c6898624 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -167,77 +167,39 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage( Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI); + Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass, + /*IncludeCalls=*/false); + if (ST.hasMAIInsts()) + Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass, + /*IncludeCalls=*/false); // If there are no calls, MachineRegisterInfo can tell us the used register // count easily. // A tail call isn't considered a call for MachineFrameInfo's purposes. if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { - Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass); - Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass); - if (ST.hasMAIInsts()) - Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass); + Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass, + /*IncludeCalls=*/false); return Info; } int32_t MaxVGPR = -1; - int32_t MaxAGPR = -1; - int32_t MaxSGPR = -1; Info.CalleeSegmentSize = 0; for (const MachineBasicBlock &MBB : MF) { for (const MachineInstr &MI : MBB) { - // TODO: Check regmasks? Do they occur anywhere except calls? - for (const MachineOperand &MO : MI.operands()) { - unsigned Width = 0; - bool IsSGPR = false; - bool IsAGPR = false; + for (unsigned I = 0; I < MI.getNumOperands(); ++I) { + const MachineOperand &MO = MI.getOperand(I); if (!MO.isReg()) continue; Register Reg = MO.getReg(); switch (Reg) { - case AMDGPU::EXEC: - case AMDGPU::EXEC_LO: - case AMDGPU::EXEC_HI: - case AMDGPU::SCC: - case AMDGPU::M0: - case AMDGPU::M0_LO16: - case AMDGPU::M0_HI16: - case AMDGPU::SRC_SHARED_BASE_LO: - case AMDGPU::SRC_SHARED_BASE: - case AMDGPU::SRC_SHARED_LIMIT_LO: - case AMDGPU::SRC_SHARED_LIMIT: - case AMDGPU::SRC_PRIVATE_BASE_LO: - case AMDGPU::SRC_PRIVATE_BASE: - case AMDGPU::SRC_PRIVATE_LIMIT_LO: - case AMDGPU::SRC_PRIVATE_LIMIT: - case AMDGPU::SRC_POPS_EXITING_WAVE_ID: - case AMDGPU::SGPR_NULL: - case AMDGPU::SGPR_NULL64: - case AMDGPU::MODE: - continue; - case AMDGPU::NoRegister: assert(MI.isDebugInstr() && "Instruction uses invalid noreg register"); continue; - case AMDGPU::VCC: - case AMDGPU::VCC_LO: - case AMDGPU::VCC_HI: - case AMDGPU::VCC_LO_LO16: - case AMDGPU::VCC_LO_HI16: - case AMDGPU::VCC_HI_LO16: - case AMDGPU::VCC_HI_HI16: - Info.UsesVCC = true; - continue; - - case AMDGPU::FLAT_SCR: - case AMDGPU::FLAT_SCR_LO: - case AMDGPU::FLAT_SCR_HI: - continue; - case AMDGPU::XNACK_MASK: case AMDGPU::XNACK_MASK_LO: case AMDGPU::XNACK_MASK_HI: @@ -267,170 +229,22 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage( break; } - if (AMDGPU::SGPR_32RegClass.contains(Reg) || - AMDGPU::SGPR_LO16RegClass.contains(Reg) || - AMDGPU::SGPR_HI16RegClass.contains(Reg)) { - IsSGPR = true; - Width = 1; - } else if (AMDGPU::VGPR_32RegClass.contains(Reg) || - AMDGPU::VGPR_16RegClass.contains(Reg)) { - IsSGPR = false; - Width = 1; - } else if (AMDGPU::AGPR_32RegClass.contains(Reg) || - AMDGPU::AGPR_LO16RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 1; - } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) { - IsSGPR = true; - Width = 2; - } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { - IsSGPR = false; - Width = 2; - } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 2; - } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { - IsSGPR = false; - Width = 3; - } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { - IsSGPR = true; - Width = 3; - } else if (AMDGPU::AReg_96RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 3; - } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) { - IsSGPR = true; - Width = 4; - } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { - IsSGPR = false; - Width = 4; - } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 4; - } else if (AMDGPU::VReg_160RegClass.contains(Reg)) { - IsSGPR = false; - Width = 5; - } else if (AMDGPU::SReg_160RegClass.contains(Reg)) { - IsSGPR = true; - Width = 5; - } else if (AMDGPU::AReg_160RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 5; - } else if (AMDGPU::VReg_192RegClass.contains(Reg)) { - IsSGPR = false; - Width = 6; - } else if (AMDGPU::SReg_192RegClass.contains(Reg)) { - IsSGPR = true; - Width = 6; - } else if (AMDGPU::AReg_192RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 6; - } else if (AMDGPU::VReg_224RegClass.contains(Reg)) { - IsSGPR = false; - Width = 7; - } else if (AMDGPU::SReg_224RegClass.contains(Reg)) { - IsSGPR = true; - Width = 7; - } else if (AMDGPU::AReg_224RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 7; - } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { - IsSGPR = true; - Width = 8; - } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { - IsSGPR = false; - Width = 8; - } else if (AMDGPU::AReg_256RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 8; - } else if (AMDGPU::VReg_288RegClass.contains(Reg)) { - IsSGPR = false; - Width = 9; - } else if (AMDGPU::SReg_288RegClass.contains(Reg)) { - IsSGPR = true; - Width = 9; - } else if (AMDGPU::AReg_288RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 9; - } else if (AMDGPU::VReg_320RegClass.contains(Reg)) { - IsSGPR = false; - Width = 10; - } else if (AMDGPU::SReg_320RegClass.contains(Reg)) { - IsSGPR = true; - Width = 10; - } else if (AMDGPU::AReg_320RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 10; - } else if (AMDGPU::VReg_352RegClass.contains(Reg)) { - IsSGPR = false; - Width = 11; - } else if (AMDGPU::SReg_352RegClass.contains(Reg)) { - IsSGPR = true; - Width = 11; - } else if (AMDGPU::AReg_352RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 11; - } else if (AMDGPU::VReg_384RegClass.contains(Reg)) { - IsSGPR = false; - Width = 12; - } else if (AMDGPU::SReg_384RegClass.contains(Reg)) { - IsSGPR = true; - Width = 12; - } else if (AMDGPU::AReg_384RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 12; - } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { - IsSGPR = true; - Width = 16; - } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { - IsSGPR = false; - Width = 16; - } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 16; - } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { - IsSGPR = true; - Width = 32; - } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { - IsSGPR = false; - Width = 32; - } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 32; - } else { - // We only expect TTMP registers or registers that do not belong to - // any RC. - assert((AMDGPU::TTMP_32RegClass.contains(Reg) || - AMDGPU::TTMP_64RegClass.contains(Reg) || - AMDGPU::TTMP_128RegClass.contains(Reg) || - AMDGPU::TTMP_256RegClass.contains(Reg) || - AMDGPU::TTMP_512RegClass.contains(Reg) || - !TRI.getPhysRegBaseClass(Reg)) && - "Unknown register class"); - } + const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(Reg); + assert((!RC || TRI.isVGPRClass(RC) || TRI.isSGPRClass(RC) || + TRI.isAGPRClass(RC) || AMDGPU::TTMP_32RegClass.contains(Reg) || + AMDGPU::TTMP_64RegClass.contains(Reg) || + AMDGPU::TTMP_128RegClass.contains(Reg) || + AMDGPU::TTMP_256RegClass.contains(Reg) || + AMDGPU::TTMP_512RegClass.contains(Reg)) && + "Unknown register class"); + + if (!RC || !TRI.isVGPRClass(RC)) + continue; + + unsigned Width = divideCeil(TRI.getRegSizeInBits(*RC), 32); unsigned HWReg = TRI.getHWRegIndex(Reg); int MaxUsed = HWReg + Width - 1; - if (IsSGPR) { - MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; - } else if (IsAGPR) { - MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR; - } else { - MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; - } + MaxVGPR = std::max(MaxUsed, MaxVGPR); } if (MI.isCall()) { @@ -492,9 +306,7 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage( } } - Info.NumExplicitSGPR = MaxSGPR + 1; Info.NumVGPR = MaxVGPR + 1; - Info.NumAGPR = MaxAGPR + 1; return Info; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 1f6002a3c6a2..dfe0cbf18c47 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -341,6 +341,10 @@ foreach intr = AMDGPUWMMAIntrinsicsGFX11 in def : SourceOfDivergence<intr>; foreach intr = AMDGPUWMMAIntrinsicsGFX12 in def : SourceOfDivergence<intr>; +foreach intr = AMDGPUWMMAIntrinsicsGFX1250 in +def : SourceOfDivergence<intr>; +foreach intr = AMDGPUSWMMACIntrinsicsGFX1250 in +def : SourceOfDivergence<intr>; def : SourceOfDivergence<int_amdgcn_global_load_tr_b64>; def : SourceOfDivergence<int_amdgcn_global_load_tr_b128>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 7c24f428d78e..1e44be8e4720 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -59,6 +59,7 @@ protected: bool HasCvtPkF16F32Inst = false; bool HasF32ToF16BF16ConversionSRInsts = false; bool EnableRealTrue16Insts = false; + bool HasBF16TransInsts = false; bool HasBF16ConversionInsts = false; bool HasMadMixInsts = false; bool HasMadMacF32Insts = false; @@ -202,6 +203,8 @@ public: // supported and the support for fake True16 instructions is removed. bool useRealTrue16Insts() const; + bool hasBF16TransInsts() const { return HasBF16TransInsts; } + bool hasBF16ConversionInsts() const { return HasBF16ConversionInsts; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index f4dc4a483181..c865082a1dce 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -25,6 +25,7 @@ #include "AMDGPUMacroFusion.h" #include "AMDGPUPerfHintAnalysis.h" #include "AMDGPUPreloadKernArgProlog.h" +#include "AMDGPUPrepareAGPRAlloc.h" #include "AMDGPURemoveIncompatibleFunctions.h" #include "AMDGPUReserveWWMRegs.h" #include "AMDGPUResourceUsageAnalysis.h" @@ -499,6 +500,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeGlobalISel(*PR); initializeAMDGPUAsmPrinterPass(*PR); initializeAMDGPUDAGToDAGISelLegacyPass(*PR); + initializeAMDGPUPrepareAGPRAllocLegacyPass(*PR); initializeGCNDPPCombineLegacyPass(*PR); initializeSILowerI1CopiesLegacyPass(*PR); initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR); @@ -1196,6 +1198,7 @@ public: bool addRegBankSelect() override; void addPreGlobalInstructionSelect() override; bool addGlobalInstructionSelect() override; + void addPreRegAlloc() override; void addFastRegAlloc() override; void addOptimizedRegAlloc() override; @@ -1539,6 +1542,11 @@ void GCNPassConfig::addFastRegAlloc() { TargetPassConfig::addFastRegAlloc(); } +void GCNPassConfig::addPreRegAlloc() { + if (getOptLevel() != CodeGenOptLevel::None) + addPass(&AMDGPUPrepareAGPRAllocLegacyID); +} + void GCNPassConfig::addOptimizedRegAlloc() { if (EnableDCEInRA) insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID); @@ -2235,6 +2243,11 @@ void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc( Base::addOptimizedRegAlloc(addPass); } +void AMDGPUCodeGenPassBuilder::addPreRegAlloc(AddMachinePass &addPass) const { + if (getOptLevel() != CodeGenOptLevel::None) + addPass(AMDGPUPrepareAGPRAllocPass()); +} + Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized( AddMachinePass &addPass) const { // TODO: Check --regalloc-npm option @@ -2284,6 +2297,12 @@ void AMDGPUCodeGenPassBuilder::addPostRegAlloc(AddMachinePass &addPass) const { Base::addPostRegAlloc(addPass); } +void AMDGPUCodeGenPassBuilder::addPreSched2(AddMachinePass &addPass) const { + if (TM.getOptLevel() > CodeGenOptLevel::None) + addPass(SIShrinkInstructionsPass()); + addPass(SIPostRABundlerPass()); +} + void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const { if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) { addPass(GCNCreateVOPDPass()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 3c62cd19c6e5..e0f1296ddded 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -181,8 +181,11 @@ public: void addMachineSSAOptimization(AddMachinePass &) const; void addPostRegAlloc(AddMachinePass &) const; void addPreEmitPass(AddMachinePass &) const; + void addPreEmitRegAlloc(AddMachinePass &) const; Error addRegAssignmentOptimized(AddMachinePass &) const; + void addPreRegAlloc(AddMachinePass &) const; void addOptimizedRegAlloc(AddMachinePass &) const; + void addPreSched2(AddMachinePass &) const; /// Check if a pass is enabled given \p Opt option. The option always /// overrides defaults if explicitly used. Otherwise its default will be used diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 6439230b8769..43d4e8db791b 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -157,6 +157,7 @@ public: ImmTyNegHi, ImmTyIndexKey8bit, ImmTyIndexKey16bit, + ImmTyIndexKey32bit, ImmTyDPP8, ImmTyDppCtrl, ImmTyDppRowMask, @@ -174,8 +175,10 @@ public: ImmTyWaitEXP, ImmTyWaitVAVDst, ImmTyWaitVMVSrc, - ImmTyByteSel, ImmTyBitOp3, + ImmTyMatrixAReuse, + ImmTyMatrixBReuse, + ImmTyByteSel, }; // Immediate operand kind. @@ -419,6 +422,9 @@ public: bool isCPol() const { return isImmTy(ImmTyCPol); } bool isIndexKey8bit() const { return isImmTy(ImmTyIndexKey8bit); } bool isIndexKey16bit() const { return isImmTy(ImmTyIndexKey16bit); } + bool isIndexKey32bit() const { return isImmTy(ImmTyIndexKey32bit); } + bool isMatrixAReuse() const { return isImmTy(ImmTyMatrixAReuse); } + bool isMatrixBReuse() const { return isImmTy(ImmTyMatrixBReuse); } bool isTFE() const { return isImmTy(ImmTyTFE); } bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<7>(getImm()); } bool isDppFI() const { return isImmTy(ImmTyDppFI); } @@ -747,6 +753,10 @@ public: return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::f64); } + bool isVISrc_512_f64() const { + return isRegOrInlineNoMods(AMDGPU::VReg_512RegClassID, MVT::f64); + } + bool isVISrc_128B16() const { return isRegOrInlineNoMods(AMDGPU::VReg_128RegClassID, MVT::i16); } @@ -1116,6 +1126,7 @@ public: case ImmTyCPol: OS << "CPol"; break; case ImmTyIndexKey8bit: OS << "index_key"; break; case ImmTyIndexKey16bit: OS << "index_key"; break; + case ImmTyIndexKey32bit: OS << "index_key"; break; case ImmTyTFE: OS << "TFE"; break; case ImmTyD16: OS << "D16"; break; case ImmTyFORMAT: OS << "FORMAT"; break; @@ -1162,8 +1173,10 @@ public: case ImmTyWaitEXP: OS << "WaitEXP"; break; case ImmTyWaitVAVDst: OS << "WaitVAVDst"; break; case ImmTyWaitVMVSrc: OS << "WaitVMVSrc"; break; - case ImmTyByteSel: OS << "ByteSel" ; break; case ImmTyBitOp3: OS << "BitOp3"; break; + case ImmTyMatrixAReuse: OS << "ImmTyMatrixAReuse"; break; + case ImmTyMatrixBReuse: OS << "ImmTyMatrixBReuse"; break; + case ImmTyByteSel: OS << "ByteSel" ; break; } // clang-format on } @@ -1700,6 +1713,7 @@ public: AMDGPUOperand::ImmTy ImmTy); ParseStatus parseIndexKey8bit(OperandVector &Operands); ParseStatus parseIndexKey16bit(OperandVector &Operands); + ParseStatus parseIndexKey32bit(OperandVector &Operands); ParseStatus parseDfmtNfmt(int64_t &Format); ParseStatus parseUfmt(int64_t &Format); @@ -3981,8 +3995,8 @@ bool AMDGPUAsmParser::validateVOPD(const MCInst &Inst, bool AsVOPD3 = MII.get(Opcode).TSFlags & SIInstrFlags::VOPD3; if (AsVOPD3) { - for (unsigned I = 0, E = Operands.size(); I != E; ++I) { - AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); + for (const std::unique_ptr<MCParsedAsmOperand> &Operand : Operands) { + AMDGPUOperand &Op = (AMDGPUOperand &)*Operand; if ((Op.isRegKind() || Op.isImmTy(AMDGPUOperand::ImmTyNone)) && (Op.getModifiers().getFPModifiersOperand() & SISrcMods::ABS)) Error(Op.getStartLoc(), "ABS not allowed in VOPD3 instructions"); @@ -7153,7 +7167,9 @@ ParseStatus AMDGPUAsmParser::tryParseIndexKey(OperandVector &Operands, if (!Res.isSuccess()) return Res; - if (ImmTy == AMDGPUOperand::ImmTyIndexKey16bit && (ImmVal < 0 || ImmVal > 1)) + if ((ImmTy == AMDGPUOperand::ImmTyIndexKey16bit || + ImmTy == AMDGPUOperand::ImmTyIndexKey32bit) && + (ImmVal < 0 || ImmVal > 1)) return Error(Loc, Twine("out of range ", StringRef(Pref))); if (ImmTy == AMDGPUOperand::ImmTyIndexKey8bit && (ImmVal < 0 || ImmVal > 3)) @@ -7171,6 +7187,10 @@ ParseStatus AMDGPUAsmParser::parseIndexKey16bit(OperandVector &Operands) { return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey16bit); } +ParseStatus AMDGPUAsmParser::parseIndexKey32bit(OperandVector &Operands) { + return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey32bit); +} + // dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their // values to live in a joint format operand in the MCInst encoding. ParseStatus AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) { @@ -9272,6 +9292,14 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands, DefaultVal); } + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::matrix_a_reuse)) + addOptionalImmOperand(Inst, Operands, OptIdx, + AMDGPUOperand::ImmTyMatrixAReuse, 0); + + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::matrix_b_reuse)) + addOptionalImmOperand(Inst, Operands, OptIdx, + AMDGPUOperand::ImmTyMatrixBReuse, 0); + int NegLoIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo); if (NegLoIdx != -1) addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegLo); @@ -9378,6 +9406,10 @@ void AMDGPUAsmParser::cvtSWMMAC(MCInst &Inst, const OperandVector &Operands) { addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyIndexKey16bit); + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::index_key_32bit)) + addOptionalImmOperand(Inst, Operands, OptIdx, + AMDGPUOperand::ImmTyIndexKey32bit); + if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp)) addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyClamp); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index e3519f192137..42edec0d0149 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -74,6 +74,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPULowerKernelArguments.cpp AMDGPULowerKernelAttributes.cpp AMDGPULowerModuleLDSPass.cpp + AMDGPUPrepareAGPRAlloc.cpp AMDGPUSwLowerLDS.cpp AMDGPUMachineFunction.cpp AMDGPUMachineModuleInfo.cpp diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 3625db9a4791..c8a4e22ed1da 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -200,6 +200,7 @@ class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> : let Inst{95-72} = !if(ps.has_offset, offset, ?); } +// TODO: Rename to FlatSaddrTable, it now handles both global and flat GVS addressing mode. class GlobalSaddrTable <bit is_saddr, string Name = ""> { bit IsSaddr = is_saddr; string SaddrOp = Name; @@ -237,10 +238,18 @@ class FLAT_Load_Pseudo< let DisableEncoding = !if(HasTiedOutput, "$vdst_in", ""); } -multiclass FLAT_Load_Pseudo_t16<string opName> { - def "" : FLAT_Load_Pseudo<opName, VGPR_32, 1>; +multiclass FLAT_Flat_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> { + def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput>, + GlobalSaddrTable<0, opName>; + let OtherPredicates = [HasFlatGVSMode] in + def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>, + GlobalSaddrTable<1, opName>; +} + +multiclass FLAT_Flat_Load_Pseudo_t16<string opName> { + defm "" : FLAT_Flat_Load_Pseudo<opName, VGPR_32, 1>; let True16Predicate = UseRealTrue16Insts in - def _t16 : FLAT_Load_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_HI", NAME>; + defm _t16 : FLAT_Flat_Load_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_HI", NAME>; } class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass, @@ -260,10 +269,26 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass, let enabled_saddr = EnableSaddr; } -multiclass FLAT_Store_Pseudo_t16<string opName> { - def "" : FLAT_Store_Pseudo<opName, VGPR_32>; - let OtherPredicates = [HasTrue16BitInsts] in - def _t16 : FLAT_Store_Pseudo<opName#"_t16", VGPR_16>, True16D16Table<NAME#"_D16_HI", NAME>; +multiclass FLAT_Flat_Store_Pseudo<string opName, RegisterClass regClass> { + def "" : FLAT_Store_Pseudo<opName, regClass>, + GlobalSaddrTable<0, opName>; + let OtherPredicates = [HasFlatGVSMode] in + def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1>, + GlobalSaddrTable<1, opName>; +} + +multiclass FLAT_Flat_Store_Pseudo_t16<string opName> { + defm "" : FLAT_Flat_Store_Pseudo<opName, VGPR_32>; + + defvar Name16 = opName#"_t16"; + let OtherPredicates = [HasFlatGVSMode, HasTrue16BitInsts] in { + def _t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1>, + GlobalSaddrTable<0, Name16>, + True16D16Table<NAME#"_D16_HI", NAME>; + def _SADDR_t16 : FLAT_Store_Pseudo<Name16, VGPR_16, 1, 1>, + GlobalSaddrTable<1, Name16>, + True16D16Table<NAME#"_D16_HI_SADDR", NAME#"_SADDR">; + } } multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> { @@ -657,6 +682,18 @@ multiclass FLAT_Atomic_Pseudo_NO_RTN< let FPAtomic = data_vt.isFP; let AddedComplexity = -1; // Prefer global atomics if available } + + def _SADDR : FLAT_AtomicNoRet_Pseudo <opName, + (outs), + (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_0:$cpol), + " $vaddr, $vdata, $saddr$offset$cpol">, + GlobalSaddrTable<1, opName> { + let OtherPredicates = [HasFlatGVSMode]; + let has_saddr = 1; + let enabled_saddr = 1; + let FPAtomic = data_vt.isFP; + let AddedComplexity = -1; // Prefer global atomics if available + } } multiclass FLAT_Atomic_Pseudo_RTN< @@ -665,15 +702,29 @@ multiclass FLAT_Atomic_Pseudo_RTN< ValueType vt, ValueType data_vt = vt, RegisterClass data_rc = vdst_rc, - RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> { + RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret, + RegisterOperand vdst_op = getLdStRegisterOperand<vdst_rc>.ret> { def _RTN : FLAT_AtomicRet_Pseudo <opName, - (outs getLdStRegisterOperand<vdst_rc>.ret:$vdst), + (outs vdst_op:$vdst), (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), " $vdst, $vaddr, $vdata$offset$cpol">, GlobalSaddrTable<0, opName#"_rtn"> { let FPAtomic = data_vt.isFP; let AddedComplexity = -1; // Prefer global atomics if available } + + def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName, + (outs vdst_op:$vdst), + (ins VGPR_32:$vaddr, data_op:$vdata, SReg_64:$saddr, flat_offset:$offset, CPol_GLC1:$cpol), + " $vdst, $vaddr, $vdata, $saddr$offset$cpol">, + GlobalSaddrTable<1, opName#"_rtn"> { + let OtherPredicates = [HasFlatGVSMode]; + let has_saddr = 1; + let enabled_saddr = 1; + let PseudoInstr = NAME#"_SADDR_RTN"; + let FPAtomic = data_vt.isFP; + let AddedComplexity = -1; // Prefer global atomics if available + } } multiclass FLAT_Atomic_Pseudo< @@ -762,36 +813,36 @@ multiclass FLAT_Global_Atomic_Pseudo< // Flat Instructions //===----------------------------------------------------------------------===// -def FLAT_LOAD_UBYTE : FLAT_Load_Pseudo <"flat_load_ubyte", VGPR_32>; -def FLAT_LOAD_SBYTE : FLAT_Load_Pseudo <"flat_load_sbyte", VGPR_32>; -def FLAT_LOAD_USHORT : FLAT_Load_Pseudo <"flat_load_ushort", VGPR_32>; -def FLAT_LOAD_SSHORT : FLAT_Load_Pseudo <"flat_load_sshort", VGPR_32>; -def FLAT_LOAD_DWORD : FLAT_Load_Pseudo <"flat_load_dword", VGPR_32>; -def FLAT_LOAD_DWORDX2 : FLAT_Load_Pseudo <"flat_load_dwordx2", VReg_64>; -def FLAT_LOAD_DWORDX4 : FLAT_Load_Pseudo <"flat_load_dwordx4", VReg_128>; -def FLAT_LOAD_DWORDX3 : FLAT_Load_Pseudo <"flat_load_dwordx3", VReg_96>; +defm FLAT_LOAD_UBYTE : FLAT_Flat_Load_Pseudo <"flat_load_ubyte", VGPR_32>; +defm FLAT_LOAD_SBYTE : FLAT_Flat_Load_Pseudo <"flat_load_sbyte", VGPR_32>; +defm FLAT_LOAD_USHORT : FLAT_Flat_Load_Pseudo <"flat_load_ushort", VGPR_32>; +defm FLAT_LOAD_SSHORT : FLAT_Flat_Load_Pseudo <"flat_load_sshort", VGPR_32>; +defm FLAT_LOAD_DWORD : FLAT_Flat_Load_Pseudo <"flat_load_dword", VGPR_32>; +defm FLAT_LOAD_DWORDX2 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx2", VReg_64>; +defm FLAT_LOAD_DWORDX4 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx4", VReg_128>; +defm FLAT_LOAD_DWORDX3 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx3", VReg_96>; -def FLAT_STORE_DWORD : FLAT_Store_Pseudo <"flat_store_dword", VGPR_32>; -def FLAT_STORE_DWORDX2 : FLAT_Store_Pseudo <"flat_store_dwordx2", VReg_64>; -def FLAT_STORE_DWORDX4 : FLAT_Store_Pseudo <"flat_store_dwordx4", VReg_128>; -def FLAT_STORE_DWORDX3 : FLAT_Store_Pseudo <"flat_store_dwordx3", VReg_96>; +defm FLAT_STORE_DWORD : FLAT_Flat_Store_Pseudo <"flat_store_dword", VGPR_32>; +defm FLAT_STORE_DWORDX2 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx2", VReg_64>; +defm FLAT_STORE_DWORDX4 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx4", VReg_128>; +defm FLAT_STORE_DWORDX3 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx3", VReg_96>; let SubtargetPredicate = HasD16LoadStore in { let TiedSourceNotRead = 1 in { -def FLAT_LOAD_UBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>; -defm FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_ubyte_d16">; -def FLAT_LOAD_SBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>; -defm FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_sbyte_d16">; -def FLAT_LOAD_SHORT_D16_HI : FLAT_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>; -defm FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo_t16 <"flat_load_short_d16">; +defm FLAT_LOAD_UBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>; +defm FLAT_LOAD_UBYTE_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_ubyte_d16">; +defm FLAT_LOAD_SBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>; +defm FLAT_LOAD_SBYTE_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_sbyte_d16">; +defm FLAT_LOAD_SHORT_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>; +defm FLAT_LOAD_SHORT_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_short_d16">; } -def FLAT_STORE_BYTE_D16_HI : FLAT_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>; -def FLAT_STORE_SHORT_D16_HI : FLAT_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>; +defm FLAT_STORE_BYTE_D16_HI : FLAT_Flat_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>; +defm FLAT_STORE_SHORT_D16_HI : FLAT_Flat_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>; } -defm FLAT_STORE_BYTE : FLAT_Store_Pseudo_t16 <"flat_store_byte">; -defm FLAT_STORE_SHORT : FLAT_Store_Pseudo_t16 <"flat_store_short">; +defm FLAT_STORE_BYTE : FLAT_Flat_Store_Pseudo_t16 <"flat_store_byte">; +defm FLAT_STORE_SHORT : FLAT_Flat_Store_Pseudo_t16 <"flat_store_short">; defm FLAT_ATOMIC_CMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap", VGPR_32, i32, v2i32, VReg_64>; @@ -1200,6 +1251,16 @@ class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueTyp (inst $saddr, $voffset, $offset, 0, $in) >; +class FlatLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$in)), + (inst $saddr, $voffset, $offset, (i32 0), $in) +>; + +class FlatLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))), + (inst $saddr, $voffset, $offset, (i32 0)) +>; + class GlobalLoadSaddrPat_D16_t16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))), (inst $saddr, $voffset, $offset, (i32 0)) @@ -1210,13 +1271,13 @@ class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> (inst $vaddr, $offset) >; -class GlobalLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < +class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat < (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))), (inst $saddr, $voffset, $offset, 0) >; -class GlobalStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, - ValueType vt> : GCNPat < +class FlatStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, + ValueType vt> : GCNPat < (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset)), (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset) >; @@ -1394,7 +1455,7 @@ multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTyp let AddedComplexity = 10; } - def : GlobalLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { let AddedComplexity = 11; } } @@ -1404,7 +1465,7 @@ multiclass GlobalFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, Valu let AddedComplexity = 10; } - def : GlobalLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + def : FlatLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { let AddedComplexity = 11; } } @@ -1425,7 +1486,7 @@ multiclass GlobalFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node, let AddedComplexity = 10; } - def : GlobalStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { let AddedComplexity = 11; } } @@ -1435,7 +1496,7 @@ multiclass GlobalFLATStorePats_D16_t16<string inst, SDPatternOperator node, Valu let AddedComplexity = 10; } - def : GlobalStoreSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_t16"), node, vt> { + def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_t16"), node, vt> { let AddedComplexity = 11; } } @@ -1568,80 +1629,129 @@ multiclass ScratchFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Valu } } +multiclass FlatLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatLoadPat <inst, node, vt>; + + def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + +multiclass FlatLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatLoadPat_D16 <inst, node, vt>; + + def : FlatLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + +multiclass FlatLoadPats_D16_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatLoadPat_D16_t16 <inst, node, vt>; + + def : FlatLoadSaddrPat_D16_t16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + +multiclass FlatStorePats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatStorePat <inst, node, vt>; + + def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + +multiclass FlatStorePats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> { + def : FlatStorePat <!cast<FLAT_Pseudo>(!cast<string>(inst)#"_t16"), node, vt>; + + def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR_t16"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + let OtherPredicates = [HasFlatAddressSpace] in { -def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_aext_16_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>; +defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_aext_16_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_zext_16_flat, i16>; +defm : FlatLoadPats <FLAT_LOAD_UBYTE, extloadi8_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_UBYTE, zextloadi8_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_SBYTE, sextloadi8_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_USHORT, extloadi16_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_USHORT, zextloadi16_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_SSHORT, atomic_load_sext_16_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_DWORDX3, load_flat, v3i32>; foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in let True16Predicate = p in { - def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i16>; - def : FlatLoadPat <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>; - def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>; - def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>; - def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>; - def : FlatLoadPat <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>; - def : FlatLoadPat <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>; - def : FlatLoadPat <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>; - def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>; - def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>; - def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i16>; - def : FlatStorePat <FLAT_STORE_SHORT, atomic_store_16_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_UBYTE, extloadi8_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_UBYTE, zextloadi8_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_SBYTE, sextloadi8_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_USHORT, load_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_nonext_16_flat, i16>; + defm : FlatLoadPats <FLAT_LOAD_SBYTE, atomic_load_sext_8_flat, i16>; + defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>; + defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>; + defm : FlatStorePats <FLAT_STORE_BYTE, atomic_store_8_flat, i16>; + defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i16>; } let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in { - def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>; - def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>; - def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>; - def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>; - def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_aext_8_flat, i16>; - def : FlatLoadPat_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>; - def : FlatLoadPat_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>; - def : FlatLoadPat_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>; - def : FlatStorePat <FLAT_STORE_BYTE_t16, truncstorei8_flat, i16>; - def : FlatStorePat <FLAT_STORE_SHORT_t16, store_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, extloadi8_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, zextloadi8_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, sextloadi8_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, load_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_aext_8_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_UBYTE_D16_t16, atomic_load_zext_8_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_SHORT_D16_t16, atomic_load_nonext_16_flat, i16>; + defm : FlatLoadPats_D16_t16<FLAT_LOAD_SBYTE_D16_t16, atomic_load_sext_8_flat, i16>; + defm : FlatStorePats_t16 <FLAT_STORE_BYTE, truncstorei8_flat, i16>; + defm : FlatStorePats_t16 <FLAT_STORE_SHORT, store_flat, i16>; def : FlatStorePat <FLAT_STORE_BYTE_t16, atomic_store_8_flat, i16>; def : FlatStorePat <FLAT_STORE_SHORT_t16, atomic_store_16_flat, i16>; } // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts -def : FlatLoadPat <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>; -def : FlatLoadPat <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>; +defm : FlatLoadPats <FLAT_LOAD_DWORD, atomic_load_nonext_32_flat, i32>; +defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, i64>; +defm : FlatLoadPats <FLAT_LOAD_DWORDX2, atomic_load_nonext_64_flat, v2i32>; -def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i32>; -def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_flat, i32>; +defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i32>; +defm : FlatStorePats <FLAT_STORE_SHORT, truncstorei16_flat, i32>; foreach vt = Reg32Types.types in { -def : FlatLoadPat <FLAT_LOAD_DWORD, load_flat, vt>; -def : FlatStorePat <FLAT_STORE_DWORD, store_flat, vt>; +defm : FlatLoadPats <FLAT_LOAD_DWORD, load_flat, vt>; +defm : FlatStorePats <FLAT_STORE_DWORD, store_flat, vt>; } foreach vt = VReg_64.RegTypes in { -def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, vt>; -def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, vt>; +defm : FlatStorePats <FLAT_STORE_DWORDX2, store_flat, vt>; +defm : FlatLoadPats <FLAT_LOAD_DWORDX2, load_flat, vt>; } -def : FlatStorePat <FLAT_STORE_DWORDX3, store_flat, v3i32>; +defm : FlatStorePats <FLAT_STORE_DWORDX3, store_flat, v3i32>; foreach vt = VReg_128.RegTypes in { -def : FlatLoadPat <FLAT_LOAD_DWORDX4, load_flat, vt>; -def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, vt>; +defm : FlatLoadPats <FLAT_LOAD_DWORDX4, load_flat, vt>; +defm : FlatStorePats <FLAT_STORE_DWORDX4, store_flat, vt>; } -def : FlatStorePat <FLAT_STORE_DWORD, atomic_store_32_flat, i32>; -def : FlatStorePat <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>; -def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i32>; -def : FlatStorePat <FLAT_STORE_SHORT, atomic_store_16_flat, i32>; +defm : FlatStorePats <FLAT_STORE_DWORD, atomic_store_32_flat, i32>; +defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>; +defm : FlatStorePats <FLAT_STORE_DWORDX2, atomic_store_64_flat, v2i32>; +defm : FlatStorePats <FLAT_STORE_BYTE, atomic_store_8_flat, i32>; +defm : FlatStorePats <FLAT_STORE_SHORT, atomic_store_16_flat, i32>; + foreach as = [ "flat", "global" ] in { defm : FlatAtomicPat <"FLAT_ATOMIC_ADD", "atomic_load_add_"#as, i32>; @@ -1684,6 +1794,9 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>; } // end foreach as +defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>; +defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>; + let SubtargetPredicate = isGFX12Plus in { defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32 >; @@ -1692,25 +1805,25 @@ let SubtargetPredicate = isGFX12Plus in { } let OtherPredicates = [HasD16LoadStore] in { -def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>; -def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>; +defm : FlatStorePats <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>; +defm : FlatStorePats <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>; } let OtherPredicates = [D16PreservesUnusedBits] in { // TODO: Handle atomic loads -def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>; -def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2i16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_flat, v2f16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2i16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_flat, v2f16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2i16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16_HI, load_d16_hi_flat, v2f16>; -def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>; -def : FlatLoadPat_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>; -def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2i16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_UBYTE_D16, az_extloadi8_d16_lo_flat, v2f16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2i16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SBYTE_D16, sextloadi8_d16_lo_flat, v2f16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>; +defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>; } } // End OtherPredicates = [HasFlatAddressSpace] @@ -1782,6 +1895,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX4, store_global, vt>; // appropriate waits. defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORD, atomic_load_nonext_32_global, i32>; defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, i64>; +defm : GlobalFLATLoadPats <GLOBAL_LOAD_DWORDX2, atomic_load_nonext_64_global, v2i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, truncstorei8_global, i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, truncstorei16_global, i32>; @@ -1821,6 +1935,7 @@ defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_DWORD, atomic_store_32_global, i32>; defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, i64>; +defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, v2i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", "atomic_load_add_global", i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", "atomic_load_sub_global", i32>; @@ -2832,14 +2947,7 @@ multiclass VFLAT_Real_Base_gfx12<bits<8> op, VFLAT_Aliases_gfx12<name, alias>, VFLAT_Real_gfx12<op, name>; -multiclass VFLAT_Real_Atomics_gfx12<bits<8> op, - string name = get_FLAT_ps<NAME>.Mnemonic, - string alias = name> : - VFLAT_Real_Base_gfx12<op, name, alias> { - defm _RTN : VFLAT_Real_gfx12<op, name>; -} - -multiclass VGLOBAL_Real_AllAddr_gfx12<bits<8> op, +multiclass VFLAT_Real_AllAddr_gfx12<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic, string alias = name> : VFLAT_Real_Base_gfx12<op, name, alias> { @@ -2853,7 +2961,7 @@ multiclass VGLOBAL_Real_AllAddr_gfx1200<bits<8> op> { } } -multiclass VGLOBAL_Real_AllAddr_gfx12_w64<bits<8> op, +multiclass VFLAT_Real_AllAddr_gfx12_w64<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic> : VFLAT_Aliases_gfx12<name> { let DecoderNamespace = "GFX12W64" in { @@ -2862,10 +2970,10 @@ multiclass VGLOBAL_Real_AllAddr_gfx12_w64<bits<8> op, } } -multiclass VGLOBAL_Real_Atomics_gfx12<bits<8> op, +multiclass VFLAT_Real_Atomics_gfx12<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic, string alias = name> : - VGLOBAL_Real_AllAddr_gfx12<op, name, alias> { + VFLAT_Real_AllAddr_gfx12<op, name, alias> { defm _RTN : VFLAT_Real_gfx12<op, name>; defm _SADDR_RTN : VFLAT_Real_gfx12<op, name>; } @@ -2879,28 +2987,28 @@ multiclass VSCRATCH_Real_AllAddr_gfx12<bits<8> op, } // ENC_VFLAT. -defm FLAT_LOAD_UBYTE : VFLAT_Real_Base_gfx12<0x010, "flat_load_u8">; -defm FLAT_LOAD_SBYTE : VFLAT_Real_Base_gfx12<0x011, "flat_load_i8">; -defm FLAT_LOAD_USHORT : VFLAT_Real_Base_gfx12<0x012, "flat_load_u16">; -defm FLAT_LOAD_SSHORT : VFLAT_Real_Base_gfx12<0x013, "flat_load_i16">; -defm FLAT_LOAD_DWORD : VFLAT_Real_Base_gfx12<0x014, "flat_load_b32">; -defm FLAT_LOAD_DWORDX2 : VFLAT_Real_Base_gfx12<0x015, "flat_load_b64">; -defm FLAT_LOAD_DWORDX3 : VFLAT_Real_Base_gfx12<0x016, "flat_load_b96">; -defm FLAT_LOAD_DWORDX4 : VFLAT_Real_Base_gfx12<0x017, "flat_load_b128">; -defm FLAT_STORE_BYTE : VFLAT_Real_Base_gfx12<0x018, "flat_store_b8">; -defm FLAT_STORE_SHORT : VFLAT_Real_Base_gfx12<0x019, "flat_store_b16">; -defm FLAT_STORE_DWORD : VFLAT_Real_Base_gfx12<0x01a, "flat_store_b32">; -defm FLAT_STORE_DWORDX2 : VFLAT_Real_Base_gfx12<0x01b, "flat_store_b64">; -defm FLAT_STORE_DWORDX3 : VFLAT_Real_Base_gfx12<0x01c, "flat_store_b96">; -defm FLAT_STORE_DWORDX4 : VFLAT_Real_Base_gfx12<0x01d, "flat_store_b128">; -defm FLAT_LOAD_UBYTE_D16 : VFLAT_Real_Base_gfx12<0x01e, "flat_load_d16_u8">; -defm FLAT_LOAD_SBYTE_D16 : VFLAT_Real_Base_gfx12<0x01f, "flat_load_d16_i8">; -defm FLAT_LOAD_SHORT_D16 : VFLAT_Real_Base_gfx12<0x020, "flat_load_d16_b16">; -defm FLAT_LOAD_UBYTE_D16_HI : VFLAT_Real_Base_gfx12<0x021, "flat_load_d16_hi_u8">; -defm FLAT_LOAD_SBYTE_D16_HI : VFLAT_Real_Base_gfx12<0x022, "flat_load_d16_hi_i8">; -defm FLAT_LOAD_SHORT_D16_HI : VFLAT_Real_Base_gfx12<0x023, "flat_load_d16_hi_b16">; -defm FLAT_STORE_BYTE_D16_HI : VFLAT_Real_Base_gfx12<0x024, "flat_store_d16_hi_b8">; -defm FLAT_STORE_SHORT_D16_HI : VFLAT_Real_Base_gfx12<0x025, "flat_store_d16_hi_b16">; +defm FLAT_LOAD_UBYTE : VFLAT_Real_AllAddr_gfx12<0x010, "flat_load_u8">; +defm FLAT_LOAD_SBYTE : VFLAT_Real_AllAddr_gfx12<0x011, "flat_load_i8">; +defm FLAT_LOAD_USHORT : VFLAT_Real_AllAddr_gfx12<0x012, "flat_load_u16">; +defm FLAT_LOAD_SSHORT : VFLAT_Real_AllAddr_gfx12<0x013, "flat_load_i16">; +defm FLAT_LOAD_DWORD : VFLAT_Real_AllAddr_gfx12<0x014, "flat_load_b32">; +defm FLAT_LOAD_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x015, "flat_load_b64">; +defm FLAT_LOAD_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x016, "flat_load_b96">; +defm FLAT_LOAD_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x017, "flat_load_b128">; +defm FLAT_STORE_BYTE : VFLAT_Real_AllAddr_gfx12<0x018, "flat_store_b8">; +defm FLAT_STORE_SHORT : VFLAT_Real_AllAddr_gfx12<0x019, "flat_store_b16">; +defm FLAT_STORE_DWORD : VFLAT_Real_AllAddr_gfx12<0x01a, "flat_store_b32">; +defm FLAT_STORE_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x01b, "flat_store_b64">; +defm FLAT_STORE_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x01c, "flat_store_b96">; +defm FLAT_STORE_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x01d, "flat_store_b128">; +defm FLAT_LOAD_UBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01e, "flat_load_d16_u8">; +defm FLAT_LOAD_SBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01f, "flat_load_d16_i8">; +defm FLAT_LOAD_SHORT_D16 : VFLAT_Real_AllAddr_gfx12<0x020, "flat_load_d16_b16">; +defm FLAT_LOAD_UBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x021, "flat_load_d16_hi_u8">; +defm FLAT_LOAD_SBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x022, "flat_load_d16_hi_i8">; +defm FLAT_LOAD_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x023, "flat_load_d16_hi_b16">; +defm FLAT_STORE_BYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x024, "flat_store_d16_hi_b8">; +defm FLAT_STORE_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x025, "flat_store_d16_hi_b16">; defm FLAT_ATOMIC_SWAP : VFLAT_Real_Atomics_gfx12<0x033, "flat_atomic_swap_b32">; defm FLAT_ATOMIC_CMPSWAP : VFLAT_Real_Atomics_gfx12<0x034, "flat_atomic_cmpswap_b32">; defm FLAT_ATOMIC_ADD : VFLAT_Real_Atomics_gfx12<0x035, "flat_atomic_add_u32">; @@ -2936,74 +3044,74 @@ defm FLAT_ATOMIC_PK_ADD_F16 : VFLAT_Real_Atomics_gfx12<0x059>; defm FLAT_ATOMIC_PK_ADD_BF16 : VFLAT_Real_Atomics_gfx12<0x05a>; // ENC_VGLOBAL. -defm GLOBAL_LOAD_UBYTE : VGLOBAL_Real_AllAddr_gfx12<0x010, "global_load_u8">; -defm GLOBAL_LOAD_SBYTE : VGLOBAL_Real_AllAddr_gfx12<0x011, "global_load_i8">; -defm GLOBAL_LOAD_USHORT : VGLOBAL_Real_AllAddr_gfx12<0x012, "global_load_u16">; -defm GLOBAL_LOAD_SSHORT : VGLOBAL_Real_AllAddr_gfx12<0x013, "global_load_i16">; -defm GLOBAL_LOAD_DWORD : VGLOBAL_Real_AllAddr_gfx12<0x014, "global_load_b32">; -defm GLOBAL_LOAD_DWORDX2 : VGLOBAL_Real_AllAddr_gfx12<0x015, "global_load_b64">; -defm GLOBAL_LOAD_DWORDX3 : VGLOBAL_Real_AllAddr_gfx12<0x016, "global_load_b96">; -defm GLOBAL_LOAD_DWORDX4 : VGLOBAL_Real_AllAddr_gfx12<0x017, "global_load_b128">; -defm GLOBAL_STORE_BYTE : VGLOBAL_Real_AllAddr_gfx12<0x018, "global_store_b8">; -defm GLOBAL_STORE_SHORT : VGLOBAL_Real_AllAddr_gfx12<0x019, "global_store_b16">; -defm GLOBAL_STORE_DWORD : VGLOBAL_Real_AllAddr_gfx12<0x01a, "global_store_b32">; -defm GLOBAL_STORE_DWORDX2 : VGLOBAL_Real_AllAddr_gfx12<0x01b, "global_store_b64">; -defm GLOBAL_STORE_DWORDX3 : VGLOBAL_Real_AllAddr_gfx12<0x01c, "global_store_b96">; -defm GLOBAL_STORE_DWORDX4 : VGLOBAL_Real_AllAddr_gfx12<0x01d, "global_store_b128">; -defm GLOBAL_LOAD_UBYTE_D16 : VGLOBAL_Real_AllAddr_gfx12<0x01e, "global_load_d16_u8">; -defm GLOBAL_LOAD_SBYTE_D16 : VGLOBAL_Real_AllAddr_gfx12<0x01f, "global_load_d16_i8">; -defm GLOBAL_LOAD_SHORT_D16 : VGLOBAL_Real_AllAddr_gfx12<0x020, "global_load_d16_b16">; -defm GLOBAL_LOAD_UBYTE_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x021, "global_load_d16_hi_u8">; -defm GLOBAL_LOAD_SBYTE_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x022, "global_load_d16_hi_i8">; -defm GLOBAL_LOAD_SHORT_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x023, "global_load_d16_hi_b16">; -defm GLOBAL_STORE_BYTE_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x024, "global_store_d16_hi_b8">; -defm GLOBAL_STORE_SHORT_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x025, "global_store_d16_hi_b16">; -defm GLOBAL_LOAD_DWORD_ADDTID : VGLOBAL_Real_AllAddr_gfx12<0x028, "global_load_addtid_b32">; -defm GLOBAL_STORE_DWORD_ADDTID : VGLOBAL_Real_AllAddr_gfx12<0x029, "global_store_addtid_b32">; -defm GLOBAL_LOAD_BLOCK : VGLOBAL_Real_AllAddr_gfx12<0x053>; -defm GLOBAL_STORE_BLOCK : VGLOBAL_Real_AllAddr_gfx12<0x054>; - -defm GLOBAL_ATOMIC_SWAP : VGLOBAL_Real_Atomics_gfx12<0x033, "global_atomic_swap_b32">; -defm GLOBAL_ATOMIC_CMPSWAP : VGLOBAL_Real_Atomics_gfx12<0x034, "global_atomic_cmpswap_b32">; -defm GLOBAL_ATOMIC_ADD : VGLOBAL_Real_Atomics_gfx12<0x035, "global_atomic_add_u32">; -defm GLOBAL_ATOMIC_SUB : VGLOBAL_Real_Atomics_gfx12<0x036, "global_atomic_sub_u32">; -defm GLOBAL_ATOMIC_CSUB : VGLOBAL_Real_Atomics_gfx12<0x037, "global_atomic_sub_clamp_u32", "global_atomic_csub_u32">; -defm GLOBAL_ATOMIC_SMIN : VGLOBAL_Real_Atomics_gfx12<0x038, "global_atomic_min_i32">; -defm GLOBAL_ATOMIC_UMIN : VGLOBAL_Real_Atomics_gfx12<0x039, "global_atomic_min_u32">; -defm GLOBAL_ATOMIC_SMAX : VGLOBAL_Real_Atomics_gfx12<0x03a, "global_atomic_max_i32">; -defm GLOBAL_ATOMIC_UMAX : VGLOBAL_Real_Atomics_gfx12<0x03b, "global_atomic_max_u32">; -defm GLOBAL_ATOMIC_AND : VGLOBAL_Real_Atomics_gfx12<0x03c, "global_atomic_and_b32">; -defm GLOBAL_ATOMIC_OR : VGLOBAL_Real_Atomics_gfx12<0x03d, "global_atomic_or_b32">; -defm GLOBAL_ATOMIC_XOR : VGLOBAL_Real_Atomics_gfx12<0x03e, "global_atomic_xor_b32">; -defm GLOBAL_ATOMIC_INC : VGLOBAL_Real_Atomics_gfx12<0x03f, "global_atomic_inc_u32">; -defm GLOBAL_ATOMIC_DEC : VGLOBAL_Real_Atomics_gfx12<0x040, "global_atomic_dec_u32">; -defm GLOBAL_ATOMIC_SWAP_X2 : VGLOBAL_Real_Atomics_gfx12<0x041, "global_atomic_swap_b64">; -defm GLOBAL_ATOMIC_CMPSWAP_X2 : VGLOBAL_Real_Atomics_gfx12<0x042, "global_atomic_cmpswap_b64">; -defm GLOBAL_ATOMIC_ADD_X2 : VGLOBAL_Real_Atomics_gfx12<0x043, "global_atomic_add_u64">; -defm GLOBAL_ATOMIC_SUB_X2 : VGLOBAL_Real_Atomics_gfx12<0x044, "global_atomic_sub_u64">; -defm GLOBAL_ATOMIC_SMIN_X2 : VGLOBAL_Real_Atomics_gfx12<0x045, "global_atomic_min_i64">; -defm GLOBAL_ATOMIC_UMIN_X2 : VGLOBAL_Real_Atomics_gfx12<0x046, "global_atomic_min_u64">; -defm GLOBAL_ATOMIC_SMAX_X2 : VGLOBAL_Real_Atomics_gfx12<0x047, "global_atomic_max_i64">; -defm GLOBAL_ATOMIC_UMAX_X2 : VGLOBAL_Real_Atomics_gfx12<0x048, "global_atomic_max_u64">; -defm GLOBAL_ATOMIC_AND_X2 : VGLOBAL_Real_Atomics_gfx12<0x049, "global_atomic_and_b64">; -defm GLOBAL_ATOMIC_OR_X2 : VGLOBAL_Real_Atomics_gfx12<0x04a, "global_atomic_or_b64">; -defm GLOBAL_ATOMIC_XOR_X2 : VGLOBAL_Real_Atomics_gfx12<0x04b, "global_atomic_xor_b64">; -defm GLOBAL_ATOMIC_INC_X2 : VGLOBAL_Real_Atomics_gfx12<0x04c, "global_atomic_inc_u64">; -defm GLOBAL_ATOMIC_DEC_X2 : VGLOBAL_Real_Atomics_gfx12<0x04d, "global_atomic_dec_u64">; -defm GLOBAL_ATOMIC_COND_SUB_U32 : VGLOBAL_Real_Atomics_gfx12<0x050>; -defm GLOBAL_ATOMIC_FMIN : VGLOBAL_Real_Atomics_gfx12<0x051, "global_atomic_min_num_f32", "global_atomic_min_f32">; -defm GLOBAL_ATOMIC_FMAX : VGLOBAL_Real_Atomics_gfx12<0x052, "global_atomic_max_num_f32", "global_atomic_max_f32">; -defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056>; +defm GLOBAL_LOAD_UBYTE : VFLAT_Real_AllAddr_gfx12<0x010, "global_load_u8">; +defm GLOBAL_LOAD_SBYTE : VFLAT_Real_AllAddr_gfx12<0x011, "global_load_i8">; +defm GLOBAL_LOAD_USHORT : VFLAT_Real_AllAddr_gfx12<0x012, "global_load_u16">; +defm GLOBAL_LOAD_SSHORT : VFLAT_Real_AllAddr_gfx12<0x013, "global_load_i16">; +defm GLOBAL_LOAD_DWORD : VFLAT_Real_AllAddr_gfx12<0x014, "global_load_b32">; +defm GLOBAL_LOAD_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x015, "global_load_b64">; +defm GLOBAL_LOAD_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x016, "global_load_b96">; +defm GLOBAL_LOAD_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x017, "global_load_b128">; +defm GLOBAL_STORE_BYTE : VFLAT_Real_AllAddr_gfx12<0x018, "global_store_b8">; +defm GLOBAL_STORE_SHORT : VFLAT_Real_AllAddr_gfx12<0x019, "global_store_b16">; +defm GLOBAL_STORE_DWORD : VFLAT_Real_AllAddr_gfx12<0x01a, "global_store_b32">; +defm GLOBAL_STORE_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x01b, "global_store_b64">; +defm GLOBAL_STORE_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x01c, "global_store_b96">; +defm GLOBAL_STORE_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x01d, "global_store_b128">; +defm GLOBAL_LOAD_UBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01e, "global_load_d16_u8">; +defm GLOBAL_LOAD_SBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01f, "global_load_d16_i8">; +defm GLOBAL_LOAD_SHORT_D16 : VFLAT_Real_AllAddr_gfx12<0x020, "global_load_d16_b16">; +defm GLOBAL_LOAD_UBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x021, "global_load_d16_hi_u8">; +defm GLOBAL_LOAD_SBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x022, "global_load_d16_hi_i8">; +defm GLOBAL_LOAD_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x023, "global_load_d16_hi_b16">; +defm GLOBAL_STORE_BYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x024, "global_store_d16_hi_b8">; +defm GLOBAL_STORE_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x025, "global_store_d16_hi_b16">; +defm GLOBAL_LOAD_DWORD_ADDTID : VFLAT_Real_AllAddr_gfx12<0x028, "global_load_addtid_b32">; +defm GLOBAL_STORE_DWORD_ADDTID : VFLAT_Real_AllAddr_gfx12<0x029, "global_store_addtid_b32">; +defm GLOBAL_LOAD_BLOCK : VFLAT_Real_AllAddr_gfx12<0x053>; +defm GLOBAL_STORE_BLOCK : VFLAT_Real_AllAddr_gfx12<0x054>; + +defm GLOBAL_ATOMIC_SWAP : VFLAT_Real_Atomics_gfx12<0x033, "global_atomic_swap_b32">; +defm GLOBAL_ATOMIC_CMPSWAP : VFLAT_Real_Atomics_gfx12<0x034, "global_atomic_cmpswap_b32">; +defm GLOBAL_ATOMIC_ADD : VFLAT_Real_Atomics_gfx12<0x035, "global_atomic_add_u32">; +defm GLOBAL_ATOMIC_SUB : VFLAT_Real_Atomics_gfx12<0x036, "global_atomic_sub_u32">; +defm GLOBAL_ATOMIC_CSUB : VFLAT_Real_Atomics_gfx12<0x037, "global_atomic_sub_clamp_u32", "global_atomic_csub_u32">; +defm GLOBAL_ATOMIC_SMIN : VFLAT_Real_Atomics_gfx12<0x038, "global_atomic_min_i32">; +defm GLOBAL_ATOMIC_UMIN : VFLAT_Real_Atomics_gfx12<0x039, "global_atomic_min_u32">; +defm GLOBAL_ATOMIC_SMAX : VFLAT_Real_Atomics_gfx12<0x03a, "global_atomic_max_i32">; +defm GLOBAL_ATOMIC_UMAX : VFLAT_Real_Atomics_gfx12<0x03b, "global_atomic_max_u32">; +defm GLOBAL_ATOMIC_AND : VFLAT_Real_Atomics_gfx12<0x03c, "global_atomic_and_b32">; +defm GLOBAL_ATOMIC_OR : VFLAT_Real_Atomics_gfx12<0x03d, "global_atomic_or_b32">; +defm GLOBAL_ATOMIC_XOR : VFLAT_Real_Atomics_gfx12<0x03e, "global_atomic_xor_b32">; +defm GLOBAL_ATOMIC_INC : VFLAT_Real_Atomics_gfx12<0x03f, "global_atomic_inc_u32">; +defm GLOBAL_ATOMIC_DEC : VFLAT_Real_Atomics_gfx12<0x040, "global_atomic_dec_u32">; +defm GLOBAL_ATOMIC_SWAP_X2 : VFLAT_Real_Atomics_gfx12<0x041, "global_atomic_swap_b64">; +defm GLOBAL_ATOMIC_CMPSWAP_X2 : VFLAT_Real_Atomics_gfx12<0x042, "global_atomic_cmpswap_b64">; +defm GLOBAL_ATOMIC_ADD_X2 : VFLAT_Real_Atomics_gfx12<0x043, "global_atomic_add_u64">; +defm GLOBAL_ATOMIC_SUB_X2 : VFLAT_Real_Atomics_gfx12<0x044, "global_atomic_sub_u64">; +defm GLOBAL_ATOMIC_SMIN_X2 : VFLAT_Real_Atomics_gfx12<0x045, "global_atomic_min_i64">; +defm GLOBAL_ATOMIC_UMIN_X2 : VFLAT_Real_Atomics_gfx12<0x046, "global_atomic_min_u64">; +defm GLOBAL_ATOMIC_SMAX_X2 : VFLAT_Real_Atomics_gfx12<0x047, "global_atomic_max_i64">; +defm GLOBAL_ATOMIC_UMAX_X2 : VFLAT_Real_Atomics_gfx12<0x048, "global_atomic_max_u64">; +defm GLOBAL_ATOMIC_AND_X2 : VFLAT_Real_Atomics_gfx12<0x049, "global_atomic_and_b64">; +defm GLOBAL_ATOMIC_OR_X2 : VFLAT_Real_Atomics_gfx12<0x04a, "global_atomic_or_b64">; +defm GLOBAL_ATOMIC_XOR_X2 : VFLAT_Real_Atomics_gfx12<0x04b, "global_atomic_xor_b64">; +defm GLOBAL_ATOMIC_INC_X2 : VFLAT_Real_Atomics_gfx12<0x04c, "global_atomic_inc_u64">; +defm GLOBAL_ATOMIC_DEC_X2 : VFLAT_Real_Atomics_gfx12<0x04d, "global_atomic_dec_u64">; +defm GLOBAL_ATOMIC_COND_SUB_U32 : VFLAT_Real_Atomics_gfx12<0x050>; +defm GLOBAL_ATOMIC_FMIN : VFLAT_Real_Atomics_gfx12<0x051, "global_atomic_min_num_f32", "global_atomic_min_f32">; +defm GLOBAL_ATOMIC_FMAX : VFLAT_Real_Atomics_gfx12<0x052, "global_atomic_max_num_f32", "global_atomic_max_f32">; +defm GLOBAL_ATOMIC_ADD_F32 : VFLAT_Real_Atomics_gfx12<0x056>; defm GLOBAL_LOAD_TR_B128_w32 : VGLOBAL_Real_AllAddr_gfx1200<0x057>; defm GLOBAL_LOAD_TR_B64_w32 : VGLOBAL_Real_AllAddr_gfx1200<0x058>; -defm GLOBAL_LOAD_TR_B128_w64 : VGLOBAL_Real_AllAddr_gfx12_w64<0x057>; -defm GLOBAL_LOAD_TR_B64_w64 : VGLOBAL_Real_AllAddr_gfx12_w64<0x058>; +defm GLOBAL_LOAD_TR_B128_w64 : VFLAT_Real_AllAddr_gfx12_w64<0x057>; +defm GLOBAL_LOAD_TR_B64_w64 : VFLAT_Real_AllAddr_gfx12_w64<0x058>; -defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073>; -defm GLOBAL_ATOMIC_PK_ADD_F16 : VGLOBAL_Real_Atomics_gfx12<0x059>; -defm GLOBAL_ATOMIC_PK_ADD_BF16 : VGLOBAL_Real_Atomics_gfx12<0x05a>; +defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VFLAT_Real_Atomics_gfx12<0x073>; +defm GLOBAL_ATOMIC_PK_ADD_F16 : VFLAT_Real_Atomics_gfx12<0x059>; +defm GLOBAL_ATOMIC_PK_ADD_BF16 : VFLAT_Real_Atomics_gfx12<0x05a>; defm GLOBAL_INV : VFLAT_Real_Base_gfx12<0x02b>; defm GLOBAL_WB : VFLAT_Real_Base_gfx12<0x02c>; diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 0976fccf78d8..bbed828b4fed 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -1189,6 +1189,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { } fixVALUPartialForwardingHazard(MI); fixVALUTransUseHazard(MI); + fixVALUTransCoexecutionHazards(MI); fixWMMAHazards(MI); fixShift64HighRegBug(MI); fixVALUMaskWriteHazard(MI); @@ -1809,6 +1810,51 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) { return true; } +bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) { + if (!AMDGPU::isGFX1250(ST) || // Coexecution disabled. + !SIInstrInfo::isVALU(*MI) || SIInstrInfo::isTRANS(*MI)) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + auto IsTransHazardFn = [MI, TII, TRI](const MachineInstr &I) { + if (!SIInstrInfo::isTRANS(I)) + return false; + + // RAW: Trans(I) writes, VALU(MI) reads. + Register TransDef = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg(); + for (const MachineOperand &ValuUse : MI->explicit_uses()) { + if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg())) + return true; + } + + auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst); + if (!ValuDst || !ValuDst->isReg()) + return false; + + // WAR: Trans(I) reads, VALU(MI) writes. + Register ValuDef = ValuDst->getReg(); + for (const MachineOperand &TransUse : I.explicit_uses()) { + if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg())) + return true; + } + + return false; + }; + + auto IsExpiredFn = [](const MachineInstr &I, int) { + return SIInstrInfo::isVALU(I); + }; + + const int HasVALU = std::numeric_limits<int>::max(); + if (::getWaitStatesSince(IsTransHazardFn, MI, IsExpiredFn) == HasVALU) + return false; + + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32)); + return true; +} + bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) { if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI)) return false; diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h index bbc55851bf96..ef6ddd874f58 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -104,6 +104,7 @@ private: bool fixLdsDirectVMEMHazard(MachineInstr *MI); bool fixVALUPartialForwardingHazard(MachineInstr *MI); bool fixVALUTransUseHazard(MachineInstr *MI); + bool fixVALUTransCoexecutionHazards(MachineInstr *MI); bool fixWMMAHazards(MachineInstr *MI); bool fixShift64HighRegBug(MachineInstr *MI); bool fixVALUMaskWriteHazard(MachineInstr *MI); diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index fce8f36d4596..a6553083d722 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -803,7 +803,8 @@ void GCNScheduleDAGMILive::schedule() { GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const { GCNDownwardRPTracker RPTracker(*LIS); - RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]); + RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second, + &LiveIns[RegionIdx]); return RPTracker.moveMaxPressure(); } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index e6dd98a10420..268162bcada4 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -214,6 +214,7 @@ protected: bool FlatInstOffsets = false; bool FlatGlobalInsts = false; bool FlatScratchInsts = false; + bool FlatGVSMode = false; bool ScalarFlatScratchInsts = false; bool HasArchitectedFlatScratch = false; bool EnableFlatScratch = false; @@ -233,6 +234,7 @@ protected: bool HasRestrictedSOffset = false; bool Has64BitLiterals = false; bool HasBitOp3Insts = false; + bool HasTanhInsts = false; bool HasTransposeLoadF4F6Insts = false; bool HasPrngInst = false; bool HasBVHDualAndBVH8Insts = false; @@ -1156,10 +1158,12 @@ public: bool hasMadF16() const; - bool hasMovB64() const { return GFX940Insts; } + bool hasMovB64() const { return GFX940Insts || GFX1250Insts; } bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; } + bool hasFlatGVSMode() const { return FlatGVSMode; } + bool enableSIScheduler() const { return EnableSIScheduler; } @@ -1377,6 +1381,10 @@ public: return HasMinimum3Maximum3F16; } + bool hasTanhInsts() const { return HasTanhInsts; } + + bool hasAddPC64Inst() const { return GFX1250Insts; } + bool hasMinimum3Maximum3PKF16() const { return HasMinimum3Maximum3PKF16; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index e7d0e1838fa6..2a920f6feb1c 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -108,7 +108,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, MCContext *Ctx) { int64_t SignedValue = static_cast<int64_t>(Value); - switch (Fixup.getTargetKind()) { + switch (Fixup.getKind()) { case AMDGPU::fixup_si_sopp_br: { int64_t BrImm = (SignedValue - 4) / 4; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp index 22ae5f4e7191..0d5a8be6220d 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp @@ -64,6 +64,8 @@ unsigned AMDGPUELFObjectWriter::getRelocType(const MCFixup &Fixup, return ELF::R_AMDGPU_ABS32_LO; case AMDGPUMCExpr::S_ABS32_HI: return ELF::R_AMDGPU_ABS32_HI; + case AMDGPUMCExpr::S_ABS64: + return ELF::R_AMDGPU_ABS64; } MCFixupKind Kind = Fixup.getKind(); @@ -76,7 +78,7 @@ unsigned AMDGPUELFObjectWriter::getRelocType(const MCFixup &Fixup, return IsPCRel ? ELF::R_AMDGPU_REL64 : ELF::R_AMDGPU_ABS64; } - if (Fixup.getTargetKind() == AMDGPU::fixup_si_sopp_br) { + if (Fixup.getKind() == AMDGPU::fixup_si_sopp_br) { const auto *SymA = Target.getAddSym(); assert(SymA); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index cb6319ed627c..ec9248b972ec 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -1332,6 +1332,16 @@ void AMDGPUInstPrinter::printIndexKey16bit(const MCInst *MI, unsigned OpNo, O << " index_key:" << Imm; } +void AMDGPUInstPrinter::printIndexKey32bit(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, + raw_ostream &O) { + auto Imm = MI->getOperand(OpNo).getImm() & 0x7; + if (Imm == 0) + return; + + O << " index_key:" << Imm; +} + void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index fb803b1f8134..e3299a618e88 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -132,6 +132,8 @@ private: const MCSubtargetInfo &STI, raw_ostream &O); void printIndexKey16bit(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printIndexKey32bit(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O); void printInterpSlot(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printInterpAttr(const MCInst *MI, unsigned OpNo, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp index 31dd373e54fb..ffdac8b8ce32 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -25,6 +25,7 @@ const MCAsmInfo::AtSpecifier atSpecifiers[] = { {AMDGPUMCExpr::S_REL64, "rel64"}, {AMDGPUMCExpr::S_ABS32_LO, "abs32@lo"}, {AMDGPUMCExpr::S_ABS32_HI, "abs32@hi"}, + {AMDGPUMCExpr::S_ABS64, "abs64"}, }; AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index 4bb3942936f0..f48739fe0181 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -381,9 +381,11 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI, // Set unused op_sel_hi bits to 1 for VOP3P and MAI instructions. // Note that accvgpr_read/write are MAI, have src0, but do not use op_sel. - if ((Desc.TSFlags & SIInstrFlags::VOP3P) || - Opcode == AMDGPU::V_ACCVGPR_READ_B32_vi || - Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) { + if (((Desc.TSFlags & SIInstrFlags::VOP3P) || + Opcode == AMDGPU::V_ACCVGPR_READ_B32_vi || + Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) && + // Matrix B reuse operand reuses op_sel_hi. + !AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::matrix_b_reuse)) { Encoding |= getImplicitOpSelHiEncoding(Opcode); } @@ -562,7 +564,8 @@ static bool needsPCRel(const MCExpr *Expr) { case MCExpr::SymbolRef: { auto *SE = cast<MCSymbolRefExpr>(Expr); auto Spec = AMDGPU::getSpecifier(SE); - return Spec != AMDGPUMCExpr::S_ABS32_LO && Spec != AMDGPUMCExpr::S_ABS32_HI; + return Spec != AMDGPUMCExpr::S_ABS32_LO && + Spec != AMDGPUMCExpr::S_ABS32_HI && Spec != AMDGPUMCExpr::S_ABS64; } case MCExpr::Binary: { auto *BE = cast<MCBinaryExpr>(Expr); @@ -685,7 +688,12 @@ void AMDGPUMCCodeEmitter::getMachineOpValueCommon( const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); uint32_t Offset = Desc.getSize(); assert(Offset == 4 || Offset == 8); - addFixup(Fixups, Offset, MO.getExpr(), FK_Data_4, PCRel); + auto OpType = Desc.operands()[OpNo].OperandType; + MCFixupKind Kind = (STI.hasFeature(AMDGPU::Feature64BitLiterals) && + OpType == AMDGPU::OPERAND_REG_IMM_INT64) + ? FK_Data_8 + : FK_Data_4; + addFixup(Fixups, Offset, MO.getExpr(), Kind, PCRel); } const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h index e1b9720cdbfc..bc6fdf7f2e4c 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h @@ -50,6 +50,7 @@ public: S_REL64, // symbol@rel64 S_ABS32_LO, // symbol@abs32@lo S_ABS32_HI, // symbol@abs32@hi + S_ABS64, // symbol@abs64 }; private: diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 9b5a46395695..f018f77bc83e 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -378,6 +378,7 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, default: return false; case AMDGPU::V_MOV_B32_e32: + case AMDGPU::AV_MOV_B32_IMM_PSEUDO: SMovOp = AMDGPU::S_MOV_B32; break; case AMDGPU::V_MOV_B64_PSEUDO: @@ -946,13 +947,18 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) { // Copies and REG_SEQUENCE do not contribute to the final assembly // So, skip them but take care of the SGPR to VGPR copies bookkeeping. - if (Inst->isCopy() || Inst->isRegSequence()) { - if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) { - if (!Inst->isCopy() || - !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) { - Info.NumSVCopies++; - continue; - } + if (Inst->isRegSequence() && + TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) { + Info.NumSVCopies++; + continue; + } + if (Inst->isCopy()) { + const TargetRegisterClass *SrcRC, *DstRC; + std::tie(SrcRC, DstRC) = getCopyRegClasses(*Inst, *TRI, *MRI); + if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI) && + !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) { + Info.NumSVCopies++; + continue; } } diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 0ed06c37507a..e172c0b63189 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1761,6 +1761,7 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI, for (MachineInstr *Copy : CopiesToReplace) Copy->addImplicitDefUseOperands(*MF); + SetVector<MachineInstr *> ConstantFoldCandidates; for (FoldCandidate &Fold : FoldList) { assert(!Fold.isReg() || Fold.Def.OpToFold); if (Fold.isReg() && Fold.getReg().isVirtual()) { @@ -1783,16 +1784,21 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI, << static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI); - if (Fold.isImm() && tryConstantFoldOp(Fold.UseMI)) { - LLVM_DEBUG(dbgs() << "Constant folded " << *Fold.UseMI); - Changed = true; - } + if (Fold.isImm()) + ConstantFoldCandidates.insert(Fold.UseMI); } else if (Fold.Commuted) { // Restoring instruction's original operand order if fold has failed. TII->commuteInstruction(*Fold.UseMI, false); } } + + for (MachineInstr *MI : ConstantFoldCandidates) { + if (tryConstantFoldOp(MI)) { + LLVM_DEBUG(dbgs() << "Constant folded " << *MI); + Changed = true; + } + } return true; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e2a10be4c2c7..0c76ff2ec5ea 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -33,6 +33,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/SDPatternMatch.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" @@ -46,6 +47,7 @@ #include <optional> using namespace llvm; +using namespace llvm::SDPatternMatch; #define DEBUG_TYPE "si-lower" @@ -938,6 +940,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal); } + if (Subtarget->hasBF16TransInsts()) { + setOperationAction({ISD::FEXP2, ISD::FLOG2, ISD::FSQRT}, MVT::bf16, Legal); + } + if (Subtarget->hasCvtPkF16F32Inst()) { setOperationAction(ISD::FP_ROUND, {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16}, @@ -3893,7 +3899,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, // arguments to begin at SP+0. Completely unused for non-tail calls. int32_t FPDiff = 0; MachineFrameInfo &MFI = MF.getFrameInfo(); - auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); + auto *TRI = Subtarget->getRegisterInfo(); // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass @@ -8162,6 +8168,14 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, // which is a 64-bit pc-relative offset from the encoding of the $symbol // operand to the global variable. + if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) { + assert(GAFlags != SIInstrInfo::MO_NONE); + + SDValue Ptr = + DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2); + return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr); + } + SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags); SDValue PtrHi; if (GAFlags == SIInstrInfo::MO_NONE) @@ -8211,6 +8225,13 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, } if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) { + if (Subtarget->has64BitLiterals()) { + SDValue Addr = DAG.getTargetGlobalAddress( + GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64); + return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr), + 0); + } + SDValue AddrLo = DAG.getTargetGlobalAddress( GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO); AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0}; @@ -9289,7 +9310,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); case Intrinsic::amdgcn_reloc_constant: { - Module *M = const_cast<Module *>(MF.getFunction().getParent()); + Module *M = MF.getFunction().getParent(); const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD(); auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString(); auto *RelocSymbol = cast<GlobalVariable>( @@ -9315,6 +9336,44 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(0), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), IndexKeyi32); } + case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: { + if (Op.getOperand(4).getValueType() == MVT::i64) + return SDValue(); + + SDLoc SL(Op); + auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), + {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2), + Op.getOperand(3), IndexKeyi64, Op.getOperand(5), + Op.getOperand(6)}); + } + case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16: + case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16: + case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: { + EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8 + ? MVT::i64 + : MVT::i32; + if (Op.getOperand(6).getValueType() == IndexKeyTy) + return SDValue(); + + SDLoc SL(Op); + auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), + {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2), + Op.getOperand(3), Op.getOperand(4), Op.getOperand(5), + IndexKey, Op.getOperand(7), + Op.getOperand(8)}); // No clamp operand + } case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: { @@ -11074,7 +11133,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { assert(VT.getSizeInBits() == 64); SDLoc DL(Op); - SDValue Cond = Op.getOperand(0); + SDValue Cond = DAG.getFreeze(Op.getOperand(0)); SDValue Zero = DAG.getConstant(0, DL, MVT::i32); SDValue One = DAG.getConstant(1, DL, MVT::i32); @@ -12155,6 +12214,11 @@ SDValue SITargetLowering::splitBinaryBitConstantOp( if ((bitOpWithConstantIsReducible(Opc, ValLo) || bitOpWithConstantIsReducible(Opc, ValHi)) || (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) { + // We have 64-bit scalar and/or/xor, but do not have vector forms. + if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() && + !CRHS->user_begin()->isDivergent()) + return SDValue(); + // If we need to materialize a 64-bit immediate, it will be split up later // anyway. Avoid creating the harder to understand 64-bit immediate // materialization. @@ -13660,6 +13724,7 @@ bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF, case Intrinsic::amdgcn_frexp_mant: case Intrinsic::amdgcn_fdot2: case Intrinsic::amdgcn_trig_preop: + case Intrinsic::amdgcn_tanh: return true; default: break; @@ -14498,7 +14563,7 @@ static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, // instead of a tree. SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const { - assert(N->getOpcode() == ISD::ADD); + assert(N->isAnyAdd()); SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); @@ -14531,7 +14596,7 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, for (SDNode *User : LHS->users()) { // There is a use that does not feed into addition, so the multiply can't // be removed. We prefer MUL + ADD + ADDC over MAD + MUL. - if (User->getOpcode() != ISD::ADD) + if (!User->isAnyAdd()) return SDValue(); // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer @@ -14643,8 +14708,11 @@ SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N, SDValue Hi = getHiHalf64(LHS, DAG); SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32); + unsigned Opcode = N->getOpcode(); + if (Opcode == ISD::PTRADD) + Opcode = ISD::ADD; SDValue AddHi = - DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags()); + DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags()); SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi); @@ -15118,42 +15186,123 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); + EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - if (N1.getOpcode() == ISD::ADD) { - // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, - // y is not, and (add y, z) is used only once. - // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, - // z is not, and (add y, z) is used only once. - // The goal is to move constant offsets to the outermost ptradd, to create - // more opportunities to fold offsets into memory instructions. - // Together with the generic combines in DAGCombiner.cpp, this also - // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). - // - // This transform is here instead of in the general DAGCombiner as it can - // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for - // AArch64's CPA. - SDValue X = N0; - SDValue Y = N1.getOperand(0); - SDValue Z = N1.getOperand(1); - if (N1.hasOneUse()) { - bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); - bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); - if (ZIsConstant != YIsConstant) { - // If both additions in the original were NUW, the new ones are as well. - SDNodeFlags Flags = - (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap; - if (YIsConstant) - std::swap(Y, Z); + // The following folds transform PTRADDs into regular arithmetic in cases + // where the PTRADD wouldn't be folded as an immediate offset into memory + // instructions anyway. They are target-specific in that other targets might + // prefer to not lose information about the pointer arithmetic. + + // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)). + // Adapted from DAGCombiner::visitADDLikeCommutative. + SDValue V, K; + if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) { + SDNodeFlags ShlFlags = N1->getFlags(); + // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0, + // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be + // preserved. + SDNodeFlags NewShlFlags = + ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap() + ? SDNodeFlags::NoSignedWrap + : SDNodeFlags(); + SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags); + DCI.AddToWorklist(Inner.getNode()); + return DAG.getNode(ISD::SUB, DL, VT, N0, Inner); + } + + // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in + // performAddCombine. + if (N1.getOpcode() == ISD::MUL) { + if (Subtarget->hasMad64_32()) { + if (SDValue Folded = tryFoldToMad64_32(N, DCI)) + return Folded; + } + } + + // If the 32 low bits of the constant are all zero, there is nothing to fold + // into an immediate offset, so it's better to eliminate the unnecessary + // addition for the lower 32 bits than to preserve the PTRADD. + // Analogous to a fold in performAddCombine. + if (VT == MVT::i64) { + if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI)) + return Folded; + } - SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags); + if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) { + // Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with + // global address GA and constant c, such that c can be folded into GA. + SDValue GAValue = N0.getOperand(0); + if (const GlobalAddressSDNode *GA = + dyn_cast<GlobalAddressSDNode>(GAValue)) { + if (DCI.isBeforeLegalizeOps() && isOffsetFoldingLegal(GA)) { + // If both additions in the original were NUW, reassociation preserves + // that. + SDNodeFlags Flags = + (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap; + SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags); DCI.AddToWorklist(Inner.getNode()); - return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags); + return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags); } } } + if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse()) + return SDValue(); + + // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, + // y is not, and (add y, z) is used only once. + // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, + // z is not, and (add y, z) is used only once. + // The goal is to move constant offsets to the outermost ptradd, to create + // more opportunities to fold offsets into memory instructions. + // Together with the generic combines in DAGCombiner.cpp, this also + // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). + // + // This transform is here instead of in the general DAGCombiner as it can + // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for + // AArch64's CPA. + SDValue X = N0; + SDValue Y = N1.getOperand(0); + SDValue Z = N1.getOperand(1); + bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); + bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); + + // If both additions in the original were NUW, reassociation preserves that. + SDNodeFlags ReassocFlags = + (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap; + + if (ZIsConstant != YIsConstant) { + if (YIsConstant) + std::swap(Y, Z); + SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags); + DCI.AddToWorklist(Inner.getNode()); + return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags); + } + + // If one of Y and Z is constant, they have been handled above. If both were + // constant, the addition would have been folded in SelectionDAG::getNode + // already. This ensures that the generic DAG combines won't undo the + // following reassociation. + assert(!YIsConstant && !ZIsConstant); + + if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) { + // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and + // y are uniform and z isn't. + // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and + // z are uniform and y isn't. + // The goal is to push uniform operands up in the computation, so that they + // can be handled with scalar operations. We can't use reassociateScalarOps + // for this since it requires two identical commutative operations to + // reassociate. + if (Y->isDivergent()) + std::swap(Y, Z); + SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags); + DCI.AddToWorklist(UniformInner.getNode()); + return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags); + } + return SDValue(); } @@ -16847,12 +16996,63 @@ static void knownBitsForWorkitemID(const GCNSubtarget &ST, Known.Zero.setHighBits(llvm::countl_zero(MaxValue)); } +static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, + KnownBits &Known, const APInt &DemandedElts, + unsigned BFEWidth, bool SExt, unsigned Depth) { + const MachineRegisterInfo &MRI = VT.getMachineFunction().getRegInfo(); + const MachineOperand &Src1 = MI.getOperand(2); + + unsigned Src1Cst = 0; + if (Src1.isImm()) { + Src1Cst = Src1.getImm(); + } else if (Src1.isReg()) { + auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI); + if (!Cst) + return; + Src1Cst = Cst->Value.getZExtValue(); + } else { + return; + } + + // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit. + // Width is always [22:16]. + const unsigned Offset = + Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6); + const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6); + + if (Width >= BFEWidth) // Ill-formed. + return; + + VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts, + Depth + 1); + + Known = Known.extractBits(Width, Offset); + + if (SExt) + Known = Known.sext(BFEWidth); + else + Known = Known.zext(BFEWidth); +} + void SITargetLowering::computeKnownBitsForTargetInstr( GISelValueTracking &VT, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth) const { + Known.resetAll(); const MachineInstr *MI = MRI.getVRegDef(R); switch (MI->getOpcode()) { + case AMDGPU::S_BFE_I32: + return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32, + /*SExt=*/true, Depth); + case AMDGPU::S_BFE_U32: + return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32, + /*SExt=*/false, Depth); + case AMDGPU::S_BFE_I64: + return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64, + /*SExt=*/true, Depth); + case AMDGPU::S_BFE_U64: + return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64, + /*SExt=*/false, Depth); case AMDGPU::G_INTRINSIC: case AMDGPU::G_INTRINSIC_CONVERGENT: { Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID(); diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 7ce1359f03da..2af0a575a888 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -260,240 +260,7 @@ InstCounterType eventCounter(const unsigned *masks, WaitEventType E) { llvm_unreachable("event type has no associated counter"); } -// This objects maintains the current score brackets of each wait counter, and -// a per-register scoreboard for each wait counter. -// -// We also maintain the latest score for every event type that can change the -// waitcnt in order to know if there are multiple types of events within -// the brackets. When multiple types of event happen in the bracket, -// wait count may get decreased out of order, therefore we need to put in -// "s_waitcnt 0" before use. -class WaitcntBrackets { -public: - WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter, - HardwareLimits Limits, const unsigned *WaitEventMaskForInst, - InstCounterType SmemAccessCounter) - : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits), - WaitEventMaskForInst(WaitEventMaskForInst), - SmemAccessCounter(SmemAccessCounter) {} - - unsigned getWaitCountMax(InstCounterType T) const { - switch (T) { - case LOAD_CNT: - return Limits.LoadcntMax; - case DS_CNT: - return Limits.DscntMax; - case EXP_CNT: - return Limits.ExpcntMax; - case STORE_CNT: - return Limits.StorecntMax; - case SAMPLE_CNT: - return Limits.SamplecntMax; - case BVH_CNT: - return Limits.BvhcntMax; - case KM_CNT: - return Limits.KmcntMax; - case X_CNT: - return Limits.XcntMax; - default: - break; - } - return 0; - } - - bool isSmemCounter(InstCounterType T) const { - return T == SmemAccessCounter || T == X_CNT; - } - - unsigned getSgprScoresIdx(InstCounterType T) const { - assert(isSmemCounter(T) && "Invalid SMEM counter"); - return T == X_CNT ? 1 : 0; - } - - unsigned getScoreLB(InstCounterType T) const { - assert(T < NUM_INST_CNTS); - return ScoreLBs[T]; - } - - unsigned getScoreUB(InstCounterType T) const { - assert(T < NUM_INST_CNTS); - return ScoreUBs[T]; - } - - unsigned getScoreRange(InstCounterType T) const { - return getScoreUB(T) - getScoreLB(T); - } - - unsigned getRegScore(int GprNo, InstCounterType T) const { - if (GprNo < NUM_ALL_VGPRS) - return VgprScores[T][GprNo]; - return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS]; - } - - bool merge(const WaitcntBrackets &Other); - - RegInterval getRegInterval(const MachineInstr *MI, - const MachineRegisterInfo *MRI, - const SIRegisterInfo *TRI, - const MachineOperand &Op) const; - - bool counterOutOfOrder(InstCounterType T) const; - void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; - void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; - - void determineWait(InstCounterType T, RegInterval Interval, - AMDGPU::Waitcnt &Wait) const; - void determineWait(InstCounterType T, int RegNo, - AMDGPU::Waitcnt &Wait) const { - determineWait(T, {RegNo, RegNo + 1}, Wait); - } - - void applyWaitcnt(const AMDGPU::Waitcnt &Wait); - void applyWaitcnt(InstCounterType T, unsigned Count); - void applyXcnt(const AMDGPU::Waitcnt &Wait); - void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, - const MachineRegisterInfo *MRI, WaitEventType E, - MachineInstr &MI); - - unsigned hasPendingEvent() const { return PendingEvents; } - unsigned hasPendingEvent(WaitEventType E) const { - return PendingEvents & (1 << E); - } - unsigned hasPendingEvent(InstCounterType T) const { - unsigned HasPending = PendingEvents & WaitEventMaskForInst[T]; - assert((HasPending != 0) == (getScoreRange(T) != 0)); - return HasPending; - } - - bool hasMixedPendingEvents(InstCounterType T) const { - unsigned Events = hasPendingEvent(T); - // Return true if more than one bit is set in Events. - return Events & (Events - 1); - } - - bool hasPendingFlat() const { - return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] && - LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) || - (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] && - LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT])); - } - - void setPendingFlat() { - LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT]; - LastFlat[DS_CNT] = ScoreUBs[DS_CNT]; - } - - bool hasPendingGDS() const { - return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT]; - } - - unsigned getPendingGDSWait() const { - return std::min(getScoreUB(DS_CNT) - LastGDS, getWaitCountMax(DS_CNT) - 1); - } - - void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; } - - // Return true if there might be pending writes to the vgpr-interval by VMEM - // instructions with types different from V. - bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const { - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - assert(RegNo < NUM_ALL_VGPRS); - if (VgprVmemTypes[RegNo] & ~(1 << V)) - return true; - } - return false; - } - - void clearVgprVmemTypes(RegInterval Interval) { - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - assert(RegNo < NUM_ALL_VGPRS); - VgprVmemTypes[RegNo] = 0; - } - } - - void setStateOnFunctionEntryOrReturn() { - setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT)); - PendingEvents |= WaitEventMaskForInst[STORE_CNT]; - } - - ArrayRef<const MachineInstr *> getLDSDMAStores() const { - return LDSDMAStores; - } - - bool hasPointSampleAccel(const MachineInstr &MI) const; - bool hasPointSamplePendingVmemTypes(const MachineInstr &MI, - RegInterval Interval) const; - - void print(raw_ostream &) const; - void dump() const { print(dbgs()); } - -private: - struct MergeInfo { - unsigned OldLB; - unsigned OtherLB; - unsigned MyShift; - unsigned OtherShift; - }; - static bool mergeScore(const MergeInfo &M, unsigned &Score, - unsigned OtherScore); - - void setScoreLB(InstCounterType T, unsigned Val) { - assert(T < NUM_INST_CNTS); - ScoreLBs[T] = Val; - } - - void setScoreUB(InstCounterType T, unsigned Val) { - assert(T < NUM_INST_CNTS); - ScoreUBs[T] = Val; - - if (T != EXP_CNT) - return; - - if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT)) - ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT); - } - - void setRegScore(int GprNo, InstCounterType T, unsigned Val) { - setScoreByInterval({GprNo, GprNo + 1}, T, Val); - } - - void setScoreByInterval(RegInterval Interval, InstCounterType CntTy, - unsigned Score); - - void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI, - const MachineRegisterInfo *MRI, - const MachineOperand &Op, InstCounterType CntTy, - unsigned Val); - - const GCNSubtarget *ST = nullptr; - InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS; - HardwareLimits Limits = {}; - const unsigned *WaitEventMaskForInst; - InstCounterType SmemAccessCounter; - unsigned ScoreLBs[NUM_INST_CNTS] = {0}; - unsigned ScoreUBs[NUM_INST_CNTS] = {0}; - unsigned PendingEvents = 0; - // Remember the last flat memory operation. - unsigned LastFlat[NUM_INST_CNTS] = {0}; - // Remember the last GDS operation. - unsigned LastGDS = 0; - // wait_cnt scores for every vgpr. - // Keep track of the VgprUB and SgprUB to make merge at join efficient. - int VgprUB = -1; - int SgprUB = -1; - unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}}; - // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt - // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant. - // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the - // X_CNT score. - unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}}; - // Bitmask of the VmemTypes of VMEM instructions that might have a pending - // write to each vgpr. - unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; - // Store representative LDS DMA operations. The only useful info here is - // alias info. One store is kept per unique AAInfo. - SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores; -}; +class WaitcntBrackets; // This abstracts the logic for generating and updating S_WAIT* instructions // away from the analysis that determines where they are needed. This was @@ -640,8 +407,13 @@ public: }; class SIInsertWaitcnts { +public: + const GCNSubtarget *ST; + InstCounterType SmemAccessCounter; + InstCounterType MaxCounter; + const unsigned *WaitEventMaskForInst; + private: - const GCNSubtarget *ST = nullptr; const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI = nullptr; const MachineRegisterInfo *MRI = nullptr; @@ -657,8 +429,6 @@ private: bool Dirty = true; }; - InstCounterType SmemAccessCounter; - MapVector<MachineBasicBlock *, BlockInfo> BlockInfos; bool ForceEmitWaitcnt[NUM_INST_CNTS]; @@ -675,7 +445,7 @@ private: // message. DenseSet<MachineInstr *> ReleaseVGPRInsts; - InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS; + HardwareLimits Limits; public: SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT, @@ -686,6 +456,30 @@ public: (void)ForceVMCounter; } + unsigned getWaitCountMax(InstCounterType T) const { + switch (T) { + case LOAD_CNT: + return Limits.LoadcntMax; + case DS_CNT: + return Limits.DscntMax; + case EXP_CNT: + return Limits.ExpcntMax; + case STORE_CNT: + return Limits.StorecntMax; + case SAMPLE_CNT: + return Limits.SamplecntMax; + case BVH_CNT: + return Limits.BvhcntMax; + case KM_CNT: + return Limits.KmcntMax; + case X_CNT: + return Limits.XcntMax; + default: + break; + } + return 0; + } + bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets); bool isPreheaderToFlush(MachineBasicBlock &MBB, const WaitcntBrackets &ScoreBrackets); @@ -791,6 +585,211 @@ public: WaitcntBrackets &ScoreBrackets); }; +// This objects maintains the current score brackets of each wait counter, and +// a per-register scoreboard for each wait counter. +// +// We also maintain the latest score for every event type that can change the +// waitcnt in order to know if there are multiple types of events within +// the brackets. When multiple types of event happen in the bracket, +// wait count may get decreased out of order, therefore we need to put in +// "s_waitcnt 0" before use. +class WaitcntBrackets { +public: + WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {} + + bool isSmemCounter(InstCounterType T) const { + return T == Context->SmemAccessCounter || T == X_CNT; + } + + unsigned getSgprScoresIdx(InstCounterType T) const { + assert(isSmemCounter(T) && "Invalid SMEM counter"); + return T == X_CNT ? 1 : 0; + } + + unsigned getScoreLB(InstCounterType T) const { + assert(T < NUM_INST_CNTS); + return ScoreLBs[T]; + } + + unsigned getScoreUB(InstCounterType T) const { + assert(T < NUM_INST_CNTS); + return ScoreUBs[T]; + } + + unsigned getScoreRange(InstCounterType T) const { + return getScoreUB(T) - getScoreLB(T); + } + + unsigned getRegScore(int GprNo, InstCounterType T) const { + if (GprNo < NUM_ALL_VGPRS) + return VgprScores[T][GprNo]; + return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS]; + } + + bool merge(const WaitcntBrackets &Other); + + RegInterval getRegInterval(const MachineInstr *MI, + const MachineRegisterInfo *MRI, + const SIRegisterInfo *TRI, + const MachineOperand &Op) const; + + bool counterOutOfOrder(InstCounterType T) const; + void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; + void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; + + void determineWait(InstCounterType T, RegInterval Interval, + AMDGPU::Waitcnt &Wait) const; + void determineWait(InstCounterType T, int RegNo, + AMDGPU::Waitcnt &Wait) const { + determineWait(T, {RegNo, RegNo + 1}, Wait); + } + + void applyWaitcnt(const AMDGPU::Waitcnt &Wait); + void applyWaitcnt(InstCounterType T, unsigned Count); + void applyXcnt(const AMDGPU::Waitcnt &Wait); + void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, + const MachineRegisterInfo *MRI, WaitEventType E, + MachineInstr &MI); + + unsigned hasPendingEvent() const { return PendingEvents; } + unsigned hasPendingEvent(WaitEventType E) const { + return PendingEvents & (1 << E); + } + unsigned hasPendingEvent(InstCounterType T) const { + unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[T]; + assert((HasPending != 0) == (getScoreRange(T) != 0)); + return HasPending; + } + + bool hasMixedPendingEvents(InstCounterType T) const { + unsigned Events = hasPendingEvent(T); + // Return true if more than one bit is set in Events. + return Events & (Events - 1); + } + + bool hasPendingFlat() const { + return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] && + LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) || + (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] && + LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT])); + } + + void setPendingFlat() { + LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT]; + LastFlat[DS_CNT] = ScoreUBs[DS_CNT]; + } + + bool hasPendingGDS() const { + return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT]; + } + + unsigned getPendingGDSWait() const { + return std::min(getScoreUB(DS_CNT) - LastGDS, + Context->getWaitCountMax(DS_CNT) - 1); + } + + void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; } + + // Return true if there might be pending writes to the vgpr-interval by VMEM + // instructions with types different from V. + bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const { + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + assert(RegNo < NUM_ALL_VGPRS); + if (VgprVmemTypes[RegNo] & ~(1 << V)) + return true; + } + return false; + } + + void clearVgprVmemTypes(RegInterval Interval) { + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + assert(RegNo < NUM_ALL_VGPRS); + VgprVmemTypes[RegNo] = 0; + } + } + + void setStateOnFunctionEntryOrReturn() { + setScoreUB(STORE_CNT, + getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT)); + PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT]; + } + + ArrayRef<const MachineInstr *> getLDSDMAStores() const { + return LDSDMAStores; + } + + bool hasPointSampleAccel(const MachineInstr &MI) const; + bool hasPointSamplePendingVmemTypes(const MachineInstr &MI, + RegInterval Interval) const; + + void print(raw_ostream &) const; + void dump() const { print(dbgs()); } + +private: + struct MergeInfo { + unsigned OldLB; + unsigned OtherLB; + unsigned MyShift; + unsigned OtherShift; + }; + static bool mergeScore(const MergeInfo &M, unsigned &Score, + unsigned OtherScore); + + void setScoreLB(InstCounterType T, unsigned Val) { + assert(T < NUM_INST_CNTS); + ScoreLBs[T] = Val; + } + + void setScoreUB(InstCounterType T, unsigned Val) { + assert(T < NUM_INST_CNTS); + ScoreUBs[T] = Val; + + if (T != EXP_CNT) + return; + + if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT)) + ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT); + } + + void setRegScore(int GprNo, InstCounterType T, unsigned Val) { + setScoreByInterval({GprNo, GprNo + 1}, T, Val); + } + + void setScoreByInterval(RegInterval Interval, InstCounterType CntTy, + unsigned Score); + + void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI, + const MachineRegisterInfo *MRI, + const MachineOperand &Op, InstCounterType CntTy, + unsigned Val); + + const SIInsertWaitcnts *Context; + + unsigned ScoreLBs[NUM_INST_CNTS] = {0}; + unsigned ScoreUBs[NUM_INST_CNTS] = {0}; + unsigned PendingEvents = 0; + // Remember the last flat memory operation. + unsigned LastFlat[NUM_INST_CNTS] = {0}; + // Remember the last GDS operation. + unsigned LastGDS = 0; + // wait_cnt scores for every vgpr. + // Keep track of the VgprUB and SgprUB to make merge at join efficient. + int VgprUB = -1; + int SgprUB = -1; + unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}}; + // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt + // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant. + // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the + // X_CNT score. + unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}}; + // Bitmask of the VmemTypes of VMEM instructions that might have a pending + // write to each vgpr. + unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; + // Store representative LDS DMA operations. The only useful info here is + // alias info. One store is kept per unique AAInfo. + SmallVector<const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores; +}; + class SIInsertWaitcntsLegacy : public MachineFunctionPass { public: static char ID; @@ -827,7 +826,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, RegInterval Result; - MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *ST); + MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *Context->ST); unsigned RegIdx = TRI->getHWRegIndex(MCReg); assert(isUInt<8>(RegIdx)); @@ -885,7 +884,7 @@ void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI, // this at compile time, so we have to assume it might be applied if the // instruction supports it). bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const { - if (!ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI)) + if (!Context->ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI)) return false; const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); @@ -911,7 +910,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, WaitEventType E, MachineInstr &Inst) { - InstCounterType T = eventCounter(WaitEventMaskForInst, E); + InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E); unsigned UB = getScoreUB(T); unsigned CurrScore = UB + 1; @@ -1080,8 +1079,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, } void WaitcntBrackets::print(raw_ostream &OS) const { + const GCNSubtarget *ST = Context->ST; + OS << '\n'; - for (auto T : inst_counter_types(MaxCounter)) { + for (auto T : inst_counter_types(Context->MaxCounter)) { unsigned SR = getScoreRange(T); switch (T) { @@ -1195,7 +1196,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval, // s_waitcnt instruction. if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() && - !ST->hasFlatLgkmVMemCountInOrder()) { + !Context->ST->hasFlatLgkmVMemCountInOrder()) { // If there is a pending FLAT operation, and this is a VMem or LGKM // waitcnt and the target can report early completion, then we need // to force a waitcnt 0. @@ -1209,7 +1210,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval, // If a counter has been maxed out avoid overflow by waiting for // MAX(CounterType) - 1 instead. unsigned NeededWait = - std::min(UB - ScoreToWait, getWaitCountMax(T) - 1); + std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1); addWait(Wait, T, NeededWait); } } @@ -1237,7 +1238,7 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { setScoreLB(T, std::max(getScoreLB(T), UB - Count)); } else { setScoreLB(T, UB); - PendingEvents &= ~WaitEventMaskForInst[T]; + PendingEvents &= ~Context->WaitEventMaskForInst[T]; } } @@ -1262,7 +1263,7 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) { // the decrement may go out of order. bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { // Scalar memory read always can go out of order. - if ((T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) || + if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) || (T == X_CNT && hasPendingEvent(SMEM_GROUP))) return true; return hasMixedPendingEvents(T); @@ -2386,8 +2387,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { VgprUB = std::max(VgprUB, Other.VgprUB); SgprUB = std::max(SgprUB, Other.SgprUB); - for (auto T : inst_counter_types(MaxCounter)) { + for (auto T : inst_counter_types(Context->MaxCounter)) { // Merge event flags for this counter + const unsigned *WaitEventMaskForInst = Context->WaitEventMaskForInst; const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T]; const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T]; if (OtherEvents & ~OldEvents) @@ -2746,11 +2748,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; - const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask(); + WaitEventMaskForInst = WCG->getWaitEventMask(); SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS); - HardwareLimits Limits = {}; if (ST->hasExtendedWaitCounts()) { Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV); Limits.DscntMax = AMDGPU::getDscntBitMask(IV); @@ -2807,8 +2808,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); } - auto NonKernelInitialState = std::make_unique<WaitcntBrackets>( - ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter); + auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this); NonKernelInitialState->setStateOnFunctionEntryOrReturn(); BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState); @@ -2839,15 +2839,13 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { *Brackets = *BI.Incoming; } else { if (!Brackets) { - Brackets = std::make_unique<WaitcntBrackets>( - ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter); + Brackets = std::make_unique<WaitcntBrackets>(this); } else { // Reinitialize in-place. N.B. do not do this by assigning from a // temporary because the WaitcntBrackets class is large and it could // cause this function to use an unreasonable amount of stack space. Brackets->~WaitcntBrackets(); - new (Brackets.get()) WaitcntBrackets( - ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter); + new (Brackets.get()) WaitcntBrackets(this); } } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index ca3af3b48a60..c8935f0cb603 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -687,7 +687,8 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII, if (!SafeToPropagate) break; - DefOp.setIsKill(false); + for (auto I = Def; I != MI; ++I) + I->clearRegisterKills(DefOp.getReg(), &RI); } MachineInstrBuilder Builder = @@ -1625,41 +1626,6 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) { } } -static unsigned getAGPRSpillSaveOpcode(unsigned Size) { - switch (Size) { - case 4: - return AMDGPU::SI_SPILL_A32_SAVE; - case 8: - return AMDGPU::SI_SPILL_A64_SAVE; - case 12: - return AMDGPU::SI_SPILL_A96_SAVE; - case 16: - return AMDGPU::SI_SPILL_A128_SAVE; - case 20: - return AMDGPU::SI_SPILL_A160_SAVE; - case 24: - return AMDGPU::SI_SPILL_A192_SAVE; - case 28: - return AMDGPU::SI_SPILL_A224_SAVE; - case 32: - return AMDGPU::SI_SPILL_A256_SAVE; - case 36: - return AMDGPU::SI_SPILL_A288_SAVE; - case 40: - return AMDGPU::SI_SPILL_A320_SAVE; - case 44: - return AMDGPU::SI_SPILL_A352_SAVE; - case 48: - return AMDGPU::SI_SPILL_A384_SAVE; - case 64: - return AMDGPU::SI_SPILL_A512_SAVE; - case 128: - return AMDGPU::SI_SPILL_A1024_SAVE; - default: - llvm_unreachable("unknown register size"); - } -} - static unsigned getAVSpillSaveOpcode(unsigned Size) { switch (Size) { case 4: @@ -1707,22 +1673,20 @@ static unsigned getWWMRegSpillSaveOpcode(unsigned Size, return AMDGPU::SI_SPILL_WWM_V32_SAVE; } -static unsigned getVectorRegSpillSaveOpcode(Register Reg, - const TargetRegisterClass *RC, - unsigned Size, - const SIRegisterInfo &TRI, - const SIMachineFunctionInfo &MFI) { - bool IsVectorSuperClass = TRI.isVectorSuperClass(RC); +unsigned SIInstrInfo::getVectorRegSpillSaveOpcode( + Register Reg, const TargetRegisterClass *RC, unsigned Size, + const SIMachineFunctionInfo &MFI) const { + bool IsVectorSuperClass = RI.isVectorSuperClass(RC); // Choose the right opcode if spilling a WWM register. if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass); - if (IsVectorSuperClass) + // TODO: Check if AGPRs are available + if (ST.hasMAIInsts()) return getAVSpillSaveOpcode(Size); - return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size) - : getVGPRSpillSaveOpcode(Size); + return getVGPRSpillSaveOpcode(Size); } void SIInstrInfo::storeRegToStackSlot( @@ -1770,8 +1734,8 @@ void SIInstrInfo::storeRegToStackSlot( return; } - unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, - SpillSize, RI, *MFI); + unsigned Opcode = + getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI); MFI->setHasSpilledVGPRs(); BuildMI(MBB, MI, DL, get(Opcode)) @@ -1854,41 +1818,6 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { } } -static unsigned getAGPRSpillRestoreOpcode(unsigned Size) { - switch (Size) { - case 4: - return AMDGPU::SI_SPILL_A32_RESTORE; - case 8: - return AMDGPU::SI_SPILL_A64_RESTORE; - case 12: - return AMDGPU::SI_SPILL_A96_RESTORE; - case 16: - return AMDGPU::SI_SPILL_A128_RESTORE; - case 20: - return AMDGPU::SI_SPILL_A160_RESTORE; - case 24: - return AMDGPU::SI_SPILL_A192_RESTORE; - case 28: - return AMDGPU::SI_SPILL_A224_RESTORE; - case 32: - return AMDGPU::SI_SPILL_A256_RESTORE; - case 36: - return AMDGPU::SI_SPILL_A288_RESTORE; - case 40: - return AMDGPU::SI_SPILL_A320_RESTORE; - case 44: - return AMDGPU::SI_SPILL_A352_RESTORE; - case 48: - return AMDGPU::SI_SPILL_A384_RESTORE; - case 64: - return AMDGPU::SI_SPILL_A512_RESTORE; - case 128: - return AMDGPU::SI_SPILL_A1024_RESTORE; - default: - llvm_unreachable("unknown register size"); - } -} - static unsigned getAVSpillRestoreOpcode(unsigned Size) { switch (Size) { case 4: @@ -1930,27 +1859,27 @@ static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, if (Size != 4) llvm_unreachable("unknown wwm register spill size"); - if (IsVectorSuperClass) + if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs return AMDGPU::SI_SPILL_WWM_AV32_RESTORE; return AMDGPU::SI_SPILL_WWM_V32_RESTORE; } -static unsigned -getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, - unsigned Size, const SIRegisterInfo &TRI, - const SIMachineFunctionInfo &MFI) { - bool IsVectorSuperClass = TRI.isVectorSuperClass(RC); +unsigned SIInstrInfo::getVectorRegSpillRestoreOpcode( + Register Reg, const TargetRegisterClass *RC, unsigned Size, + const SIMachineFunctionInfo &MFI) const { + bool IsVectorSuperClass = RI.isVectorSuperClass(RC); // Choose the right opcode if restoring a WWM register. if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass); - if (IsVectorSuperClass) + // TODO: Check if AGPRs are available + if (ST.hasMAIInsts()) return getAVSpillRestoreOpcode(Size); - return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size) - : getVGPRSpillRestoreOpcode(Size); + assert(!RI.isAGPRClass(RC)); + return getVGPRSpillRestoreOpcode(Size); } void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, @@ -1998,7 +1927,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, } unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC, - SpillSize, RI, *MFI); + SpillSize, *MFI); BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex) // vaddr .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset @@ -2214,7 +2143,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { if (ST.hasMovB64()) { MI.setDesc(get(AMDGPU::V_MOV_B64_e32)); if (SrcOp.isReg() || isInlineConstant(MI, 1) || - isUInt<32>(SrcOp.getImm())) + isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals()) break; } if (SrcOp.isImm()) { @@ -2273,6 +2202,12 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case AMDGPU::S_MOV_B64_IMM_PSEUDO: { const MachineOperand &SrcOp = MI.getOperand(1); assert(!SrcOp.isFPImm()); + + if (ST.has64BitLiterals()) { + MI.setDesc(get(AMDGPU::S_MOV_B64)); + break; + } + APInt Imm(64, SrcOp.getImm()); if (Imm.isIntN(32) || isInlineConstant(Imm)) { MI.setDesc(get(AMDGPU::S_MOV_B64)); @@ -2492,6 +2427,25 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.eraseFromParent(); break; } + case AMDGPU::SI_PC_ADD_REL_OFFSET64: { + MachineFunction &MF = *MBB.getParent(); + Register Reg = MI.getOperand(0).getReg(); + MachineOperand Op = MI.getOperand(1); + + // Create a bundle so these instructions won't be re-ordered by the + // post-RA scheduler. + MIBundleBuilder Bundler(MBB, MI); + Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); + if (Op.isGlobal()) + Op.setOffset(Op.getOffset() + 4); + Bundler.append( + BuildMI(MF, DL, get(AMDGPU::S_ADD_U64), Reg).addReg(Reg).add(Op)); + + finalizeBundle(MBB, Bundler.begin()); + + MI.eraseFromParent(); + break; + } case AMDGPU::ENTER_STRICT_WWM: { // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when // Whole Wave Mode is entered. @@ -2807,12 +2761,14 @@ bool SIInstrInfo::isLegalToSwap(const MachineInstr &MI, unsigned OpIdx0, if ((int)OpIdx1 != Src0Idx && MO0->isReg()) { if (!DefinedRC1) return OpInfo1.OperandType == MCOI::OPERAND_UNKNOWN; - return isLegalRegOperand(MI, OpIdx1, *MO0); + return isLegalRegOperand(MI, OpIdx1, *MO0) && + (!MO1->isReg() || isLegalRegOperand(MI, OpIdx0, *MO1)); } if ((int)OpIdx0 != Src0Idx && MO1->isReg()) { if (!DefinedRC0) return OpInfo0.OperandType == MCOI::OPERAND_UNKNOWN; - return isLegalRegOperand(MI, OpIdx0, *MO1); + return (!MO0->isReg() || isLegalRegOperand(MI, OpIdx1, *MO0)) && + isLegalRegOperand(MI, OpIdx0, *MO1); } // No need to check 64-bit literals since swapping does not bring new @@ -2903,9 +2859,9 @@ bool SIInstrInfo::findCommutedOpIndices(const MCInstrDesc &Desc, bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, int64_t BrOffset) const { - // BranchRelaxation should never have to check s_setpc_b64 because its dest - // block is unanalyzable. - assert(BranchOp != AMDGPU::S_SETPC_B64); + // BranchRelaxation should never have to check s_setpc_b64 or s_add_pc_i64 + // because its dest block is unanalyzable. + assert(isSOPP(BranchOp) || isSOPK(BranchOp)); // Convert to dwords. BrOffset /= 4; @@ -2946,13 +2902,30 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, MachineFunction *MF = MBB.getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + auto I = MBB.end(); + auto &MCCtx = MF->getContext(); + + if (ST.hasAddPC64Inst()) { + MCSymbol *Offset = + MCCtx.createTempSymbol("offset", /*AlwaysAddSuffix=*/true); + auto AddPC = BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_PC_I64)) + .addSym(Offset, MO_FAR_BRANCH_OFFSET); + MCSymbol *PostAddPCLabel = + MCCtx.createTempSymbol("post_addpc", /*AlwaysAddSuffix=*/true); + AddPC->setPostInstrSymbol(*MF, PostAddPCLabel); + auto *OffsetExpr = MCBinaryExpr::createSub( + MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx), + MCSymbolRefExpr::create(PostAddPCLabel, MCCtx), MCCtx); + Offset->setVariableValue(OffsetExpr); + return; + } + + assert(RS && "RegScavenger required for long branching"); // FIXME: Virtual register workaround for RegScavenger not working with empty // blocks. Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - auto I = MBB.end(); - // Note: as this is used after hazard recognizer we need to apply some hazard // workarounds directly. const bool FlushSGPRWrites = (ST.isWave64() && ST.hasVALUMaskWriteHazard()) || @@ -2968,7 +2941,6 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); ApplyHazardWorkarounds(); - auto &MCCtx = MF->getContext(); MCSymbol *PostGetPCLabel = MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true); GetPC->setPostInstrSymbol(*MF, PostGetPCLabel); @@ -3507,6 +3479,10 @@ static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) { ? AMDGPU::V_FMAAK_F16_t16 : AMDGPU::V_FMAAK_F16_fake16 : AMDGPU::V_FMAAK_F16; + case AMDGPU::V_FMAC_F64_e32: + case AMDGPU::V_FMAC_F64_e64: + case AMDGPU::V_FMA_F64_e64: + return AMDGPU::V_FMAAK_F64; default: llvm_unreachable("invalid instruction"); } @@ -3535,6 +3511,10 @@ static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) { ? AMDGPU::V_FMAMK_F16_t16 : AMDGPU::V_FMAMK_F16_fake16 : AMDGPU::V_FMAMK_F16; + case AMDGPU::V_FMAC_F64_e32: + case AMDGPU::V_FMAC_F64_e64: + case AMDGPU::V_FMA_F64_e64: + return AMDGPU::V_FMAMK_F64; default: llvm_unreachable("invalid instruction"); } @@ -3613,7 +3593,8 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 || - Opc == AMDGPU::V_FMAC_F16_fake16_e64) { + Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMA_F64_e64 || + Opc == AMDGPU::V_FMAC_F64_e64) { // Don't fold if we are using source or output modifiers. The new VOP2 // instructions don't have them. if (hasAnyModifiersSet(UseMI)) @@ -3685,7 +3666,8 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 || - Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64) + Opc == AMDGPU::V_FMAC_F16_fake16_e64 || + Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64) UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); @@ -3753,7 +3735,8 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 || - Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64) + Opc == AMDGPU::V_FMAC_F16_fake16_e64 || + Opc == AMDGPU::V_FMAC_F16_e64 || Opc == AMDGPU::V_FMAC_F64_e64) UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); @@ -4074,8 +4057,8 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel); - if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 && - !IsLegacy && + if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsLegacy && + (!IsF64 || ST.hasFmaakFmamkF64Insts()) && // If we have an SGPR input, we will violate the constant bus restriction. (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() || !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) { @@ -6099,14 +6082,18 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32; if (Is64BitOp && !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) { - if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp)) + if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) && + (!ST.has64BitLiterals() || InstDesc.getSize() != 4)) return false; // FIXME: We can use sign extended 64-bit literals, but only for signed // operands. At the moment we do not know if an operand is signed. // Such operand will be encoded as its low 32 bits and then either // correctly sign extended or incorrectly zero extended by HW. - if (!Is64BitFPOp && (int32_t)Imm < 0) + // If 64-bit literals are supported and the literal will be encoded + // as full 64 bit we still can use it. + if (!Is64BitFPOp && (int32_t)Imm < 0 && + (!ST.has64BitLiterals() || AMDGPU::isValid32BitLiteral(Imm, false))) return false; } } @@ -6402,7 +6389,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const { if (OldSAddrIdx < 0) return false; - assert(isSegmentSpecificFLAT(Inst)); + assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode())); int NewOpc = AMDGPU::getGlobalVaddrOp(Opc); if (NewOpc < 0) @@ -6426,7 +6413,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const { if (OldVAddrIdx >= 0) { MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx); VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg()); - if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 || + if (!VAddrDef || !VAddrDef->isMoveImmediate() || !VAddrDef->getOperand(1).isImm() || VAddrDef->getOperand(1).getImm() != 0) return false; @@ -6479,7 +6466,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const { // FIXME: Remove this when SelectionDAG is obsoleted. void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const { - if (!isSegmentSpecificFLAT(MI)) + if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode()) return; // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence @@ -9178,15 +9165,30 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { if (isDPP(MI)) return DescSize; bool HasLiteral = false; + unsigned LiteralSize = 4; for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) { const MachineOperand &Op = MI.getOperand(I); const MCOperandInfo &OpInfo = Desc.operands()[I]; if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) { HasLiteral = true; + if (ST.has64BitLiterals()) { + switch (OpInfo.OperandType) { + default: + break; + case AMDGPU::OPERAND_REG_IMM_FP64: + if (!AMDGPU::isValid32BitLiteral(Op.getImm(), true)) + LiteralSize = 8; + break; + case AMDGPU::OPERAND_REG_IMM_INT64: + if (!Op.isImm() || !AMDGPU::isValid32BitLiteral(Op.getImm(), false)) + LiteralSize = 8; + break; + } + } break; } } - return HasLiteral ? DescSize + 4 : DescSize; + return HasLiteral ? DescSize + LiteralSize : DescSize; } // Check whether we have extra NSA words. @@ -9277,13 +9279,16 @@ SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { ArrayRef<std::pair<unsigned, const char *>> SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { static const std::pair<unsigned, const char *> TargetFlags[] = { - { MO_GOTPCREL, "amdgpu-gotprel" }, - { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" }, - { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" }, - { MO_REL32_LO, "amdgpu-rel32-lo" }, - { MO_REL32_HI, "amdgpu-rel32-hi" }, - { MO_ABS32_LO, "amdgpu-abs32-lo" }, - { MO_ABS32_HI, "amdgpu-abs32-hi" }, + {MO_GOTPCREL, "amdgpu-gotprel"}, + {MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo"}, + {MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi"}, + {MO_GOTPCREL64, "amdgpu-gotprel64"}, + {MO_REL32_LO, "amdgpu-rel32-lo"}, + {MO_REL32_HI, "amdgpu-rel32-hi"}, + {MO_REL64, "amdgpu-rel64"}, + {MO_ABS32_LO, "amdgpu-abs32-lo"}, + {MO_ABS32_HI, "amdgpu-abs32-hi"}, + {MO_ABS64, "amdgpu-abs64"}, }; return ArrayRef(TargetFlags); @@ -10390,10 +10395,23 @@ bool SIInstrInfo::isGlobalMemoryObject(const MachineInstr *MI) const { return TargetInstrInfo::isGlobalMemoryObject(MI); } +bool SIInstrInfo::isXDLWMMA(const MachineInstr &MI) const { + if (!isWMMA(MI) && !isSWMMAC(MI)) + return false; + + if (AMDGPU::isGFX1250(ST)) + return AMDGPU::getWMMAIsXDL(MI.getOpcode()); + + return true; +} + bool SIInstrInfo::isXDL(const MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); - if (!SIInstrInfo::isMAI(MI) || isDGEMM(Opcode) || + if (AMDGPU::isGFX12Plus(ST)) + return isDOT(MI) || isXDLWMMA(MI); + + if (!isMAI(MI) || isDGEMM(Opcode) || Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64) return false; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 9e84822bfc27..5e92921f3ea2 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -33,6 +33,7 @@ class LiveVariables; class MachineDominatorTree; class MachineRegisterInfo; class RegScavenger; +class SIMachineFunctionInfo; class TargetRegisterClass; class ScheduleHazardRecognizer; @@ -214,16 +215,20 @@ public: MO_GOTPCREL32_LO = 2, // MO_GOTPCREL32_HI -> symbol@gotpcrel32@hi -> R_AMDGPU_GOTPCREL32_HI. MO_GOTPCREL32_HI = 3, + // MO_GOTPCREL64 -> symbol@GOTPCREL -> R_AMDGPU_GOTPCREL. + MO_GOTPCREL64 = 4, // MO_REL32_LO -> symbol@rel32@lo -> R_AMDGPU_REL32_LO. - MO_REL32 = 4, - MO_REL32_LO = 4, + MO_REL32 = 5, + MO_REL32_LO = 5, // MO_REL32_HI -> symbol@rel32@hi -> R_AMDGPU_REL32_HI. - MO_REL32_HI = 5, + MO_REL32_HI = 6, + MO_REL64 = 7, - MO_FAR_BRANCH_OFFSET = 6, + MO_FAR_BRANCH_OFFSET = 8, - MO_ABS32_LO = 8, - MO_ABS32_HI = 9, + MO_ABS32_LO = 9, + MO_ABS32_HI = 10, + MO_ABS64 = 11, }; explicit SIInstrInfo(const GCNSubtarget &ST); @@ -283,6 +288,15 @@ public: bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override; + unsigned getVectorRegSpillSaveOpcode(Register Reg, + const TargetRegisterClass *RC, + unsigned Size, + const SIMachineFunctionInfo &MFI) const; + unsigned + getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, + unsigned Size, + const SIMachineFunctionInfo &MFI) const; + void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, @@ -863,6 +877,8 @@ public: return get(Opcode).TSFlags & SIInstrFlags::IsDOT; } + bool isXDLWMMA(const MachineInstr &MI) const; + bool isXDL(const MachineInstr &MI) const; static bool isDGEMM(unsigned Opcode) { return AMDGPU::getMAIIsDGEMM(Opcode); } @@ -1097,7 +1113,6 @@ public: // that will not require an additional 4-bytes; this function assumes that it // will. bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const { - assert(!MO.isReg() && "isInlineConstant called on register operand!"); if (!MO.isImm()) return false; return isInlineConstant(MO.getImm(), OperandType); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 5e41f875d980..9e1951e2946c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -268,6 +268,10 @@ def SIpc_add_rel_offset : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET", SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]> >; +def SIpc_add_rel_offset64 : SDNode<"AMDGPUISD::PC_ADD_REL_OFFSET64", + SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>]> +>; + def SIlds : SDNode<"AMDGPUISD::LDS", SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>]> >; @@ -1247,6 +1251,7 @@ def op_sel_hi0 : ArrayOperand0<"op_sel_hi", "OpSelHi">; def neg_lo0 : ArrayOperand0<"neg_lo", "NegLo">; def neg_hi0 : ArrayOperand0<"neg_hi", "NegHi">; +def IndexKey32bit : CustomOperand<i32, 1>; def IndexKey16bit : CustomOperand<i32, 1>; def IndexKey8bit : CustomOperand<i32, 1>; @@ -1302,6 +1307,9 @@ let PrintMethod = "printBitOp3" in def BitOp3 : NamedIntOperand<"bitop3">; def bitop3_0 : DefaultOperand<BitOp3, 0>; +def MatrixAReuse : NamedBitOperand<"matrix_a_reuse">; +def MatrixBReuse : NamedBitOperand<"matrix_b_reuse">; + class KImmFPOperand<ValueType vt> : ImmOperand<vt> { let OperandNamespace = "AMDGPU"; let OperandType = "OPERAND_KIMM"#vt.Size; @@ -1633,6 +1641,8 @@ def VOP3PMods : ComplexPattern<untyped, 2, "SelectVOP3PMods">; def VOP3PModsDOT : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">; def VOP3PModsNeg : ComplexPattern<untyped, 1, "SelectVOP3PModsNeg">; +def VOP3PModsNegs : ComplexPattern<untyped, 1, "SelectVOP3PModsNegs">; // chfang: not use complex pattern? +def VOP3PModsNegAbs : ComplexPattern<untyped, 1, "SelectVOP3PModsNegAbs">; def WMMAOpSelVOP3PMods : ComplexPattern<untyped, 1, "SelectWMMAOpSelVOP3PMods">; def WMMAModsF32NegAbs : ComplexPattern<untyped, 2, "SelectWMMAModsF32NegAbs">; @@ -1641,6 +1651,7 @@ def WMMAModsF16NegAbs : ComplexPattern<untyped, 2, "SelectWMMAModsF16NegAbs">; def WMMAVISrc : ComplexPattern<untyped, 1, "SelectWMMAVISrc">; def SWMMACIndex8 : ComplexPattern<untyped, 2, "SelectSWMMACIndex8">; def SWMMACIndex16 : ComplexPattern<untyped, 2, "SelectSWMMACIndex16">; +def SWMMACIndex32 : ComplexPattern<untyped, 2, "SelectSWMMACIndex32">; def VOP3OpSel : ComplexPattern<untyped, 2, "SelectVOP3OpSel">; @@ -2654,6 +2665,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> { isModifierType<Src2VT>.ret, HasOMod); field bit HasNeg = HasModifiers; + field bit HasMatrixReuse = 0; field bit HasSrc0Mods = HasModifiers; field bit HasSrc1Mods = !if(HasModifiers, !or(HasSrc1FloatMods, HasSrc1IntMods), 0); @@ -2837,6 +2849,8 @@ def VOP_F16_F16 : VOPProfile<[f16, f16, untyped, untyped]>; def VOP_F16_I16 : VOPProfile <[f16, i16, untyped, untyped]>; def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>; def VOP_I16_I16 : VOPProfile <[i16, i16, untyped, untyped]>; +def VOP_BF16_BF16 : VOPProfile<[bf16, bf16, untyped, untyped]>; +def VOP1_I16_I32 : VOPProfile<[i16, i32, untyped, untyped]>; def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>; def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 4419ce00b473..991d9f83e92e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1144,6 +1144,14 @@ def : GCNPat < (SI_PC_ADD_REL_OFFSET $ptr_lo, (i32 0)) >; +def SI_PC_ADD_REL_OFFSET64 : SPseudoInstSI < + (outs SReg_64:$dst), + (ins si_ga:$ptr), + [(set SReg_64:$dst, + (i64 (SIpc_add_rel_offset64 tglobaladdr:$ptr)))]> { + let SubtargetPredicate = Has64BitLiterals; +} + def : GCNPat< (AMDGPUtrap timm:$trapid), (S_TRAP $trapid) @@ -2465,7 +2473,6 @@ def : AMDGPUPat < >; let True16Predicate = NotHasTrue16BitInsts in { -let SubtargetPredicate = isNotGFX9Plus in { def : ROTRPattern <V_ALIGNBIT_B32_e64>; def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), @@ -2475,35 +2482,6 @@ def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; -} // isNotGFX9Plus - -let SubtargetPredicate = isGFX9GFX10 in { -def : GCNPat < - (rotr i32:$src0, i32:$src1), - (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0, - /* src1_modifiers */ 0, $src0, - /* src2_modifiers */ 0, - $src1, /* clamp */ 0, /* op_sel */ 0) ->; - -foreach pat = [(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), - (i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1))))] in -def : GCNPat<pat, - (V_ALIGNBIT_B32_opsel_e64 0, /* src0_modifiers */ - (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), - 0, /* src1_modifiers */ - (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), - 0, /* src2_modifiers */ - $src1, /* clamp */ 0, /* op_sel */ 0) ->; - -def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2), - (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0, - /* src1_modifiers */ 0, $src1, - /* src2_modifiers */ 0, - $src2, /* clamp */ 0, /* op_sel */ 0) ->; -} // isGFX9GFX10 } // end True16Predicate = NotHasTrue16BitInsts let True16Predicate = UseRealTrue16Insts in { @@ -3104,8 +3082,6 @@ def : GCNPat < (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1)) >; -// This pattern for bswap is used for pre-GFX8. For GFX8+, bswap is mapped -// to V_PERM_B32. let True16Predicate = NotHasTrue16BitInsts in def : GCNPat < (i32 (bswap i32:$a)), @@ -3451,30 +3427,32 @@ def : GCNPat < (S_LSHL_B32 SReg_32:$src1, (i16 16)) >; +foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in +let True16Predicate = p in { def : GCNPat < (v2i16 (DivergentBinFrag<build_vector> (i16 0), (i16 VGPR_32:$src1))), (v2i16 (V_LSHLREV_B32_e64 (i16 16), VGPR_32:$src1)) >; - def : GCNPat < - (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))), - (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1) + (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src1), (i16 0))), + (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1)) >; def : GCNPat < - (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src1), (i16 0))), - (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1)) + (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 FP_ZERO))), + (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1)) >; +} def : GCNPat < - (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))), + (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))), (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1) >; def : GCNPat < - (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 FP_ZERO))), - (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1)) + (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))), + (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1) >; foreach vecTy = [v2i16, v2f16, v2bf16] in { @@ -3581,20 +3559,15 @@ def : GCNPat < // Take the upper 16 bits from V[0] and the lower 16 bits from V[1] // Special case, can use V_ALIGNBIT (always uses encoded literal) -let True16Predicate = NotHasTrue16BitInsts in { -defvar BuildVectorToAlignBitPat = +let True16Predicate = NotHasTrue16BitInsts in +def : GCNPat < (vecTy (DivergentBinFrag<build_vector> (Ty !if(!eq(Ty, i16), (Ty (trunc (srl VGPR_32:$a, (i32 16)))), (Ty (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))))), - (Ty VGPR_32:$b))); - -let SubtargetPredicate = isNotGFX9Plus in -def : GCNPat<BuildVectorToAlignBitPat, (V_ALIGNBIT_B32_e64 VGPR_32:$b, VGPR_32:$a, (i32 16))>; - -let SubtargetPredicate = isGFX9GFX10 in -def : GCNPat<BuildVectorToAlignBitPat, (V_ALIGNBIT_B32_opsel_e64 0, VGPR_32:$b, 0, VGPR_32:$a, 0, (i32 16), 0, 0)>; -} //True16Predicate = NotHasTrue16BitInsts + (Ty VGPR_32:$b))), + (V_ALIGNBIT_B32_e64 VGPR_32:$b, VGPR_32:$a, (i32 16)) +>; let True16Predicate = UseFakeTrue16Insts in def : GCNPat < diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index b0d6fd95cd27..5097ac03954d 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -2225,8 +2225,7 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm( MachineBasicBlock::iterator E = MBB->end(); MachineBasicBlock::iterator MBBI = MI.getIterator(); ++MBBI; - const SITargetLowering *TLI = - static_cast<const SITargetLowering *>(STM->getTargetLowering()); + const SITargetLowering *TLI = STM->getTargetLowering(); for ( ; MBBI != E; ++MBBI) { MachineInstr &MINext = *MBBI; diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 67ad28661da4..75ce67c00228 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -42,7 +42,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, PrivateSegmentWaveByteOffset(false), WorkItemIDX(false), WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false), GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) { - const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI); + const GCNSubtarget &ST = *STI; FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F); WavesPerEU = ST.getWavesPerEU(F); MaxNumWorkGroups = ST.getMaxNumWorkGroups(F); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 9173041a7bcc..fa2b8db6ba55 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -4052,11 +4052,11 @@ SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC, return 0; } -unsigned -SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI, - const TargetRegisterClass &RC) const { +unsigned SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI, + const TargetRegisterClass &RC, + bool IncludeCalls) const { for (MCPhysReg Reg : reverse(RC.getRegisters())) - if (MRI.isPhysRegUsed(Reg)) + if (MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls)) return getHWRegIndex(Reg) + 1; return 0; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 06a7a17b0246..0008e5f8cf3b 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -486,9 +486,11 @@ public: unsigned SubReg) const; // \returns a number of registers of a given \p RC used in a function. - // Does not go inside function calls. + // Does not go inside function calls. If \p IncludeCalls is true, it will + // include registers that may be clobbered by calls. unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI, - const TargetRegisterClass &RC) const; + const TargetRegisterClass &RC, + bool IncludeCalls = true) const; std::optional<uint8_t> getVRegFlagValue(StringRef Name) const override { return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index d24c301fc1e5..c194e5c255d4 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -1294,6 +1294,7 @@ def VISrc_256_f32 : SrcRegOrImm9 <VReg_256, "OPERAND_REG_INLINE_C_FP32">; def VISrc_256_f64 : SrcRegOrImm9 <VReg_256, "OPERAND_REG_INLINE_C_FP64">; def VISrc_512_b32 : SrcRegOrImm9 <VReg_512, "OPERAND_REG_INLINE_C_INT32">; def VISrc_512_f32 : SrcRegOrImm9 <VReg_512, "OPERAND_REG_INLINE_C_FP32">; +def VISrc_512_f64 : SrcRegOrImm9 <VReg_512, "OPERAND_REG_INLINE_C_FP64">; def VISrc_1024_b32 : SrcRegOrImm9 <VReg_1024, "OPERAND_REG_INLINE_C_INT32">; def VISrc_1024_f32 : SrcRegOrImm9 <VReg_1024, "OPERAND_REG_INLINE_C_FP32">; diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td index 1679cee32006..ef8faffa5f55 100644 --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -66,6 +66,13 @@ def Write4PassDGEMM : SchedWrite; def Write8PassDGEMM : SchedWrite; def Write16PassDGEMM : SchedWrite; +// WMMA/SWMMA instructions +def WriteXDL2PassWMMA : SchedWrite; +def WriteXDL4PassWMMA : SchedWrite; +def Write4PassWMMA : SchedWrite; +def Write8PassWMMA : SchedWrite; +def Write16PassWMMA : SchedWrite; + // Scalar float instructions def WriteSFPU : SchedWrite; @@ -459,6 +466,15 @@ def : InstRW<[WriteCopy], (instrs COPY)>; multiclass GFX125xCommonWriteRes { +let ReleaseAtCycles = [8] in +def : HWWriteRes<WriteXDL2PassWMMA, [HWXDL], 8>; +let ReleaseAtCycles = [16] in +def : HWWriteRes<WriteXDL4PassWMMA, [HWXDL], 16>; + +def : HWWriteRes<Write4PassWMMA, [HWVALU], 16>; +def : HWWriteRes<Write8PassWMMA, [HWVALU], 32>; +def : HWWriteRes<Write16PassWMMA, [HWVALU], 64>; + def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>; def : HWWriteRes<WriteFloatCvt, [HWVALU, HWRC], 5>; def : HWWriteRes<WriteTrans32, [HWTransVALU, HWRC], 7>; @@ -476,6 +492,11 @@ def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>; def : HWWriteRes<WriteBarrier, [HWBranch], 2000>; def : InstRW<[WriteCopy], (instrs COPY)>; + +def : InstRW<[WriteXDL2PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(FP8|BF8|BF16|F16)_w32")>; +def : InstRW<[WriteXDL4PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(IU8|IU4)_w32")>; +def : InstRW<[Write4PassWMMA], (instregex "^V_WMMA_F32_16X16X4_F32_w32")>; +def : InstRW<[WriteXDL2PassWMMA], (instregex "^V_WMMA.*_F32_32X16X128_F4")>; } // End GFX125xCommonWriteRes let SchedModel = GFX1250SpeedModel in { diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index fd39b8a1350c..7a519117f248 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -463,6 +463,10 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { case AMDGPU::V_FMA_F16_gfx9_fake16_e64: NewOpcode = AMDGPU::V_FMAAK_F16_fake16; break; + case AMDGPU::V_FMA_F64_e64: + if (ST->hasFmaakFmamkF64Insts()) + NewOpcode = AMDGPU::V_FMAAK_F64; + break; } } @@ -497,6 +501,10 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { case AMDGPU::V_FMA_F16_gfx9_fake16_e64: NewOpcode = AMDGPU::V_FMAMK_F16_fake16; break; + case AMDGPU::V_FMA_F64_e64: + if (ST->hasFmaakFmamkF64Insts()) + NewOpcode = AMDGPU::V_FMAMK_F64; + break; } } @@ -961,7 +969,9 @@ bool SIShrinkInstructions::run(MachineFunction &MF) { MI.getOpcode() == AMDGPU::V_FMA_F16_e64 || MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64 || MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_t16_e64 || - MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64) { + MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64 || + (MI.getOpcode() == AMDGPU::V_FMA_F64_e64 && + ST->hasFmaakFmamkF64Insts())) { shrinkMadFma(MI); continue; } @@ -1058,7 +1068,11 @@ bool SIShrinkInstructions::run(MachineFunction &MF) { // fold an immediate into the shrunk instruction as a literal operand. In // GFX10 VOP3 instructions can take a literal operand anyway, so there is // no advantage to doing this. - if (ST->hasVOP3Literal() && !IsPostRA) + // However, if 64-bit literals are allowed we still need to shrink it + // for such literal to be able to fold. + if (ST->hasVOP3Literal() && + (!ST->has64BitLiterals() || AMDGPU::isTrue16Inst(MI.getOpcode())) && + !IsPostRA) continue; if (ST->hasTrue16BitInsts() && AMDGPU::isTrue16Inst(MI.getOpcode()) && diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 2472b76fcf02..e103ccc2f00e 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -154,6 +154,10 @@ class SOP1_1 <string opName, list<dag> pattern=[]> : SOP1_Pseudo < let has_sdst = 0; } +class SOP1_1_REGIMM64 <string opName, list<dag> pattern=[]> : SOP1_Pseudo < + opName, (outs), (ins SSrc_b64:$src0), "$src0", pattern> { + let has_sdst = 0; +} class UniformUnaryFrag<SDPatternOperator Op> : PatFrag < (ops node:$src0), @@ -317,6 +321,9 @@ let isTerminator = 1, isBarrier = 1, SchedRW = [WriteBranch] in { let isBranch = 1, isIndirectBranch = 1 in { def S_SETPC_B64 : SOP1_1 <"s_setpc_b64">; + +let SubtargetPredicate = HasAddPC64Inst in +def S_ADD_PC_I64 : SOP1_1_REGIMM64 <"s_add_pc_i64">; } // End isBranch = 1, isIndirectBranch = 1 let isReturn = 1 in { @@ -2130,6 +2137,9 @@ defm S_GET_BARRIER_STATE_IMM : SOP1_IMM_Real_gfx12<0x050>; defm S_ALLOC_VGPR : SOP1_Real_gfx12<0x053>; defm S_SLEEP_VAR : SOP1_IMM_Real_gfx12<0x058>; +// GFX1250 +defm S_ADD_PC_I64 : SOP1_Real_gfx12<0x04b>; + //===----------------------------------------------------------------------===// // SOP1 - GFX1150, GFX12 //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index a32078cc403e..77258810dd68 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -296,6 +296,7 @@ unsigned getCompletionActionImplicitArgPosition(unsigned CodeObjectVersion) { #define GET_MIMGOffsetMappingTable_IMPL #define GET_MIMGG16MappingTable_IMPL #define GET_MAIInstInfoTable_IMPL +#define GET_WMMAInstInfoTable_IMPL #include "AMDGPUGenSearchableTables.inc" int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, @@ -568,6 +569,11 @@ bool getMAIIsGFX940XDL(unsigned Opc) { return Info && Info->is_gfx940_xdl; } +bool getWMMAIsXDL(unsigned Opc) { + const WMMAInstInfo *Info = getWMMAInstInfoHelper(Opc); + return Info ? Info->is_wmma_xdl : false; +} + uint8_t mfmaScaleF8F6F4FormatToNumRegs(unsigned EncodingVal) { switch (EncodingVal) { case MFMAScaleFormats::FP6_E2M3: @@ -639,6 +645,7 @@ bool isMAC(unsigned Opc) { Opc == AMDGPU::V_MAC_LEGACY_F32_e64_gfx10 || Opc == AMDGPU::V_MAC_F16_e64_vi || Opc == AMDGPU::V_FMAC_F64_e64_gfx90a || + Opc == AMDGPU::V_FMAC_F64_e64_gfx12 || Opc == AMDGPU::V_FMAC_F32_e64_gfx10 || Opc == AMDGPU::V_FMAC_F32_e64_gfx11 || Opc == AMDGPU::V_FMAC_F32_e64_gfx12 || diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 6708e0a3f454..c9d2c286bf23 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -119,6 +119,11 @@ struct True16D16Info { unsigned LoOp; }; +struct WMMAInstInfo { + uint16_t Opcode; + bool is_wmma_xdl; +}; + #define GET_MIMGBaseOpcode_DECL #define GET_MIMGDim_DECL #define GET_MIMGEncoding_DECL @@ -129,6 +134,7 @@ struct True16D16Info { #define GET_isMFMA_F8F6F4Table_DECL #define GET_isCvtScaleF32_F32F16ToF8F4Table_DECL #define GET_True16D16Table_DECL +#define GET_WMMAInstInfoTable_DECL #include "AMDGPUGenSearchableTables.inc" namespace IsaInfo { @@ -593,6 +599,9 @@ bool getMAIIsDGEMM(unsigned Opc); LLVM_READONLY bool getMAIIsGFX940XDL(unsigned Opc); +LLVM_READONLY +bool getWMMAIsXDL(unsigned Opc); + // Get an equivalent BitOp3 for a binary logical \p Opc. // \returns BitOp3 modifier for the logical operation or zero. // Used in VOPD3 conversion. diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 211112e5262a..f621f8581f77 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -366,6 +366,9 @@ defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, int_amdgcn_sqrt>; let TRANS = 1, SchedRW = [WriteTrans32] in { defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>; defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>; + +let SubtargetPredicate = HasTanhInsts in +defm V_TANH_F32 : VOP1Inst <"v_tanh_f32", VOP_F32_F32, int_amdgcn_tanh>; } // End TRANS = 1, SchedRW = [WriteTrans32] defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>; @@ -526,6 +529,21 @@ defm V_LOG_F16 : VOP1Inst_t16 <"v_log_f16", VOP_F16_F16, AMDGPUlogf16>; defm V_EXP_F16 : VOP1Inst_t16 <"v_exp_f16", VOP_F16_F16, AMDGPUexpf16>; defm V_SIN_F16 : VOP1Inst_t16 <"v_sin_f16", VOP_F16_F16, AMDGPUsin>; defm V_COS_F16 : VOP1Inst_t16 <"v_cos_f16", VOP_F16_F16, AMDGPUcos>; + +let SubtargetPredicate = HasTanhInsts in { +defm V_TANH_F16 : VOP1Inst_t16 <"v_tanh_f16", VOP_F16_F16, int_amdgcn_tanh>; +} + +let SubtargetPredicate = HasBF16TransInsts in { +defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>; +defm V_RCP_BF16 : VOP1Inst_t16 <"v_rcp_bf16", VOP_BF16_BF16, AMDGPUrcp>; +defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>; +defm V_RSQ_BF16 : VOP1Inst_t16 <"v_rsq_bf16", VOP_BF16_BF16, AMDGPUrsq>; +defm V_LOG_BF16 : VOP1Inst_t16 <"v_log_bf16", VOP_BF16_BF16, AMDGPUlogf16>; +defm V_EXP_BF16 : VOP1Inst_t16 <"v_exp_bf16", VOP_BF16_BF16, AMDGPUexpf16>; +defm V_SIN_BF16 : VOP1Inst_t16 <"v_sin_bf16", VOP_BF16_BF16, AMDGPUsin>; +defm V_COS_BF16 : VOP1Inst_t16 <"v_cos_bf16", VOP_BF16_BF16, AMDGPUcos>; +} } // End TRANS = 1, SchedRW = [WriteTrans32] defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>; defm V_FREXP_EXP_I16_F16 : VOP1Inst_t16_with_profiles <"v_frexp_exp_i16_f16", @@ -785,6 +803,9 @@ let SubtargetPredicate = isGFX1250Plus in { def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f16_fp8, V_CVT_F16_FP8_fake16_e64, 1>; def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f16_bf8, V_CVT_F16_BF8_fake16_e64, 1>; } + + defm V_SAT_PK4_I4_I8 : VOP1Inst_t16<"v_sat_pk4_i4_i8", VOP1_I16_I32, int_amdgcn_sat_pk4_i4_i8>; + defm V_SAT_PK4_U4_U8 : VOP1Inst_t16<"v_sat_pk4_u4_u8", VOP1_I16_I32, int_amdgcn_sat_pk4_u4_u8>; } // End SubtargetPredicate = isGFX1250Plus let SubtargetPredicate = isGFX10Plus in { @@ -1062,6 +1083,13 @@ multiclass VOP1_Real_FULL_t16_and_fake16_gfx1250< VOP1_Real_FULL_with_name<GFX1250Gen, op, opName#"_fake16", asmName>; } +multiclass VOP1_Real_OpSelIsDPP_gfx1250<bits<9> op> : VOP1_Real_e32<GFX1250Gen, op> { + defvar ps = !cast<VOP_Pseudo>(NAME#"_e64"); + def _e64_gfx1250 : + VOP3_Real_Gen<ps, GFX1250Gen>, + VOP3OpSelIsDPP_gfx12<{0, 1, 1, op{6-0}}, ps.Pfl>; +} + defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX12Not12_50Gen, 0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">; defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX1250Gen, 0x06c, "V_CVT_F32_FP8_gfx1250", "v_cvt_f32_fp8">; @@ -1127,11 +1155,25 @@ defm V_CVT_F32_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00b>; defm V_MOV_B64 : VOP1_Real_FULL <GFX1250Gen, 0x1d>; +defm V_TANH_F32 : VOP1_Real_FULL<GFX1250Gen, 0x01e>; +defm V_TANH_F16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>; +defm V_PERMLANE16_SWAP_B32 : VOP1_Real_OpSelIsDPP_gfx1250<0x049>; +defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>; +defm V_PRNG_B32 : VOP1_Real_FULL<GFX1250Gen, 0x04b>; defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">; +defm V_SAT_PK4_I4_I8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x073>; +defm V_SAT_PK4_U4_U8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x074>; defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>; defm V_CVT_PK_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x076>; defm V_CVT_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x077>; defm V_CVT_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x078>; +defm V_RCP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x079>; +defm V_SQRT_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07a>; +defm V_RSQ_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07b>; +defm V_LOG_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07c>; +defm V_EXP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07d>; +defm V_SIN_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07e>; +defm V_COS_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07f>; //===----------------------------------------------------------------------===// // GFX10. diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 25c6cbc3e1ab..030a6e1e978c 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -175,10 +175,14 @@ multiclass VOP2Inst_e64<string opName, def _e64 : VOP3InstBase <opName, P, node, 1>, Commutable_REV<revOp#"_e64", !eq(revOp, opName)>; - let SubtargetPredicate = isGFX11Plus in { - if P.HasExtVOP3DPP then - def _e64_dpp : VOP3_DPP_Pseudo <opName, P>; - } // End SubtargetPredicate = isGFX11Plus + if P.HasExtVOP3DPP then + def _e64_dpp : VOP3_DPP_Pseudo <opName, P> { + let SubtargetPredicate = isGFX11Plus; + } + else if P.HasExt64BitDPP then + def _e64_dpp : VOP3_DPP_Pseudo <opName, P> { + let OtherPredicates = [HasDPALU_DPP]; + } } multiclass VOP2Inst_e64_VOPD<string opName, @@ -1492,7 +1496,9 @@ class Base_VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps, VOP2_DPP<op, ps, opName, p, 1> { let AssemblerPredicate = HasDPP16; let SubtargetPredicate = ps.SubtargetPredicate; - let OtherPredicates = ps.OtherPredicates; + let OtherPredicates = !listconcat(ps.OtherPredicates, + !if(p.HasExt64BitDPP, [HasDPALU_DPP], []), + !if(ps.Pfl.IsRealTrue16, [UseRealTrue16Insts], [])); } class VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps, int subtarget, @@ -1832,6 +1838,9 @@ let SubtargetPredicate = isGFX12Plus in { V_SUBBREV_U32_e32, V_SUBREV_CO_CI_U32_e32_gfx12, "v_subrev_co_ci_u32">; } // End SubtargetPredicate = isGFX12Plus +let SubtargetPredicate = HasFmacF64Inst in +defm V_FMAC_F64 : VOP2_Real_FULL<GFX12Gen, 0x17>; + defm V_FMAMK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x23>; defm V_FMAAK_F64 : VOP2Only_Real_MADK64<GFX1250Gen, 0x24>; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 75c531913ded..2e7f25b67fb6 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -224,12 +224,6 @@ defm V_ALIGNBIT_B32 : VOP3Inst_t16_with_profiles <"v_alignbit_b32", fshr, null_frag>; defm V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbyte>; - -// In gfx9 and 10, opsel is allowed for V_ALIGNBIT_B32 and V_ALIGNBYTE_B32. -// Hardware uses opsel[1:0] to byte-select src2. Other opsel bits are ignored. -defm V_ALIGNBIT_B32_opsel : VOP3Inst <"v_alignbit_b32_opsel", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_OPSEL>>; -defm V_ALIGNBYTE_B32_opsel : VOP3Inst <"v_alignbyte_b32_opsel", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_OPSEL>>; - let True16Predicate = UseRealTrue16Insts in defm V_ALIGNBYTE_B32_t16 : VOP3Inst <"v_alignbyte_b32_t16", VOP3_Profile_True16<VOP_I32_I32_I32_I16, VOP3_OPSEL>>; let True16Predicate = UseFakeTrue16Insts in @@ -1960,9 +1954,6 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { } } // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" -defm V_ALIGNBIT_B32_opsel : VOP3OpSel_Real_gfx10_with_name<0x14e, "V_ALIGNBIT_B32_opsel", "v_alignbit_b32">; -defm V_ALIGNBYTE_B32_opsel : VOP3OpSel_Real_gfx10_with_name<0x14f, "V_ALIGNBYTE_B32_opsel", "v_alignbyte_b32">; - defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx10<0x360>; let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in { @@ -2113,8 +2104,8 @@ defm V_BFI_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14a>; defm V_FMA_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x14b>; defm V_FMA_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x14c>; defm V_LERP_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x14d>; -defm V_ALIGNBIT_B32 : VOP3_Real_gfx6_gfx7<0x14e>; -defm V_ALIGNBYTE_B32 : VOP3_Real_gfx6_gfx7<0x14f>; +defm V_ALIGNBIT_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14e>; +defm V_ALIGNBYTE_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14f>; defm V_MULLIT_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x150>; defm V_MIN3_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x151>; defm V_MIN3_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x152>; @@ -2257,17 +2248,6 @@ multiclass VOP3_Real_BITOP3_gfx9<bits<10> op, string AsmName, bit isSingle = 0> } } -// Instructions such as v_alignbyte_b32 allows op_sel in gfx9, but not in vi. -// The following is created to support that. -multiclass VOP3OpSel_Real_gfx9_with_name<bits<10> op, string opName, string AsmName> { - defvar psName = opName#"_e64"; - def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(psName), SIEncodingFamily.VI>, // note: encoding family is VI - VOP3OpSel_gfx9 <op, !cast<VOP3_Pseudo>(psName).Pfl> { - VOP3_Pseudo ps = !cast<VOP3_Pseudo>(psName); - let AsmString = AsmName # ps.AsmOperands; - } -} - } // End AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>; @@ -2287,10 +2267,8 @@ defm V_BFI_B32 : VOP3_Real_vi <0x1ca>; defm V_FMA_F32 : VOP3_Real_vi <0x1cb>; defm V_FMA_F64 : VOP3_Real_vi <0x1cc>; defm V_LERP_U8 : VOP3_Real_vi <0x1cd>; -let SubtargetPredicate = isGFX8Only in { defm V_ALIGNBIT_B32 : VOP3_Real_vi <0x1ce>; defm V_ALIGNBYTE_B32 : VOP3_Real_vi <0x1cf>; -} defm V_MIN3_F32 : VOP3_Real_vi <0x1d0>; defm V_MIN3_I32 : VOP3_Real_vi <0x1d1>; defm V_MIN3_U32 : VOP3_Real_vi <0x1d2>; @@ -2335,9 +2313,6 @@ defm V_INTERP_P2_LEGACY_F16 : VOP3Interp_F16_Real_gfx9 <0x276, "V_INTERP_P2_F16" defm V_MAD_LEGACY_U16 : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16", "v_mad_legacy_u16">; defm V_MAD_LEGACY_I16 : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16", "v_mad_legacy_i16">; -defm V_ALIGNBIT_B32_opsel : VOP3OpSel_Real_gfx9_with_name <0x1ce, "V_ALIGNBIT_B32_opsel", "v_alignbit_b32">; -defm V_ALIGNBYTE_B32_opsel : VOP3OpSel_Real_gfx9_with_name <0x1cf, "V_ALIGNBYTE_B32_opsel", "v_alignbyte_b32">; - defm V_MAD_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x203, "v_mad_f16">; defm V_MAD_U16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">; defm V_MAD_I16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x205, "v_mad_i16">; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 31997f803dfc..e51e9574f8de 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -1223,6 +1223,8 @@ class WMMAOpcodeMapping<Instruction TwoAddr, Instruction ThreeAddr> { Instruction Opcode2Addr = TwoAddr; Instruction Opcode3Addr = ThreeAddr; Predicate WaveSizePredicate; + Predicate SubtargetPredicate; + field bit is_wmma_xdl; } def WMMAOpcode : GenericEnum { @@ -1315,28 +1317,39 @@ let WaveSizePredicate = isWave64 in { } class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, - bit _IsIU, bit _IsFP8BF8> + bit _IsIU, bit _IsFP8BF8XF32, bit _Has_ImodOp = 0, + bit _HasMatrixReuse = 0, bit _IsF4 = 0> : VOP3P_Profile<VOPProfile<ArgTy>> { bit IsIU = _IsIU; - bit IsFP8BF8 = _IsFP8BF8; - bit IsF16BF16 = !not(!or(IsIU, IsFP8BF8)); + bit NoABMods = !or(_IsFP8BF8XF32, _IsF4); // No IMOD support for A and B + bit IsXF32 = !and(_IsFP8BF8XF32, !eq(ArgTy[1], v8f32)); int IndexType = _IndexType; + let HasMatrixReuse = _HasMatrixReuse; + bit HasIModOp = _Has_ImodOp; + let HasClamp = !and(IsIU, !not(HasIModOp)); let IsPacked = 1; let IsWMMA = !not(_IsSWMMAC); let IsSWMMAC = _IsSWMMAC; - bit IsAB_F16 = !and(IsF16BF16, ArgTy[1].isFP); - bit IsAB_BF16 = !and(IsF16BF16, isIntType<ArgTy[1]>.ret); + bit IsAB_F64 = !or(!eq(ArgTy[1], v2f64), !eq(ArgTy[1], v4f64)); + bit IsAB_F32 = !eq(ArgTy[1], v2f32); + bit IsAB_F16 = !or(!eq(ArgTy[1], v16f16), !eq(ArgTy[1], v8f16), !eq(ArgTy[1], v4f16)); + bit IsAB_BF16 = !or(!eq(ArgTy[1], v16i16), !eq(ArgTy[1], v8i16), !eq(ArgTy[1], v4i16), + !eq(ArgTy[1], v16bf16), !eq(ArgTy[1], v8bf16), !eq(ArgTy[1], v4bf16)); + bit IsF16BF16 = !or(IsAB_F16, IsAB_BF16); + + bit IsC_F64 = !eq(ArgTy[3], v8f64); bit IsC_F32 = !or(!eq(ArgTy[3], v8f32), !eq(ArgTy[3], v4f32)); - bit IsC_BF16 = !or(!eq(ArgTy[3], v8i16), !eq(ArgTy[3], v4i16)); + bit IsC_BF16 = !or(!eq(ArgTy[3], v8i16), !eq(ArgTy[3], v4i16), + !eq(ArgTy[3], v8bf16), !eq(ArgTy[3], v4bf16)); bit IsC_F16 = !or(!eq(ArgTy[3], v8f16), !eq(ArgTy[3], v4f16)); - bit NegLo01 = !or(IsF16BF16, IsIU); - bit NegLo2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA); - bit NegHi01 = IsF16BF16; - bit NegHi2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA); + bit NegLo01 = !not(NoABMods); + bit NegLo2 = !and(!not(IsIU), !not(IsXF32), IsWMMA); + bit NegHi01 = IsF16BF16; // Only F16BF16 can have neg_hi[0:1] + bit NegHi2 = !and(!not(IsIU), !not(IsXF32), IsWMMA); bit NegLoAny = !or(NegLo01, NegLo2); bit NegHiAny = !or(NegHi01, NegHi2); @@ -1345,19 +1358,29 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, let Src1RC64 = !cast<RegisterOperand>("VRegSrc_"#ArgTy[2].Size); let Src2RC64 = !if(IsSWMMAC, DstRC, !cast<RegisterOperand>("VISrc_"#ArgTy[3].Size# - !cond(IsC_F32: "_f32", - IsC_F16: "_f16", + !cond(IsC_F64: "_f64", + IsC_F32: "_f32", + IsC_F16: "_f16", IsC_BF16: "_bf16", 1: "_b32"))); // For f16 and bf16 matrices A and B, each element can be modified by - // fneg(neg_lo,neg_hi = 1). For iu4 and iu8 matrices A and B neg_lo is + // fneg(neg_lo,neg_hi = 1). For f32 and f64, neg_lo[0:1] is allowed, but + // neg_hi[0:1] is ignored. For iu4 and iu8 matrices A and B neg_lo is // overloaded to mean unsigned/signed: neg_lo = 0 (u4 and u8) unsigned(zext) - // neg_lo = 1 (i4 and i8) signed(sext). For f16, bf16 and f32 matrix C each - // element can be modified by fneg(neg_lo = 1) or fabs(neg_hi = 1). + // neg_lo = 1 (i4 and i8) signed(sext). For f16, bf16, f32 and f64 matrix C + // each element can be modified by fneg(neg_lo = 1) or fabs(neg_hi = 1). // Opcode | src0/src1 - matrix A/B | src2 - matrix C or Index // --------------------------------------------------------------------------- + // wmma f64_f64 | neg_lo for neg A/B | neg_lo = 1 neg C(f64) + // | neg_hi ignored | neg_hi = 1 abs C(f64) + // --------------------------------------------------------------------------- + // wmma f32_f32 | neg_lo for neg A/B | neg_lo = 1 neg C(f32) + // | neg_hi ignored | neg_hi = 1 abs C(f32) + // --------------------------------------------------------------------------- + // wmma f32_xf32 | not allowed for xf32 | not allowed + // --------------------------------------------------------------------------- // wmma f32_f16 | both neg_lo,neg_hi = 1 | neg_lo = 1 neg C(f32) // wmma f32_bf16 | neg A/B (f16 or bf16) | neg_hi = 1 abs C(f32) // --------------------------------------------------------------------------- @@ -1368,7 +1391,10 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, // | neg_lo = 1 i4/i8(sext) | i32 matrices // --------------------------------------------------------------------------- // wmma f32_fp8/bf8 | not allowed for | neg_lo = 1 neg C(f32) - // (4 instructions) | f8 and bf8 matrices | neg_hi = 1 abs C(f32) + // | fp8 and bf8 matrices | neg_hi = 1 abs C(f32) + // --------------------------------------------------------------------------- + // wmma f16_fp8/bf8 | not allowed for | neg_lo = 1 neg C(f16) + // | fp8 and bf8 matrices | neg_hi = 1 abs C(f16) // --------------------------------------------------------------------------- // swmmac f32_f16 | both neg_lo,neg_hi = 1 | not allowed for sparse matrix // swmmac f32_bf16 | neg A/B (f16 or bf16) | A Index - matrix C is in dst @@ -1380,103 +1406,153 @@ class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType, // | neg_lo = 1 i4/i8(sext) | A Index - matrix C is in dst // --------------------------------------------------------------------------- // swmmac f32_fp8/bf8 | not allowed for | not allowed for sparse matrix - // (4 instructions) | f8 and bf8 matrices | A Index - matrix C is in dst + // swmmac f16_fp8/bf8 | f8 and bf8 matrices | A Index - matrix C is in dst + // --------------------------------------------------------------------------- // pseudo - // fp8bf8 wmmas don't use src (0 and 1) modifiers, iu use neg_lo, f16 and bf16 + // fp8bf8 and xf32 wmmas don't use src (0 and 1) modifiers, iu use neg_lo, f16 and bf16 // use neg_lo and neg_hi. iu wmmas (C is i32) don't use src 2 modifiers, // remaining wmmas(f16, bf16 and f8bf8) use neg_lo and neg_hi for C (C is f32 // f16 or bf16). swmmac use index_key and don't use src 2 modifiers. - - dag Src0Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src0_modifiers)); - dag Src1Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src1_modifiers)); - dag Src2Mods = !if(IsIU, (ins), (ins PackedF16InputMods:$src2_modifiers)); + dag Src0Mods = !if(NoABMods, (ins), (ins PackedF16InputMods:$src0_modifiers)); + dag Src1Mods = !if(NoABMods, (ins), (ins PackedF16InputMods:$src1_modifiers)); + dag Src2Mods = !if(!or(IsIU, IsXF32, IsSWMMAC), (ins), (ins PackedF16InputMods:$src2_modifiers)); dag IndexKey = !cond(!eq(IndexType, 0) : (ins), !eq(IndexType, 8) : (ins IndexKey8bit:$index_key_8bit), - !eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit)); - dag Clamp = !if(IsIU, (ins Clamp0:$clamp), (ins)); + !eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit), + !eq(IndexType, 32): (ins IndexKey32bit:$index_key_32bit)); + + dag MatrixReuse = !if(HasMatrixReuse, (ins MatrixAReuse:$matrix_a_reuse, MatrixBReuse:$matrix_b_reuse), (ins)); + dag Clamp = !if(HasClamp, (ins Clamp0:$clamp), (ins)); dag Neg = !cond(!and(NegLoAny, NegHiAny) : (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi), !and(NegLoAny, !not(NegHiAny)) : (ins neg_lo0:$neg_lo), !and(!not(NegLoAny), !not(NegHiAny)) : (ins)); let InsVOP3P = !con(Src0Mods, (ins Src0RC64:$src0), Src1Mods, (ins Src1RC64:$src1), !cond(IsWMMA : !con(Src2Mods, (ins Src2RC64:$src2)), - IsSWMMAC : !con((ins DstRC:$srcTiedDef), (ins VRegSrc_32:$src2), IndexKey)), - Clamp, Neg); + IsSWMMAC : !con((ins DstRC:$srcTiedDef), + !if(!eq(IndexType, 32), + (ins VRegSrc_64:$src2), + (ins VRegSrc_32:$src2)), + IndexKey)), + MatrixReuse, Clamp, Neg); // asm string IndexKeyAsm = !cond(!eq(IndexType, 0) : "", !eq(IndexType, 8) : "$index_key_8bit", - !eq(IndexType, 16) : "$index_key_16bit"); - string ClampAsm = !if(IsIU, "$clamp", ""); + !eq(IndexType, 16) : "$index_key_16bit", + !eq(IndexType, 32) : "$index_key_32bit"); + string MatrixReuseAsm = !if(HasMatrixReuse, "$matrix_a_reuse$matrix_b_reuse", ""); + string ClampAsm = !if(HasClamp, "$clamp", ""); string NegAsm = !cond(!and(NegLoAny, NegHiAny) : "$neg_lo$neg_hi", !and(NegLoAny, !not(NegHiAny)) : "$neg_lo", !and(!not(NegLoAny), !not(NegHiAny)) : ""); - let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#NegAsm#ClampAsm; + let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#MatrixReuseAsm#NegAsm#ClampAsm; // isel patterns + bit IsAB_BF16_IMod0 = !and(IsAB_BF16, !not(HasIModOp)); + bit IsAB_F16_IMod0 = !and(IsAB_F16, !not(HasIModOp)); + bit IsAB_F32F64_IMod1 = !and(!or(IsAB_F64, IsAB_F32), HasIModOp); + bit IsAB_F16BF16_IMod1 = !and(!or(IsAB_F16, IsAB_BF16), HasIModOp); + dag Src0InPat = !cond(IsAB_F32F64_IMod1 : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0), + IsAB_F16BF16_IMod1 : (ins (VOP3PModsNegs i32:$src0_modifiers), Src0VT:$src0), + IsAB_F16_IMod0 : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))), + IsAB_BF16_IMod0 : (ins Src0VT:$src0), + IsIU : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0), + NoABMods : (ins Src0VT:$src0)); + dag Src0OutPat = !cond(IsAB_F32F64_IMod1 : (ins i32:$src0_modifiers, Src0VT:$src0), + IsAB_F16BF16_IMod1 : (ins i32:$src0_modifiers, Src0VT:$src0), + IsAB_F16_IMod0 : (ins i32:$src0_modifiers, Src0VT:$src0), + IsAB_BF16_IMod0 : (ins (i32 8), Src0VT:$src0), + IsIU : (ins i32:$src0_modifiers, Src0VT:$src0), + NoABMods : (ins Src0VT:$src0)); + dag Src1InPat = !cond(IsAB_F32F64_IMod1 : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1), + IsAB_F16BF16_IMod1 : (ins (VOP3PModsNegs i32:$src1_modifiers), Src1VT:$src1), + IsAB_F16_IMod0 : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))), + IsAB_BF16_IMod0 : (ins Src1VT:$src1), + IsIU : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1), + NoABMods : (ins Src1VT:$src1)); + dag Src1OutPat = !cond(IsAB_F32F64_IMod1 : (ins i32:$src1_modifiers, Src1VT:$src1), + IsAB_F16BF16_IMod1 : (ins i32:$src1_modifiers, Src1VT:$src1), + IsAB_F16_IMod0 : (ins i32:$src1_modifiers, Src1VT:$src1), + IsAB_BF16_IMod0 : (ins (i32 8), Src1VT:$src1), + IsIU : (ins i32:$src1_modifiers, Src1VT:$src1), + NoABMods : (ins Src1VT:$src1)); + bit IsC_IMod1 = !and(HasIModOp, IsWMMA, !not(IsIU), !not(IsXF32)); + bit IsC_F32_IMod0 = !and(IsC_F32, !not(HasIModOp)); + bit IsC_F16_IMod0 = !and(IsC_F16, !not(HasIModOp)); + bit IsC_BF16_IMod0 = !and(IsC_BF16, !not(HasIModOp)); + bit IsIUXF32 = !or(IsIU, IsXF32); + dag Src2InPatWmma = !cond(IsC_IMod1 : (ins (VOP3PModsNegAbs i32:$src2_modifiers), Src2VT:$src2), + IsC_F32_IMod0 : (ins (Src2VT (WMMAModsF32NegAbs Src2VT:$src2, i32:$src2_modifiers))), + IsC_F16_IMod0 : (ins (Src2VT (WMMAModsF16NegAbs Src2VT:$src2, i32:$src2_modifiers))), + IsC_BF16_IMod0 : (ins Src2VT:$src2), + IsIUXF32 : (ins Src2VT:$src2), + IsSWMMAC : (ins)); + dag Src2OutPatWmma = !cond(IsC_IMod1 : (ins i32:$src2_modifiers, Src2VT:$src2), + IsC_F32_IMod0 : (ins i32:$src2_modifiers, Src2VT:$src2), + IsC_F16_IMod0 : (ins i32:$src2_modifiers, Src2VT:$src2), + IsC_BF16_IMod0 : (ins (i32 8), Src2VT:$src2), + IsIUXF32 : (ins Src2VT:$src2), + IsSWMMAC : (ins)); + dag ClampPat = !if(HasClamp, (ins i1:$clamp), (ins)); - dag Src0InPat = !cond(IsAB_F16 : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))), - IsAB_BF16 : (ins Src0VT:$src0), - IsIU : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0), - IsFP8BF8 : (ins Src0VT:$src0)); - dag Src0OutPat = !cond(IsAB_F16 : (ins i32:$src0_modifiers, Src0VT:$src0), - IsAB_BF16 : (ins (i32 8), Src0VT:$src0), - IsIU : (ins i32:$src0_modifiers, Src0VT:$src0), - IsFP8BF8 : (ins Src0VT:$src0)); - dag Src1InPat = !cond(IsAB_F16 : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))), - IsAB_BF16 : (ins Src1VT:$src1), - IsIU : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1), - IsFP8BF8 : (ins Src1VT:$src1)); - dag Src1OutPat = !cond(IsAB_F16 : (ins i32:$src1_modifiers, Src1VT:$src1), - IsAB_BF16 : (ins (i32 8), Src1VT:$src1), - IsIU : (ins i32:$src1_modifiers, Src1VT:$src1), - IsFP8BF8 : (ins Src1VT:$src1)); - dag Src2InPatWmma = !cond(IsC_F32 : (ins (Src2VT (WMMAModsF32NegAbs Src2VT:$src2, i32:$src2_modifiers))), - IsC_F16 : (ins (Src2VT (WMMAModsF16NegAbs Src2VT:$src2, i32:$src2_modifiers))), - IsC_BF16 : (ins Src2VT:$src2), - IsIU : (ins Src2VT:$src2), - IsSWMMAC : (ins)); - dag Src2OutPatWmma = !cond(IsC_F32 : (ins i32:$src2_modifiers, Src2VT:$src2), - IsC_F16 : (ins i32:$src2_modifiers, Src2VT:$src2), - IsC_BF16 : (ins (i32 8), Src2VT:$src2), - IsIU : (ins Src2VT:$src2), - IsSWMMAC : (ins)); - dag ClampPat = !if(IsIU, (ins i1:$clamp), (ins)); dag IndexInPat = !cond(!eq(IndexType, 0) : (ins i32:$src2), !eq(IndexType, 8) : (ins (i32 (SWMMACIndex8 i32:$src2, i32:$index_key_8bit))), - !eq(IndexType, 16): (ins (i32 (SWMMACIndex16 i32:$src2, i32:$index_key_16bit)))); + !eq(IndexType, 16): (ins (i32 (SWMMACIndex16 i32:$src2, i32:$index_key_16bit))), + !eq(IndexType, 32): (ins (i64 (SWMMACIndex32 i64:$src2, i32:$index_key_32bit)))); dag IndexOutPat = !cond(!eq(IndexType, 0) : (ins i32:$src2), !eq(IndexType, 8) : (ins i32:$src2, i32:$index_key_8bit), - !eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit)); - dag Src2InlineInPat = (ins (Src2VT (WMMAVISrc Src2VT:$src2))); - dag Src2InlineOutPat = !con(!if(IsIU, (ins), (ins (i32 8))), (ins Src2VT:$src2)); + !eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit), + !eq(IndexType, 32): (ins i64:$src2, i32:$index_key_32bit)); + dag Src2InlineInPat = !con(!if(IsC_IMod1, (ins (VOP3PModsNegAbs i32:$src2_modifiers)), (ins)), (ins (Src2VT (WMMAVISrc Src2VT:$src2)))); + dag Src2InlineOutPat = !con(!if(IsIUXF32, (ins), !if(IsC_IMod1, (ins i32:$src2_modifiers), (ins (i32 8)))), (ins Src2VT:$src2)); + dag MatrixReuseInPat = !if(HasMatrixReuse, (ins timm:$matrix_a_reuse, timm:$matrix_b_reuse), (ins)); + dag MatrixReuseOutModPat = !if(HasMatrixReuse, (ins i1:$matrix_a_reuse, i1:$matrix_b_reuse), (ins)); - dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, ClampPat); - dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, ClampPat); + dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, MatrixReuseInPat, ClampPat); + dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixReuseOutModPat, ClampPat); - dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, ClampPat); - dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, ClampPat); + dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, MatrixReuseInPat, ClampPat); + dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, MatrixReuseOutModPat, ClampPat); // wmma pattern where src2 is inline imm uses _threeaddr pseudo, // can't use _twoaddr since it would violate src2 tied to vdst constraint. - dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, ClampPat); - dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, ClampPat); + dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, MatrixReuseInPat, ClampPat); + dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixReuseOutModPat, ClampPat); } -multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix> { +def WMMAInstInfoTable : GenericTable { + let FilterClass = "WMMAInstInfo"; + let CppTypeName = "WMMAInstInfo"; + let Fields = ["Opcode", "is_wmma_xdl"]; + + let PrimaryKey = ["Opcode"]; + let PrimaryKeyName = "getWMMAInstInfoHelper"; +} + +class WMMAInstInfo { + Instruction Opcode = !cast<Instruction>(NAME); + bit is_wmma_xdl = 0; +} + +multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix, bit DiffVdstSrc2 = 0> { + + defvar WMMAConstraints2Addr = !if(DiffVdstSrc2, "@earlyclobber $vdst", "@earlyclobber $vdst,$vdst = $src2"); + defvar WMMAConstraints3Addr = "@earlyclobber $vdst"; + let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in { - let Constraints = "@earlyclobber $vdst,$vdst = $src2", isConvertibleToThreeAddress = 1 in - def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>{ + let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in + def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo { let PseudoInstr = Instr#PseudoInstrSuffix; } - let Constraints = "@earlyclobber $vdst", SchedRW = [Write32Bit, Write32Bit] in - def _threeaddr : VOP3P_Pseudo<Instr, WMMAProfile>{ + let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in + def _threeaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo { let PseudoInstr = Instr#PseudoInstrSuffix; } @@ -1486,7 +1562,7 @@ multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string Pse } multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix> { - def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>{ + def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo { let Mnemonic = Instr; let PseudoInstr = Instr#PseudoInstrSuffix; let mayRaiseFPException = 0; @@ -1556,6 +1632,76 @@ def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, i32, v2i32, v4f32], 1, // *** IU4X32_SWMMAC_w64 lanes 0-31 will have 8xi4 remaining lanes are ignored // for matrix A, index is i16; Matrix B uses all lanes +def F64_F64X4_WMMA_w32 : VOP3PWMMA_Profile<[v8f64, v2f64, v2f64, v8f64], 0, 0, 0, 0, 1>; +def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 1>; +def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 1>; +def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 1>; +def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 1>; +def BF16_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 1>; +def BF16F32_BF16_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 1>; +def F32_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v8i32, v8f32], 0, 0, 0, 1, 1, 1>; +def F32_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16i32, v16i32, v8f32], 0, 0, 0, 1, 1, 1>; +def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16], 0, 0, 0, 1, 1, 1>; +def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 1>; +def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 1>; +def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 1>; +def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 1>; +def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 1>; +def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 1>; +def BF16_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v32bf16, v8bf16], 1, 16, 0, 0, 1, 1>; +def F32_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v8i32, v16i32, v8f32], 1, 32, 0, 1, 1, 1>; +def F16_FP8BF8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v16i32, v8f16], 1, 32, 0, 1, 1, 1>; +def I32_IU8X128_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v16i32, v8i32], 1, 32, 1, 0, 1, 1>; + +let WaveSizePredicate = isWave32 in { +let SubtargetPredicate = isGFX125xOnly in { +defm V_WMMA_F32_16X16X4_F32_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x4_f32", F32_F32_WMMA_w32, "_w32">; + +let is_wmma_xdl = 1 in { +defm V_WMMA_F32_16X16X32_BF16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x32_bf16", F32_BF16X32_WMMA_w32, "_w32">; +defm V_WMMA_BF16_16X16X32_BF16_w32 : WMMAInstGFX12<"v_wmma_bf16_16x16x32_bf16", BF16_BF16X32_WMMA_w32, "_w32">; +defm V_WMMA_BF16F32_16X16X32_BF16_w32 : WMMAInstGFX12<"v_wmma_bf16f32_16x16x32_bf16", BF16F32_BF16_WMMA_w32, "_w32", 1>; +defm V_WMMA_F32_16X16X64_FP8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x64_fp8_fp8", F32_FP8BF8X64_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X64_FP8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x64_fp8_bf8", F32_FP8BF8X64_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X64_BF8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x64_bf8_fp8", F32_FP8BF8X64_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X64_BF8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x64_bf8_bf8", F32_FP8BF8X64_WMMA_w32, "_w32">; +defm V_WMMA_F16_16X16X64_FP8_FP8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x64_fp8_fp8", F16_FP8BF8X64_WMMA_w32, "_w32">; +defm V_WMMA_F16_16X16X64_FP8_BF8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x64_fp8_bf8", F16_FP8BF8X64_WMMA_w32, "_w32">; +defm V_WMMA_F16_16X16X64_BF8_FP8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x64_bf8_fp8", F16_FP8BF8X64_WMMA_w32, "_w32">; +defm V_WMMA_F16_16X16X64_BF8_BF8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x64_bf8_bf8", F16_FP8BF8X64_WMMA_w32, "_w32">; +defm V_WMMA_I32_16X16X64_IU8_w32 : WMMAInstGFX12<"v_wmma_i32_16x16x64_iu8", I32_IU8X64_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X32_F16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x32_f16", F32_F16X32_WMMA_w32, "_w32">; +defm V_WMMA_F16_16X16X32_F16_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x32_f16", F16_F16X32_WMMA_w32, "_w32">; +defm V_WMMA_F16_16X16X128_FP8_FP8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x128_fp8_fp8", F16_FP8BF8X128_WMMA_w32, "_w32">; +defm V_WMMA_F16_16X16X128_FP8_BF8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x128_fp8_bf8", F16_FP8BF8X128_WMMA_w32, "_w32">; +defm V_WMMA_F16_16X16X128_BF8_FP8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x128_bf8_fp8", F16_FP8BF8X128_WMMA_w32, "_w32">; +defm V_WMMA_F16_16X16X128_BF8_BF8_w32 : WMMAInstGFX12<"v_wmma_f16_16x16x128_bf8_bf8", F16_FP8BF8X128_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X128_FP8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x128_fp8_fp8", F32_FP8BF8X128_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X128_FP8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x128_fp8_bf8", F32_FP8BF8X128_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X128_BF8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x128_bf8_fp8", F32_FP8BF8X128_WMMA_w32, "_w32">; +defm V_WMMA_F32_16X16X128_BF8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x128_bf8_bf8", F32_FP8BF8X128_WMMA_w32, "_w32">; +defm V_WMMA_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_f32_32x16x128_f4", F32_32X16X128_F4_WMMA_w32, "_w32">; + +defm V_SWMMAC_F32_16X16X64_BF16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x64_bf16", F32_BF16X64_SWMMAC_w32, "_w32">; +defm V_SWMMAC_BF16_16X16X64_BF16_w32 : SWMMACInstGFX12<"v_swmmac_bf16_16x16x64_bf16", BF16_BF16X64_SWMMAC_w32, "_w32">; +defm V_SWMMAC_BF16F32_16X16X64_BF16_w32 : SWMMACInstGFX12<"v_swmmac_bf16f32_16x16x64_bf16", F32_BF16X64_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F32_16X16X128_FP8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x128_fp8_fp8", F32_FP8BF8X128_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F32_16X16X128_FP8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x128_fp8_bf8", F32_FP8BF8X128_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F32_16X16X128_BF8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x128_bf8_fp8", F32_FP8BF8X128_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F32_16X16X128_BF8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x128_bf8_bf8", F32_FP8BF8X128_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F16_16X16X128_FP8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x128_fp8_fp8", F16_FP8BF8X128_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F16_16X16X128_FP8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x128_fp8_bf8", F16_FP8BF8X128_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F16_16X16X128_BF8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x128_bf8_fp8", F16_FP8BF8X128_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F16_16X16X128_BF8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x128_bf8_bf8", F16_FP8BF8X128_SWMMAC_w32, "_w32">; +defm V_SWMMAC_I32_16X16X128_IU8_w32 : SWMMACInstGFX12<"v_swmmac_i32_16x16x128_iu8", I32_IU8X128_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F32_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x64_f16", F32_F16X64_SWMMAC_w32, "_w32">; +defm V_SWMMAC_F16_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x64_f16", F16_F16X64_SWMMAC_w32, "_w32">; + +} // End is_wmma_xdl = 1. + +} // End SubtargetPredicate = isGFX125xOnly +} // End WaveSizePredicate = isWave32 + let WaveSizePredicate = isWave32 in { defm V_WMMA_F32_16X16X16_F16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_f16", F32_F16_WMMA_w32, "_w32">; defm V_WMMA_F32_16X16X16_BF16_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf16", F32_BF16_WMMA_w32, "_w32">; @@ -1628,7 +1774,7 @@ class SWMMACPat_w64<Instruction Inst, SDPatternOperator node, VOP3PWMMA_Profile let WaveSizePredicate = isWave64; } -let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12Plus in { +let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12PlusNot12_50 in { defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w32", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w32>; defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w32", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w32>; defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w32", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w32,1>; @@ -1655,7 +1801,7 @@ let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12Plus in { def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_bf8, F32_FP8BF8_SWMMAC_w32>; } -let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12Plus in { +let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12PlusNot12_50 in { defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w64", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w64>; defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w64", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w64>; defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w64", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w64,1>; @@ -1681,6 +1827,49 @@ let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12Plus in { def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_bf8, F32_FP8BF8_SWMMAC_w64>; } +let WaveSizePredicate = isWave32 in { +let SubtargetPredicate = isGFX125xOnly in { + defm : WMMAPat<"V_WMMA_F32_16X16X4_F32_w32", int_amdgcn_wmma_f32_16x16x4_f32, F32_F32_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X32_BF16_w32", int_amdgcn_wmma_f32_16x16x32_bf16, F32_BF16X32_WMMA_w32>; + defm : WMMAPat<"V_WMMA_BF16_16X16X32_BF16_w32", int_amdgcn_wmma_bf16_16x16x32_bf16, BF16_BF16X32_WMMA_w32>; + defm : WMMAPat<"V_WMMA_BF16F32_16X16X32_BF16_w32", int_amdgcn_wmma_bf16f32_16x16x32_bf16, BF16F32_BF16_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X64_FP8_FP8_w32", int_amdgcn_wmma_f32_16x16x64_fp8_fp8, F32_FP8BF8X64_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X64_FP8_BF8_w32", int_amdgcn_wmma_f32_16x16x64_fp8_bf8, F32_FP8BF8X64_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X64_BF8_FP8_w32", int_amdgcn_wmma_f32_16x16x64_bf8_fp8, F32_FP8BF8X64_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X64_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x64_bf8_bf8, F32_FP8BF8X64_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X64_FP8_FP8_w32", int_amdgcn_wmma_f16_16x16x64_fp8_fp8, F16_FP8BF8X64_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X64_FP8_BF8_w32", int_amdgcn_wmma_f16_16x16x64_fp8_bf8, F16_FP8BF8X64_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X64_BF8_FP8_w32", int_amdgcn_wmma_f16_16x16x64_bf8_fp8, F16_FP8BF8X64_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X64_BF8_BF8_w32", int_amdgcn_wmma_f16_16x16x64_bf8_bf8, F16_FP8BF8X64_WMMA_w32>; + defm : WMMAPat<"V_WMMA_I32_16X16X64_IU8_w32", int_amdgcn_wmma_i32_16x16x64_iu8, I32_IU8X64_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X32_F16_w32", int_amdgcn_wmma_f32_16x16x32_f16, F32_F16X32_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X32_F16_w32", int_amdgcn_wmma_f16_16x16x32_f16, F16_F16X32_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X128_FP8_FP8_w32", int_amdgcn_wmma_f16_16x16x128_fp8_fp8, F16_FP8BF8X128_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X128_FP8_BF8_w32", int_amdgcn_wmma_f16_16x16x128_fp8_bf8, F16_FP8BF8X128_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X128_BF8_FP8_w32", int_amdgcn_wmma_f16_16x16x128_bf8_fp8, F16_FP8BF8X128_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X128_BF8_BF8_w32", int_amdgcn_wmma_f16_16x16x128_bf8_bf8, F16_FP8BF8X128_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X128_FP8_FP8_w32", int_amdgcn_wmma_f32_16x16x128_fp8_fp8, F32_FP8BF8X128_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X128_FP8_BF8_w32", int_amdgcn_wmma_f32_16x16x128_fp8_bf8, F32_FP8BF8X128_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_FP8_w32", int_amdgcn_wmma_f32_16x16x128_bf8_fp8, F32_FP8BF8X128_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x128_bf8_bf8, F32_FP8BF8X128_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_32X16X128_F4_w32", int_amdgcn_wmma_f32_32x16x128_f4, F32_32X16X128_F4_WMMA_w32>; + + def : SWMMACPat<V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x64_bf16, F32_BF16X64_SWMMAC_w32>; + def : SWMMACPat<V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_bf16_16x16x64_bf16, BF16_BF16X64_SWMMAC_w32>; + def : SWMMACPat<V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr, int_amdgcn_swmmac_bf16f32_16x16x64_bf16, F32_BF16X64_SWMMAC_w32>; + def : SWMMACPat<V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x128_fp8_fp8, F32_FP8BF8X128_SWMMAC_w32>; + def : SWMMACPat<V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x128_fp8_bf8, F32_FP8BF8X128_SWMMAC_w32>; + def : SWMMACPat<V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x128_bf8_fp8, F32_FP8BF8X128_SWMMAC_w32>; + def : SWMMACPat<V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x128_bf8_bf8, F32_FP8BF8X128_SWMMAC_w32>; + def : SWMMACPat<V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x128_fp8_fp8, F16_FP8BF8X128_SWMMAC_w32>; + def : SWMMACPat<V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x128_fp8_bf8, F16_FP8BF8X128_SWMMAC_w32>; + def : SWMMACPat<V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x128_bf8_fp8, F16_FP8BF8X128_SWMMAC_w32>; + def : SWMMACPat<V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x128_bf8_bf8, F16_FP8BF8X128_SWMMAC_w32>; + def : SWMMACPat<V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr, int_amdgcn_swmmac_i32_16x16x128_iu8, I32_IU8X128_SWMMAC_w32>; + def : SWMMACPat<V_SWMMAC_F32_16X16X64_F16_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x64_f16, F32_F16X64_SWMMAC_w32>; + def : SWMMACPat<V_SWMMAC_F16_16X16X64_F16_w32_twoaddr, int_amdgcn_swmmac_f16_16x16x64_f16, F16_F16X64_SWMMAC_w32>; +} // End SubtargetPredicate = isGFX125xOnly +} // End WaveSizePredicate = isWave32 //===----------------------------------------------------------------------===// // Begin Real Encodings @@ -1726,13 +1915,14 @@ class VOP3PeWmma<bits<8> op, VOPProfile P, VOP3PWMMA_Profile WMMAP> // opsel let Inst{11} = !cond(!eq(WMMAP.IndexType, 0) : 0, !eq(WMMAP.IndexType, 8) : index_key_8bit{0}, - !eq(WMMAP.IndexType, 16) : index_key_16bit{0}); + !eq(WMMAP.IndexType, 16) : index_key_16bit{0}, + !eq(WMMAP.IndexType, 32) : index_key_32bit{0}); let Inst{12} = !if(!eq(WMMAP.IndexType, 8), index_key_8bit{1}, 0); - let Inst{13} = 0; + let Inst{13} = !if(WMMAP.HasMatrixReuse, matrix_a_reuse, 0); // opsel_hi let Inst{59} = 1; let Inst{60} = 1; - let Inst{14} = 1; + let Inst{14} = !if(WMMAP.HasMatrixReuse, matrix_b_reuse, 1); // neg_lo let Inst{61} = !if(WMMAP.NegLo01, src0_modifiers{0}, 0); let Inst{62} = !if(WMMAP.NegLo01, src1_modifiers{0}, 0); @@ -1742,7 +1932,7 @@ class VOP3PeWmma<bits<8> op, VOPProfile P, VOP3PWMMA_Profile WMMAP> let Inst{9} = !if(WMMAP.NegHi01, src1_modifiers{1}, 0); let Inst{10} = !if(WMMAP.NegHi2, src2_modifiers{1}, 0); // clamp - let Inst{15} = !if(WMMAP.IsIU, clamp{0}, 0); + let Inst{15} = !if(WMMAP.HasClamp, clamp{0}, 0); } multiclass VOP3P_WMMA_Real_Base<GFXGen Gen, bits<8> op, VOP3PWMMA_Profile WMMAP, @@ -1765,6 +1955,12 @@ multiclass VOP3P_Real_WMMA_gfx12w64 <bits<8> op, VOP3PWMMA_Profile WMMAP> { } } +multiclass VOP3P_Real_WMMA_gfx1250 <bits<8> op, VOP3PWMMA_Profile WMMAP> { + let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in { + defm _twoaddr : VOP3P_WMMA_Real_Base <GFX1250Gen, op, WMMAP>; + } +} + defm V_WMMA_F32_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>; defm V_WMMA_F32_16X16X16_BF16_w32 : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>; defm V_WMMA_F16_16X16X16_F16_w32 : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>; @@ -1814,6 +2010,46 @@ defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x058, F32_FP defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x059, F32_FP8BF8_SWMMAC_w64>; defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x05a, F32_FP8BF8_SWMMAC_w64>; +defm V_WMMA_F32_16X16X4_F32_w32 : VOP3P_Real_WMMA_gfx1250 <0x05d, F32_F32_WMMA_w32>; +defm V_WMMA_F32_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x062, F32_BF16X32_WMMA_w32>; +defm V_WMMA_F32_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x060, F32_F16X32_WMMA_w32>; +defm V_WMMA_F16_16X16X32_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x061, F16_F16X32_WMMA_w32>; +defm V_WMMA_BF16_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x063, BF16_BF16X32_WMMA_w32>; +defm V_WMMA_BF16F32_16X16X32_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x064, BF16F32_BF16_WMMA_w32>; +defm V_WMMA_F32_16X16X64_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x06a, F32_FP8BF8X64_WMMA_w32>; +defm V_WMMA_F32_16X16X64_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x06b, F32_FP8BF8X64_WMMA_w32>; +defm V_WMMA_F32_16X16X64_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x06c, F32_FP8BF8X64_WMMA_w32>; +defm V_WMMA_F32_16X16X64_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x06d, F32_FP8BF8X64_WMMA_w32>; +defm V_WMMA_F16_16X16X64_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x06e, F16_FP8BF8X64_WMMA_w32>; +defm V_WMMA_F16_16X16X64_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x06f, F16_FP8BF8X64_WMMA_w32>; +defm V_WMMA_F16_16X16X64_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x070, F16_FP8BF8X64_WMMA_w32>; +defm V_WMMA_F16_16X16X64_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x071, F16_FP8BF8X64_WMMA_w32>; +defm V_WMMA_I32_16X16X64_IU8_w32 : VOP3P_Real_WMMA_gfx1250 <0x072, I32_IU8X64_WMMA_w32>; +defm V_WMMA_F32_16X16X128_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x080, F32_FP8BF8X128_WMMA_w32>; +defm V_WMMA_F32_16X16X128_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x081, F32_FP8BF8X128_WMMA_w32>; +defm V_WMMA_F32_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x082, F32_FP8BF8X128_WMMA_w32>; +defm V_WMMA_F32_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x083, F32_FP8BF8X128_WMMA_w32>; +defm V_WMMA_F16_16X16X128_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x084, F16_FP8BF8X128_WMMA_w32>; +defm V_WMMA_F16_16X16X128_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x085, F16_FP8BF8X128_WMMA_w32>; +defm V_WMMA_F16_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x086, F16_FP8BF8X128_WMMA_w32>; +defm V_WMMA_F16_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x087, F16_FP8BF8X128_WMMA_w32>; +defm V_WMMA_F32_32X16X128_F4_w32 : VOP3P_Real_WMMA_gfx1250 <0x088, F32_32X16X128_F4_WMMA_w32>; + +defm V_SWMMAC_F32_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x065, F32_F16X64_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X64_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x066, F32_BF16X64_SWMMAC_w32>; +defm V_SWMMAC_F16_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x067, F16_F16X64_SWMMAC_w32>; +defm V_SWMMAC_BF16_16X16X64_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x068, BF16_BF16X64_SWMMAC_w32>; +defm V_SWMMAC_BF16F32_16X16X64_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x069, F32_BF16X64_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X128_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x073, F32_FP8BF8X128_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X128_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x074, F32_FP8BF8X128_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x075, F32_FP8BF8X128_SWMMAC_w32>; +defm V_SWMMAC_F32_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x076, F32_FP8BF8X128_SWMMAC_w32>; +defm V_SWMMAC_F16_16X16X128_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x077, F16_FP8BF8X128_SWMMAC_w32>; +defm V_SWMMAC_F16_16X16X128_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x078, F16_FP8BF8X128_SWMMAC_w32>; +defm V_SWMMAC_F16_16X16X128_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx1250 <0x079, F16_FP8BF8X128_SWMMAC_w32>; +defm V_SWMMAC_F16_16X16X128_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx1250 <0x07a, F16_FP8BF8X128_SWMMAC_w32>; +defm V_SWMMAC_I32_16X16X128_IU8_w32 : VOP3P_Real_WMMA_gfx1250 <0x07b, I32_IU8X128_SWMMAC_w32>; + multiclass VOP3P_Real_with_name<GFXGen Gen, bits<8> op, string backing_ps_name = NAME, string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> { diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index df215d23f7f4..a25ebdf3e5f6 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -331,10 +331,19 @@ class VOP3OpSel_gfx9 <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> { // Special case for v_permlane16_swap_b32/v_permlane32_swap_b32 // op_sel[0]/op_sel[1] are treated as bound_ctrl and fi dpp operands. -class VOP3OpSelIsDPP_gfx9 <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> { +class VOP3OpSelIsDPP_base { bits<1> fi; bits<1> bound_ctrl; +} + +class VOP3OpSelIsDPP_gfx9 <bits<10> op, VOPProfile P> : VOP3OpSelIsDPP_base, VOP3e_vi <op, P> { + // OPSEL[0] specifies FI + let Inst{11} = fi; + // OPSEL[1] specifies BOUND_CTRL + let Inst{12} = bound_ctrl; +} +class VOP3OpSelIsDPP_gfx12 <bits<10> op, VOPProfile P> : VOP3OpSelIsDPP_base, VOP3e_gfx11_gfx12 <op, P> { // OPSEL[0] specifies FI let Inst{11} = fi; // OPSEL[1] specifies BOUND_CTRL @@ -432,7 +441,7 @@ class VOP3be <VOPProfile P> : Enc64 { let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); } -class VOP3Pe <VOPProfile P> : Enc64 { +class VOP3Pe_Base { bits<8> vdst; bits<4> src0_modifiers; bits<9> src0; @@ -443,7 +452,12 @@ class VOP3Pe <VOPProfile P> : Enc64 { bits<1> clamp; bits<2> index_key_8bit; bits<1> index_key_16bit; + bits<1> index_key_32bit; + bits<1> matrix_a_reuse; + bits<1> matrix_b_reuse; +} +class VOP3Pe <VOPProfile P> : Enc64, VOP3Pe_Base { let Inst{7-0} = !if(P.HasDst, vdst, 0); let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0 let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1 @@ -451,9 +465,13 @@ class VOP3Pe <VOPProfile P> : Enc64 { let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0) let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1) - let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2) + let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, + !if(P.HasMatrixReuse, matrix_a_reuse, 0)); // op_sel(2) - let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(2) + let Inst{14} = !cond(!and(P.HasSrc2, P.HasOpSel) : src2_modifiers{3}, + P.IsDOT : 1, + P.HasMatrixReuse : matrix_b_reuse, + 1: ?); // op_sel_hi(2) let Inst{15} = !if(P.HasClamp, clamp{0}, 0); diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp index 850b00406f09..1c42f44765ab 100644 --- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp @@ -2041,12 +2041,6 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) { } break; } - case ARM::TRAPNaCl: { - uint32_t Val = 0xe7fedef0UL; - OutStreamer->AddComment("trap"); - ATS.emitInst(Val); - return; - } case ARM::tTRAP: { // Non-Darwin binutils don't yet support the "trap" mnemonic. // FIXME: Remove this special case when they do. diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 52302241fe36..57141ab69223 100644 --- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -2542,9 +2542,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, } case ARM::Int_eh_sjlj_dispatchsetup: { MachineFunction &MF = *MI.getParent()->getParent(); - const ARMBaseInstrInfo *AII = - static_cast<const ARMBaseInstrInfo*>(TII); - const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); + const ARMBaseRegisterInfo &RI = TII->getRegisterInfo(); // For functions using a base pointer, we rematerialize it (via the frame // pointer) here since eh.sjlj.setjmp and eh.sjlj.longjmp don't do it // for us. Otherwise, expand to nothing. diff --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp index 06499a3945ee..7ba2487d2390 100644 --- a/llvm/lib/Target/ARM/ARMFastISel.cpp +++ b/llvm/lib/Target/ARM/ARMFastISel.cpp @@ -2562,8 +2562,7 @@ bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) { const TargetRegisterClass *RC = isThumb2 ? &ARM::tGPRRegClass : &ARM::GPRRegClass; - const ARMBaseRegisterInfo *RegInfo = - static_cast<const ARMBaseRegisterInfo *>(Subtarget->getRegisterInfo()); + const ARMBaseRegisterInfo *RegInfo = Subtarget->getRegisterInfo(); Register FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF)); Register SrcReg = FramePtr; @@ -2636,12 +2635,8 @@ bool ARMFastISel::SelectIntrinsicCall(const IntrinsicInst &I) { return SelectCall(&I, "memset"); } case Intrinsic::trap: { - unsigned Opcode; - if (Subtarget->isThumb()) - Opcode = ARM::tTRAP; - else - Opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP; - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(Opcode)); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, + TII.get(Subtarget->isThumb() ? ARM::tTRAP : ARM::TRAP)); return true; } } diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td index bb437698296c..9b1fa5d7b99d 100644 --- a/llvm/lib/Target/ARM/ARMFeatures.td +++ b/llvm/lib/Target/ARM/ARMFeatures.td @@ -451,12 +451,6 @@ def FeatureVirtualization : SubtargetFeature<"virtualization", "Supports Virtualization extension", [FeatureHWDivThumb, FeatureHWDivARM]>; -// Special TRAP encoding for NaCl, which looks like a TRAP in Thumb too. -// See ARMInstrInfo.td for details. -// True if NaCl TRAP instruction is generated instead of the regular TRAP. -def FeatureNaClTrap : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true", - "NaCl trap">; - // True if the subtarget disallows unaligned memory // accesses for some types. For details, see // ARMTargetLowering::allowsMisalignedMemoryAccesses(). diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp index 50d8eee8644c..a8da70eadea5 100644 --- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp +++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp @@ -1747,9 +1747,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, RetOpcode == ARM::TCRETURNrinotr12); isInterrupt = RetOpcode == ARM::SUBS_PC_LR || RetOpcode == ARM::t2SUBS_PC_LR; - isTrap = - RetOpcode == ARM::TRAP || RetOpcode == ARM::TRAPNaCl || - RetOpcode == ARM::tTRAP; + isTrap = RetOpcode == ARM::TRAP || RetOpcode == ARM::tTRAP; isCmseEntry = (RetOpcode == ARM::tBXNS || RetOpcode == ARM::tBXNS_RET); } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index fb72bab03e75..fd3b0525c105 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -3545,8 +3545,7 @@ SDValue ARMTargetLowering::LowerConstantPool(SDValue Op, auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>(); auto T = const_cast<Type*>(CP->getType()); auto C = const_cast<Constant*>(CP->getConstVal()); - auto M = const_cast<Module*>(DAG.getMachineFunction(). - getFunction().getParent()); + auto M = DAG.getMachineFunction().getFunction().getParent(); auto GV = new GlobalVariable( *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C, Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" + @@ -11040,13 +11039,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, DispatchBB->setIsEHPad(); MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); - unsigned trap_opcode; - if (Subtarget->isThumb()) - trap_opcode = ARM::tTRAP; - else - trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP; - BuildMI(TrapBB, dl, TII->get(trap_opcode)); + BuildMI(TrapBB, dl, TII->get(Subtarget->isThumb() ? ARM::tTRAP : ARM::TRAP)); DispatchBB->addSuccessor(TrapBB); MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); @@ -21590,7 +21584,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 bool ARMTargetLowering::lowerInterleavedLoad( - LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, + Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); @@ -21598,6 +21592,11 @@ bool ARMTargetLowering::lowerInterleavedLoad( assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); + auto *LI = dyn_cast<LoadInst>(Load); + if (!LI) + return false; + assert(!Mask && "Unexpected mask on a load"); + auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType()); Type *EltTy = VecTy->getElementType(); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 5f4aef55b22c..9159f3d2c3ed 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -681,7 +681,7 @@ class VectorType; unsigned getMaxSupportedInterleaveFactor() const override; - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const override; diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td index 1f5ba998970f..934ec52c6f1e 100644 --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -486,11 +486,6 @@ def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{ return hasNoVMLxHazardUse(N); }]>; -// An 'fadd' node which can be contracted into a fma -def fadd_contract : PatFrag<(ops node:$lhs, node:$rhs),(fadd node:$lhs, node:$rhs),[{ - return N->getFlags().hasAllowContract(); -}]>; - def imm_even : ImmLeaf<i32, [{ return (Imm & 1) == 0; }]>; def imm_odd : ImmLeaf<i32, [{ return (Imm & 1) == 1; }]>; @@ -2387,29 +2382,13 @@ def UDF : AInoP<(outs), (ins imm0_65535:$imm16), MiscFrm, NoItinerary, /* * A5.4 Permanently UNDEFINED instructions. * - * For most targets use UDF #65006, for which the OS will generate SIGTRAP. - * Other UDF encodings generate SIGILL. + * Targets use UDF #65006, for which the OS will generate SIGTRAP. * - * NaCl's OS instead chooses an ARM UDF encoding that's also a UDF in Thumb. - * Encoding A1: - * 1110 0111 1111 iiii iiii iiii 1111 iiii - * Encoding T1: - * 1101 1110 iiii iiii - * It uses the following encoding: - * 1110 0111 1111 1110 1101 1110 1111 0000 - * - In ARM: UDF #60896; - * - In Thumb: UDF #254 followed by a branch-to-self. */ let isTrap = 1 in -def TRAPNaCl : AXI<(outs), (ins), MiscFrm, NoItinerary, - "trap", [(trap)]>, - Requires<[IsARM,UseNaClTrap]> { - let Inst = 0xe7fedef0; -} -let isTrap = 1 in def TRAP : AXI<(outs), (ins), MiscFrm, NoItinerary, "trap", [(trap)]>, - Requires<[IsARM,DontUseNaClTrap]> { + Requires<[IsARM]> { let Inst = 0xe7ffdefe; } diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp index 06f362b26744..b84f685f214c 100644 --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -1293,7 +1293,7 @@ bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { RDA = &getAnalysis<ReachingDefAnalysis>(); MF->getProperties().setTracksLiveness(); MRI = &MF->getRegInfo(); - TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo()); + TII = ST.getInstrInfo(); TRI = ST.getRegisterInfo(); BBUtils = std::make_unique<ARMBasicBlockUtils>(*MF); BBUtils->computeAllBlockSizes(); diff --git a/llvm/lib/Target/ARM/ARMPredicates.td b/llvm/lib/Target/ARM/ARMPredicates.td index ddc5ad8754ee..c638e96a355d 100644 --- a/llvm/lib/Target/ARM/ARMPredicates.td +++ b/llvm/lib/Target/ARM/ARMPredicates.td @@ -167,16 +167,12 @@ def IsARM : Predicate<"!Subtarget->isThumb()">, AssemblerPredicate<(all_of (not ModeThumb)), "arm-mode">; def IsMachO : Predicate<"Subtarget->isTargetMachO()">; def IsNotMachO : Predicate<"!Subtarget->isTargetMachO()">; -def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">; def IsWindows : Predicate<"Subtarget->isTargetWindows()">; def IsNotWindows : Predicate<"!Subtarget->isTargetWindows()">; def IsReadTPTPIDRURW : Predicate<"Subtarget->isReadTPTPIDRURW()">; def IsReadTPTPIDRURO : Predicate<"Subtarget->isReadTPTPIDRURO()">; def IsReadTPTPIDRPRW : Predicate<"Subtarget->isReadTPTPIDRPRW()">; def IsReadTPSoft : Predicate<"Subtarget->isReadTPSoft()">; -def UseNaClTrap : Predicate<"Subtarget->useNaClTrap()">, - AssemblerPredicate<(all_of FeatureNaClTrap), "NaCl">; -def DontUseNaClTrap : Predicate<"!Subtarget->useNaClTrap()">; def UseNegativeImmediates : Predicate<"false">, diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp index 13185a7d797a..9f600e0c685a 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.cpp +++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -189,7 +189,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { if (TM.isAAPCS_ABI()) stackAlignment = Align(8); - if (isTargetNaCl() || TM.isAAPCS16_ABI()) + if (TM.isAAPCS16_ABI()) stackAlignment = Align(16); // FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo:: @@ -407,10 +407,9 @@ bool ARMSubtarget::useFastISel() const { if (!hasV6Ops()) return false; - // Thumb2 support on iOS; ARM support on iOS, Linux and NaCl. - return TM.Options.EnableFastISel && - ((isTargetMachO() && !isThumb1Only()) || - (isTargetLinux() && !isThumb()) || (isTargetNaCl() && !isThumb())); + // Thumb2 support on iOS; ARM support on iOS and Linux. + return TM.Options.EnableFastISel && ((isTargetMachO() && !isThumb1Only()) || + (isTargetLinux() && !isThumb())); } unsigned ARMSubtarget::getGPRAllocationOrder(const MachineFunction &MF) const { diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h index beb1ff644714..637eb4560e0f 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/llvm/lib/Target/ARM/ARMSubtarget.h @@ -338,7 +338,6 @@ public: bool isTargetWatchABI() const { return TargetTriple.isWatchABI(); } bool isTargetDriverKit() const { return TargetTriple.isDriverKit(); } bool isTargetLinux() const { return TargetTriple.isOSLinux(); } - bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); } bool isTargetNetBSD() const { return TargetTriple.isOSNetBSD(); } bool isTargetWindows() const { return TargetTriple.isOSWindows(); } diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp index c66232ef4dc7..e8d0d3508077 100644 --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -166,9 +166,8 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU, // Integer registers are 32 bits. Ret += "-n32"; - // The stack is 128 bit aligned on NaCl, 64 bit aligned on AAPCS and 32 bit - // aligned everywhere else. - if (TT.isOSNaCl() || ABI == ARM::ARM_ABI_AAPCS16) + // The stack is 64 bit aligned on AAPCS and 32 bit aligned everywhere else. + if (ABI == ARM::ARM_ABI_AAPCS16) Ret += "-S128"; else if (ABI == ARM::ARM_ABI_AAPCS) Ret += "-S64"; diff --git a/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp b/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp index cf84f1043cc6..3692eeeaaa64 100644 --- a/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp +++ b/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp @@ -16,7 +16,6 @@ #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSectionELF.h" -#include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/MCValue.h" #include "llvm/MC/SectionKind.h" #include "llvm/Target/TargetMachine.h" diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 05d4069a686a..6f37eca2b00a 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1330,8 +1330,7 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy); if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE, LT.second)) - return LT.first * Entry->Cost * - ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput); + return LT.first * Entry->Cost * ST->getMVEVectorCostFactor(CostKind); } if (!Mask.empty()) { @@ -1340,7 +1339,7 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Mask.size() <= LT.second.getVectorNumElements() && (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) || isVREVMask(Mask, LT.second, 64))) - return ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) * LT.first; + return ST->getMVEVectorCostFactor(CostKind) * LT.first; } } @@ -1348,7 +1347,7 @@ InstructionCost ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, if (IsExtractSubvector) Kind = TTI::SK_ExtractSubvector; int BaseCost = ST->hasMVEIntegerOps() && SrcTy->isVectorTy() - ? ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput) + ? ST->getMVEVectorCostFactor(CostKind) : 1; return BaseCost * BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp); diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index ca06b9e3cb66..522c235a90a8 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -91,9 +91,9 @@ class ARMTTIImpl final : public BasicTTIImplBase<ARMTTIImpl> { ARM::FeatureAvoidMOVsShOp, ARM::FeatureHasRetAddrStack, ARM::FeatureHasNoBranchPredictor, ARM::FeatureDSP, ARM::FeatureMP, ARM::FeatureVirtualization, ARM::FeatureMClass, ARM::FeatureRClass, - ARM::FeatureAClass, ARM::FeatureNaClTrap, ARM::FeatureStrictAlign, - ARM::FeatureLongCalls, ARM::FeatureExecuteOnly, ARM::FeatureReserveR9, - ARM::FeatureNoMovt, ARM::FeatureNoNegativeImmediates + ARM::FeatureAClass, ARM::FeatureStrictAlign, ARM::FeatureLongCalls, + ARM::FeatureExecuteOnly, ARM::FeatureReserveR9, ARM::FeatureNoMovt, + ARM::FeatureNoNegativeImmediates }; const ARMSubtarget *getST() const { return ST; } diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index 5f930fb0c807..2e47ceeca96b 100644 --- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -877,8 +877,7 @@ static bool tryAddingSymbolicOperand(uint64_t Address, int32_t Value, /// is an address into a section with 'C' string literals. static void tryAddingPcLoadReferenceComment(uint64_t Address, int Value, const MCDisassembler *Decoder) { - const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder); - Dis->tryAddingPcLoadReferenceComment(Value, Address); + Decoder->tryAddingPcLoadReferenceComment(Value, Address); } // Thumb1 instructions don't have explicit S bits. Rather, they @@ -1482,7 +1481,7 @@ static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo, DecodeStatus S = MCDisassembler::Success; const FeatureBitset &featureBits = - ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits(); + Decoder->getSubtargetInfo().getFeatureBits(); if ((RegNo == 13 && !featureBits[ARM::HasV8Ops]) || RegNo == 15) S = MCDisassembler::SoftFail; @@ -1535,7 +1534,7 @@ static bool PermitsD32(const MCInst &Inst, const MCDisassembler *Decoder) { if (Inst.getOpcode() == ARM::VSCCLRMD || Inst.getOpcode() == ARM::VSCCLRMS) return true; const FeatureBitset &featureBits = - ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits(); + Decoder->getSubtargetInfo().getFeatureBits(); return featureBits[ARM::FeatureD32]; } @@ -1879,7 +1878,7 @@ static DecodeStatus DecodeCopMemInstruction(MCInst &Inst, unsigned Insn, unsigned Rn = fieldFromInstruction(Insn, 16, 4); unsigned U = fieldFromInstruction(Insn, 23, 1); const FeatureBitset &featureBits = - ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits(); + Decoder->getSubtargetInfo().getFeatureBits(); switch (Inst.getOpcode()) { case ARM::LDC_OFFSET: @@ -2553,8 +2552,8 @@ static DecodeStatus DecodeHINTInstruction(MCInst &Inst, unsigned Insn, const MCDisassembler *Decoder) { unsigned pred = fieldFromInstruction(Insn, 28, 4); unsigned imm8 = fieldFromInstruction(Insn, 0, 8); - const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder); - const FeatureBitset &FeatureBits = Dis->getSubtargetInfo().getFeatureBits(); + const FeatureBitset &FeatureBits = + Decoder->getSubtargetInfo().getFeatureBits(); DecodeStatus S = MCDisassembler::Success; @@ -2798,8 +2797,8 @@ static DecodeStatus DecodeSETPANInstruction(MCInst &Inst, unsigned Insn, unsigned Imm = fieldFromInstruction(Insn, 9, 1); - const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder); - const FeatureBitset &FeatureBits = Dis->getSubtargetInfo().getFeatureBits(); + const FeatureBitset &FeatureBits = + Decoder->getSubtargetInfo().getFeatureBits(); if (!FeatureBits[ARM::HasV8_1aOps] || !FeatureBits[ARM::HasV8Ops]) @@ -4081,7 +4080,7 @@ static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Insn, unsigned Rn = fieldFromInstruction(Insn, 16, 4); const FeatureBitset &featureBits = - ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits(); + Decoder->getSubtargetInfo().getFeatureBits(); bool hasMP = featureBits[ARM::FeatureMP]; bool hasV7Ops = featureBits[ARM::HasV7Ops]; @@ -4170,7 +4169,7 @@ static DecodeStatus DecodeT2LoadImm8(MCInst &Inst, unsigned Insn, unsigned add = fieldFromInstruction(Insn, 9, 1); const FeatureBitset &featureBits = - ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits(); + Decoder->getSubtargetInfo().getFeatureBits(); bool hasMP = featureBits[ARM::FeatureMP]; bool hasV7Ops = featureBits[ARM::HasV7Ops]; @@ -4252,7 +4251,7 @@ static DecodeStatus DecodeT2LoadImm12(MCInst &Inst, unsigned Insn, imm |= (Rn << 13); const FeatureBitset &featureBits = - ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits(); + Decoder->getSubtargetInfo().getFeatureBits(); bool hasMP = featureBits[ARM::FeatureMP]; bool hasV7Ops = featureBits[ARM::HasV7Ops]; @@ -4371,7 +4370,7 @@ static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn, int imm = fieldFromInstruction(Insn, 0, 12); const FeatureBitset &featureBits = - ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits(); + Decoder->getSubtargetInfo().getFeatureBits(); bool hasV7Ops = featureBits[ARM::HasV7Ops]; @@ -4826,7 +4825,7 @@ static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Val, return MCDisassembler::Fail; const FeatureBitset &featureBits = - ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits(); + Decoder->getSubtargetInfo().getFeatureBits(); if (!isValidCoprocessorNumber(Val, featureBits)) return MCDisassembler::Fail; @@ -4839,7 +4838,7 @@ static DecodeStatus DecodeThumbTableBranch(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { const FeatureBitset &FeatureBits = - ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits(); + Decoder->getSubtargetInfo().getFeatureBits(); DecodeStatus S = MCDisassembler::Success; unsigned Rn = fieldFromInstruction(Insn, 16, 4); @@ -4984,7 +4983,7 @@ static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Val, uint64_t Address, const MCDisassembler *Decoder) { DecodeStatus S = MCDisassembler::Success; const FeatureBitset &FeatureBits = - ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits(); + Decoder->getSubtargetInfo().getFeatureBits(); if (FeatureBits[ARM::FeatureMClass]) { unsigned ValLow = Val & 0xff; @@ -6019,7 +6018,7 @@ static DecodeStatus DecodeSwap(MCInst &Inst, unsigned Insn, uint64_t Address, static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { const FeatureBitset &featureBits = - ((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits(); + Decoder->getSubtargetInfo().getFeatureBits(); bool hasFullFP16 = featureBits[ARM::FeatureFullFP16]; unsigned Vd = (fieldFromInstruction(Insn, 12, 4) << 0); @@ -6078,7 +6077,7 @@ static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn, uint64_t Address, static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { const FeatureBitset &featureBits = - ((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits(); + Decoder->getSubtargetInfo().getFeatureBits(); bool hasFullFP16 = featureBits[ARM::FeatureFullFP16]; unsigned Vd = (fieldFromInstruction(Insn, 12, 4) << 0); @@ -6244,7 +6243,7 @@ static DecodeStatus DecodeForVMRSandVMSR(MCInst &Inst, unsigned Val, uint64_t Address, const MCDisassembler *Decoder) { const FeatureBitset &featureBits = - ((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits(); + Decoder->getSubtargetInfo().getFeatureBits(); DecodeStatus S = MCDisassembler::Success; // Add explicit operand for the destination sysreg, for cases where @@ -6717,7 +6716,7 @@ static DecodeStatus DecodeVSTRVLDR_SYSREG(MCInst &Inst, unsigned Val, case ARM::VLDR_FPSCR_post: case ARM::VLDR_FPSCR_NZCVQC_post: const FeatureBitset &featureBits = - ((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits(); + Decoder->getSubtargetInfo().getFeatureBits(); if (!featureBits[ARM::HasMVEIntegerOps] && !featureBits[ARM::FeatureVFP2]) return MCDisassembler::Fail; diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index 376bddb120d5..146fc6704c6d 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -215,7 +215,7 @@ static const char *checkPCRelOffset(uint64_t Value, int64_t Min, int64_t Max) { const char *ARMAsmBackend::reasonForFixupRelaxation(const MCFixup &Fixup, uint64_t Value) const { - switch (Fixup.getTargetKind()) { + switch (Fixup.getKind()) { case ARM::fixup_arm_thumb_br: { // Relaxing tB to t2B. tB has a signed 12-bit displacement with the // low bit being an implied zero. There's an implied +4 offset for the @@ -311,12 +311,13 @@ static bool needsInterworking(const MCAssembler &Asm, const MCSymbol *Sym, return false; } -bool ARMAsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, +bool ARMAsmBackend::fixupNeedsRelaxationAdvanced(const MCFragment &, + const MCFixup &Fixup, const MCValue &Target, uint64_t Value, bool Resolved) const { const MCSymbol *Sym = Target.getAddSym(); - if (needsInterworking(*Asm, Sym, Fixup.getTargetKind())) + if (needsInterworking(*Asm, Sym, Fixup.getKind())) return true; if (!Resolved) @@ -947,7 +948,7 @@ bool ARMAsmBackend::shouldForceRelocation(const MCFixup &Fixup, } // Create relocations for unconditional branches to function symbols with // different execution mode in ELF binaries. - if (needsInterworking(*Asm, Sym, Fixup.getTargetKind())) + if (needsInterworking(*Asm, Sym, Fixup.getKind())) return true; // We must always generate a relocation for BL/BLX instructions if we have // a symbol to reference, as the linker relies on knowing the destination @@ -1093,7 +1094,7 @@ std::optional<bool> ARMAsmBackend::evaluateFixup(const MCFragment &F, // For a few PC-relative fixups in Thumb mode, offsets need to be aligned // down. We compensate here because the default handler's `Value` decrement // doesn't account for this alignment. - switch (Fixup.getTargetKind()) { + switch (Fixup.getKind()) { case ARM::fixup_t2_ldst_pcrel_12: case ARM::fixup_t2_pcrel_10: case ARM::fixup_t2_pcrel_9: diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h index 877e3afdb1d5..07d2cf784c44 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h @@ -51,7 +51,8 @@ public: const char *reasonForFixupRelaxation(const MCFixup &Fixup, uint64_t Value) const; - bool fixupNeedsRelaxationAdvanced(const MCFixup &, const MCValue &, uint64_t, + bool fixupNeedsRelaxationAdvanced(const MCFragment &, const MCFixup &, + const MCValue &, uint64_t, bool) const override; void relaxInstruction(MCInst &Inst, diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp index b0ebb74424c7..50e9ca1d3759 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp @@ -75,7 +75,7 @@ bool ARMELFObjectWriter::needsRelocateWithSymbol(const MCValue &V, unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup, const MCValue &Target, bool IsPCRel) const { - unsigned Kind = Fixup.getTargetKind(); + auto Kind = Fixup.getKind(); uint8_t Specifier = Target.getSpecifier(); auto CheckFDPIC = [&](uint32_t Type) { if (getOSABI() != ELF::ELFOSABI_ARM_FDPIC) @@ -105,7 +105,7 @@ unsigned ARMELFObjectWriter::getRelocType(const MCFixup &Fixup, } if (IsPCRel) { - switch (Fixup.getTargetKind()) { + switch (Fixup.getKind()) { default: reportError(Fixup.getLoc(), "unsupported relocation type"); return ELF::R_ARM_NONE; diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index c61e405bd3a0..eaba6fe5bfcb 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -638,7 +638,7 @@ private: Offset = 0; } bool hasInfo() { return F != nullptr; } - MCDataFragment *F = nullptr; + MCFragment *F = nullptr; uint64_t Offset = 0; ElfMappingSymbol State = EMS_None; }; @@ -650,11 +650,11 @@ private: // This is a tentative symbol, it won't really be emitted until it's // actually needed. ElfMappingSymbolInfo *EMS = LastEMSInfo.get(); - auto *DF = dyn_cast_or_null<MCDataFragment>(getCurrentFragment()); - if (!DF) + auto *DF = getCurrentFragment(); + if (DF->getKind() != MCFragment::FT_Data) return; EMS->F = DF; - EMS->Offset = DF->getContents().size(); + EMS->Offset = DF->getFixedSize(); LastEMSInfo->State = EMS_Data; return; } @@ -686,7 +686,7 @@ private: Symbol->setBinding(ELF::STB_LOCAL); } - void emitMappingSymbol(StringRef Name, MCDataFragment &F, uint64_t Offset) { + void emitMappingSymbol(StringRef Name, MCFragment &F, uint64_t Offset) { auto *Symbol = cast<MCSymbolELF>(getContext().createLocalSymbol(Name)); emitLabelAtPos(Symbol, SMLoc(), F, Offset); Symbol->setType(ELF::STT_NOTYPE); @@ -1145,9 +1145,8 @@ void ARMTargetELFStreamer::finish() { auto *Text = static_cast<MCSectionELF *>(Ctx.getObjectFileInfo()->getTextSection()); for (auto &F : *Text) - if (auto *DF = dyn_cast<MCDataFragment>(&F)) - if (!DF->getContents().empty()) - return; + if (F.getSize()) + return; Text->setFlags(Text->getFlags() | ELF::SHF_ARM_PURECODE); } } @@ -1208,7 +1207,7 @@ inline void ARMELFStreamer::SwitchToExIdxSection(const MCSymbol &FnStart) { } void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) { - MCDataFragment *Frag = getOrCreateDataFragment(); + MCFragment *Frag = getOrCreateDataFragment(); Frag->addFixup(MCFixup::create(Frag->getContents().size(), Expr, Kind)); } @@ -1296,7 +1295,7 @@ void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) { MCSymbolRefExpr::create(PersonalitySym, ARM::S_ARM_NONE, getContext()); visitUsedExpr(*PersonalityRef); - MCDataFragment *DF = getOrCreateDataFragment(); + MCFragment *DF = getOrCreateDataFragment(); DF->addFixup( MCFixup::create(DF->getContents().size(), PersonalityRef, FK_Data_4)); } diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp index 2d22b27ceb13..e84aaaad3750 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp @@ -152,12 +152,6 @@ std::string ARM_MC::ParseARMTriple(const Triple &TT, StringRef CPU) { ARMArchFeature += "+thumb-mode,+v4t"; } - if (TT.isOSNaCl()) { - if (!ARMArchFeature.empty()) - ARMArchFeature += ","; - ARMArchFeature += "+nacl-trap"; - } - if (TT.isOSWindows()) { if (!ARMArchFeature.empty()) ARMArchFeature += ","; diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp index c0c40ade5810..354de8fd7b4b 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp @@ -192,7 +192,7 @@ void ARMMachObjectWriter::recordARMScatteredHalfRelocation( // relocation entry in the low 16 bits of r_address field. unsigned ThumbBit = 0; unsigned MovtBit = 0; - switch (Fixup.getTargetKind()) { + switch (Fixup.getKind()) { default: break; case ARM::fixup_arm_movt_hi16: MovtBit = 1; @@ -465,7 +465,7 @@ void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer, // PAIR. I.e. it's correct that we insert the high bits of the addend in the // MOVW case here. relocation entries. uint32_t Value = 0; - switch (Fixup.getTargetKind()) { + switch (Fixup.getKind()) { default: break; case ARM::fixup_arm_movw_lo16: case ARM::fixup_t2_movw_lo16: diff --git a/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp index 440d852fa4bc..90505aa82aa4 100644 --- a/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp @@ -2531,27 +2531,47 @@ bool AVRExpandPseudo::expand<AVR::SPWRITE>(Block &MBB, BlockIt MBBI) { unsigned Flags = MI.getFlags(); TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg); - buildMI(MBB, MBBI, AVR::INRdA) - .addReg(STI.getTmpRegister(), RegState::Define) - .addImm(STI.getIORegSREG()) - .setMIFlags(Flags); - - buildMI(MBB, MBBI, AVR::BCLRs).addImm(0x07).setMIFlags(Flags); - - buildMI(MBB, MBBI, AVR::OUTARr) - .addImm(0x3e) - .addReg(SrcHiReg, getKillRegState(SrcIsKill)) - .setMIFlags(Flags); + // From the XMEGA series manual: + // To prevent corruption when updating the stack pointer from software, + // a write to SPL will automatically disable interrupts + // for up to four instructions or until the next I/O memory write. + if (STI.getELFArch() >= 102) { // An XMEGA device + + buildMI(MBB, MBBI, AVR::OUTARr) + .addImm(STI.getIORegSPL()) + .addReg(SrcLoReg, getKillRegState(SrcIsKill)) + .setMIFlags(Flags); + + buildMI(MBB, MBBI, AVR::OUTARr) + .addImm(STI.getIORegSPH()) + .addReg(SrcHiReg, getKillRegState(SrcIsKill)) + .setMIFlags(Flags); + + } else { // Disable interrupts for older devices (3 extra instructions) + + buildMI(MBB, MBBI, AVR::INRdA) + .addReg(STI.getTmpRegister(), RegState::Define) + .addImm(STI.getIORegSREG()) + .setMIFlags(Flags); + + buildMI(MBB, MBBI, AVR::BCLRs).addImm(0x07).setMIFlags(Flags); + + if (STI.getIORegSPH() != -1) + buildMI(MBB, MBBI, AVR::OUTARr) + .addImm(STI.getIORegSPH()) + .addReg(SrcHiReg, getKillRegState(SrcIsKill)) + .setMIFlags(Flags); - buildMI(MBB, MBBI, AVR::OUTARr) - .addImm(STI.getIORegSREG()) - .addReg(STI.getTmpRegister(), RegState::Kill) - .setMIFlags(Flags); + buildMI(MBB, MBBI, AVR::OUTARr) + .addImm(STI.getIORegSREG()) + .addReg(STI.getTmpRegister(), RegState::Kill) + .setMIFlags(Flags); - buildMI(MBB, MBBI, AVR::OUTARr) - .addImm(0x3d) - .addReg(SrcLoReg, getKillRegState(SrcIsKill)) - .setMIFlags(Flags); + buildMI(MBB, MBBI, AVR::OUTARr) + .addImm(STI.getIORegSPL()) + .addReg(SrcLoReg, getKillRegState(SrcIsKill)) + .setMIFlags(Flags); + } MI.eraseFromParent(); return true; diff --git a/llvm/lib/Target/AVR/README.md b/llvm/lib/Target/AVR/README.md index bd8b453aa81e..2bcf63cf7581 100644 --- a/llvm/lib/Target/AVR/README.md +++ b/llvm/lib/Target/AVR/README.md @@ -4,5 +4,5 @@ This experimental backend is for the 8-bit Atmel [AVR](https://en.wikipedia.org/ ## Useful links -* [Unresolved bugs](https://llvm.org/bugs/buglist.cgi?product=libraries&component=Backend%3A%20AVR&resolution=---&list_id=109466) +* [Unresolved bugs](https://github.com/llvm/llvm-project/labels/backend%3AAVR) * [Architecture notes](https://github.com/avr-llvm/architecture) diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp index 958790d49d08..dda87537809c 100644 --- a/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp +++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp @@ -90,7 +90,7 @@ void BPFAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, Data[Fixup.getOffset() + 1] = 0x1; support::endian::write32be(&Data[Fixup.getOffset() + 4], Value); } - } else if (Fixup.getTargetKind() == BPF::FK_BPF_PCRel_4) { + } else if (Fixup.getKind() == BPF::FK_BPF_PCRel_4) { // The input Value represents the number of bytes. Value = (uint32_t)((Value - 8) / 8); support::endian::write<uint32_t>(&Data[Fixup.getOffset() + 4], Value, diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp index ce1da6e58b9c..694d9eab9694 100644 --- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp +++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.cpp @@ -71,7 +71,7 @@ MCFixupKindInfo CSKYAsmBackend::getFixupKindInfo(MCFixupKind Kind) const { static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, MCContext &Ctx) { - switch (Fixup.getTargetKind()) { + switch (Fixup.getKind()) { default: llvm_unreachable("Unknown fixup kind!"); case CSKY::fixup_csky_got32: @@ -157,7 +157,8 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, } } -bool CSKYAsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, +bool CSKYAsmBackend::fixupNeedsRelaxationAdvanced(const MCFragment &, + const MCFixup &Fixup, const MCValue &, uint64_t Value, bool Resolved) const { @@ -166,7 +167,7 @@ bool CSKYAsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, return true; int64_t Offset = int64_t(Value); - switch (Fixup.getTargetKind()) { + switch (Fixup.getKind()) { default: return false; case CSKY::fixup_csky_pcrel_imm10_scale2: @@ -186,7 +187,7 @@ std::optional<bool> CSKYAsmBackend::evaluateFixup(const MCFragment &F, // For a few PC-relative fixups, offsets need to be aligned down. We // compensate here because the default handler's `Value` decrement doesn't // account for this alignment. - switch (Fixup.getTargetKind()) { + switch (Fixup.getKind()) { case CSKY::fixup_csky_pcrel_uimm16_scale4: case CSKY::fixup_csky_pcrel_uimm8_scale4: case CSKY::fixup_csky_pcrel_uimm7_scale4: @@ -264,7 +265,7 @@ bool CSKYAsmBackend::shouldForceRelocation(const MCFixup &Fixup, const MCValue &Target /*STI*/) { if (Target.getSpecifier()) return true; - switch (Fixup.getTargetKind()) { + switch (Fixup.getKind()) { default: break; case CSKY::fixup_csky_doffset_imm18: diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h index 1d3a22c2bbbb..1c8516fbf53a 100644 --- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h +++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYAsmBackend.h @@ -38,7 +38,8 @@ public: void relaxInstruction(MCInst &Inst, const MCSubtargetInfo &STI) const override; - bool fixupNeedsRelaxationAdvanced(const MCFixup &, const MCValue &, uint64_t, + bool fixupNeedsRelaxationAdvanced(const MCFragment &, const MCFixup &, + const MCValue &, uint64_t, bool) const override; bool writeNopData(raw_ostream &OS, uint64_t Count, diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp index 1de82e6cc6ce..d042d26e6ef2 100644 --- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp +++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYELFObjectWriter.cpp @@ -39,7 +39,7 @@ unsigned CSKYELFObjectWriter::getRelocType(const MCFixup &Fixup, bool IsPCRel) const { const MCExpr *Expr = Fixup.getValue(); // Determine the type of the relocation - unsigned Kind = Fixup.getTargetKind(); + auto Kind = Fixup.getKind(); uint8_t Modifier = Target.getSpecifier(); switch (Target.getSpecifier()) { diff --git a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp index c97c604fdbf7..d9d9b36d0b73 100644 --- a/llvm/lib/Target/DirectX/DXILDataScalarization.cpp +++ b/llvm/lib/Target/DirectX/DXILDataScalarization.cpp @@ -202,7 +202,7 @@ DataScalarizerVisitor::createArrayFromVector(IRBuilder<> &Builder, Value *Vec, // original vector's defining instruction if available, else immediately after // the alloca if (auto *Instr = dyn_cast<Instruction>(Vec)) - Builder.SetInsertPoint(Instr->getNextNonDebugInstruction()); + Builder.SetInsertPoint(Instr->getNextNode()); SmallVector<Value *, 4> GEPs(ArrNumElems); for (unsigned I = 0; I < ArrNumElems; ++I) { Value *EE = Builder.CreateExtractElement(Vec, I, Name + ".extract"); @@ -302,7 +302,7 @@ bool DataScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) { bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) { Value *PtrOperand = GEPI.getPointerOperand(); - Type *OrigGEPType = GEPI.getPointerOperandType(); + Type *OrigGEPType = GEPI.getSourceElementType(); Type *NewGEPType = OrigGEPType; bool NeedsTransform = false; @@ -319,6 +319,11 @@ bool DataScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) { } } + // Scalar geps should remain scalars geps. The dxil-flatten-arrays pass will + // convert these scalar geps into flattened array geps + if (!isa<ArrayType>(OrigGEPType)) + NewGEPType = OrigGEPType; + // Note: We bail if this isn't a gep touched via alloca or global // transformations if (!NeedsTransform) diff --git a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp index 0b7cf2f97017..f0e2e786dfaf 100644 --- a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp +++ b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp @@ -20,6 +20,7 @@ #include "llvm/IR/InstVisitor.h" #include "llvm/IR/ReplaceConstant.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Transforms/Utils/Local.h" #include <cassert> #include <cstddef> @@ -40,18 +41,19 @@ public: static char ID; // Pass identification. }; -struct GEPData { - ArrayType *ParentArrayType; - Value *ParentOperand; - SmallVector<Value *> Indices; - SmallVector<uint64_t> Dims; - bool AllIndicesAreConstInt; +struct GEPInfo { + ArrayType *RootFlattenedArrayType; + Value *RootPointerOperand; + SmallMapVector<Value *, APInt, 4> VariableOffsets; + APInt ConstantOffset; }; class DXILFlattenArraysVisitor : public InstVisitor<DXILFlattenArraysVisitor, bool> { public: - DXILFlattenArraysVisitor() {} + DXILFlattenArraysVisitor( + SmallDenseMap<GlobalVariable *, GlobalVariable *> &GlobalMap) + : GlobalMap(GlobalMap) {} bool visit(Function &F); // InstVisitor methods. They return true if the instruction was scalarized, // false if nothing changed. @@ -78,7 +80,8 @@ public: private: SmallVector<WeakTrackingVH> PotentiallyDeadInstrs; - DenseMap<GetElementPtrInst *, GEPData> GEPChainMap; + SmallDenseMap<GEPOperator *, GEPInfo> GEPChainInfoMap; + SmallDenseMap<GlobalVariable *, GlobalVariable *> &GlobalMap; bool finish(); ConstantInt *genConstFlattenIndices(ArrayRef<Value *> Indices, ArrayRef<uint64_t> Dims, @@ -86,27 +89,11 @@ private: Value *genInstructionFlattenIndices(ArrayRef<Value *> Indices, ArrayRef<uint64_t> Dims, IRBuilder<> &Builder); - - // Helper function to collect indices and dimensions from a GEP instruction - void collectIndicesAndDimsFromGEP(GetElementPtrInst &GEP, - SmallVectorImpl<Value *> &Indices, - SmallVectorImpl<uint64_t> &Dims, - bool &AllIndicesAreConstInt); - - void - recursivelyCollectGEPs(GetElementPtrInst &CurrGEP, - ArrayType *FlattenedArrayType, Value *PtrOperand, - unsigned &GEPChainUseCount, - SmallVector<Value *> Indices = SmallVector<Value *>(), - SmallVector<uint64_t> Dims = SmallVector<uint64_t>(), - bool AllIndicesAreConstInt = true); - bool visitGetElementPtrInstInGEPChain(GetElementPtrInst &GEP); - bool visitGetElementPtrInstInGEPChainBase(GEPData &GEPInfo, - GetElementPtrInst &GEP); }; } // namespace bool DXILFlattenArraysVisitor::finish() { + GEPChainInfoMap.clear(); RecursivelyDeleteTriviallyDeadInstructionsPermissive(PotentiallyDeadInstrs); return true; } @@ -225,131 +212,159 @@ bool DXILFlattenArraysVisitor::visitAllocaInst(AllocaInst &AI) { return true; } -void DXILFlattenArraysVisitor::collectIndicesAndDimsFromGEP( - GetElementPtrInst &GEP, SmallVectorImpl<Value *> &Indices, - SmallVectorImpl<uint64_t> &Dims, bool &AllIndicesAreConstInt) { - - Type *CurrentType = GEP.getSourceElementType(); - - // Note index 0 is the ptr index. - for (Value *Index : llvm::drop_begin(GEP.indices(), 1)) { - Indices.push_back(Index); - AllIndicesAreConstInt &= isa<ConstantInt>(Index); +bool DXILFlattenArraysVisitor::visitGetElementPtrInst(GetElementPtrInst &GEP) { + // Do not visit GEPs more than once + if (GEPChainInfoMap.contains(cast<GEPOperator>(&GEP))) + return false; - if (auto *ArrayTy = dyn_cast<ArrayType>(CurrentType)) { - Dims.push_back(ArrayTy->getNumElements()); - CurrentType = ArrayTy->getElementType(); - } else { - assert(false && "Expected array type in GEP chain"); - } + Value *PtrOperand = GEP.getPointerOperand(); + // It shouldn't(?) be possible for the pointer operand of a GEP to be a PHI + // node unless HLSL has pointers. If this assumption is incorrect or HLSL gets + // pointer types, then the handling of this case can be implemented later. + assert(!isa<PHINode>(PtrOperand) && + "Pointer operand of GEP should not be a PHI Node"); + + // Replace a GEP ConstantExpr pointer operand with a GEP instruction so that + // it can be visited + if (auto *PtrOpGEPCE = dyn_cast<ConstantExpr>(PtrOperand); + PtrOpGEPCE && PtrOpGEPCE->getOpcode() == Instruction::GetElementPtr) { + GetElementPtrInst *OldGEPI = + cast<GetElementPtrInst>(PtrOpGEPCE->getAsInstruction()); + OldGEPI->insertBefore(GEP.getIterator()); + + IRBuilder<> Builder(&GEP); + SmallVector<Value *> Indices(GEP.indices()); + Value *NewGEP = + Builder.CreateGEP(GEP.getSourceElementType(), OldGEPI, Indices, + GEP.getName(), GEP.getNoWrapFlags()); + assert(isa<GetElementPtrInst>(NewGEP) && + "Expected newly-created GEP to be an instruction"); + GetElementPtrInst *NewGEPI = cast<GetElementPtrInst>(NewGEP); + + GEP.replaceAllUsesWith(NewGEPI); + GEP.eraseFromParent(); + visitGetElementPtrInst(*OldGEPI); + visitGetElementPtrInst(*NewGEPI); + return true; } -} - -void DXILFlattenArraysVisitor::recursivelyCollectGEPs( - GetElementPtrInst &CurrGEP, ArrayType *FlattenedArrayType, - Value *PtrOperand, unsigned &GEPChainUseCount, SmallVector<Value *> Indices, - SmallVector<uint64_t> Dims, bool AllIndicesAreConstInt) { - // Check if this GEP is already in the map to avoid circular references - if (GEPChainMap.count(&CurrGEP) > 0) - return; - // Collect indices and dimensions from the current GEP - collectIndicesAndDimsFromGEP(CurrGEP, Indices, Dims, AllIndicesAreConstInt); - bool IsMultiDimArr = isMultiDimensionalArray(CurrGEP.getSourceElementType()); - if (!IsMultiDimArr) { - assert(GEPChainUseCount < FlattenedArrayType->getNumElements()); - GEPChainMap.insert( - {&CurrGEP, - {std::move(FlattenedArrayType), PtrOperand, std::move(Indices), - std::move(Dims), AllIndicesAreConstInt}}); - return; - } - bool GepUses = false; - for (auto *User : CurrGEP.users()) { - if (GetElementPtrInst *NestedGEP = dyn_cast<GetElementPtrInst>(User)) { - recursivelyCollectGEPs(*NestedGEP, FlattenedArrayType, PtrOperand, - ++GEPChainUseCount, Indices, Dims, - AllIndicesAreConstInt); - GepUses = true; - } - } - // This case is just incase the gep chain doesn't end with a 1d array. - if (IsMultiDimArr && GEPChainUseCount > 0 && !GepUses) { - GEPChainMap.insert( - {&CurrGEP, - {std::move(FlattenedArrayType), PtrOperand, std::move(Indices), - std::move(Dims), AllIndicesAreConstInt}}); + // Construct GEPInfo for this GEP + GEPInfo Info; + + // Obtain the variable and constant byte offsets computed by this GEP + const DataLayout &DL = GEP.getDataLayout(); + unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP.getType()); + Info.ConstantOffset = {BitWidth, 0}; + [[maybe_unused]] bool Success = GEP.collectOffset( + DL, BitWidth, Info.VariableOffsets, Info.ConstantOffset); + assert(Success && "Failed to collect offsets for GEP"); + + // If there is a parent GEP, inherit the root array type and pointer, and + // merge the byte offsets. Otherwise, this GEP is itself the root of a GEP + // chain and we need to deterine the root array type + if (auto *PtrOpGEP = dyn_cast<GEPOperator>(PtrOperand)) { + assert(GEPChainInfoMap.contains(PtrOpGEP) && + "Expected parent GEP to be visited before this GEP"); + GEPInfo &PGEPInfo = GEPChainInfoMap[PtrOpGEP]; + Info.RootFlattenedArrayType = PGEPInfo.RootFlattenedArrayType; + Info.RootPointerOperand = PGEPInfo.RootPointerOperand; + for (auto &VariableOffset : PGEPInfo.VariableOffsets) + Info.VariableOffsets.insert(VariableOffset); + Info.ConstantOffset += PGEPInfo.ConstantOffset; + } else { + Info.RootPointerOperand = PtrOperand; + + // We should try to determine the type of the root from the pointer rather + // than the GEP's source element type because this could be a scalar GEP + // into an array-typed pointer from an Alloca or Global Variable. + Type *RootTy = GEP.getSourceElementType(); + if (auto *GlobalVar = dyn_cast<GlobalVariable>(PtrOperand)) { + if (GlobalMap.contains(GlobalVar)) + GlobalVar = GlobalMap[GlobalVar]; + Info.RootPointerOperand = GlobalVar; + RootTy = GlobalVar->getValueType(); + } else if (auto *Alloca = dyn_cast<AllocaInst>(PtrOperand)) + RootTy = Alloca->getAllocatedType(); + assert(!isMultiDimensionalArray(RootTy) && + "Expected root array type to be flattened"); + + // If the root type is not an array, we don't need to do any flattening + if (!isa<ArrayType>(RootTy)) + return false; + + Info.RootFlattenedArrayType = cast<ArrayType>(RootTy); } -} -bool DXILFlattenArraysVisitor::visitGetElementPtrInstInGEPChain( - GetElementPtrInst &GEP) { - GEPData GEPInfo = GEPChainMap.at(&GEP); - return visitGetElementPtrInstInGEPChainBase(GEPInfo, GEP); -} -bool DXILFlattenArraysVisitor::visitGetElementPtrInstInGEPChainBase( - GEPData &GEPInfo, GetElementPtrInst &GEP) { - IRBuilder<> Builder(&GEP); - Value *FlatIndex; - if (GEPInfo.AllIndicesAreConstInt) - FlatIndex = genConstFlattenIndices(GEPInfo.Indices, GEPInfo.Dims, Builder); - else - FlatIndex = - genInstructionFlattenIndices(GEPInfo.Indices, GEPInfo.Dims, Builder); - - ArrayType *FlattenedArrayType = GEPInfo.ParentArrayType; - - // Don't append '.flat' to an empty string. If the SSA name isn't available - // it could conflict with the ParentOperand's name. - std::string FlatName = GEP.hasName() ? GEP.getName().str() + ".flat" : ""; - - Value *FlatGEP = Builder.CreateGEP(FlattenedArrayType, GEPInfo.ParentOperand, - {Builder.getInt32(0), FlatIndex}, FlatName, - GEP.getNoWrapFlags()); - - // Note: Old gep will become an invalid instruction after replaceAllUsesWith. - // Erase the old GEP in the map before to avoid invalid instructions - // and circular references. - GEPChainMap.erase(&GEP); - - GEP.replaceAllUsesWith(FlatGEP); - GEP.eraseFromParent(); - return true; -} - -bool DXILFlattenArraysVisitor::visitGetElementPtrInst(GetElementPtrInst &GEP) { - auto It = GEPChainMap.find(&GEP); - if (It != GEPChainMap.end()) - return visitGetElementPtrInstInGEPChain(GEP); - if (!isMultiDimensionalArray(GEP.getSourceElementType())) - return false; - - ArrayType *ArrType = cast<ArrayType>(GEP.getSourceElementType()); - IRBuilder<> Builder(&GEP); - auto [TotalElements, BaseType] = getElementCountAndType(ArrType); - ArrayType *FlattenedArrayType = ArrayType::get(BaseType, TotalElements); - - Value *PtrOperand = GEP.getPointerOperand(); + // GEPs without users or GEPs with non-GEP users should be replaced such that + // the chain of GEPs they are a part of are collapsed to a single GEP into a + // flattened array. + bool ReplaceThisGEP = GEP.users().empty(); + for (Value *User : GEP.users()) + if (!isa<GetElementPtrInst>(User)) + ReplaceThisGEP = true; + + if (ReplaceThisGEP) { + unsigned BytesPerElem = + DL.getTypeAllocSize(Info.RootFlattenedArrayType->getArrayElementType()); + assert(isPowerOf2_32(BytesPerElem) && + "Bytes per element should be a power of 2"); + + // Compute the 32-bit index for this flattened GEP from the constant and + // variable byte offsets in the GEPInfo + IRBuilder<> Builder(&GEP); + Value *ZeroIndex = Builder.getInt32(0); + uint64_t ConstantOffset = + Info.ConstantOffset.udiv(BytesPerElem).getZExtValue(); + assert(ConstantOffset < UINT32_MAX && + "Constant byte offset for flat GEP index must fit within 32 bits"); + Value *FlattenedIndex = Builder.getInt32(ConstantOffset); + for (auto [VarIndex, Multiplier] : Info.VariableOffsets) { + assert(Multiplier.getActiveBits() <= 32 && + "The multiplier for a flat GEP index must fit within 32 bits"); + assert(VarIndex->getType()->isIntegerTy(32) && + "Expected i32-typed GEP indices"); + Value *VI; + if (Multiplier.getZExtValue() % BytesPerElem != 0) { + // This can happen, e.g., with i8 GEPs. To handle this we just divide + // by BytesPerElem using an instruction after multiplying VarIndex by + // Multiplier. + VI = Builder.CreateMul(VarIndex, + Builder.getInt32(Multiplier.getZExtValue())); + VI = Builder.CreateLShr(VI, Builder.getInt32(Log2_32(BytesPerElem))); + } else + VI = Builder.CreateMul( + VarIndex, + Builder.getInt32(Multiplier.getZExtValue() / BytesPerElem)); + FlattenedIndex = Builder.CreateAdd(FlattenedIndex, VI); + } - unsigned GEPChainUseCount = 0; - recursivelyCollectGEPs(GEP, FlattenedArrayType, PtrOperand, GEPChainUseCount); - - // NOTE: hasNUses(0) is not the same as GEPChainUseCount == 0. - // Here recursion is used to get the length of the GEP chain. - // Handle zero uses here because there won't be an update via - // a child in the chain later. - if (GEPChainUseCount == 0) { - SmallVector<Value *> Indices; - SmallVector<uint64_t> Dims; - bool AllIndicesAreConstInt = true; - - // Collect indices and dimensions from the GEP - collectIndicesAndDimsFromGEP(GEP, Indices, Dims, AllIndicesAreConstInt); - GEPData GEPInfo{std::move(FlattenedArrayType), PtrOperand, - std::move(Indices), std::move(Dims), AllIndicesAreConstInt}; - return visitGetElementPtrInstInGEPChainBase(GEPInfo, GEP); + // Construct a new GEP for the flattened array to replace the current GEP + Value *NewGEP = Builder.CreateGEP( + Info.RootFlattenedArrayType, Info.RootPointerOperand, + {ZeroIndex, FlattenedIndex}, GEP.getName(), GEP.getNoWrapFlags()); + + // If the pointer operand is a global variable and all indices are 0, + // IRBuilder::CreateGEP will return the global variable instead of creating + // a GEP instruction or GEP ConstantExpr. In this case we have to create and + // insert our own GEP instruction. + if (!isa<GEPOperator>(NewGEP)) + NewGEP = GetElementPtrInst::Create( + Info.RootFlattenedArrayType, Info.RootPointerOperand, + {ZeroIndex, FlattenedIndex}, GEP.getNoWrapFlags(), GEP.getName(), + Builder.GetInsertPoint()); + + // Replace the current GEP with the new GEP. Store GEPInfo into the map + // for later use in case this GEP was not the end of the chain + GEPChainInfoMap.insert({cast<GEPOperator>(NewGEP), std::move(Info)}); + GEP.replaceAllUsesWith(NewGEP); + GEP.eraseFromParent(); + return true; } + // This GEP is potentially dead at the end of the pass since it may not have + // any users anymore after GEP chains have been collapsed. We retain store + // GEPInfo for GEPs down the chain to use to compute their indices. + GEPChainInfoMap.insert({cast<GEPOperator>(&GEP), std::move(Info)}); PotentiallyDeadInstrs.emplace_back(&GEP); return false; } @@ -416,9 +431,8 @@ static Constant *transformInitializer(Constant *Init, Type *OrigType, return ConstantArray::get(FlattenedType, FlattenedElements); } -static void -flattenGlobalArrays(Module &M, - DenseMap<GlobalVariable *, GlobalVariable *> &GlobalMap) { +static void flattenGlobalArrays( + Module &M, SmallDenseMap<GlobalVariable *, GlobalVariable *> &GlobalMap) { LLVMContext &Ctx = M.getContext(); for (GlobalVariable &G : M.globals()) { Type *OrigType = G.getValueType(); @@ -456,9 +470,9 @@ flattenGlobalArrays(Module &M, static bool flattenArrays(Module &M) { bool MadeChange = false; - DXILFlattenArraysVisitor Impl; - DenseMap<GlobalVariable *, GlobalVariable *> GlobalMap; + SmallDenseMap<GlobalVariable *, GlobalVariable *> GlobalMap; flattenGlobalArrays(M, GlobalMap); + DXILFlattenArraysVisitor Impl(GlobalMap); for (auto &F : make_early_inc_range(M.functions())) { if (F.isDeclaration()) continue; diff --git a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp index 76a46c7a2b76..c73648f21e8d 100644 --- a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp +++ b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp @@ -98,9 +98,9 @@ static void fixI8UseChain(Instruction &I, ElementType = AI->getAllocatedType(); if (auto *GEP = dyn_cast<GetElementPtrInst>(NewOperands[0])) { ElementType = GEP->getSourceElementType(); - if (ElementType->isArrayTy()) - ElementType = ElementType->getArrayElementType(); } + if (ElementType->isArrayTy()) + ElementType = ElementType->getArrayElementType(); LoadInst *NewLoad = Builder.CreateLoad(ElementType, NewOperands[0]); ReplacedValues[Load] = NewLoad; ToRemove.push_back(Load); @@ -347,7 +347,6 @@ static void emitMemcpyExpansion(IRBuilder<> &Builder, Value *Dst, Value *Src, if (ByteLength == 0) return; - LLVMContext &Ctx = Builder.getContext(); const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout(); auto GetArrTyFromVal = [](Value *Val) -> ArrayType * { @@ -392,10 +391,11 @@ static void emitMemcpyExpansion(IRBuilder<> &Builder, Value *Dst, Value *Src, assert(ByteLength % DstElemByteSize == 0 && "memcpy length must be divisible by array element type"); for (uint64_t I = 0; I < NumElemsToCopy; ++I) { - Value *Offset = ConstantInt::get(Type::getInt32Ty(Ctx), I); - Value *SrcPtr = Builder.CreateInBoundsGEP(SrcElemTy, Src, Offset, "gep"); + SmallVector<Value *, 2> Indices = {Builder.getInt32(0), + Builder.getInt32(I)}; + Value *SrcPtr = Builder.CreateInBoundsGEP(SrcArrTy, Src, Indices, "gep"); Value *SrcVal = Builder.CreateLoad(SrcElemTy, SrcPtr); - Value *DstPtr = Builder.CreateInBoundsGEP(DstElemTy, Dst, Offset, "gep"); + Value *DstPtr = Builder.CreateInBoundsGEP(DstArrTy, Dst, Indices, "gep"); Builder.CreateStore(SrcVal, DstPtr); } } @@ -403,7 +403,6 @@ static void emitMemcpyExpansion(IRBuilder<> &Builder, Value *Dst, Value *Src, static void emitMemsetExpansion(IRBuilder<> &Builder, Value *Dst, Value *Val, ConstantInt *SizeCI, DenseMap<Value *, Value *> &ReplacedValues) { - LLVMContext &Ctx = Builder.getContext(); [[maybe_unused]] const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout(); [[maybe_unused]] uint64_t OrigSize = SizeCI->getZExtValue(); @@ -444,8 +443,9 @@ static void emitMemsetExpansion(IRBuilder<> &Builder, Value *Dst, Value *Val, } for (uint64_t I = 0; I < Size; ++I) { - Value *Offset = ConstantInt::get(Type::getInt32Ty(Ctx), I); - Value *Ptr = Builder.CreateGEP(ElemTy, Dst, Offset, "gep"); + Value *Zero = Builder.getInt32(0); + Value *Offset = Builder.getInt32(I); + Value *Ptr = Builder.CreateGEP(ArrTy, Dst, {Zero, Offset}, "gep"); Builder.CreateStore(TypedVal, Ptr); } } @@ -478,9 +478,9 @@ static void legalizeMemCpy(Instruction &I, ToRemove.push_back(CI); } -static void removeMemSet(Instruction &I, - SmallVectorImpl<Instruction *> &ToRemove, - DenseMap<Value *, Value *> &ReplacedValues) { +static void legalizeMemSet(Instruction &I, + SmallVectorImpl<Instruction *> &ToRemove, + DenseMap<Value *, Value *> &ReplacedValues) { CallInst *CI = dyn_cast<CallInst>(&I); if (!CI) @@ -562,6 +562,53 @@ legalizeGetHighLowi64Bytes(Instruction &I, } } +static void +legalizeScalarLoadStoreOnArrays(Instruction &I, + SmallVectorImpl<Instruction *> &ToRemove, + DenseMap<Value *, Value *> &) { + + Value *PtrOp; + unsigned PtrOpIndex; + [[maybe_unused]] Type *LoadStoreTy; + if (auto *LI = dyn_cast<LoadInst>(&I)) { + PtrOp = LI->getPointerOperand(); + PtrOpIndex = LI->getPointerOperandIndex(); + LoadStoreTy = LI->getType(); + } else if (auto *SI = dyn_cast<StoreInst>(&I)) { + PtrOp = SI->getPointerOperand(); + PtrOpIndex = SI->getPointerOperandIndex(); + LoadStoreTy = SI->getValueOperand()->getType(); + } else + return; + + // If the load/store is not of a single-value type (i.e., scalar or vector) + // then we do not modify it. It shouldn't be a vector either because the + // dxil-data-scalarization pass is expected to run before this, but it's not + // incorrect to apply this transformation to vector load/stores. + if (!LoadStoreTy->isSingleValueType()) + return; + + Type *ArrayTy; + if (auto *GlobalVarPtrOp = dyn_cast<GlobalVariable>(PtrOp)) + ArrayTy = GlobalVarPtrOp->getValueType(); + else if (auto *AllocaPtrOp = dyn_cast<AllocaInst>(PtrOp)) + ArrayTy = AllocaPtrOp->getAllocatedType(); + else + return; + + if (!isa<ArrayType>(ArrayTy)) + return; + + assert(ArrayTy->getArrayElementType() == LoadStoreTy && + "Expected array element type to be the same as to the scalar load or " + "store type"); + + Value *Zero = ConstantInt::get(Type::getInt32Ty(I.getContext()), 0); + Value *GEP = GetElementPtrInst::Create( + ArrayTy, PtrOp, {Zero, Zero}, GEPNoWrapFlags::all(), "", I.getIterator()); + I.setOperand(PtrOpIndex, GEP); +} + namespace { class DXILLegalizationPipeline { @@ -603,7 +650,7 @@ private: LegalizationPipeline[Stage1].push_back(legalizeGetHighLowi64Bytes); LegalizationPipeline[Stage1].push_back(legalizeFreeze); LegalizationPipeline[Stage1].push_back(legalizeMemCpy); - LegalizationPipeline[Stage1].push_back(removeMemSet); + LegalizationPipeline[Stage1].push_back(legalizeMemSet); LegalizationPipeline[Stage1].push_back(updateFnegToFsub); // Note: legalizeGetHighLowi64Bytes and // downcastI64toI32InsertExtractElements both modify extractelement, so they @@ -612,6 +659,7 @@ private: // downcastI64toI32InsertExtractElements needs to handle. LegalizationPipeline[Stage2].push_back( downcastI64toI32InsertExtractElements); + LegalizationPipeline[Stage2].push_back(legalizeScalarLoadStoreOnArrays); } }; diff --git a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp index 40fe6c6e639e..84751d2db226 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp +++ b/llvm/lib/Target/DirectX/DirectXTargetMachine.cpp @@ -107,10 +107,10 @@ public: addPass(createDXILIntrinsicExpansionLegacyPass()); addPass(createDXILCBufferAccessLegacyPass()); addPass(createDXILDataScalarizationLegacyPass()); - addPass(createDXILFlattenArraysLegacyPass()); ScalarizerPassOptions DxilScalarOptions; DxilScalarOptions.ScalarizeLoadStore = true; addPass(createScalarizerPass(DxilScalarOptions)); + addPass(createDXILFlattenArraysLegacyPass()); addPass(createDXILForwardHandleAccessesLegacyPass()); addPass(createDXILLegalizeLegacyPass()); addPass(createDXILResourceImplicitBindingLegacyPass()); diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp index 5bd31707acb6..22cff7c80fa0 100644 --- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp +++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp @@ -43,12 +43,12 @@ namespace { class HexagonDisassembler : public MCDisassembler { public: std::unique_ptr<MCInstrInfo const> const MCII; - std::unique_ptr<MCInst *> CurrentBundle; + mutable std::unique_ptr<MCInst> CurrentBundle; mutable MCInst const *CurrentExtender; HexagonDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, MCInstrInfo const *MCII) - : MCDisassembler(STI, Ctx), MCII(MCII), CurrentBundle(new MCInst *), + : MCDisassembler(STI, Ctx), MCII(MCII), CurrentBundle(nullptr), CurrentExtender(nullptr) {} DecodeStatus getSingleInstruction(MCInst &Instr, MCInst &MCB, @@ -57,7 +57,23 @@ public: DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address, raw_ostream &CStream) const override; + + DecodeStatus getInstructionBundle(MCInst &Instr, uint64_t &Size, + ArrayRef<uint8_t> Bytes, uint64_t Address, + raw_ostream &CStream) const override; + void remapInstruction(MCInst &Instr) const; + +private: + bool makeBundle(ArrayRef<uint8_t> Bytes, uint64_t Address, + uint64_t &BytesToSkip, raw_ostream &CS) const; + + void resetBundle() const { + CurrentBundle.reset(); + CurrentInstruction = nullptr; + } + + mutable MCOperand *CurrentInstruction = nullptr; }; static uint64_t fullValue(HexagonDisassembler const &Disassembler, MCInst &MI, @@ -171,43 +187,88 @@ LLVMInitializeHexagonDisassembler() { createHexagonDisassembler); } -DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size, - ArrayRef<uint8_t> Bytes, - uint64_t Address, - raw_ostream &CS) const { - CommentStream = &CS; - - DecodeStatus Result = DecodeStatus::Success; +bool HexagonDisassembler::makeBundle(ArrayRef<uint8_t> Bytes, uint64_t Address, + uint64_t &BytesToSkip, + raw_ostream &CS) const { bool Complete = false; - Size = 0; + DecodeStatus Result = DecodeStatus::Success; - *CurrentBundle = &MI; - MI.setOpcode(Hexagon::BUNDLE); - MI.addOperand(MCOperand::createImm(0)); + CurrentBundle.reset(new MCInst); + CurrentBundle->setOpcode(Hexagon::BUNDLE); + CurrentBundle->addOperand(MCOperand::createImm(0)); while (Result == Success && !Complete) { if (Bytes.size() < HEXAGON_INSTR_SIZE) - return MCDisassembler::Fail; + return false; MCInst *Inst = getContext().createMCInst(); - Result = getSingleInstruction(*Inst, MI, Bytes, Address, CS, Complete); - MI.addOperand(MCOperand::createInst(Inst)); - Size += HEXAGON_INSTR_SIZE; + Result = getSingleInstruction(*Inst, *CurrentBundle, Bytes, Address, CS, + Complete); + CurrentBundle->addOperand(MCOperand::createInst(Inst)); + BytesToSkip += HEXAGON_INSTR_SIZE; Bytes = Bytes.slice(HEXAGON_INSTR_SIZE); } if (Result == MCDisassembler::Fail) - return Result; - if (Size > HEXAGON_MAX_PACKET_SIZE) - return MCDisassembler::Fail; + return false; + if (BytesToSkip > HEXAGON_MAX_PACKET_SIZE) + return false; const auto ArchSTI = Hexagon_MC::getArchSubtarget(&STI); const auto STI_ = (ArchSTI != nullptr) ? *ArchSTI : STI; - HexagonMCChecker Checker(getContext(), *MCII, STI_, MI, + HexagonMCChecker Checker(getContext(), *MCII, STI_, *CurrentBundle, *getContext().getRegisterInfo(), false); if (!Checker.check()) - return MCDisassembler::Fail; - remapInstruction(MI); + return false; + remapInstruction(*CurrentBundle); + return true; +} + +DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size, + ArrayRef<uint8_t> Bytes, + uint64_t Address, + raw_ostream &CS) const { + CommentStream = &CS; + + Size = 0; + uint64_t BytesToSkip = 0; + + if (!CurrentBundle) { + if (!makeBundle(Bytes, Address, BytesToSkip, CS)) { + Size = BytesToSkip; + resetBundle(); + return MCDisassembler::Fail; + } + CurrentInstruction = (CurrentBundle->begin() + 1); + } + + MI = *(CurrentInstruction->getInst()); + Size = HEXAGON_INSTR_SIZE; + if (++CurrentInstruction == CurrentBundle->end()) + resetBundle(); return MCDisassembler::Success; } +DecodeStatus HexagonDisassembler::getInstructionBundle(MCInst &MI, + uint64_t &Size, + ArrayRef<uint8_t> Bytes, + uint64_t Address, + raw_ostream &CS) const { + CommentStream = &CS; + Size = 0; + uint64_t BytesToSkip = 0; + assert(!CurrentBundle); + + if (!makeBundle(Bytes, Address, BytesToSkip, CS)) { + Size = BytesToSkip; + resetBundle(); + return MCDisassembler::Fail; + } + + MI = *CurrentBundle; + Size = HEXAGON_INSTR_SIZE * HexagonMCInstrInfo::bundleSize(MI); + resetBundle(); + + return Success; +} + void HexagonDisassembler::remapInstruction(MCInst &Instr) const { for (auto I: HexagonMCInstrInfo::bundleInstructions(Instr)) { auto &MI = const_cast<MCInst &>(*I.getInst()); @@ -482,7 +543,7 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB, unsigned Offset = 1; bool Vector = HexagonMCInstrInfo::isVector(*MCII, MI); bool PrevVector = false; - auto Instructions = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle); + auto Instructions = HexagonMCInstrInfo::bundleInstructions(*CurrentBundle); auto i = Instructions.end() - 1; for (auto n = Instructions.begin() - 1;; --i, ++Offset) { if (i == n) diff --git a/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp b/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp index bb7814c5226f..35da34ed0a89 100644 --- a/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp +++ b/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp @@ -1005,7 +1005,7 @@ bool MachineConstPropagator::rewrite(MachineFunction &MF) { SmallVector<MachineBasicBlock*,2> ToRemove; for (MachineBasicBlock *SB : B->successors()) { if (!Targets.count(SB)) - ToRemove.push_back(const_cast<MachineBasicBlock*>(SB)); + ToRemove.push_back(SB); Targets.remove(SB); } for (MachineBasicBlock *MBB : ToRemove) diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index 53943de3bc59..e285e0454369 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -1640,6 +1640,15 @@ bool HexagonDAGToDAGISel::DetectUseSxtw(SDValue &N, SDValue &R) { R = N; break; } + case ISD::AssertSext: { + EVT T = cast<VTSDNode>(N.getOperand(1))->getVT(); + if (T.getSizeInBits() == 32) + R = N.getOperand(0); + else + return false; + break; + } + default: return false; } diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index ec73e58ce5d4..facea646d4b6 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -236,7 +236,16 @@ MVT HexagonTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, SDValue HexagonTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { - return SDValue(); + unsigned IntNo = Op.getConstantOperandVal(0); + SDLoc dl(Op); + switch (IntNo) { + default: + return SDValue(); // Don't custom lower most intrinsics. + case Intrinsic::thread_pointer: { + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + return DAG.getNode(HexagonISD::THREAD_POINTER, dl, PtrVT); + } + } } /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified @@ -1588,6 +1597,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setOperationAction(ISD::PREFETCH, MVT::Other, Custom); setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::EH_RETURN, MVT::Other, Custom); setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom); @@ -1963,6 +1973,8 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const { case HexagonISD::VROR: return "HexagonISD::VROR"; case HexagonISD::READCYCLE: return "HexagonISD::READCYCLE"; case HexagonISD::READTIMER: return "HexagonISD::READTIMER"; + case HexagonISD::THREAD_POINTER: + return "HexagonISD::THREAD_POINTER"; case HexagonISD::PTRUE: return "HexagonISD::PTRUE"; case HexagonISD::PFALSE: return "HexagonISD::PFALSE"; case HexagonISD::D2P: return "HexagonISD::D2P"; diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h index f9e5478f457f..9ebbbc6399b4 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h @@ -31,6 +31,7 @@ namespace llvm { namespace HexagonISD { +// clang-format off enum NodeType : unsigned { OP_BEGIN = ISD::BUILTIN_OP_END, @@ -78,6 +79,7 @@ enum NodeType : unsigned { DCFETCH, READCYCLE, READTIMER, + THREAD_POINTER, PTRUE, PFALSE, D2P, // Convert 8-byte value to 8-bit predicate register. [*] @@ -121,6 +123,7 @@ enum NodeType : unsigned { }; } // end namespace HexagonISD +// clang-format on class HexagonSubtarget; diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td index 82d999ad820e..4b236708ca6d 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatterns.td +++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td @@ -3432,6 +3432,11 @@ def HexagonREADTIMER: SDNode<"HexagonISD::READTIMER", SDTInt64Leaf, def: Pat<(HexagonREADTIMER), (A4_tfrcpp UTIMER)>; +def SDTInt32Leaf : SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>; +def HexagonTHREADPOINTER : SDNode<"HexagonISD::THREAD_POINTER", SDTPtrLeaf>; + +def : Pat<(HexagonTHREADPOINTER), (i32(COPY UGP))>; + // The declared return value of the store-locked intrinsics is i32, but // the instructions actually define i1. To avoid register copies from // IntRegs to PredRegs and back, fold the entire pattern checking the diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp index de7bd5d4b2c6..7d3074ba6b5d 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp @@ -46,16 +46,15 @@ class HexagonAsmBackend : public MCAsmBackend { MCInst * Extender; unsigned MaxPacketSize; - void ReplaceInstruction(MCCodeEmitter &E, MCRelaxableFragment &RF, - MCInst &HMB) const { + void ReplaceInstruction(MCCodeEmitter &E, MCFragment &RF, MCInst &HMB) const { SmallVector<MCFixup, 4> Fixups; SmallString<256> Code; E.encodeInstruction(HMB, Code, Fixups, *RF.getSubtargetInfo()); // Update the fragment. RF.setInst(HMB); - RF.setContents(Code); - RF.getFixups() = Fixups; + RF.setVarContents(Code); + RF.setVarFixups(Fixups); } public: @@ -200,7 +199,7 @@ public: } bool shouldForceRelocation(const MCFixup &Fixup) { - switch(Fixup.getTargetKind()) { + switch(Fixup.getKind()) { default: llvm_unreachable("Unknown Fixup Kind!"); @@ -438,21 +437,21 @@ public: /// fixupNeedsRelaxation - Target specific predicate for whether a given /// fixup requires the associated instruction to be relaxed. - bool fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, const MCValue &, - uint64_t Value, + bool fixupNeedsRelaxationAdvanced(const MCFragment &F, const MCFixup &Fixup, + const MCValue &, uint64_t Value, bool Resolved) const override { MCInst const &MCB = RelaxedMCB; assert(HexagonMCInstrInfo::isBundle(MCB)); *RelaxTarget = nullptr; MCInst &MCI = const_cast<MCInst &>(HexagonMCInstrInfo::instruction( - MCB, Fixup.getOffset() / HEXAGON_INSTR_SIZE)); + MCB, (Fixup.getOffset() - F.getFixedSize()) / HEXAGON_INSTR_SIZE)); bool Relaxable = isInstRelaxable(MCI); if (Relaxable == false) return false; // If we cannot resolve the fixup value, it requires relaxation. if (!Resolved) { - switch (Fixup.getTargetKind()) { + switch (Fixup.getKind()) { case fixup_Hexagon_B22_PCREL: // GetFixupCount assumes B22 won't relax [[fallthrough]]; @@ -595,7 +594,7 @@ public: } case MCFragment::FT_Relaxable: { MCContext &Context = getContext(); - auto &RF = cast<MCRelaxableFragment>(*Frags[K]); + auto &RF = *Frags[K]; MCInst Inst = RF.getInst(); const bool WouldTraverseLabel = llvm::any_of( diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp index ed381c33225d..9752f3a13120 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp @@ -56,7 +56,7 @@ unsigned HexagonELFObjectWriter::getRelocType(const MCFixup &Fixup, default: break; } - switch (Fixup.getTargetKind()) { + switch (Fixup.getKind()) { default: report_fatal_error("Unrecognized relocation type"); break; diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp index 9030e43b7149..f83e06cd3d93 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp @@ -33,30 +33,18 @@ void HexagonInstPrinter::printRegName(raw_ostream &O, MCRegister Reg) { void HexagonInstPrinter::printInst(const MCInst *MI, uint64_t Address, StringRef Annot, const MCSubtargetInfo &STI, raw_ostream &OS) { - assert(HexagonMCInstrInfo::isBundle(*MI)); - assert(HexagonMCInstrInfo::bundleSize(*MI) <= HEXAGON_PACKET_SIZE); - assert(HexagonMCInstrInfo::bundleSize(*MI) > 0); - HasExtender = false; - for (auto const &I : HexagonMCInstrInfo::bundleInstructions(*MI)) { - MCInst const &MCI = *I.getInst(); - if (HexagonMCInstrInfo::isDuplex(MII, MCI)) { - printInstruction(MCI.getOperand(1).getInst(), Address, OS); - OS << '\v'; - HasExtender = false; - printInstruction(MCI.getOperand(0).getInst(), Address, OS); - } else - printInstruction(&MCI, Address, OS); - HasExtender = HexagonMCInstrInfo::isImmext(MCI); - OS << "\n"; - } - - bool IsLoop0 = HexagonMCInstrInfo::isInnerLoop(*MI); - bool IsLoop1 = HexagonMCInstrInfo::isOuterLoop(*MI); - if (IsLoop0) { - OS << (IsLoop1 ? " :endloop01" : " :endloop0"); - } else if (IsLoop1) { - OS << " :endloop1"; + if (HexagonMCInstrInfo::isDuplex(MII, *MI)) { + printInstruction(MI->getOperand(1).getInst(), Address, OS); + OS << '\v'; + HasExtender = false; + printInstruction(MI->getOperand(0).getInst(), Address, OS); + } else { + printInstruction(MI, Address, OS); } + HasExtender = HexagonMCInstrInfo::isImmext(*MI); + if ((MI->getOpcode() & HexagonII::INST_PARSE_MASK) == + HexagonII::INST_PARSE_PACKET_END) + HasExtender = false; } void HexagonInstPrinter::printOperand(MCInst const *MI, unsigned OpNo, diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp index 980df819b2c2..bfea50e2d6dc 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp @@ -252,8 +252,21 @@ public: std::string Buffer; { raw_string_ostream TempStream(Buffer); - InstPrinter.printInst(&Inst, Address, "", STI, TempStream); + for (auto &I : HexagonMCInstrInfo::bundleInstructions(Inst)) { + InstPrinter.printInst(I.getInst(), Address, "", STI, TempStream); + TempStream << "\n"; + } + } + + std::string LoopString = ""; + bool IsLoop0 = HexagonMCInstrInfo::isInnerLoop(Inst); + bool IsLoop1 = HexagonMCInstrInfo::isOuterLoop(Inst); + if (IsLoop0) { + LoopString += (IsLoop1 ? " :endloop01" : " :endloop0"); + } else if (IsLoop1) { + LoopString += " :endloop1"; } + StringRef Contents(Buffer); auto PacketBundle = Contents.rsplit('\n'); auto HeadTail = PacketBundle.first.split('\n'); @@ -275,9 +288,9 @@ public: } if (HexagonMCInstrInfo::isMemReorderDisabled(Inst)) - OS << "\n\t} :mem_noshuf" << PacketBundle.second; + OS << "\n\t} :mem_noshuf" << LoopString; else - OS << "\t}" << PacketBundle.second; + OS << "\t}" << LoopString; } void finish() override { finishAttributeSection(); } diff --git a/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp b/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp index 93beaec7eeff..3c3924bd5018 100644 --- a/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp +++ b/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp @@ -59,8 +59,7 @@ void LanaiFrameLowering::determineFrameLayout(MachineFunction &MF) const { // ADJDYNALLOC pseudo instructions with a Lanai:ADDI with the // maximum call frame size as the immediate. void LanaiFrameLowering::replaceAdjDynAllocPseudo(MachineFunction &MF) const { - const LanaiInstrInfo &LII = - *static_cast<const LanaiInstrInfo *>(STI.getInstrInfo()); + const LanaiInstrInfo &LII = *STI.getInstrInfo(); unsigned MaxCallFrameSize = MF.getFrameInfo().getMaxCallFrameSize(); for (MachineBasicBlock &MBB : MF) { @@ -88,8 +87,7 @@ void LanaiFrameLowering::emitPrologue(MachineFunction &MF, assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); MachineFrameInfo &MFI = MF.getFrameInfo(); - const LanaiInstrInfo &LII = - *static_cast<const LanaiInstrInfo *>(STI.getInstrInfo()); + const LanaiInstrInfo &LII = *STI.getInstrInfo(); MachineBasicBlock::iterator MBBI = MBB.begin(); // Debug location must be unknown since the first debug location is used @@ -173,8 +171,7 @@ MachineBasicBlock::iterator LanaiFrameLowering::eliminateCallFramePseudoInstr( void LanaiFrameLowering::emitEpilogue(MachineFunction & /*MF*/, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); - const LanaiInstrInfo &LII = - *static_cast<const LanaiInstrInfo *>(STI.getInstrInfo()); + const LanaiInstrInfo &LII = *STI.getInstrInfo(); DebugLoc DL = MBBI->getDebugLoc(); // Restore the stack pointer using the callee's frame pointer value. @@ -195,8 +192,7 @@ void LanaiFrameLowering::determineCalleeSaves(MachineFunction &MF, TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); MachineFrameInfo &MFI = MF.getFrameInfo(); - const LanaiRegisterInfo *LRI = - static_cast<const LanaiRegisterInfo *>(STI.getRegisterInfo()); + const LanaiRegisterInfo *LRI = STI.getRegisterInfo(); int Offset = -4; // Reserve 4 bytes for the saved RCA diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td index d5a5f17348e4..36c3011be2b9 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// This file describes the baisc single-precision floating-point instructions. +// This file describes the basic single-precision floating-point instructions. // //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp index ac5e7f3891c7..1493bf4cba69 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp @@ -158,7 +158,12 @@ void LoongArchFrameLowering::processFunctionBeforeFrameFinalized( // estimateStackSize has been observed to under-estimate the final stack // size, so give ourselves wiggle-room by checking for stack size // representable an 11-bit signed field rather than 12-bits. - if (!isInt<11>(MFI.estimateStackSize(MF))) + // For [x]vstelm.{b/h/w/d} memory instructions with 8 imm offset, 7-bit + // signed field is fine. + unsigned EstimateStackSize = MFI.estimateStackSize(MF); + if (!isInt<11>(EstimateStackSize) || + (MF.getSubtarget<LoongArchSubtarget>().hasExtLSX() && + !isInt<7>(EstimateStackSize))) ScavSlotsNum = std::max(ScavSlotsNum, 1u); // For CFR spill. diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index c47987fbf683..2378664ca815 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -2597,12 +2597,9 @@ LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { EVT VecTy = Op->getOperand(0)->getValueType(0); SDValue Idx = Op->getOperand(1); - EVT EltTy = VecTy.getVectorElementType(); unsigned NumElts = VecTy.getVectorNumElements(); - if (isa<ConstantSDNode>(Idx) && - (EltTy == MVT::i32 || EltTy == MVT::i64 || EltTy == MVT::f32 || - EltTy == MVT::f64 || Idx->getAsZExtVal() < NumElts / 2)) + if (isa<ConstantSDNode>(Idx) && Idx->getAsZExtVal() < NumElts) return Op; return SDValue(); @@ -6003,10 +6000,9 @@ emitPseudoXVINSGR2VR(MachineInstr &MI, MachineBasicBlock *BB, Register ScratchReg1 = XSrc; if (Idx >= HalfSize) { ScratchReg1 = MRI.createVirtualRegister(RC); - BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_Q), ScratchReg1) + BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_D), ScratchReg1) .addReg(XSrc) - .addReg(XSrc) - .addImm(1); + .addImm(14); } Register ScratchSubReg1 = MRI.createVirtualRegister(SubRC); diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index 95e9fd49d1c0..a0107e44b421 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -1282,6 +1282,32 @@ multiclass PatCCXrXrF<CondCode CC, string Inst> { (!cast<LAInst>(Inst#"_D") LASX256:$xj, LASX256:$xk)>; } +multiclass PairInsertExtractPatV8<ValueType vecty, ValueType elemty> { + foreach imm1 = 0...3 in { + foreach imm2 = 0...3 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert (vector_insert vecty:$xd, + (elemty (vector_extract vecty:$xj, imm1)), imm2), + (elemty (vector_extract vecty:$xj, !add(imm1, 4))), + !add(imm2, 4)), + (XVEXTRINS_W $xd, $xj, Imm)>; + } + } +} + +multiclass PairInsertExtractPatV4<ValueType vecty, ValueType elemty> { + foreach imm1 = 0...1 in { + foreach imm2 = 0...1 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert (vector_insert vecty:$xd, + (elemty (vector_extract vecty:$xj, imm1)), imm2), + (elemty (vector_extract vecty:$xj, !add(imm1, 2))), + !add(imm2, 2)), + (XVEXTRINS_D $xd, $xj, Imm)>; + } + } +} + let Predicates = [HasExtLASX] in { // XVADD_{B/H/W/D} @@ -1582,6 +1608,38 @@ defm : PatCCXrXrF<SETUNE, "XVFCMP_CUNE">; defm : PatCCXrXrF<SETO, "XVFCMP_COR">; defm : PatCCXrXrF<SETUO, "XVFCMP_CUN">; +// Insert two elements extracted from vector into vector. (The positions +// of the two elements must be same in the source or destination vector's +// front and back 128bits.) +// 2*XVPICKVE2GR_{W/D} + 2*XVINSGR2VR_{W/D} -> XVEXTRINS_{W/D} +// XVPERMI_D + 2*XVPICKVE2GR_{B/H} + 2*PseudoXVINSGR2VR_{B/H} -> XVEXTRINS_{W/D} +foreach imm1 = 0...15 in { + foreach imm2 = 0...15 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert (vector_insert v32i8:$xd, + (GRLenVT (vector_extract v32i8:$xj, imm1)), imm2), + (GRLenVT (vector_extract v32i8:$xj, !add(imm1, 16))), + !add(imm2, 16)), + (XVEXTRINS_B $xd, $xj, Imm)>; + } +} + +foreach imm1 = 0...7 in { + foreach imm2 = 0...7 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert (vector_insert v16i16:$xd, + (GRLenVT (vector_extract v16i16:$xj, imm1)), imm2), + (GRLenVT (vector_extract v16i16:$xj, !add(imm1, 8))), + !add(imm2, 8)), + (XVEXTRINS_H $xd, $xj, Imm)>; + } +} + +defm : PairInsertExtractPatV8<v8i32, GRLenVT>; +defm : PairInsertExtractPatV8<v8f32, f32>; +defm : PairInsertExtractPatV4<v4i64, GRLenVT>; +defm : PairInsertExtractPatV4<v4f64, f64>; + // PseudoXVINSGR2VR_{B/H} def : Pat<(vector_insert v32i8:$xd, GRLenVT:$rj, uimm5:$imm), (PseudoXVINSGR2VR_B v32i8:$xd, GRLenVT:$rj, uimm5:$imm)>; @@ -1593,11 +1651,18 @@ def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm), (XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>; def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm), (XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>; - -def : Pat<(vector_insert v8f32:$vd, FPR32:$fj, uimm3:$imm), - (XVINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>; -def : Pat<(vector_insert v4f64:$vd, FPR64:$fj, uimm2:$imm), - (XVINSGR2VR_D $vd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>; +def : Pat<(vector_insert v8f32:$vd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm3:$imm), + (XVINSGR2VR_W $vd, $rj, uimm3:$imm)>; +def : Pat<(vector_insert v4f64:$vd, (f64 (bitconvert i64:$rj)), uimm2:$imm), + (XVINSGR2VR_D $vd, $rj, uimm2:$imm)>; +def : Pat<(vector_insert v8f32:$xd, (f32 (vector_extract v8f32:$xj, uimm3:$imm1)), uimm3:$imm2), + (XVINSGR2VR_W $xd, (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm1), uimm3:$imm2)>; +def : Pat<(vector_insert v4f64:$xd, (f64 (vector_extract v4f64:$xj, uimm2:$imm1)), uimm2:$imm2), + (XVINSGR2VR_D $xd, (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm1), uimm2:$imm2)>; +def : Pat<(vector_insert v8f32:$xd, FPR32:$fj, uimm3:$imm), + (XVINSGR2VR_W $xd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>; +def : Pat<(vector_insert v4f64:$xd, FPR64:$fj, uimm2:$imm), + (XVINSGR2VR_D $xd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>; // scalar_to_vector def : Pat<(v8f32 (scalar_to_vector FPR32:$fj)), @@ -1790,7 +1855,25 @@ foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in { def : RegRegStPat<store, XVSTX, LASX256, vt>; } +// Bitcast float/double element extracted from vector to integer. +def : Pat<(loongarch_movfr2gr_s_la64 (f32 (vector_extract v8f32:$xj, uimm3:$imm))), + (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm)>; +def : Pat<(i64 (bitconvert (f64 (vector_extract v4f64:$xj, uimm2:$imm)))), + (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm)>; + // Vector extraction with constant index. +foreach imm = 16...31 in { + defvar Imm = !and(imm, 15); + def : Pat<(i64 (vector_extract v32i8:$xj, imm)), + (VPICKVE2GR_B (EXTRACT_SUBREG (XVPERMI_D v32i8:$xj, 14), sub_128), + Imm)>; +} +foreach imm = 8...15 in { + defvar Imm = !and(imm, 7); + def : Pat<(i64 (vector_extract v16i16:$xj, imm)), + (VPICKVE2GR_H (EXTRACT_SUBREG (XVPERMI_D v16i16:$xj, 14), sub_128), + Imm)>; +} def : Pat<(i64 (vector_extract v32i8:$xj, uimm4:$imm)), (VPICKVE2GR_B (EXTRACT_SUBREG v32i8:$xj, sub_128), uimm4:$imm)>; def : Pat<(i64 (vector_extract v16i16:$xj, uimm3:$imm)), diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index d73d78083ddc..962e7c21431b 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -1482,6 +1482,28 @@ multiclass VstelmPat<PatFrag StoreOp, ValueType vt, LAInst Inst, (Inst vt:$vd, BaseAddr:$rj, ImmOpnd:$imm, IdxOpnd:$idx)>; } +multiclass InsertExtractPatV4<ValueType vecty, ValueType elemty> { + foreach imm1 = 0...3 in { + foreach imm2 = 0...3 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert vecty:$vd, + (elemty (vector_extract vecty:$vj, imm1)), imm2), + (VEXTRINS_W $vd, $vj, Imm)>; + } + } +} + +multiclass InsertExtractPatV2<ValueType vecty, ValueType elemty> { + foreach imm1 = 0...1 in { + foreach imm2 = 0...1 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert vecty:$vd, + (elemty (vector_extract vecty:$vj, imm1)), imm2), + (VEXTRINS_D $vd, $vj, Imm)>; + } + } +} + let Predicates = [HasExtLSX] in { // VADD_{B/H/W/D} @@ -1782,6 +1804,31 @@ defm : PatCCVrVrF<SETUNE, "VFCMP_CUNE">; defm : PatCCVrVrF<SETO, "VFCMP_COR">; defm : PatCCVrVrF<SETUO, "VFCMP_CUN">; +// Insert element extracted from vector into vector. +// VPICKVE2GR_{B/H/W/D} + VINSGR2VR_{B/H/W/D} -> VEXTRINS_{B/H/W/D} +foreach imm1 = 0...15 in { + foreach imm2 = 0...15 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert v16i8:$vd, + (GRLenVT (vector_extract v16i8:$vj, imm1)), imm2), + (VEXTRINS_B $vd, $vj, Imm)>; + } +} + +foreach imm1 = 0...7 in { + foreach imm2 = 0...7 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert v8i16:$vd, + (GRLenVT (vector_extract v8i16:$vj, imm1)), imm2), + (VEXTRINS_H $vd, $vj, Imm)>; + } +} + +defm : InsertExtractPatV4<v4i32, GRLenVT>; +defm : InsertExtractPatV4<v4f32, f32>; +defm : InsertExtractPatV2<v2i64, GRLenVT>; +defm : InsertExtractPatV2<v2f64, f64>; + // VINSGR2VR_{B/H/W/D} def : Pat<(vector_insert v16i8:$vd, GRLenVT:$rj, uimm4:$imm), (VINSGR2VR_B v16i8:$vd, GRLenVT:$rj, uimm4:$imm)>; @@ -1791,7 +1838,10 @@ def : Pat<(vector_insert v4i32:$vd, GRLenVT:$rj, uimm2:$imm), (VINSGR2VR_W v4i32:$vd, GRLenVT:$rj, uimm2:$imm)>; def : Pat<(vector_insert v2i64:$vd, GRLenVT:$rj, uimm1:$imm), (VINSGR2VR_D v2i64:$vd, GRLenVT:$rj, uimm1:$imm)>; - +def : Pat<(vector_insert v4f32:$vd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm2:$imm), + (VINSGR2VR_W $vd, $rj, uimm2:$imm)>; +def : Pat<(vector_insert v2f64:$vd, (f64 (bitconvert i64:$rj)), uimm1:$imm), + (VINSGR2VR_D $vd, $rj, uimm1:$imm)>; def : Pat<(vector_insert v4f32:$vd, FPR32:$fj, uimm2:$imm), (VINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm2:$imm)>; def : Pat<(vector_insert v2f64:$vd, FPR64:$fj, uimm1:$imm), @@ -1990,6 +2040,12 @@ foreach vt = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in { def : RegRegStPat<store, VSTX, LSX128, vt>; } +// Bitcast float/double element extracted from vector to integer. +def : Pat<(loongarch_movfr2gr_s_la64 (f32 (vector_extract v4f32:$vj, uimm2:$imm))), + (VPICKVE2GR_W v4f32:$vj, uimm2:$imm)>; +def : Pat<(i64 (bitconvert (f64 (vector_extract v2f64:$vj, uimm1:$imm)))), + (VPICKVE2GR_D v2f64:$vj, uimm1:$imm)>; + // Vector extraction with constant index. def : Pat<(i64 (vector_extract v16i8:$vj, uimm4:$imm)), (VPICKVE2GR_B v16i8:$vj, uimm4:$imm)>; diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp index 1b8893029bb3..7b9f1156f910 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp @@ -90,7 +90,7 @@ static void reportOutOfRangeError(MCContext &Ctx, SMLoc Loc, unsigned N) { static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, MCContext &Ctx) { - switch (Fixup.getTargetKind()) { + switch (Fixup.getKind()) { default: llvm_unreachable("Unknown fixup kind"); case FK_Data_1: @@ -157,7 +157,7 @@ void LoongArchAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, MCContext &Ctx = getContext(); // Fixup leb128 separately. - if (Fixup.getTargetKind() == FK_Data_leb128) + if (Fixup.getKind() == FK_Data_leb128) return fixupLeb128(Ctx, Fixup, Data, Value); // Apply any target-specific value adjustments. @@ -247,7 +247,7 @@ bool LoongArchAsmBackend::shouldInsertFixupForCodeAlign(MCAssembler &Asm, bool LoongArchAsmBackend::shouldForceRelocation(const MCFixup &Fixup, const MCValue &Target) { - switch (Fixup.getTargetKind()) { + switch (Fixup.getKind()) { default: return STI.hasFeature(LoongArch::FeatureRelax); case FK_Data_1: @@ -279,23 +279,23 @@ getRelocPairForSize(unsigned Size) { } } -std::pair<bool, bool> LoongArchAsmBackend::relaxLEB128(MCLEBFragment &LF, +std::pair<bool, bool> LoongArchAsmBackend::relaxLEB128(MCFragment &F, int64_t &Value) const { - const MCExpr &Expr = LF.getValue(); - if (LF.isSigned() || !Expr.evaluateKnownAbsolute(Value, *Asm)) + const MCExpr &Expr = F.getLEBValue(); + if (F.isLEBSigned() || !Expr.evaluateKnownAbsolute(Value, *Asm)) return std::make_pair(false, false); - LF.addFixup(MCFixup::create(0, &Expr, FK_Data_leb128)); + F.setVarFixups({MCFixup::create(0, &Expr, FK_Data_leb128)}); return std::make_pair(true, true); } -bool LoongArchAsmBackend::relaxDwarfLineAddr(MCDwarfLineAddrFragment &DF, +bool LoongArchAsmBackend::relaxDwarfLineAddr(MCFragment &F, bool &WasRelaxed) const { MCContext &C = getContext(); - int64_t LineDelta = DF.getLineDelta(); - const MCExpr &AddrDelta = DF.getAddrDelta(); + int64_t LineDelta = F.getDwarfLineDelta(); + const MCExpr &AddrDelta = F.getDwarfAddrDelta(); SmallVector<MCFixup, 1> Fixups; - size_t OldSize = DF.getContents().size(); + size_t OldSize = F.getVarSize(); int64_t Value; if (AddrDelta.evaluateAsAbsolute(Value, *Asm)) @@ -349,17 +349,16 @@ bool LoongArchAsmBackend::relaxDwarfLineAddr(MCDwarfLineAddrFragment &DF, OS << uint8_t(dwarf::DW_LNS_copy); } - DF.setContents(Data); - DF.setFixups(Fixups); + F.setVarContents(Data); + F.setVarFixups(Fixups); WasRelaxed = OldSize != Data.size(); return true; } -bool LoongArchAsmBackend::relaxDwarfCFA(MCDwarfCallFrameFragment &DF, - bool &WasRelaxed) const { - const MCExpr &AddrDelta = DF.getAddrDelta(); +bool LoongArchAsmBackend::relaxDwarfCFA(MCFragment &F, bool &WasRelaxed) const { + const MCExpr &AddrDelta = F.getDwarfAddrDelta(); SmallVector<MCFixup, 2> Fixups; - size_t OldSize = DF.getContents().size(); + size_t OldSize = F.getVarContents().size(); int64_t Value; if (AddrDelta.evaluateAsAbsolute(Value, *Asm)) @@ -371,9 +370,9 @@ bool LoongArchAsmBackend::relaxDwarfCFA(MCDwarfCallFrameFragment &DF, assert(getContext().getAsmInfo()->getMinInstAlignment() == 1 && "expected 1-byte alignment"); if (Value == 0) { - DF.clearContents(); - DF.clearFixups(); - WasRelaxed = OldSize != DF.getContents().size(); + F.clearVarContents(); + F.clearVarFixups(); + WasRelaxed = OldSize != 0; return true; } @@ -405,8 +404,8 @@ bool LoongArchAsmBackend::relaxDwarfCFA(MCDwarfCallFrameFragment &DF, } else { llvm_unreachable("unsupported CFA encoding"); } - DF.setContents(Data); - DF.setFixups(Fixups); + F.setVarContents(Data); + F.setVarFixups(Fixups); WasRelaxed = OldSize != Data.size(); return true; diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h index 4446cadf11e2..b32ba067810c 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h @@ -59,11 +59,9 @@ public: MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override; - bool relaxDwarfLineAddr(MCDwarfLineAddrFragment &DF, - bool &WasRelaxed) const override; - bool relaxDwarfCFA(MCDwarfCallFrameFragment &DF, - bool &WasRelaxed) const override; - std::pair<bool, bool> relaxLEB128(MCLEBFragment &LF, + bool relaxDwarfLineAddr(MCFragment &F, bool &WasRelaxed) const override; + bool relaxDwarfCFA(MCFragment &F, bool &WasRelaxed) const override; + std::pair<bool, bool> relaxLEB128(MCFragment &F, int64_t &Value) const override; bool writeNopData(raw_ostream &OS, uint64_t Count, diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp index faf3cba59a53..fb741afa77e5 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp @@ -68,7 +68,7 @@ unsigned LoongArchELFObjectWriter::getRelocType(const MCFixup &Fixup, break; } - unsigned Kind = Fixup.getTargetKind(); + auto Kind = Fixup.getKind(); if (mc::isRelocation(Fixup.getKind())) return Kind; switch (Kind) { diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp index 1fdc1f799fe5..117dd31e7f05 100644 --- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp +++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp @@ -30,7 +30,7 @@ protected: unsigned getRelocType(const MCFixup &Fixup, const MCValue &, bool IsPCRel) const override { // Translate fixup kind to ELF relocation type. - switch (Fixup.getTargetKind()) { + switch (Fixup.getKind()) { case FK_Data_1: return ELF::R_MSP430_8; case FK_Data_2: return ELF::R_MSP430_16_BYTE; case FK_Data_4: return ELF::R_MSP430_32; diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index 01e4d17f6236..259b71b37d9a 100644 --- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -2101,7 +2101,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, TOut.getStreamer().emitRelocDirective( *TmpExpr, inMicroMipsMode() ? "R_MICROMIPS_JALR" : "R_MIPS_JALR", - RelocJalrExpr, IDLoc, *STI); + RelocJalrExpr); TOut.getStreamer().emitLabel(TmpLabel); } diff --git a/llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt b/llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt index 8b73a7bdd4bc..8ccd42ea0abf 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt +++ b/llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt @@ -8,7 +8,6 @@ add_llvm_component_library(LLVMMipsDesc MipsMCAsmInfo.cpp MipsMCCodeEmitter.cpp MipsMCTargetDesc.cpp - MipsNaClELFStreamer.cpp MipsOptionRecord.cpp MipsTargetStreamer.cpp MipsWinCOFFObjectWriter.cpp diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp index 25e31941bbb4..ad8f5f0a0974 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp @@ -156,7 +156,7 @@ unsigned MipsELFObjectWriter::getRelocType(const MCFixup &Fixup, const MCValue &Target, bool IsPCRel) const { // Determine the type of the relocation. - unsigned Kind = Fixup.getTargetKind(); + auto Kind = Fixup.getKind(); switch (Target.getSpecifier()) { case Mips::S_DTPREL: case Mips::S_DTPREL_HI: diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h deleted file mode 100644 index 94b2f412c8cd..000000000000 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h +++ /dev/null @@ -1,31 +0,0 @@ -//===-- MipsMCNaCl.h - NaCl-related declarations --------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCNACL_H -#define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCNACL_H - -#include "llvm/MC/MCELFStreamer.h" -#include "llvm/Support/Alignment.h" - -namespace llvm { - -// NaCl MIPS sandbox's instruction bundle size. -static const Align MIPS_NACL_BUNDLE_ALIGN = Align(16); - -bool isBasePlusOffsetMemoryAccess(unsigned Opcode, unsigned *AddrIdx, - bool *IsStore = nullptr); -bool baseRegNeedsLoadStoreMask(MCRegister Reg); - -// This function creates an MCELFStreamer for Mips NaCl. -MCELFStreamer * -createMipsNaClELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB, - std::unique_ptr<MCObjectWriter> OW, - std::unique_ptr<MCCodeEmitter> Emitter); -} - -#endif diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp index ab1eda0f48e1..2cc634154bff 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp @@ -16,7 +16,6 @@ #include "MipsELFStreamer.h" #include "MipsInstPrinter.h" #include "MipsMCAsmInfo.h" -#include "MipsMCNaCl.h" #include "MipsTargetStreamer.h" #include "TargetInfo/MipsTargetInfo.h" #include "llvm/DebugInfo/CodeView/CodeView.h" @@ -199,12 +198,8 @@ static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context, std::unique_ptr<MCObjectWriter> &&OW, std::unique_ptr<MCCodeEmitter> &&Emitter) { MCStreamer *S; - if (!T.isOSNaCl()) - S = createMipsELFStreamer(Context, std::move(MAB), std::move(OW), - std::move(Emitter)); - else - S = createMipsNaClELFStreamer(Context, std::move(MAB), std::move(OW), - std::move(Emitter)); + S = createMipsELFStreamer(Context, std::move(MAB), std::move(OW), + std::move(Emitter)); return S; } diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp deleted file mode 100644 index 3410726c8e55..000000000000 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp +++ /dev/null @@ -1,274 +0,0 @@ -//===-- MipsNaClELFStreamer.cpp - ELF Object Output for Mips NaCl ---------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file implements MCELFStreamer for Mips NaCl. It emits .o object files -// as required by NaCl's SFI sandbox. It inserts address-masking instructions -// before dangerous control-flow and memory access instructions. It inserts -// address-masking instructions after instructions that change the stack -// pointer. It ensures that the mask and the dangerous instruction are always -// emitted in the same bundle. It aligns call + branch delay to the bundle end, -// so that return address is always aligned to the start of next bundle. -// -//===----------------------------------------------------------------------===// - -#include "MipsELFStreamer.h" -#include "MipsMCNaCl.h" -#include "llvm/MC/MCAsmBackend.h" -#include "llvm/MC/MCAssembler.h" -#include "llvm/MC/MCCodeEmitter.h" -#include "llvm/MC/MCELFStreamer.h" -#include "llvm/MC/MCInst.h" -#include "llvm/MC/MCObjectWriter.h" -#include "llvm/Support/ErrorHandling.h" -#include <cassert> - -using namespace llvm; - -#define DEBUG_TYPE "mips-mc-nacl" - -namespace { - -const unsigned IndirectBranchMaskReg = Mips::T6; -const unsigned LoadStoreStackMaskReg = Mips::T7; - -/// Extend the generic MCELFStreamer class so that it can mask dangerous -/// instructions. - -class MipsNaClELFStreamer : public MipsELFStreamer { -public: - MipsNaClELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB, - std::unique_ptr<MCObjectWriter> OW, - std::unique_ptr<MCCodeEmitter> Emitter) - : MipsELFStreamer(Context, std::move(TAB), std::move(OW), - std::move(Emitter)) {} - - ~MipsNaClELFStreamer() override = default; - -private: - // Whether we started the sandboxing sequence for calls. Calls are bundled - // with branch delays and aligned to the bundle end. - bool PendingCall = false; - - bool isIndirectJump(const MCInst &MI) { - if (MI.getOpcode() == Mips::JALR) { - // MIPS32r6/MIPS64r6 doesn't have a JR instruction and uses JALR instead. - // JALR is an indirect branch if the link register is $0. - assert(MI.getOperand(0).isReg()); - return MI.getOperand(0).getReg() == Mips::ZERO; - } - return MI.getOpcode() == Mips::JR; - } - - bool isStackPointerFirstOperand(const MCInst &MI) { - return (MI.getNumOperands() > 0 && MI.getOperand(0).isReg() - && MI.getOperand(0).getReg() == Mips::SP); - } - - bool isCall(const MCInst &MI, bool *IsIndirectCall) { - unsigned Opcode = MI.getOpcode(); - - *IsIndirectCall = false; - - switch (Opcode) { - default: - return false; - - case Mips::JAL: - case Mips::BAL: - case Mips::BAL_BR: - case Mips::BLTZAL: - case Mips::BGEZAL: - return true; - - case Mips::JALR: - // JALR is only a call if the link register is not $0. Otherwise it's an - // indirect branch. - assert(MI.getOperand(0).isReg()); - if (MI.getOperand(0).getReg() == Mips::ZERO) - return false; - - *IsIndirectCall = true; - return true; - } - } - - void emitMask(MCRegister AddrReg, unsigned MaskReg, - const MCSubtargetInfo &STI) { - MCInst MaskInst; - MaskInst.setOpcode(Mips::AND); - MaskInst.addOperand(MCOperand::createReg(AddrReg)); - MaskInst.addOperand(MCOperand::createReg(AddrReg)); - MaskInst.addOperand(MCOperand::createReg(MaskReg)); - MipsELFStreamer::emitInstruction(MaskInst, STI); - } - - // Sandbox indirect branch or return instruction by inserting mask operation - // before it. - void sandboxIndirectJump(const MCInst &MI, const MCSubtargetInfo &STI) { - MCRegister AddrReg = MI.getOperand(0).getReg(); - - emitBundleLock(false); - emitMask(AddrReg, IndirectBranchMaskReg, STI); - MipsELFStreamer::emitInstruction(MI, STI); - emitBundleUnlock(); - } - - // Sandbox memory access or SP change. Insert mask operation before and/or - // after the instruction. - void sandboxLoadStoreStackChange(const MCInst &MI, unsigned AddrIdx, - const MCSubtargetInfo &STI, bool MaskBefore, - bool MaskAfter) { - emitBundleLock(false); - if (MaskBefore) { - // Sandbox memory access. - MCRegister BaseReg = MI.getOperand(AddrIdx).getReg(); - emitMask(BaseReg, LoadStoreStackMaskReg, STI); - } - MipsELFStreamer::emitInstruction(MI, STI); - if (MaskAfter) { - // Sandbox SP change. - MCRegister SPReg = MI.getOperand(0).getReg(); - assert((Mips::SP == SPReg) && "Unexpected stack-pointer register."); - emitMask(SPReg, LoadStoreStackMaskReg, STI); - } - emitBundleUnlock(); - } - -public: - /// This function is the one used to emit instruction data into the ELF - /// streamer. We override it to mask dangerous instructions. - void emitInstruction(const MCInst &Inst, - const MCSubtargetInfo &STI) override { - // Sandbox indirect jumps. - if (isIndirectJump(Inst)) { - if (PendingCall) - report_fatal_error("Dangerous instruction in branch delay slot!"); - sandboxIndirectJump(Inst, STI); - return; - } - - // Sandbox loads, stores and SP changes. - unsigned AddrIdx = 0; - bool IsStore = false; - bool IsMemAccess = isBasePlusOffsetMemoryAccess(Inst.getOpcode(), &AddrIdx, - &IsStore); - bool IsSPFirstOperand = isStackPointerFirstOperand(Inst); - if (IsMemAccess || IsSPFirstOperand) { - bool MaskBefore = (IsMemAccess - && baseRegNeedsLoadStoreMask(Inst.getOperand(AddrIdx) - .getReg())); - bool MaskAfter = IsSPFirstOperand && !IsStore; - if (MaskBefore || MaskAfter) { - if (PendingCall) - report_fatal_error("Dangerous instruction in branch delay slot!"); - sandboxLoadStoreStackChange(Inst, AddrIdx, STI, MaskBefore, MaskAfter); - return; - } - // fallthrough - } - - // Sandbox calls by aligning call and branch delay to the bundle end. - // For indirect calls, emit the mask before the call. - bool IsIndirectCall; - if (isCall(Inst, &IsIndirectCall)) { - if (PendingCall) - report_fatal_error("Dangerous instruction in branch delay slot!"); - - // Start the sandboxing sequence by emitting call. - emitBundleLock(true); - if (IsIndirectCall) { - MCRegister TargetReg = Inst.getOperand(1).getReg(); - emitMask(TargetReg, IndirectBranchMaskReg, STI); - } - MipsELFStreamer::emitInstruction(Inst, STI); - PendingCall = true; - return; - } - if (PendingCall) { - // Finish the sandboxing sequence by emitting branch delay. - MipsELFStreamer::emitInstruction(Inst, STI); - emitBundleUnlock(); - PendingCall = false; - return; - } - - // None of the sandboxing applies, just emit the instruction. - MipsELFStreamer::emitInstruction(Inst, STI); - } -}; - -} // end anonymous namespace - -namespace llvm { - -bool isBasePlusOffsetMemoryAccess(unsigned Opcode, unsigned *AddrIdx, - bool *IsStore) { - if (IsStore) - *IsStore = false; - - switch (Opcode) { - default: - return false; - - // Load instructions with base address register in position 1. - case Mips::LB: - case Mips::LBu: - case Mips::LH: - case Mips::LHu: - case Mips::LW: - case Mips::LWC1: - case Mips::LDC1: - case Mips::LL: - case Mips::LL_R6: - case Mips::LWL: - case Mips::LWR: - *AddrIdx = 1; - return true; - - // Store instructions with base address register in position 1. - case Mips::SB: - case Mips::SH: - case Mips::SW: - case Mips::SWC1: - case Mips::SDC1: - case Mips::SWL: - case Mips::SWR: - *AddrIdx = 1; - if (IsStore) - *IsStore = true; - return true; - - // Store instructions with base address register in position 2. - case Mips::SC: - case Mips::SC_R6: - *AddrIdx = 2; - if (IsStore) - *IsStore = true; - return true; - } -} - -bool baseRegNeedsLoadStoreMask(MCRegister Reg) { - // The contents of SP and thread pointer register do not require masking. - return Reg != Mips::SP && Reg != Mips::T8; -} - -MCELFStreamer * -createMipsNaClELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB, - std::unique_ptr<MCObjectWriter> OW, - std::unique_ptr<MCCodeEmitter> Emitter) { - MipsNaClELFStreamer *S = new MipsNaClELFStreamer( - Context, std::move(TAB), std::move(OW), std::move(Emitter)); - - // Set bundle-alignment as required by the NaCl ABI for the target. - S->emitBundleAlignMode(MIPS_NACL_BUNDLE_ALIGN); - - return S; -} - -} // end namespace llvm diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp index c69fc68ab5af..b89d6890903d 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp @@ -1033,42 +1033,42 @@ MCELFStreamer &MipsTargetELFStreamer::getStreamer() { } void MipsTargetELFStreamer::emitGPRel32Value(const MCExpr *Value) { - MCDataFragment *DF = getStreamer().getOrCreateDataFragment(); + MCFragment *DF = getStreamer().getOrCreateDataFragment(); DF->addFixup(MCFixup::create(DF->getContents().size(), Value, Mips::fixup_Mips_GPREL32)); DF->appendContents(4, 0); } void MipsTargetELFStreamer::emitGPRel64Value(const MCExpr *Value) { - MCDataFragment *DF = getStreamer().getOrCreateDataFragment(); + MCFragment *DF = getStreamer().getOrCreateDataFragment(); DF->addFixup(MCFixup::create(DF->getContents().size(), Value, Mips::fixup_Mips_GPREL32)); DF->appendContents(8, 0); } void MipsTargetELFStreamer::emitDTPRel32Value(const MCExpr *Value) { - MCDataFragment *DF = getStreamer().getOrCreateDataFragment(); + MCFragment *DF = getStreamer().getOrCreateDataFragment(); DF->addFixup(MCFixup::create(DF->getContents().size(), Value, Mips::fixup_Mips_DTPREL32)); DF->appendContents(4, 0); } void MipsTargetELFStreamer::emitDTPRel64Value(const MCExpr *Value) { - MCDataFragment *DF = getStreamer().getOrCreateDataFragment(); + MCFragment *DF = getStreamer().getOrCreateDataFragment(); DF->addFixup(MCFixup::create(DF->getContents().size(), Value, Mips::fixup_Mips_DTPREL64)); DF->appendContents(8, 0); } void MipsTargetELFStreamer::emitTPRel32Value(const MCExpr *Value) { - MCDataFragment *DF = getStreamer().getOrCreateDataFragment(); + MCFragment *DF = getStreamer().getOrCreateDataFragment(); DF->addFixup(MCFixup::create(DF->getContents().size(), Value, Mips::fixup_Mips_TPREL32)); DF->appendContents(4, 0); } void MipsTargetELFStreamer::emitTPRel64Value(const MCExpr *Value) { - MCDataFragment *DF = getStreamer().getOrCreateDataFragment(); + MCFragment *DF = getStreamer().getOrCreateDataFragment(); DF->addFixup(MCFixup::create(DF->getContents().size(), Value, Mips::fixup_Mips_TPREL64)); DF->appendContents(8, 0); diff --git a/llvm/lib/Target/Mips/MicroMipsSizeReduction.cpp b/llvm/lib/Target/Mips/MicroMipsSizeReduction.cpp index b0de8dacf691..4633df5d1b6a 100644 --- a/llvm/lib/Target/Mips/MicroMipsSizeReduction.cpp +++ b/llvm/lib/Target/Mips/MicroMipsSizeReduction.cpp @@ -781,7 +781,7 @@ bool MicroMipsSizeReduce::runOnMachineFunction(MachineFunction &MF) { Subtarget->hasMips32r6()) return false; - MipsII = static_cast<const MipsInstrInfo *>(Subtarget->getInstrInfo()); + MipsII = Subtarget->getInstrInfo(); bool Modified = false; MachineFunction::iterator I = MF.begin(), E = MF.end(); diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp index 87e06a6d3c08..ca0331006be7 100644 --- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp @@ -16,7 +16,6 @@ #include "MCTargetDesc/MipsBaseInfo.h" #include "MCTargetDesc/MipsInstPrinter.h" #include "MCTargetDesc/MipsMCAsmInfo.h" -#include "MCTargetDesc/MipsMCNaCl.h" #include "MCTargetDesc/MipsMCTargetDesc.h" #include "MCTargetDesc/MipsTargetStreamer.h" #include "Mips.h" @@ -87,10 +86,6 @@ bool MipsAsmPrinter::runOnMachineFunction(MachineFunction &MF) { StubsNeeded.insert(I); MCP = MF.getConstantPool(); - // In NaCl, all indirect jump targets must be aligned to bundle size. - if (Subtarget->isTargetNaCl()) - NaClAlignIndirectJumpTargets(MF); - AsmPrinter::runOnMachineFunction(MF); emitXRayTable(); @@ -171,7 +166,7 @@ static void emitDirectiveRelocJalr(const MachineInstr &MI, OutStreamer.emitRelocDirective( *OffsetExpr, Subtarget.inMicroMipsMode() ? "R_MICROMIPS_JALR" : "R_MIPS_JALR", - CaleeExpr, SMLoc(), *TM.getMCSubtargetInfo()); + CaleeExpr); OutStreamer.emitLabel(OffsetLabel); return; } @@ -401,11 +396,6 @@ const char *MipsAsmPrinter::getCurrentABIString() const { void MipsAsmPrinter::emitFunctionEntryLabel() { MipsTargetStreamer &TS = getTargetStreamer(); - // NaCl sandboxing requires that indirect call instructions are masked. - // This means that function entry points should be bundle-aligned. - if (Subtarget->isTargetNaCl()) - emitAlignment(std::max(MF->getAlignment(), MIPS_NACL_BUNDLE_ALIGN)); - if (Subtarget->inMicroMipsMode()) { TS.emitDirectiveSetMicroMips(); TS.setUsesMicroMips(); @@ -1263,27 +1253,6 @@ void MipsAsmPrinter::emitDebugValue(const MCExpr *Value, unsigned Size) const { AsmPrinter::emitDebugValue(Value, Size); } -// Align all targets of indirect branches on bundle size. Used only if target -// is NaCl. -void MipsAsmPrinter::NaClAlignIndirectJumpTargets(MachineFunction &MF) { - // Align all blocks that are jumped to through jump table. - if (MachineJumpTableInfo *JtInfo = MF.getJumpTableInfo()) { - const std::vector<MachineJumpTableEntry> &JT = JtInfo->getJumpTables(); - for (const auto &I : JT) { - const std::vector<MachineBasicBlock *> &MBBs = I.MBBs; - - for (MachineBasicBlock *MBB : MBBs) - MBB->setAlignment(MIPS_NACL_BUNDLE_ALIGN); - } - } - - // If basic block address is taken, block can be target of indirect branch. - for (auto &MBB : MF) { - if (MBB.hasAddressTaken()) - MBB.setAlignment(MIPS_NACL_BUNDLE_ALIGN); - } -} - bool MipsAsmPrinter::isLongBranchPseudo(int Opcode) const { return (Opcode == Mips::LONG_BRANCH_LUi || Opcode == Mips::LONG_BRANCH_LUi2Op diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.h b/llvm/lib/Target/Mips/MipsAsmPrinter.h index bbaa3b3cef9d..8b2fb32dc552 100644 --- a/llvm/lib/Target/Mips/MipsAsmPrinter.h +++ b/llvm/lib/Target/Mips/MipsAsmPrinter.h @@ -112,8 +112,6 @@ private: void EmitFPCallStub(const char *, const Mips16HardFloatInfo::FuncSignature *); - void NaClAlignIndirectJumpTargets(MachineFunction &MF); - bool isLongBranchPseudo(int Opcode) const; public: diff --git a/llvm/lib/Target/Mips/MipsBranchExpansion.cpp b/llvm/lib/Target/Mips/MipsBranchExpansion.cpp index 6e897fe87668..3720c936643b 100644 --- a/llvm/lib/Target/Mips/MipsBranchExpansion.cpp +++ b/llvm/lib/Target/Mips/MipsBranchExpansion.cpp @@ -74,7 +74,6 @@ #include "MCTargetDesc/MipsABIInfo.h" #include "MCTargetDesc/MipsBaseInfo.h" -#include "MCTargetDesc/MipsMCNaCl.h" #include "MCTargetDesc/MipsMCTargetDesc.h" #include "Mips.h" #include "MipsInstrInfo.h" @@ -518,27 +517,19 @@ void MipsBranchExpansion::expandToLongBranch(MBBInfo &I) { BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LW), Mips::RA) .addReg(Mips::SP) .addImm(0); - if (STI->isTargetNaCl()) - // Bundle-align the target of indirect branch JR. - TgtMBB->setAlignment(MIPS_NACL_BUNDLE_ALIGN); - // In NaCl, modifying the sp is not allowed in branch delay slot. // For MIPS32R6, we can skip using a delay slot branch. bool hasDelaySlot = buildProperJumpMI(BalTgtMBB, Pos, DL); - if (STI->isTargetNaCl() || !hasDelaySlot) { + if (!hasDelaySlot) { BuildMI(*BalTgtMBB, std::prev(Pos), DL, TII->get(Mips::ADDiu), Mips::SP) .addReg(Mips::SP) .addImm(8); } if (hasDelaySlot) { - if (STI->isTargetNaCl()) { - TII->insertNop(*BalTgtMBB, Pos, DL); - } else { - BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP) - .addReg(Mips::SP) - .addImm(8); - } + BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP) + .addReg(Mips::SP) + .addImm(8); BalTgtMBB->rbegin()->bundleWithPred(); } } else { @@ -899,14 +890,6 @@ bool MipsBranchExpansion::handlePossibleLongBranch() { (Br->isUnconditionalBranch() && IsPIC))) { int64_t Offset = computeOffset(&*Br); - if (STI->isTargetNaCl()) { - // The offset calculation does not include sandboxing instructions - // that will be added later in the MC layer. Since at this point we - // don't know the exact amount of code that "sandboxing" will add, we - // conservatively estimate that code will not grow more than 100%. - Offset *= 2; - } - if (ForceLongBranchFirstPass || !TII->isBranchOffsetInRange(Br->getOpcode(), Offset)) { MBBInfos[I].Offset = Offset; @@ -941,7 +924,7 @@ bool MipsBranchExpansion::runOnMachineFunction(MachineFunction &MF) { IsPIC = TM.isPositionIndependent(); ABI = static_cast<const MipsTargetMachine &>(TM).getABI(); STI = &MF.getSubtarget<MipsSubtarget>(); - TII = static_cast<const MipsInstrInfo *>(STI->getInstrInfo()); + TII = STI->getInstrInfo(); if (IsPIC && ABI.IsO32() && MF.getInfo<MipsFunctionInfo>()->globalBaseRegSet()) diff --git a/llvm/lib/Target/Mips/MipsCallingConv.td b/llvm/lib/Target/Mips/MipsCallingConv.td index 3c60114f507b..39e184a6303a 100644 --- a/llvm/lib/Target/Mips/MipsCallingConv.td +++ b/llvm/lib/Target/Mips/MipsCallingConv.td @@ -267,15 +267,8 @@ def CC_Mips_FastCC : CallingConv<[ // Integer arguments are passed in integer registers. All scratch registers, // except for AT, V0 and T9, are available to be used as argument registers. - CCIfType<[i32], CCIfSubtargetNot<"isTargetNaCl()", - CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3, T4, T5, T6, T7, T8, V1]>>>, - - // In NaCl, T6, T7 and T8 are reserved and not available as argument - // registers for fastcc. T6 contains the mask for sandboxing control flow - // (indirect jumps and calls). T7 contains the mask for sandboxing memory - // accesses (loads and stores). T8 contains the thread pointer. - CCIfType<[i32], CCIfSubtarget<"isTargetNaCl()", - CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3, T4, T5, V1]>>>, + CCIfType<[i32], + CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3, T4, T5, T6, T7, T8, V1]>>, // f32 arguments are passed in single-precision floating pointer registers. CCIfType<[f32], CCIfSubtarget<"useOddSPReg()", diff --git a/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp b/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp index b13394a607f6..dfbbcbe60219 100644 --- a/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp +++ b/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp @@ -10,7 +10,6 @@ // //===----------------------------------------------------------------------===// -#include "MCTargetDesc/MipsMCNaCl.h" #include "Mips.h" #include "MipsInstrInfo.h" #include "MipsSubtarget.h" @@ -727,18 +726,6 @@ bool MipsDelaySlotFiller::searchRange(MachineBasicBlock &MBB, IterTy Begin, continue; const MipsSubtarget &STI = MBB.getParent()->getSubtarget<MipsSubtarget>(); - if (STI.isTargetNaCl()) { - // In NaCl, instructions that must be masked are forbidden in delay slots. - // We only check for loads, stores and SP changes. Calls, returns and - // branches are not checked because non-NaCl targets never put them in - // delay slots. - unsigned AddrIdx; - if ((isBasePlusOffsetMemoryAccess(CurrI->getOpcode(), &AddrIdx) && - baseRegNeedsLoadStoreMask(CurrI->getOperand(AddrIdx).getReg())) || - CurrI->modifiesRegister(Mips::SP, STI.getRegisterInfo())) - continue; - } - bool InMicroMipsMode = STI.inMicroMipsMode(); const MipsInstrInfo *TII = STI.getInstrInfo(); unsigned Opcode = (*Slot).getOpcode(); diff --git a/llvm/lib/Target/Mips/MipsInstrFPU.td b/llvm/lib/Target/Mips/MipsInstrFPU.td index 14590ddacfcb..4ca329d21498 100644 --- a/llvm/lib/Target/Mips/MipsInstrFPU.td +++ b/llvm/lib/Target/Mips/MipsInstrFPU.td @@ -622,15 +622,13 @@ let AdditionalPredicates = [NotInMicroMips] in { // Indexed loads and stores. // Base register + offset register addressing mode (indicated by "x" in the -// instruction mnemonic) is disallowed under NaCl. -let AdditionalPredicates = [IsNotNaCl] in { - def LWXC1 : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>, LWXC1_FM<0>, - INSN_MIPS4_32R2_NOT_32R6_64R6; - def SWXC1 : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>, SWXC1_FM<8>, - INSN_MIPS4_32R2_NOT_32R6_64R6; -} +// instruction mnemonic). +def LWXC1 : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>, LWXC1_FM<0>, + INSN_MIPS4_32R2_NOT_32R6_64R6; +def SWXC1 : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>, SWXC1_FM<8>, + INSN_MIPS4_32R2_NOT_32R6_64R6; -let AdditionalPredicates = [NotInMicroMips, IsNotNaCl] in { +let AdditionalPredicates = [NotInMicroMips] in { def LDXC1 : LWXC1_FT<"ldxc1", AFGR64Opnd, II_LDXC1, load>, LWXC1_FM<1>, INSN_MIPS4_32R2_NOT_32R6_64R6, FGR_32; def SDXC1 : SWXC1_FT<"sdxc1", AFGR64Opnd, II_SDXC1, store>, SWXC1_FM<9>, @@ -646,14 +644,14 @@ let DecoderNamespace="MipsFP64" in { // Load/store doubleword indexed unaligned. // FIXME: This instruction should not be defined for FGR_32. -let AdditionalPredicates = [IsNotNaCl, NotInMicroMips] in { +let AdditionalPredicates = [NotInMicroMips] in { def LUXC1 : MMRel, LWXC1_FT<"luxc1", AFGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>, INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_32; def SUXC1 : MMRel, SWXC1_FT<"suxc1", AFGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>, INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_32; } -let AdditionalPredicates = [IsNotNaCl, NotInMicroMips], +let AdditionalPredicates = [NotInMicroMips], DecoderNamespace="MipsFP64" in { def LUXC164 : LWXC1_FT<"luxc1", FGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>, INSN_MIPS5_32R2_NOT_32R6_64R6, FGR_64; diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.td b/llvm/lib/Target/Mips/MipsInstrInfo.td index b6125b972717..a124e84e9ca5 100644 --- a/llvm/lib/Target/Mips/MipsInstrInfo.td +++ b/llvm/lib/Target/Mips/MipsInstrInfo.td @@ -236,7 +236,6 @@ def NotInMicroMips : Predicate<"!Subtarget->inMicroMipsMode()">, AssemblerPredicate<(all_of (not FeatureMicroMips))>; def IsLE : Predicate<"Subtarget->isLittle()">; def IsBE : Predicate<"!Subtarget->isLittle()">; -def IsNotNaCl : Predicate<"!Subtarget->isTargetNaCl()">; def UseTCCInDIV : AssemblerPredicate<(all_of FeatureUseTCCInDIV)>; def HasEVA : Predicate<"Subtarget->hasEVA()">, AssemblerPredicate<(all_of FeatureEVA)>; diff --git a/llvm/lib/Target/Mips/MipsRegisterInfo.cpp b/llvm/lib/Target/Mips/MipsRegisterInfo.cpp index ae4b2377ad21..539288e8da59 100644 --- a/llvm/lib/Target/Mips/MipsRegisterInfo.cpp +++ b/llvm/lib/Target/Mips/MipsRegisterInfo.cpp @@ -162,13 +162,6 @@ getReservedRegs(const MachineFunction &MF) const { for (MCPhysReg R : ReservedGPR32) Reserved.set(R); - // Reserve registers for the NaCl sandbox. - if (Subtarget.isTargetNaCl()) { - Reserved.set(Mips::T6); // Reserved for control flow mask. - Reserved.set(Mips::T7); // Reserved for memory access mask. - Reserved.set(Mips::T8); // Reserved for thread pointer. - } - for (MCPhysReg R : ReservedGPR64) Reserved.set(R); diff --git a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp index d775f5a16bcd..f08704a7e799 100644 --- a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp +++ b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp @@ -403,8 +403,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF, const MipsSEInstrInfo &TII = *static_cast<const MipsSEInstrInfo *>(STI.getInstrInfo()); - const MipsRegisterInfo &RegInfo = - *static_cast<const MipsRegisterInfo *>(STI.getRegisterInfo()); + const MipsRegisterInfo &RegInfo = *STI.getRegisterInfo(); MachineBasicBlock::iterator MBBI = MBB.begin(); DebugLoc dl; @@ -658,8 +657,7 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF, const MipsSEInstrInfo &TII = *static_cast<const MipsSEInstrInfo *>(STI.getInstrInfo()); - const MipsRegisterInfo &RegInfo = - *static_cast<const MipsRegisterInfo *>(STI.getRegisterInfo()); + const MipsRegisterInfo &RegInfo = *STI.getRegisterInfo(); DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); MipsABIInfo ABI = STI.getABI(); diff --git a/llvm/lib/Target/Mips/MipsSubtarget.h b/llvm/lib/Target/Mips/MipsSubtarget.h index bb026f565512..52f892a160c3 100644 --- a/llvm/lib/Target/Mips/MipsSubtarget.h +++ b/llvm/lib/Target/Mips/MipsSubtarget.h @@ -355,7 +355,6 @@ public: bool os16() const { return Os16; } - bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); } bool isTargetWindows() const { return TargetTriple.isOSWindows(); } bool isXRaySupported() const override { return true; } diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index 443db4391a52..8eec91562ecf 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -268,8 +268,8 @@ void NVPTXInstPrinter::printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O, llvm_unreachable("Empty Modifier"); } -void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, - raw_ostream &O, StringRef Modifier) { +void NVPTXInstPrinter::printAtomicCode(const MCInst *MI, int OpNum, + raw_ostream &O, StringRef Modifier) { const MCOperand &MO = MI->getOperand(OpNum); int Imm = (int)MO.getImm(); if (Modifier == "sem") { @@ -286,22 +286,24 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, case NVPTX::Ordering::Release: O << ".release"; return; + case NVPTX::Ordering::AcquireRelease: + O << ".acq_rel"; + return; + case NVPTX::Ordering::SequentiallyConsistent: + O << ".seq_cst"; + return; case NVPTX::Ordering::Volatile: O << ".volatile"; return; case NVPTX::Ordering::RelaxedMMIO: O << ".mmio.relaxed"; return; - default: - report_fatal_error(formatv( - "NVPTX LdStCode Printer does not support \"{}\" sem modifier. " - "Loads/Stores cannot be AcquireRelease or SequentiallyConsistent.", - OrderingToString(Ordering))); } } else if (Modifier == "scope") { auto S = NVPTX::Scope(Imm); switch (S) { case NVPTX::Scope::Thread: + case NVPTX::Scope::DefaultDevice: return; case NVPTX::Scope::System: O << ".sys"; @@ -316,9 +318,9 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, O << ".gpu"; return; } - report_fatal_error( - formatv("NVPTX LdStCode Printer does not support \"{}\" sco modifier.", - ScopeToString(S))); + report_fatal_error(formatv( + "NVPTX AtomicCode Printer does not support \"{}\" scope modifier.", + ScopeToString(S))); } else if (Modifier == "addsp") { auto A = NVPTX::AddressSpace(Imm); switch (A) { @@ -334,7 +336,7 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, return; } report_fatal_error(formatv( - "NVPTX LdStCode Printer does not support \"{}\" addsp modifier.", + "NVPTX AtomicCode Printer does not support \"{}\" addsp modifier.", AddressSpaceToString(A))); } else if (Modifier == "sign") { switch (Imm) { diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h index 193c436939f6..c3ff3469150e 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h @@ -40,8 +40,8 @@ public: StringRef Modifier = {}); void printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O, StringRef Modifier = {}); - void printLdStCode(const MCInst *MI, int OpNum, raw_ostream &O, - StringRef Modifier = {}); + void printAtomicCode(const MCInst *MI, int OpNum, raw_ostream &O, + StringRef Modifier = {}); void printMmaCode(const MCInst *MI, int OpNum, raw_ostream &O, StringRef Modifier = {}); void printMemOperand(const MCInst *MI, int OpNum, raw_ostream &O, diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 15997bc3878d..77a0e03d4075 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -164,7 +164,6 @@ enum Ordering : OrderingUnderlyingType { (OrderingUnderlyingType)AtomicOrdering::SequentiallyConsistent, Volatile = SequentiallyConsistent + 1, RelaxedMMIO = Volatile + 1, - LASTORDERING = RelaxedMMIO }; using ScopeUnderlyingType = unsigned int; @@ -174,7 +173,8 @@ enum Scope : ScopeUnderlyingType { Cluster = 2, Device = 3, System = 4, - LASTSCOPE = System + DefaultDevice = 5, // For SM < 70: denotes PTX op implicit/default .gpu scope + LASTSCOPE = DefaultDevice }; using AddressSpaceUnderlyingType = unsigned int; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index ae73d8da79f8..65e7c5677454 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -494,7 +494,7 @@ bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) { return true; } -static std::optional<unsigned> convertAS(unsigned AS) { +static std::optional<NVPTX::AddressSpace> convertAS(unsigned AS) { switch (AS) { case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::AddressSpace::Local; @@ -515,11 +515,42 @@ static std::optional<unsigned> convertAS(unsigned AS) { } } -static unsigned int getCodeAddrSpace(const MemSDNode *N) { +NVPTX::AddressSpace NVPTXDAGToDAGISel::getAddrSpace(const MemSDNode *N) { return convertAS(N->getMemOperand()->getAddrSpace()) .value_or(NVPTX::AddressSpace::Generic); } +NVPTX::Ordering NVPTXDAGToDAGISel::getMemOrder(const MemSDNode *N) const { + // No "sem" orderings for SM/PTX versions which do not support memory ordering + if (!Subtarget->hasMemoryOrdering()) + return NVPTX::Ordering::NotAtomic; + auto Ordering = N->getMergedOrdering(); + switch (Ordering) { + case AtomicOrdering::NotAtomic: + return NVPTX::Ordering::NotAtomic; + case AtomicOrdering::Unordered: + case AtomicOrdering::Monotonic: + return NVPTX::Ordering::Relaxed; + case AtomicOrdering::Acquire: + return NVPTX::Ordering::Acquire; + case AtomicOrdering::Release: + return NVPTX::Ordering::Release; + case AtomicOrdering::AcquireRelease: + return NVPTX::Ordering::AcquireRelease; + case AtomicOrdering::SequentiallyConsistent: + return NVPTX::Ordering::SequentiallyConsistent; + } + llvm_unreachable("Invalid atomic ordering"); +} + +NVPTX::Scope NVPTXDAGToDAGISel::getAtomicScope(const MemSDNode *N) const { + // No "scope" modifier for SM/PTX versions which do not support scoped atomics + // Functionally, these atomics are at device scope + if (!Subtarget->hasAtomScope()) + return NVPTX::Scope::DefaultDevice; + return Scopes[N->getSyncScopeID()]; +} + namespace { struct OperationOrderings { @@ -532,7 +563,7 @@ struct OperationOrderings { static OperationOrderings getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) { AtomicOrdering Ordering = N->getSuccessOrdering(); - auto CodeAddrSpace = getCodeAddrSpace(N); + auto CodeAddrSpace = NVPTXDAGToDAGISel::getAddrSpace(N); bool HasMemoryOrdering = Subtarget->hasMemoryOrdering(); bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO(); @@ -756,7 +787,7 @@ NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N, } static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget, - unsigned CodeAddrSpace) { + NVPTX::AddressSpace CodeAddrSpace) { // We use ldg (i.e. ld.global.nc) for invariant loads from the global address // space. return Subtarget.hasLDG() && CodeAddrSpace == NVPTX::AddressSpace::Global && @@ -788,6 +819,7 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_gpu : NVPTX::INT_MEMBAR_GL; case NVPTX::Scope::Thread: + case NVPTX::Scope::DefaultDevice: report_fatal_error( formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.", ScopeToString(S))); @@ -807,6 +839,7 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_gpu : NVPTX::INT_MEMBAR_GL; case NVPTX::Scope::Thread: + case NVPTX::Scope::DefaultDevice: report_fatal_error( formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.", ScopeToString(S))); @@ -826,6 +859,7 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu : NVPTX::INT_MEMBAR_GL; case NVPTX::Scope::Thread: + case NVPTX::Scope::DefaultDevice: report_fatal_error( formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.", ScopeToString(S))); @@ -846,6 +880,7 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S, return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu : NVPTX::INT_MEMBAR_GL; case NVPTX::Scope::Thread: + case NVPTX::Scope::DefaultDevice: report_fatal_error(formatv("Unsupported scope \"{}\" for seq_cst fence.", ScopeToString(S))); } @@ -1025,7 +1060,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { const MVT LoadedVT = LoadedEVT.getSimpleVT(); // Address Space Setting - const unsigned CodeAddrSpace = getCodeAddrSpace(LD); + const auto CodeAddrSpace = getAddrSpace(LD); if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace)) return tryLDG(LD); @@ -1097,7 +1132,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { const MVT MemVT = MemEVT.getSimpleVT(); // Address Space Setting - const unsigned CodeAddrSpace = getCodeAddrSpace(LD); + const auto CodeAddrSpace = getAddrSpace(LD); if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace)) return tryLDG(LD); @@ -1313,7 +1348,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { return false; // Address Space Setting - const unsigned CodeAddrSpace = getCodeAddrSpace(ST); + const auto CodeAddrSpace = getAddrSpace(ST); SDLoc DL(ST); SDValue Chain = ST->getChain(); @@ -1363,7 +1398,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { assert(StoreVT.isSimple() && "Store value is not simple"); // Address Space Setting - const unsigned CodeAddrSpace = getCodeAddrSpace(ST); + const auto CodeAddrSpace = getAddrSpace(ST); if (CodeAddrSpace == NVPTX::AddressSpace::Const) { report_fatal_error("Cannot store to pointer that points to constant " "memory space"); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 88e5328ff69c..b99b4ef2d307 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -100,6 +100,8 @@ private: inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) { return CurDAG->getTargetConstant(Imm, DL, MVT::i32); } + NVPTX::Ordering getMemOrder(const MemSDNode *N) const; + NVPTX::Scope getAtomicScope(const MemSDNode *N) const; bool SelectADDR(SDValue Addr, SDValue &Base, SDValue &Offset); SDValue getPTXCmpMode(const CondCodeSDNode &CondCode); @@ -114,6 +116,9 @@ private: std::pair<NVPTX::Ordering, NVPTX::Scope> insertMemoryInstructionFence(SDLoc DL, SDValue &Chain, MemSDNode *N); NVPTX::Scope getOperationScope(MemSDNode *N, NVPTX::Ordering O) const; + +public: + static NVPTX::AddressSpace getAddrSpace(const MemSDNode *N); }; class NVPTXDAGToDAGISelLegacy : public SelectionDAGISelLegacy { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 3d010e04824c..7aa06f9079b0 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -57,6 +57,7 @@ #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/NVPTXAddrSpace.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" @@ -1047,9 +1048,12 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, MVT::v32i32, MVT::v64i32, MVT::v128i32}, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - // Enable custom lowering for the i128 bit operand with clusterlaunchcontrol - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i128, Custom); + // Enable custom lowering for the following: + // * MVT::i128 - clusterlaunchcontrol + // * MVT::i32 - prmt + // * MVT::Other - internal.addrspace.wrap + setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::i32, MVT::i128, MVT::Other}, + Custom); } const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { @@ -1087,7 +1091,6 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(NVPTXISD::StoreV8) MAKE_CASE(NVPTXISD::FSHL_CLAMP) MAKE_CASE(NVPTXISD::FSHR_CLAMP) - MAKE_CASE(NVPTXISD::BFE) MAKE_CASE(NVPTXISD::BFI) MAKE_CASE(NVPTXISD::PRMT) MAKE_CASE(NVPTXISD::FCOPYSIGN) @@ -2060,6 +2063,19 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { return DAG.getBuildVector(Node->getValueType(0), dl, Ops); } +static SDValue getPRMT(SDValue A, SDValue B, SDValue Selector, SDLoc DL, + SelectionDAG &DAG, + unsigned Mode = NVPTX::PTXPrmtMode::NONE) { + return DAG.getNode(NVPTXISD::PRMT, DL, MVT::i32, + {A, B, Selector, DAG.getConstant(Mode, DL, MVT::i32)}); +} + +static SDValue getPRMT(SDValue A, SDValue B, uint64_t Selector, SDLoc DL, + SelectionDAG &DAG, + unsigned Mode = NVPTX::PTXPrmtMode::NONE) { + return getPRMT(A, B, DAG.getConstant(Selector, DL, MVT::i32), DL, DAG, Mode); +} + SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { // Handle bitcasting from v2i8 without hitting the default promotion // strategy which goes through stack memory. @@ -2111,15 +2127,12 @@ SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, L = DAG.getAnyExtOrTrunc(L, DL, MVT::i32); R = DAG.getAnyExtOrTrunc(R, DL, MVT::i32); } - return DAG.getNode( - NVPTXISD::PRMT, DL, MVT::v4i8, - {L, R, DAG.getConstant(SelectionValue, DL, MVT::i32), - DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)}); + return getPRMT(L, R, SelectionValue, DL, DAG); }; auto PRMT__10 = GetPRMT(Op->getOperand(0), Op->getOperand(1), true, 0x3340); auto PRMT__32 = GetPRMT(Op->getOperand(2), Op->getOperand(3), true, 0x3340); auto PRMT3210 = GetPRMT(PRMT__10, PRMT__32, false, 0x5410); - return DAG.getNode(ISD::BITCAST, DL, VT, PRMT3210); + return DAG.getBitcast(VT, PRMT3210); } // Get value or the Nth operand as an APInt(32). Undef values treated as 0. @@ -2173,14 +2186,17 @@ SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, EVT VectorVT = Vector.getValueType(); if (VectorVT == MVT::v4i8) { - SDValue BFE = - DAG.getNode(NVPTXISD::BFE, DL, MVT::i32, - {Vector, - DAG.getNode(ISD::MUL, DL, MVT::i32, - DAG.getZExtOrTrunc(Index, DL, MVT::i32), - DAG.getConstant(8, DL, MVT::i32)), - DAG.getConstant(8, DL, MVT::i32)}); - return DAG.getAnyExtOrTrunc(BFE, DL, Op->getValueType(0)); + SDValue Selector = DAG.getNode(ISD::OR, DL, MVT::i32, + DAG.getZExtOrTrunc(Index, DL, MVT::i32), + DAG.getConstant(0x7770, DL, MVT::i32)); + SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, Vector), + DAG.getConstant(0, DL, MVT::i32), Selector, DL, DAG); + SDValue Ext = DAG.getAnyExtOrTrunc(PRMT, DL, Op->getValueType(0)); + SDNodeFlags Flags; + Flags.setNoSignedWrap(Ext.getScalarValueSizeInBits() > 8); + Flags.setNoUnsignedWrap(Ext.getScalarValueSizeInBits() >= 8); + Ext->setFlags(Flags); + return Ext; } // Constant index will be matched by tablegen. @@ -2242,9 +2258,9 @@ SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, } SDLoc DL(Op); - return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2, - DAG.getConstant(Selector, DL, MVT::i32), - DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)); + SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, V1), + DAG.getBitcast(MVT::i32, V2), Selector, DL, DAG); + return DAG.getBitcast(Op.getValueType(), PRMT); } /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift @@ -2729,10 +2745,46 @@ static SDValue LowerClusterLaunchControlQueryCancel(SDValue Op, {TryCancelResponse0, TryCancelResponse1}); } +static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG) { + const unsigned Mode = [&]() { + switch (Op->getConstantOperandVal(0)) { + case Intrinsic::nvvm_prmt: + return NVPTX::PTXPrmtMode::NONE; + case Intrinsic::nvvm_prmt_b4e: + return NVPTX::PTXPrmtMode::B4E; + case Intrinsic::nvvm_prmt_ecl: + return NVPTX::PTXPrmtMode::ECL; + case Intrinsic::nvvm_prmt_ecr: + return NVPTX::PTXPrmtMode::ECR; + case Intrinsic::nvvm_prmt_f4e: + return NVPTX::PTXPrmtMode::F4E; + case Intrinsic::nvvm_prmt_rc16: + return NVPTX::PTXPrmtMode::RC16; + case Intrinsic::nvvm_prmt_rc8: + return NVPTX::PTXPrmtMode::RC8; + default: + llvm_unreachable("unsupported/unhandled intrinsic"); + } + }(); + SDLoc DL(Op); + SDValue A = Op->getOperand(1); + SDValue B = Op.getNumOperands() == 4 ? Op.getOperand(2) + : DAG.getConstant(0, DL, MVT::i32); + SDValue Selector = (Op->op_end() - 1)->get(); + return getPRMT(A, B, Selector, DL, DAG, Mode); +} static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG) { switch (Op->getConstantOperandVal(0)) { default: return Op; + case Intrinsic::nvvm_prmt: + case Intrinsic::nvvm_prmt_b4e: + case Intrinsic::nvvm_prmt_ecl: + case Intrinsic::nvvm_prmt_ecr: + case Intrinsic::nvvm_prmt_f4e: + case Intrinsic::nvvm_prmt_rc16: + case Intrinsic::nvvm_prmt_rc8: + return lowerPrmtIntrinsic(Op, DAG); case Intrinsic::nvvm_internal_addrspace_wrap: return Op.getOperand(1); case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled: @@ -5271,31 +5323,6 @@ static SDValue PerformANDCombine(SDNode *N, SDValue AExt; - // Convert BFE-> truncate i16 -> and 255 - // To just BFE-> truncate i16, as the value already has all the bits in the - // right places. - if (Val.getOpcode() == ISD::TRUNCATE) { - SDValue BFE = Val.getOperand(0); - if (BFE.getOpcode() != NVPTXISD::BFE) - return SDValue(); - - ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(BFE.getOperand(0)); - if (!BFEBits) - return SDValue(); - uint64_t BFEBitsVal = BFEBits->getZExtValue(); - - ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask); - if (!MaskCnst) { - // Not an AND with a constant - return SDValue(); - } - uint64_t MaskVal = MaskCnst->getZExtValue(); - - if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1) - return SDValue(); - // If we get here, the AND is unnecessary. Just replace it with the trunc - DCI.CombineTo(N, Val, false); - } // Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and if (Val.getOpcode() == ISD::ANY_EXTEND) { AExt = Val; @@ -5800,11 +5827,10 @@ PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SDLoc DL(N); auto &DAG = DCI.DAG; - auto PRMT = DAG.getNode( - NVPTXISD::PRMT, DL, MVT::v4i8, - {Op0, Op1, DAG.getConstant((Op1Bytes << 8) | Op0Bytes, DL, MVT::i32), - DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)}); - return DAG.getNode(ISD::BITCAST, DL, VT, PRMT); + auto PRMT = + getPRMT(DAG.getBitcast(MVT::i32, Op0), DAG.getBitcast(MVT::i32, Op1), + (Op1Bytes << 8) | Op0Bytes, DL, DAG); + return DAG.getBitcast(VT, PRMT); } static SDValue combineADDRSPACECAST(SDNode *N, @@ -5822,47 +5848,120 @@ static SDValue combineADDRSPACECAST(SDNode *N, return SDValue(); } +// Given a constant selector value and a prmt mode, return the selector value +// normalized to the generic prmt mode. See the PTX ISA documentation for more +// details: +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt +static APInt getPRMTSelector(const APInt &Selector, unsigned Mode) { + if (Mode == NVPTX::PTXPrmtMode::NONE) + return Selector; + + const unsigned V = Selector.trunc(2).getZExtValue(); + + const auto GetSelector = [](unsigned S0, unsigned S1, unsigned S2, + unsigned S3) { + return APInt(32, S0 | (S1 << 4) | (S2 << 8) | (S3 << 12)); + }; + + switch (Mode) { + case NVPTX::PTXPrmtMode::F4E: + return GetSelector(V, V + 1, V + 2, V + 3); + case NVPTX::PTXPrmtMode::B4E: + return GetSelector(V, (V - 1) & 7, (V - 2) & 7, (V - 3) & 7); + case NVPTX::PTXPrmtMode::RC8: + return GetSelector(V, V, V, V); + case NVPTX::PTXPrmtMode::ECL: + return GetSelector(V, std::max(V, 1U), std::max(V, 2U), 3U); + case NVPTX::PTXPrmtMode::ECR: + return GetSelector(0, std::min(V, 1U), std::min(V, 2U), V); + case NVPTX::PTXPrmtMode::RC16: { + unsigned V1 = (V & 1) << 1; + return GetSelector(V1, V1 + 1, V1, V1 + 1); + } + default: + llvm_unreachable("Invalid PRMT mode"); + } +} + +static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode) { + // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}} + APInt BitField = B.concat(A); + APInt SelectorVal = getPRMTSelector(Selector, Mode); + APInt Result(32, 0); + for (unsigned I : llvm::seq(4U)) { + APInt Sel = SelectorVal.extractBits(4, I * 4); + unsigned Idx = Sel.getLoBits(3).getZExtValue(); + unsigned Sign = Sel.getHiBits(1).getZExtValue(); + APInt Byte = BitField.extractBits(8, Idx * 8); + if (Sign) + Byte = Byte.ashr(8); + Result.insertBits(Byte, I * 8); + } + return Result; +} + +static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + CodeGenOptLevel OptLevel) { + if (OptLevel == CodeGenOptLevel::None) + return SDValue(); + + // Constant fold PRMT + if (isa<ConstantSDNode>(N->getOperand(0)) && + isa<ConstantSDNode>(N->getOperand(1)) && + isa<ConstantSDNode>(N->getOperand(2))) + return DCI.DAG.getConstant(computePRMT(N->getConstantOperandAPInt(0), + N->getConstantOperandAPInt(1), + N->getConstantOperandAPInt(2), + N->getConstantOperandVal(3)), + SDLoc(N), N->getValueType(0)); + + return SDValue(); +} + SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel(); switch (N->getOpcode()) { - default: break; - case ISD::ADD: - return PerformADDCombine(N, DCI, OptLevel); - case ISD::FADD: - return PerformFADDCombine(N, DCI, OptLevel); - case ISD::MUL: - return PerformMULCombine(N, DCI, OptLevel); - case ISD::SHL: - return PerformSHLCombine(N, DCI, OptLevel); - case ISD::AND: - return PerformANDCombine(N, DCI); - case ISD::UREM: - case ISD::SREM: - return PerformREMCombine(N, DCI, OptLevel); - case ISD::SETCC: - return PerformSETCCCombine(N, DCI, STI.getSmVersion()); - case ISD::LOAD: - case NVPTXISD::LoadParamV2: - case NVPTXISD::LoadV2: - case NVPTXISD::LoadV4: - return combineUnpackingMovIntoLoad(N, DCI); - case NVPTXISD::StoreParam: - case NVPTXISD::StoreParamV2: - case NVPTXISD::StoreParamV4: - return PerformStoreParamCombine(N, DCI); - case ISD::STORE: - case NVPTXISD::StoreV2: - case NVPTXISD::StoreV4: - return PerformStoreCombine(N, DCI); - case ISD::EXTRACT_VECTOR_ELT: - return PerformEXTRACTCombine(N, DCI); - case ISD::VSELECT: - return PerformVSELECTCombine(N, DCI); - case ISD::BUILD_VECTOR: - return PerformBUILD_VECTORCombine(N, DCI); - case ISD::ADDRSPACECAST: - return combineADDRSPACECAST(N, DCI); + default: + break; + case ISD::ADD: + return PerformADDCombine(N, DCI, OptLevel); + case ISD::ADDRSPACECAST: + return combineADDRSPACECAST(N, DCI); + case ISD::AND: + return PerformANDCombine(N, DCI); + case ISD::BUILD_VECTOR: + return PerformBUILD_VECTORCombine(N, DCI); + case ISD::EXTRACT_VECTOR_ELT: + return PerformEXTRACTCombine(N, DCI); + case ISD::FADD: + return PerformFADDCombine(N, DCI, OptLevel); + case ISD::LOAD: + case NVPTXISD::LoadParamV2: + case NVPTXISD::LoadV2: + case NVPTXISD::LoadV4: + return combineUnpackingMovIntoLoad(N, DCI); + case ISD::MUL: + return PerformMULCombine(N, DCI, OptLevel); + case NVPTXISD::PRMT: + return combinePRMT(N, DCI, OptLevel); + case ISD::SETCC: + return PerformSETCCCombine(N, DCI, STI.getSmVersion()); + case ISD::SHL: + return PerformSHLCombine(N, DCI, OptLevel); + case ISD::SREM: + case ISD::UREM: + return PerformREMCombine(N, DCI, OptLevel); + case NVPTXISD::StoreParam: + case NVPTXISD::StoreParamV2: + case NVPTXISD::StoreParamV4: + return PerformStoreParamCombine(N, DCI); + case ISD::STORE: + case NVPTXISD::StoreV2: + case NVPTXISD::StoreV4: + return PerformStoreCombine(N, DCI); + case ISD::VSELECT: + return PerformVSELECTCombine(N, DCI); } return SDValue(); } @@ -6340,10 +6439,12 @@ Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder, // Specialize for cmpxchg // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated + SyncScope::ID SSID = cast<AtomicCmpXchgInst>(Inst)->getSyncScopeID(); if (isReleaseOrStronger(Ord)) - return Ord == AtomicOrdering::SequentiallyConsistent - ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent) - : Builder.CreateFence(AtomicOrdering::Release); + return Builder.CreateFence(Ord == AtomicOrdering::SequentiallyConsistent + ? Ord + : AtomicOrdering::Release, + SSID); return nullptr; } @@ -6355,15 +6456,15 @@ Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder, if (!isa<AtomicCmpXchgInst>(Inst)) return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord); + auto *CI = cast<AtomicCmpXchgInst>(Inst); auto CASWidth = - cast<IntegerType>( - dyn_cast<AtomicCmpXchgInst>(Inst)->getCompareOperand()->getType()) - ->getBitWidth(); + cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth(); + SyncScope::ID SSID = CI->getSyncScopeID(); // Do not emit a trailing fence for cmpxchg seq_cst which are not emulated if (isAcquireOrStronger(Ord) && (Ord != AtomicOrdering::SequentiallyConsistent || CASWidth < STI.getMinCmpXchgSizeInBits())) - return Builder.CreateFence(AtomicOrdering::Acquire); + return Builder.CreateFence(AtomicOrdering::Acquire, SSID); return nullptr; } @@ -6402,3 +6503,45 @@ MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal( const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const { return getDataSection(); } + +static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known, + const SelectionDAG &DAG, unsigned Depth) { + SDValue A = Op.getOperand(0); + SDValue B = Op.getOperand(1); + ConstantSDNode *Selector = dyn_cast<ConstantSDNode>(Op.getOperand(2)); + unsigned Mode = Op.getConstantOperandVal(3); + + if (!Selector) + return; + + KnownBits AKnown = DAG.computeKnownBits(A, Depth); + KnownBits BKnown = DAG.computeKnownBits(B, Depth); + + // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}} + KnownBits BitField = BKnown.concat(AKnown); + + APInt SelectorVal = getPRMTSelector(Selector->getAPIntValue(), Mode); + for (unsigned I : llvm::seq(std::min(4U, Known.getBitWidth() / 8))) { + APInt Sel = SelectorVal.extractBits(4, I * 4); + unsigned Idx = Sel.getLoBits(3).getZExtValue(); + unsigned Sign = Sel.getHiBits(1).getZExtValue(); + KnownBits Byte = BitField.extractBits(8, Idx * 8); + if (Sign) + Byte = KnownBits::ashr(Byte, 8); + Known.insertBits(Byte, I * 8); + } +} + +void NVPTXTargetLowering::computeKnownBitsForTargetNode( + const SDValue Op, KnownBits &Known, const APInt &DemandedElts, + const SelectionDAG &DAG, unsigned Depth) const { + Known.resetAll(); + + switch (Op.getOpcode()) { + case NVPTXISD::PRMT: + computeKnownBitsForPRMT(Op, Known, DAG, Depth); + break; + default: + break; + } +}
\ No newline at end of file diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 2477e1fb6159..bc3548c0272b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -50,7 +50,6 @@ enum NodeType : unsigned { MUL_WIDE_UNSIGNED, SETP_F16X2, SETP_BF16X2, - BFE, BFI, PRMT, @@ -272,6 +271,11 @@ public: unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT, EVT ToVT) const override; + void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, + const APInt &DemandedElts, + const SelectionDAG &DAG, + unsigned Depth = 0) const override; + private: const NVPTXSubtarget &STI; // cache the subtarget here mutable unsigned GlobalUniqueCallSite; diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index db6b411509e9..a5bb83dfadb8 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1372,11 +1372,6 @@ def BREV64 : // restriction in PTX? // // dest and src may be int32 or int64, but start and end are always int32. -def SDTBFE : - SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisInt<0>, - SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; -def bfe : SDNode<"NVPTXISD::BFE", SDTBFE>; - def SDTBFI : SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>; @@ -1387,22 +1382,13 @@ def SDTPRMT : SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>; def prmt : SDNode<"NVPTXISD::PRMT", SDTPRMT>; -multiclass BFE<string Instr, ValueType T, RegisterClass RC> { +multiclass BFE<string Instr, RegisterClass RC> { def rrr - : BasicNVPTXInst<(outs RC:$d), - (ins RC:$a, B32:$b, B32:$c), - Instr, - [(set T:$d, (bfe T:$a, i32:$b, i32:$c))]>; + : BasicNVPTXInst<(outs RC:$d), (ins RC:$a, B32:$b, B32:$c), Instr>; def rri - : BasicNVPTXInst<(outs RC:$d), - (ins RC:$a, B32:$b, i32imm:$c), - Instr, - [(set T:$d, (bfe T:$a, i32:$b, imm:$c))]>; + : BasicNVPTXInst<(outs RC:$d), (ins RC:$a, B32:$b, i32imm:$c), Instr>; def rii - : BasicNVPTXInst<(outs RC:$d), - (ins RC:$a, i32imm:$b, i32imm:$c), - Instr, - [(set T:$d, (bfe T:$a, imm:$b, imm:$c))]>; + : BasicNVPTXInst<(outs RC:$d), (ins RC:$a, i32imm:$b, i32imm:$c), Instr>; } multiclass BFI<string Instr, ValueType T, RegisterClass RC, Operand ImmCls> { @@ -1447,10 +1433,10 @@ let hasSideEffects = false in { // the same patterns, so the first one wins. Having unsigned byte extraction // has the benefit of always having zero in unused bits, which makes some // optimizations easier (e.g. no need to mask them). - defm BFE_U32 : BFE<"bfe.u32", i32, B32>; - defm BFE_S32 : BFE<"bfe.s32", i32, B32>; - defm BFE_U64 : BFE<"bfe.u64", i64, B64>; - defm BFE_S64 : BFE<"bfe.s64", i64, B64>; + defm BFE_U32 : BFE<"bfe.u32", B32>; + defm BFE_S32 : BFE<"bfe.s32", B32>; + defm BFE_U64 : BFE<"bfe.u64", B64>; + defm BFE_S64 : BFE<"bfe.s64", B64>; defm BFI_B32 : BFI<"bfi.b32", i32, B32, i32imm>; defm BFI_B64 : BFI<"bfi.b64", i64, B64, i64imm>; @@ -1467,18 +1453,33 @@ let hasSideEffects = false in { (ins PrmtMode:$mode), "prmt.b32$mode", [(set i32:$d, (prmt i32:$a, i32:$b, imm:$c, imm:$mode))]>; + def PRMT_B32rir + : BasicFlagsNVPTXInst<(outs B32:$d), + (ins B32:$a, i32imm:$b, B32:$c), + (ins PrmtMode:$mode), + "prmt.b32$mode", + [(set i32:$d, (prmt i32:$a, imm:$b, i32:$c, imm:$mode))]>; def PRMT_B32rii : BasicFlagsNVPTXInst<(outs B32:$d), (ins B32:$a, i32imm:$b, Hexu32imm:$c), (ins PrmtMode:$mode), "prmt.b32$mode", [(set i32:$d, (prmt i32:$a, imm:$b, imm:$c, imm:$mode))]>; - def PRMT_B32rir + def PRMT_B32irr : BasicFlagsNVPTXInst<(outs B32:$d), - (ins B32:$a, i32imm:$b, B32:$c), - (ins PrmtMode:$mode), + (ins i32imm:$a, B32:$b, B32:$c), (ins PrmtMode:$mode), + "prmt.b32$mode", + [(set i32:$d, (prmt imm:$a, i32:$b, i32:$c, imm:$mode))]>; + def PRMT_B32iri + : BasicFlagsNVPTXInst<(outs B32:$d), + (ins i32imm:$a, B32:$b, Hexu32imm:$c), (ins PrmtMode:$mode), "prmt.b32$mode", - [(set i32:$d, (prmt i32:$a, imm:$b, i32:$c, imm:$mode))]>; + [(set i32:$d, (prmt imm:$a, i32:$b, imm:$c, imm:$mode))]>; + def PRMT_B32iir + : BasicFlagsNVPTXInst<(outs B32:$d), + (ins i32imm:$a, i32imm:$b, B32:$c), (ins PrmtMode:$mode), + "prmt.b32$mode", + [(set i32:$d, (prmt imm:$a, imm:$b, i32:$c, imm:$mode))]>; } @@ -1487,19 +1488,26 @@ def : Pat<(fshr i32:$hi, i32:$lo, (shl i32:$amt, (i32 3))), (PRMT_B32rrr $lo, $hi, $amt, PrmtF4E)>; +def byte_extract_prmt : ImmLeaf<i32, [{ + return (Imm == 0x7770) || (Imm == 0x7771) || (Imm == 0x7772) || (Imm == 0x7773); +}]>; + +def to_sign_extend_selector : SDNodeXForm<imm, [{ + const APInt &V = N->getAPIntValue(); + const APInt B = V.trunc(4); + const APInt BSext = B | 8; + const APInt R = BSext.concat(BSext).concat(BSext).concat(B).zext(32); + return CurDAG->getTargetConstant(R, SDLoc(N), MVT::i32); +}]>; + + // byte extraction + signed/unsigned extension to i32. -def : Pat<(i32 (sext_inreg (bfe i32:$s, i32:$o, 8), i8)), - (BFE_S32rri $s, $o, 8)>; -def : Pat<(i32 (sext_inreg (bfe i32:$s, imm:$o, 8), i8)), - (BFE_S32rii $s, imm:$o, 8)>; -def : Pat<(i32 (and (bfe i32:$s, i32:$o, 8), 255)), - (BFE_U32rri $s, $o, 8)>; -def : Pat<(i32 (and (bfe i32:$s, imm:$o, 8), 255)), - (BFE_U32rii $s, imm:$o, 8)>; +def : Pat<(i32 (sext_inreg (prmt i32:$s, 0, byte_extract_prmt:$sel, PrmtNONE), i8)), + (PRMT_B32rii $s, 0, (to_sign_extend_selector $sel), PrmtNONE)>; // byte extraction + signed extension to i16 -def : Pat<(i16 (sext_inreg (trunc (bfe i32:$s, imm:$o, 8)), i8)), - (CVT_s8_s32 (BFE_S32rii $s, imm:$o, 8), CvtNONE)>; +def : Pat<(i16 (sext_inreg (trunc (prmt i32:$s, 0, byte_extract_prmt:$sel, PrmtNONE)), i8)), + (CVT_u16_u32 (PRMT_B32rii $s, 0, (to_sign_extend_selector $sel), PrmtNONE), CvtNONE)>; // Byte extraction via shift/trunc/sext @@ -1615,8 +1623,8 @@ def ADDR : Operand<pAny> { let MIOperandInfo = (ops ADDR_base, i32imm); } -def LdStCode : Operand<i32> { - let PrintMethod = "printLdStCode"; +def AtomicCode : Operand<i32> { + let PrintMethod = "printAtomicCode"; } def MmaCode : Operand<i32> { @@ -1709,28 +1717,36 @@ def cond_not_signed : PatLeaf<(cond), [{ return !isSignedIntSetCC(N->get()); }]>; -// comparisons of i8 extracted with BFE as i32 -// It's faster to do comparison directly on i32 extracted by BFE, +// comparisons of i8 extracted with PRMT as i32 +// It's faster to do comparison directly on i32 extracted by PRMT, // instead of the long conversion and sign extending. -def: Pat<(setcc (i16 (sext_inreg (i16 (trunc (bfe B32:$a, B32:$oa, 8))), i8)), - (i16 (sext_inreg (i16 (trunc (bfe B32:$b, B32:$ob, 8))), i8)), +def: Pat<(setcc (i16 (sext_inreg (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))), i8)), + (i16 (sext_inreg (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))), i8)), cond_signed:$cc), - (SETP_i32rr (BFE_S32rri $a, $oa, 8), (BFE_S32rri $b, $ob, 8), (cond2cc $cc))>; + (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE), + (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE), + (cond2cc $cc))>; -def: Pat<(setcc (i16 (sext_inreg (trunc (bfe B32:$a, imm:$oa, 8)), i8)), - (i16 (sext_inreg (trunc (bfe B32:$b, imm:$ob, 8)), i8)), +def: Pat<(setcc (i16 (sext_inreg (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE)), i8)), + (i16 (sext_inreg (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE)), i8)), cond_signed:$cc), - (SETP_i32rr (BFE_S32rii $a, imm:$oa, 8), (BFE_S32rii $b, imm:$ob, 8), (cond2cc $cc))>; + (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE), + (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE), + (cond2cc $cc))>; -def: Pat<(setcc (i16 (and (trunc (bfe B32:$a, B32:$oa, 8)), 255)), - (i16 (and (trunc (bfe B32:$b, B32:$ob, 8)), 255)), +def: Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))), + (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))), cond_signed:$cc), - (SETP_i32rr (BFE_U32rri $a, $oa, 8), (BFE_U32rri $b, $ob, 8), (cond2cc $cc))>; + (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE), + (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE), + (cond2cc $cc))>; -def: Pat<(setcc (i16 (and (trunc (bfe B32:$a, imm:$oa, 8)), 255)), - (i16 (and (trunc (bfe B32:$b, imm:$ob, 8)), 255)), +def: Pat<(setcc (i16 (trunc (prmt i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE))), + (i16 (trunc (prmt i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE))), cond_not_signed:$cc), - (SETP_i32rr (BFE_U32rii $a, imm:$oa, 8), (BFE_U32rii $b, imm:$ob, 8), (cond2cc $cc))>; + (SETP_i32rr (PRMT_B32rii i32:$a, 0, byte_extract_prmt:$sel_a, PrmtNONE), + (PRMT_B32rii i32:$b, 0, byte_extract_prmt:$sel_b, PrmtNONE), + (cond2cc $cc))>; def SDTDeclareArrayParam : SDTypeProfile<0, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>; @@ -1961,7 +1977,7 @@ defm ProxyRegB64 : ProxyRegInst<"b64", B64>; class LD<NVPTXRegClass regclass> : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Sign, + (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$addr), "ld${sem:sem}${scope:scope}${addsp:addsp}.${Sign:sign}$fromWidth " "\t$dst, [$addr];", []>; @@ -1977,7 +1993,7 @@ class ST<DAGOperand O> : NVPTXInst< (outs), (ins O:$src, - LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, i32imm:$toWidth, + AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$toWidth, ADDR:$addr), "st${sem:sem}${scope:scope}${addsp:addsp}.b$toWidth" " \t[$addr], $src;", []>; @@ -1995,21 +2011,21 @@ let mayStore=1, hasSideEffects=0 in { multiclass LD_VEC<NVPTXRegClass regclass, bit support_v8 = false> { def _v2 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, - LdStCode:$Sign, i32imm:$fromWidth, ADDR:$addr), + (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, + AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$addr), "ld${sem:sem}${scope:scope}${addsp:addsp}.v2.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr];", []>; def _v4 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, - LdStCode:$Sign, i32imm:$fromWidth, ADDR:$addr), + (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, + AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$addr), "ld${sem:sem}${scope:scope}${addsp:addsp}.v4.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; if support_v8 then def _v8 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4, regclass:$dst5, regclass:$dst6, regclass:$dst7, regclass:$dst8), - (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Sign, + (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$addr), "ld${sem:sem}${scope:scope}${addsp:addsp}.v8.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, " @@ -2026,14 +2042,14 @@ multiclass ST_VEC<DAGOperand O, bit support_v8 = false> { def _v2 : NVPTXInst< (outs), (ins O:$src1, O:$src2, - LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, i32imm:$fromWidth, + AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth, ADDR:$addr), "st${sem:sem}${scope:scope}${addsp:addsp}.v2.b$fromWidth " "\t[$addr], {{$src1, $src2}};", []>; def _v4 : NVPTXInst< (outs), (ins O:$src1, O:$src2, O:$src3, O:$src4, - LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, i32imm:$fromWidth, + AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth, ADDR:$addr), "st${sem:sem}${scope:scope}${addsp:addsp}.v4.b$fromWidth " "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; @@ -2042,7 +2058,7 @@ multiclass ST_VEC<DAGOperand O, bit support_v8 = false> { (outs), (ins O:$src1, O:$src2, O:$src3, O:$src4, O:$src5, O:$src6, O:$src7, O:$src8, - LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, i32imm:$fromWidth, + AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth, ADDR:$addr), "st${sem:sem}${scope:scope}${addsp:addsp}.v8.b$fromWidth " "\t[$addr], " diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 93827be5c281..70150bdfc8d1 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -41,6 +41,46 @@ def AS_match { }]; } + +//===----------------------------------------------------------------------===// +// NVPTX Scope Constants +// These map to the Scope enum in NVPTX.h +//===----------------------------------------------------------------------===// + +def Scope_thread : PatLeaf<(i32 0)>; // Thread = 0 +def Scope_cta : PatLeaf<(i32 1)>; // Block = 1 +def Scope_cluster : PatLeaf<(i32 2)>; // Cluster = 2 +def Scope_device : PatLeaf<(i32 3)>; // Device = 3 +def Scope_sys : PatLeaf<(i32 4)>; // System = 4 + +//===----------------------------------------------------------------------===// +// NVPTX Address Space Constants +// These map to the AddressSpace enum in NVPTX.h +//===----------------------------------------------------------------------===// + +def AddrSpace_gen : PatLeaf<(i32 0)>; // Generic = 0 +def AddrSpace_global : PatLeaf<(i32 1)>; // Global = 1 +def AddrSpace_shared : PatLeaf<(i32 3)>; // Shared = 3 +def AddrSpace_const : PatLeaf<(i32 4)>; // Const = 4 +def AddrSpace_local : PatLeaf<(i32 5)>; // Local = 5 +def AddrSpace_shared_cluster : PatLeaf<(i32 7)>; // SharedCluster = 7 +def AddrSpace_param : PatLeaf<(i32 101)>; // Param = 101 + +//===----------------------------------------------------------------------===// +// NVPTX Ordering Constants +// These map to the Ordering enum in NVPTX.h +//===----------------------------------------------------------------------===// + +def Ordering_not_atomic : PatLeaf<(i32 0)>; // NotAtomic = 0 +def Ordering_relaxed : PatLeaf<(i32 2)>; // Relaxed = 1 +def Ordering_acquire : PatLeaf<(i32 4)>; // Acquire = 4 +def Ordering_release : PatLeaf<(i32 5)>; // Release = 5 +def Ordering_acquire_release : PatLeaf<(i32 6)>; // AcquireRelease = 6 +def Ordering_sequentially_consistent : PatLeaf<(i32 7)>; // SequentiallyConsistent = 7 +def Ordering_volatile : PatLeaf<(i32 8)>; // Volatile = 8 +def Ordering_relaxed_mmio : PatLeaf<(i32 9)>; // RelaxedMMIO = 9 + + // A node that will be replaced with the current PTX version. class PTX { SDNodeXForm PTXVerXform = SDNodeXForm<imm, [{ @@ -1007,24 +1047,6 @@ class F_MATH_3<string OpcStr, NVPTXRegClass t_regclass, // MISC // -class PRMT3Pat<Intrinsic prmt_intrinsic, PatLeaf prmt_mode> - : Pat<(prmt_intrinsic i32:$a, i32:$b, i32:$c), - (PRMT_B32rrr $a, $b, $c, prmt_mode)>; - -class PRMT2Pat<Intrinsic prmt_intrinsic, PatLeaf prmt_mode> - : Pat<(prmt_intrinsic i32:$a, i32:$c), - (PRMT_B32rir $a, (i32 0), $c, prmt_mode)>; - -def : PRMT3Pat<int_nvvm_prmt, PrmtNONE>; -def : PRMT3Pat<int_nvvm_prmt_f4e, PrmtF4E>; -def : PRMT3Pat<int_nvvm_prmt_b4e, PrmtB4E>; - -def : PRMT2Pat<int_nvvm_prmt_rc8, PrmtRC8>; -def : PRMT2Pat<int_nvvm_prmt_ecl, PrmtECL>; -def : PRMT2Pat<int_nvvm_prmt_ecr, PrmtECR>; -def : PRMT2Pat<int_nvvm_prmt_rc16, PrmtRC16>; - - def INT_NVVM_NANOSLEEP_I : BasicNVPTXInst<(outs), (ins i32imm:$i), "nanosleep.u32", [(int_nvvm_nanosleep imm:$i)]>, Requires<[hasPTX<63>, hasSM<70>]>; @@ -1860,35 +1882,50 @@ multiclass F_ATOMIC_2<RegTyInfo t, string sem_str, string as_str, string op_str, } } -// has 3 operands -multiclass F_ATOMIC_3<RegTyInfo t, string sem_str, string as_str, string op_str, - SDPatternOperator op, list<Predicate> preds> { - defvar asm_str = "atom" # sem_str # as_str # "." # op_str; +multiclass F_ATOMIC_3<RegTyInfo t, string op_str, SDPatternOperator op, SDNode atomic> { + defvar asm_str = "atom${sem:sem}${scope:scope}${addsp:addsp}" # op_str; + let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { - def rr : BasicNVPTXInst<(outs t.RC:$dst), - (ins ADDR:$addr, t.RC:$b, t.RC:$c), - asm_str, - [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b, t.Ty:$c))]>, - Requires<preds>; + def _rr : BasicFlagsNVPTXInst<(outs t.RC:$dst), + (ins ADDR:$addr, t.RC:$b, t.RC:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), + asm_str>; - def ir : BasicNVPTXInst<(outs t.RC:$dst), - (ins ADDR:$addr, t.Imm:$b, t.RC:$c), - asm_str, - [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c))]>, - Requires<preds>; + def _ir : BasicFlagsNVPTXInst<(outs t.RC:$dst), + (ins ADDR:$addr, t.Imm:$b, t.RC:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), + asm_str>; - def ri : BasicNVPTXInst<(outs t.RC:$dst), - (ins ADDR:$addr, t.RC:$b, t.Imm:$c), - asm_str, - [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c)))]>, - Requires<preds>; + def _ri : BasicFlagsNVPTXInst<(outs t.RC:$dst), + (ins ADDR:$addr, t.RC:$b, t.Imm:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), + asm_str>; - def ii : BasicNVPTXInst<(outs t.RC:$dst), - (ins ADDR:$addr, t.Imm:$b, t.Imm:$c), - asm_str, - [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c)))]>, - Requires<preds>; + def _ii : BasicFlagsNVPTXInst<(outs t.RC:$dst), + (ins ADDR:$addr, t.Imm:$b, t.Imm:$c), (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp), + asm_str>; } + + defvar GetSem = SDNodeXForm<atomic, [{ + return getI32Imm(getMemOrder(cast<MemSDNode>(N)), SDLoc(N)); + }]>; + + defvar GetScope = SDNodeXForm<atomic, [{ + return getI32Imm(getAtomicScope(cast<MemSDNode>(N)), SDLoc(N)); + }]>; + + defvar GetAddSp = SDNodeXForm<atomic, [{ + return getI32Imm(getAddrSpace(cast<MemSDNode>(N)), SDLoc(N)); + }]>; + + def : Pat<(op:$this addr:$addr, t.Ty:$b, t.Ty:$c), + (!cast<Instruction>(NAME # _rr) ADDR:$addr, t.Ty:$b, t.Ty:$c, (GetSem $this), (GetScope $this), (GetAddSp $this))>; + + def : Pat<(op:$this addr:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c), + (!cast<Instruction>(NAME # _ir) ADDR:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c, (GetSem $this), (GetScope $this), (GetAddSp $this))>; + + def : Pat<(op:$this addr:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c)), + (!cast<Instruction>(NAME # _ri) ADDR:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c), (GetSem $this), (GetScope $this), (GetAddSp $this))>; + + def : Pat<(op:$this addr:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c)), + (!cast<Instruction>(NAME # _ii) ADDR:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c), (GetSem $this), (GetScope $this), (GetAddSp $this))>; } multiclass F_ATOMIC_2_AS<RegTyInfo t, SDPatternOperator frag, string op_str, list<Predicate> preds = []> { @@ -1899,14 +1936,6 @@ multiclass F_ATOMIC_2_AS<RegTyInfo t, SDPatternOperator frag, string op_str, lis defm _GEN : F_ATOMIC_2<t, "", "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>; } -multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string sem_str, string op_str, list<Predicate> preds = []> { - defvar frag_pat = (frag node:$a, node:$b, node:$c); - defm _G : F_ATOMIC_3<t, sem_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>; - defm _S : F_ATOMIC_3<t, sem_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>; - defm _S_C : F_ATOMIC_3<t, sem_str, ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>; - defm _GEN : F_ATOMIC_3<t, sem_str, "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>; -} - // atom_add defm INT_PTX_ATOM_ADD_32 : F_ATOMIC_2_AS<I32RT, atomic_load_add_i32, "add.u32">; defm INT_PTX_ATOM_ADD_64 : F_ATOMIC_2_AS<I64RT, atomic_load_add_i64, "add.u64">; @@ -1951,23 +1980,12 @@ defm INT_PTX_ATOM_XOR_64 : F_ATOMIC_2_AS<I64RT, atomic_load_xor_i64, "xor.b64", // Define atom.cas for all combinations of size x addrspace x memory order // supported in PTX *and* on the hardware. -foreach t = [I32RT, I64RT] in { - foreach order = ["acquire", "release", "acq_rel", "monotonic"] in { - defvar cas_order_string = !if(!eq(order, "monotonic"), ".relaxed", "."#order); - defvar atomic_cmp_swap_pat = !cast<PatFrag>("atomic_cmp_swap_i"#t.Size#_#order); - // Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it. - // Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions- - // for SM70+, and "old" ones which lower to "atom.cas", for earlier archs. - defm INT_PTX_ATOM_CAS_#t.Size#_#order - : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, cas_order_string, "cas.b"#t.Size, [hasSM<70>, hasPTX<63>]>; - defm INT_PTX_ATOM_CAS_#t.Size#_#order#_old - : F_ATOMIC_3_AS<t, atomic_cmp_swap_pat, "", "cas.b"#t.Size, []>; - } +foreach t = [I16RT, I32RT, I64RT] in { + defvar atomic_cmp_swap_pat = !cast<PatFrag>("atomic_cmp_swap_i"#t.Size); + defm INT_PTX_ATOM_CAS_#t.Size + : F_ATOMIC_3<t, ".cas.b"#t.Size, atomic_cmp_swap_pat, atomic_cmp_swap>; } -// Note that 16-bit CAS support in PTX is emulated. -defm INT_PTX_ATOM_CAS_16 : F_ATOMIC_3_AS<I16RT, atomic_cmp_swap_i16, "", "cas.b16", [hasSM<70>, hasPTX<63>]>; - // Support for scoped atomic operations. Matches // int_nvvm_atomic_{op}_{space}_{type}_{scope} // and converts it into the appropriate instruction. @@ -1991,19 +2009,6 @@ multiclass ATOM2N_impl<string OpStr, string IntTypeStr, string TypeStr, # !if(!empty(ScopeStr), "", "_" # ScopeStr)), preds = Preds>; } -multiclass ATOM3N_impl<string OpStr, string IntTypeStr, string TypeStr, - string ScopeStr, string SpaceStr, - RegTyInfo t, list<Predicate> Preds> { - defm "" : F_ATOMIC_3<t, - as_str = !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr), - sem_str = !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr), - op_str = OpStr # "." # TypeStr, - op = !cast<Intrinsic>( - "int_nvvm_atomic_" # OpStr - # "_" # SpaceStr # "_" # IntTypeStr - # !if(!empty(ScopeStr), "", "_" # ScopeStr)), - preds = Preds>; -} // Constructs variants for different scopes of atomic op. multiclass ATOM2S_impl<string OpStr, string IntTypeStr, string TypeStr, @@ -2018,15 +2023,22 @@ multiclass ATOM2S_impl<string OpStr, string IntTypeStr, string TypeStr, } } } -multiclass ATOM3S_impl<string OpStr, string IntTypeStr, string TypeStr, - RegTyInfo t, list<Predicate> Preds> { - // No need to define ".gpu"-scoped atomics. They do the same thing - // as the regular, non-scoped atomics defined elsewhere. + +multiclass F_ATOMIC_3_INTRINSIC_PATTERN<RegTyInfo t, string OpStr, string InstructionName> { foreach scope = ["cta", "sys"] in { - // For now we only need variants for generic space pointers. foreach space = ["gen"] in { - defm _#scope#space : ATOM3N_impl<OpStr, IntTypeStr, TypeStr, scope, space, - t, !listconcat(Preds, [hasAtomScope])>; + defvar intrinsic = !cast<SDPatternOperator>("int_nvvm_atomic_" # OpStr # "_" # space # "_i_" # scope); + def : Pat<(t.Ty (intrinsic addr:$addr, t.Ty:$b, t.Ty:$c)), + (!cast<Instruction>(InstructionName # "_rr") ADDR:$addr, t.Ty:$b, t.Ty:$c, Ordering_not_atomic, !cast<PatLeaf>("Scope_" # scope), !cast<PatLeaf>("AddrSpace_" # space))>; + + def : Pat<(t.Ty (intrinsic addr:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c)), + (!cast<Instruction>(InstructionName # "_ir") ADDR:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c, Ordering_not_atomic, !cast<PatLeaf>("Scope_" # scope), !cast<PatLeaf>("AddrSpace_" # space))>; + + def : Pat<(t.Ty (intrinsic addr:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c))), + (!cast<Instruction>(InstructionName # "_ri") ADDR:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c), Ordering_not_atomic, !cast<PatLeaf>("Scope_" # scope), !cast<PatLeaf>("AddrSpace_" # space))>; + + def : Pat<(t.Ty (intrinsic addr:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c))), + (!cast<Instruction>(InstructionName # "_ii") ADDR:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c), Ordering_not_atomic, !cast<PatLeaf>("Scope_" # scope), !cast<PatLeaf>("AddrSpace_" # space))>; } } } @@ -2069,9 +2081,9 @@ multiclass ATOM2_incdec_impl<string OpStr> { // atom.cas multiclass ATOM3_cas_impl<string OpStr> { - defm _b16 : ATOM3S_impl<OpStr, "i", "b16", I16RT, []>; - defm _b32 : ATOM3S_impl<OpStr, "i", "b32", I32RT, []>; - defm _b64 : ATOM3S_impl<OpStr, "i", "b64", I64RT, []>; + defm _b16 : F_ATOMIC_3_INTRINSIC_PATTERN<I16RT, OpStr, "INT_PTX_ATOM_CAS_16">; + defm _b32 : F_ATOMIC_3_INTRINSIC_PATTERN<I32RT, OpStr, "INT_PTX_ATOM_CAS_32">; + defm _b64 : F_ATOMIC_3_INTRINSIC_PATTERN<I64RT, OpStr, "INT_PTX_ATOM_CAS_64">; } defm INT_PTX_SATOM_ADD : ATOM2_add_impl<"add">; @@ -2137,7 +2149,7 @@ def LDU_GLOBAL_v4i32 : VLDU_G_ELE_V4<"b32", B32>; // during the lifetime of the kernel. class LDG_G<NVPTXRegClass regclass> - : NVPTXInst<(outs regclass:$result), (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src), + : NVPTXInst<(outs regclass:$result), (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src), "ld.global.nc.${Sign:sign}$fromWidth \t$result, [$src];", []>; def LD_GLOBAL_NC_i8 : LDG_G<B16>; @@ -2150,19 +2162,19 @@ def LD_GLOBAL_NC_i64 : LDG_G<B64>; // Elementized vector ldg class VLDG_G_ELE_V2<NVPTXRegClass regclass> : NVPTXInst<(outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src), + (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src), "ld.global.nc.v2.${Sign:sign}$fromWidth \t{{$dst1, $dst2}}, [$src];", []>; class VLDG_G_ELE_V4<NVPTXRegClass regclass> : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src), + (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src), "ld.global.nc.v4.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>; class VLDG_G_ELE_V8<NVPTXRegClass regclass> : NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4, regclass:$dst5, regclass:$dst6, regclass:$dst7, regclass:$dst8), - (ins LdStCode:$Sign, i32imm:$fromWidth, ADDR:$src), + (ins AtomicCode:$Sign, i32imm:$fromWidth, ADDR:$src), "ld.global.nc.v8.${Sign:sign}$fromWidth \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];", []>; // FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads. diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h index 88d3eefcc521..4eb452f39822 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -165,6 +165,8 @@ inline std::string ScopeToString(Scope S) { return "Cluster"; case Scope::Device: return "Device"; + case Scope::DefaultDevice: + return "DefaultDevice"; } report_fatal_error(formatv("Unknown NVPTX::Scope \"{}\".", static_cast<ScopeUnderlyingType>(S))); diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp index 53312e36fb9d..a5d3be40c5cf 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp @@ -96,7 +96,7 @@ unsigned PPCELFObjectWriter::getRelocType(const MCFixup &Fixup, // determine the type of the relocation unsigned Type = 0; if (IsPCRel) { - switch (Fixup.getTargetKind()) { + switch (Fixup.getKind()) { default: llvm_unreachable("Unimplemented"); case PPC::fixup_ppc_br24: @@ -173,8 +173,9 @@ unsigned PPCELFObjectWriter::getRelocType(const MCFixup &Fixup, break; } } else { - switch (Fixup.getTargetKind()) { - default: llvm_unreachable("invalid fixup kind!"); + switch (Fixup.getKind()) { + default: + llvm_unreachable("invalid fixup kind!"); case PPC::fixup_ppc_br24abs: Type = ELF::R_PPC_ADDR24; break; diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp index ee99cfc7d655..2dbc31fce72c 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp @@ -155,11 +155,10 @@ void PPCELFStreamer::emitGOTToPCRelReloc(const MCInst &Inst) { const MCExpr *SubExpr2 = MCBinaryExpr::createSub(CurrentLocationExpr, SubExpr, getContext()); - MCDataFragment *DF = static_cast<MCDataFragment *>(LabelSym->getFragment()); - assert(DF && "Expecting a valid data fragment."); - MCFixupKind FixupKind = static_cast<MCFixupKind>(FirstLiteralRelocationKind + - ELF::R_PPC64_PCREL_OPT); - DF->addFixup(MCFixup::create(LabelSym->getOffset() - 8, SubExpr2, FixupKind)); + MCFragment *F = LabelSym->getFragment(); + F->addFixup( + MCFixup::create(LabelSym->getOffset() - 8, SubExpr2, + FirstLiteralRelocationKind + ELF::R_PPC64_PCREL_OPT)); emitLabel(CurrentLocation, Inst.getLoc()); } diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 1521ad5f4502..a091b21f4a79 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -2425,8 +2425,7 @@ void PPCAIXAsmPrinter::emitTracebackTable() { // Set the 4th byte of the mandatory field. FirstHalfOfMandatoryField |= TracebackTable::IsFunctionNamePresentMask; - const PPCRegisterInfo *RegInfo = - static_cast<const PPCRegisterInfo *>(Subtarget->getRegisterInfo()); + const PPCRegisterInfo *RegInfo = Subtarget->getRegisterInfo(); Register FrameReg = RegInfo->getFrameRegister(*MF); if (FrameReg == (Subtarget->isPPC64() ? PPC::X31 : PPC::R31)) FirstHalfOfMandatoryField |= TracebackTable::IsAllocaUsedMask; diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 66f4aade380f..a143d85f61ec 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -1199,6 +1199,14 @@ public: addExpr(Inst, getImm(), isRV64Imm()); } + void addSImm10UnsignedOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + int64_t Imm; + [[maybe_unused]] bool IsConstant = evaluateConstantImm(getImm(), Imm); + assert(IsConstant); + Inst.addOperand(MCOperand::createImm(SignExtend64<10>(Imm))); + } + void addFPImmOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); if (isImm()) { @@ -1650,6 +1658,10 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, case Match_InvalidSImm26: return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 25), (1 << 25) - 1); + // HACK: See comment before `BareSymbolQC_E_LI` in RISCVInstrInfoXqci.td. + case Match_InvalidBareSymbolQC_E_LI: + LLVM_FALLTHROUGH; + // END HACK case Match_InvalidBareSImm32: return generateImmOutOfRangeError(Operands, ErrorInfo, std::numeric_limits<int32_t>::min(), diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index b723958a6ff2..fa7bcfa0e813 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -774,7 +774,8 @@ static constexpr FeatureBitset XTHeadGroup = { RISCV::FeatureVendorXTHeadVdot}; static constexpr FeatureBitset XAndesGroup = { - RISCV::FeatureVendorXAndesPerf, RISCV::FeatureVendorXAndesVBFHCvt, + RISCV::FeatureVendorXAndesPerf, RISCV::FeatureVendorXAndesBFHCvt, + RISCV::FeatureVendorXAndesVBFHCvt, RISCV::FeatureVendorXAndesVSIntLoad, RISCV::FeatureVendorXAndesVPackFPH, RISCV::FeatureVendorXAndesVDot}; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index 89a87798d71e..f76f8b3060d2 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -76,12 +76,13 @@ MCFixupKindInfo RISCVAsmBackend::getFixupKindInfo(MCFixupKind Kind) const { {"fixup_riscv_branch", 0, 32, 0}, {"fixup_riscv_rvc_jump", 2, 11, 0}, {"fixup_riscv_rvc_branch", 0, 16, 0}, + {"fixup_riscv_rvc_imm", 0, 16, 0}, {"fixup_riscv_call", 0, 64, 0}, {"fixup_riscv_call_plt", 0, 64, 0}, {"fixup_riscv_qc_e_branch", 0, 48, 0}, {"fixup_riscv_qc_e_32", 16, 32, 0}, - {"fixup_riscv_qc_abs20_u", 12, 20, 0}, + {"fixup_riscv_qc_abs20_u", 0, 32, 0}, {"fixup_riscv_qc_e_call_plt", 0, 48, 0}, // Andes fixups @@ -103,12 +104,13 @@ MCFixupKindInfo RISCVAsmBackend::getFixupKindInfo(MCFixupKind Kind) const { return Infos[Kind - FirstTargetFixupKind]; } -bool RISCVAsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, +bool RISCVAsmBackend::fixupNeedsRelaxationAdvanced(const MCFragment &, + const MCFixup &Fixup, const MCValue &, uint64_t Value, bool Resolved) const { int64_t Offset = int64_t(Value); - unsigned Kind = Fixup.getTargetKind(); + auto Kind = Fixup.getKind(); // Return true if the symbol is unresolved. if (!Resolved) @@ -134,6 +136,10 @@ bool RISCVAsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, // For jump instructions the immediate must be in the range // [-1048576, 1048574] return Offset > 1048574 || Offset < -1048576; + case RISCV::fixup_riscv_rvc_imm: + // This fixup can never be emitted as a relocation, so always needs to be + // relaxed. + return true; } } @@ -152,6 +158,18 @@ static unsigned getRelaxedOpcode(unsigned Opcode, ArrayRef<MCOperand> Operands, // This only relaxes one "step" - i.e. from C.J to JAL, not from C.J to // QC.E.J, because we can always relax again if needed. return RISCV::JAL; + case RISCV::C_LI: + if (!STI.hasFeature(RISCV::FeatureVendorXqcili)) + break; + // We only need this because `QC.E.LI` can be compressed into a `C.LI`. This + // happens because the `simm6` MCOperandPredicate accepts bare symbols, and + // `QC.E.LI` is the only instruction that accepts bare symbols at parse-time + // and compresses to `C.LI`. `C.LI` does not itself accept bare symbols at + // parse time. + // + // If we have a bare symbol, we need to turn this back to a `QC.E.LI`, as we + // have no way to emit a relocation on a `C.LI` instruction. + return RISCV::QC_E_LI; case RISCV::JAL: { // We can only relax JAL if we have Xqcilb if (!STI.hasFeature(RISCV::FeatureVendorXqcilb)) @@ -240,6 +258,23 @@ void RISCVAsmBackend::relaxInstruction(MCInst &Inst, Res.addOperand(Inst.getOperand(1)); break; } + case RISCV::C_LI: { + // This should only be hit when trying to relax a `C.LI` into a `QC.E.LI` + // because the `C.LI` has a bare symbol. We cannot use + // `RISCVRVC::uncompress` because it will use decompression patterns. The + // `QC.E.LI` compression pattern to `C.LI` is compression-only (because we + // don't want `c.li` ever printed as `qc.e.li`, which might be done if the + // pattern applied to decompression), but that doesn't help much becuase + // `C.LI` with a bare symbol will decompress to an `ADDI` anyway (because + // `simm12`'s MCOperandPredicate accepts a bare symbol and that pattern + // comes first), and we still cannot emit an `ADDI` with a bare symbol. + assert(STI.hasFeature(RISCV::FeatureVendorXqcili) && + "C.LI is only relaxable with Xqcili"); + Res.setOpcode(getRelaxedOpcode(Inst.getOpcode(), Inst.getOperands(), STI)); + Res.addOperand(Inst.getOperand(0)); + Res.addOperand(Inst.getOperand(1)); + break; + } case RISCV::BEQ: case RISCV::BNE: case RISCV::BLT: @@ -267,14 +302,14 @@ void RISCVAsmBackend::relaxInstruction(MCInst &Inst, Inst = std::move(Res); } -bool RISCVAsmBackend::relaxDwarfLineAddr(MCDwarfLineAddrFragment &DF, +bool RISCVAsmBackend::relaxDwarfLineAddr(MCFragment &F, bool &WasRelaxed) const { MCContext &C = getContext(); - int64_t LineDelta = DF.getLineDelta(); - const MCExpr &AddrDelta = DF.getAddrDelta(); + int64_t LineDelta = F.getDwarfLineDelta(); + const MCExpr &AddrDelta = F.getDwarfAddrDelta(); SmallVector<MCFixup, 1> Fixups; - size_t OldSize = DF.getContents().size(); + size_t OldSize = F.getVarSize(); int64_t Value; [[maybe_unused]] bool IsAbsolute = @@ -327,17 +362,16 @@ bool RISCVAsmBackend::relaxDwarfLineAddr(MCDwarfLineAddrFragment &DF, OS << uint8_t(dwarf::DW_LNS_copy); } - DF.setContents(Data); - DF.setFixups(Fixups); + F.setVarContents(Data); + F.setVarFixups(Fixups); WasRelaxed = OldSize != Data.size(); return true; } -bool RISCVAsmBackend::relaxDwarfCFA(MCDwarfCallFrameFragment &DF, - bool &WasRelaxed) const { - const MCExpr &AddrDelta = DF.getAddrDelta(); +bool RISCVAsmBackend::relaxDwarfCFA(MCFragment &F, bool &WasRelaxed) const { + const MCExpr &AddrDelta = F.getDwarfAddrDelta(); SmallVector<MCFixup, 2> Fixups; - size_t OldSize = DF.getContents().size(); + size_t OldSize = F.getVarSize(); int64_t Value; if (AddrDelta.evaluateAsAbsolute(Value, *Asm)) @@ -349,9 +383,9 @@ bool RISCVAsmBackend::relaxDwarfCFA(MCDwarfCallFrameFragment &DF, assert(getContext().getAsmInfo()->getMinInstAlignment() == 1 && "expected 1-byte alignment"); if (Value == 0) { - DF.clearContents(); - DF.clearFixups(); - WasRelaxed = OldSize != DF.getContents().size(); + F.clearVarContents(); + F.clearVarFixups(); + WasRelaxed = OldSize != 0; return true; } @@ -382,20 +416,20 @@ bool RISCVAsmBackend::relaxDwarfCFA(MCDwarfCallFrameFragment &DF, } else { llvm_unreachable("unsupported CFA encoding"); } - DF.setContents(Data); - DF.setFixups(Fixups); + F.setVarContents(Data); + F.setVarFixups(Fixups); WasRelaxed = OldSize != Data.size(); return true; } -std::pair<bool, bool> RISCVAsmBackend::relaxLEB128(MCLEBFragment &LF, +std::pair<bool, bool> RISCVAsmBackend::relaxLEB128(MCFragment &LF, int64_t &Value) const { - if (LF.isSigned()) + if (LF.isLEBSigned()) return std::make_pair(false, false); - const MCExpr &Expr = LF.getValue(); + const MCExpr &Expr = LF.getLEBValue(); if (ULEB128Reloc) { - LF.addFixup(MCFixup::create(0, &Expr, FK_Data_leb128)); + LF.setVarFixups({MCFixup::create(0, &Expr, FK_Data_leb128)}); } return std::make_pair(Expr.evaluateKnownAbsolute(Value, *Asm), false); } @@ -440,7 +474,7 @@ bool RISCVAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count, static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, MCContext &Ctx) { - switch (Fixup.getTargetKind()) { + switch (Fixup.getKind()) { default: llvm_unreachable("Unknown fixup kind!"); case FK_Data_1: @@ -539,10 +573,18 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, (Bit5 << 2); return Value; } + case RISCV::fixup_riscv_rvc_imm: { + if (!isInt<6>(Value)) + Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); + unsigned Bit5 = (Value >> 5) & 0x1; + unsigned Bit4_0 = Value & 0x1f; + Value = (Bit5 << 12) | (Bit4_0 << 2); + return Value; + } case RISCV::fixup_riscv_qc_e_32: { if (!isInt<32>(Value)) Ctx.reportError(Fixup.getLoc(), "fixup value out of range"); - return ((Value & 0xffffffff) << 16); + return Value & 0xffffffffu; } case RISCV::fixup_riscv_qc_abs20_u: { if (!isInt<20>(Value)) @@ -620,14 +662,13 @@ static const MCFixup *getPCRelHiFixup(const MCSpecifierExpr &Expr, const MCSymbol *AUIPCSymbol = AUIPCLoc.getAddSym(); if (!AUIPCSymbol) return nullptr; - const auto *DF = dyn_cast_or_null<MCDataFragment>(AUIPCSymbol->getFragment()); - + const auto *DF = AUIPCSymbol->getFragment(); if (!DF) return nullptr; uint64_t Offset = AUIPCSymbol->getOffset(); if (DF->getContents().size() == Offset) { - DF = dyn_cast_or_null<MCDataFragment>(DF->getNext()); + DF = DF->getNext(); if (!DF) return nullptr; Offset = 0; @@ -636,7 +677,7 @@ static const MCFixup *getPCRelHiFixup(const MCSpecifierExpr &Expr, for (const MCFixup &F : DF->getFixups()) { if (F.getOffset() != Offset) continue; - auto Kind = F.getTargetKind(); + auto Kind = F.getKind(); if (!mc::isRelocation(F.getKind())) { if (Kind == RISCV::fixup_riscv_pcrel_hi20) { *DFOut = DF; @@ -664,7 +705,7 @@ std::optional<bool> RISCVAsmBackend::evaluateFixup(const MCFragment &, const MCFixup *AUIPCFixup; const MCFragment *AUIPCDF; MCValue AUIPCTarget; - switch (Fixup.getTargetKind()) { + switch (Fixup.getKind()) { default: // Use default handling for `Value` and `IsResolved`. return {}; @@ -703,14 +744,14 @@ std::optional<bool> RISCVAsmBackend::evaluateFixup(const MCFragment &, Value = Asm->getSymbolOffset(SA) + AUIPCTarget.getConstant(); Value -= Asm->getFragmentOffset(*AUIPCDF) + AUIPCFixup->getOffset(); - return AUIPCFixup->getTargetKind() == RISCV::fixup_riscv_pcrel_hi20 && + return AUIPCFixup->getKind() == RISCV::fixup_riscv_pcrel_hi20 && isPCRelFixupResolved(AUIPCTarget.getAddSym(), *AUIPCDF); } void RISCVAsmBackend::maybeAddVendorReloc(const MCFragment &F, const MCFixup &Fixup) { StringRef VendorIdentifier; - switch (Fixup.getTargetKind()) { + switch (Fixup.getKind()) { default: // No Vendor Relocation Required. return; diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h index 1f1a6f5fe31a..8c10fbec3c8f 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h @@ -60,7 +60,8 @@ public: std::unique_ptr<MCObjectTargetWriter> createObjectTargetWriter() const override; - bool fixupNeedsRelaxationAdvanced(const MCFixup &, const MCValue &, uint64_t, + bool fixupNeedsRelaxationAdvanced(const MCFragment &, const MCFixup &, + const MCValue &, uint64_t, bool) const override; std::optional<MCFixupKind> getFixupKind(StringRef Name) const override; @@ -72,11 +73,9 @@ public: void relaxInstruction(MCInst &Inst, const MCSubtargetInfo &STI) const override; - bool relaxDwarfLineAddr(MCDwarfLineAddrFragment &DF, - bool &WasRelaxed) const override; - bool relaxDwarfCFA(MCDwarfCallFrameFragment &DF, - bool &WasRelaxed) const override; - std::pair<bool, bool> relaxLEB128(MCLEBFragment &LF, + bool relaxDwarfLineAddr(MCFragment &F, bool &WasRelaxed) const override; + bool relaxDwarfCFA(MCFragment &F, bool &WasRelaxed) const override; + std::pair<bool, bool> relaxLEB128(MCFragment &LF, int64_t &Value) const override; bool writeNopData(raw_ostream &OS, uint64_t Count, diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index f41ad419db1a..7ad5d5f3118b 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -339,7 +339,6 @@ enum OperandType : unsigned { OPERAND_SIMM6, OPERAND_SIMM6_NONZERO, OPERAND_SIMM10, - OPERAND_SIMM10_UNSIGNED, OPERAND_SIMM10_LSB0000_NONZERO, OPERAND_SIMM11, OPERAND_SIMM12, @@ -495,6 +494,17 @@ inline static bool isValidRoundingMode(unsigned Mode) { } } // namespace RISCVVXRndMode +namespace RISCVExceptFlags { +enum ExceptionFlag { + NX = 0x01, // Inexact + UF = 0x02, // Underflow + OF = 0x04, // Overflow + DZ = 0x08, // Divide by zero + NV = 0x10, // Invalid operation + ALL = 0x1F // Mask for all accrued exception flags +}; +} + //===----------------------------------------------------------------------===// // Floating-point Immediates // diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp index 8ab2c56ae317..9bf7896e1f1e 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp @@ -48,7 +48,7 @@ RISCVELFObjectWriter::~RISCVELFObjectWriter() = default; unsigned RISCVELFObjectWriter::getRelocType(const MCFixup &Fixup, const MCValue &Target, bool IsPCRel) const { - unsigned Kind = Fixup.getTargetKind(); + auto Kind = Fixup.getKind(); auto Spec = Target.getSpecifier(); switch (Spec) { case ELF::R_RISCV_TPREL_HI20: @@ -135,6 +135,9 @@ unsigned RISCVELFObjectWriter::getRelocType(const MCFixup &Fixup, return ELF::R_RISCV_LO12_I; case RISCV::fixup_riscv_lo12_s: return ELF::R_RISCV_LO12_S; + case RISCV::fixup_riscv_rvc_imm: + reportError(Fixup.getLoc(), "No relocation for CI-type instructions"); + return ELF::R_RISCV_NONE; case RISCV::fixup_riscv_qc_e_32: return ELF::R_RISCV_QC_E_32; case RISCV::fixup_riscv_qc_abs20_u: diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h index c1cdf511fae5..f816561ccf3f 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h @@ -40,12 +40,16 @@ enum Fixups { fixup_riscv_rvc_jump, // 8-bit fixup for symbol references in the compressed branch instruction fixup_riscv_rvc_branch, + // 6-bit fixup for symbol references in instructions like c.li + fixup_riscv_rvc_imm, // Fixup representing a legacy no-pic function call attached to the auipc // instruction in a pair composed of adjacent auipc+jalr instructions. fixup_riscv_call, // Fixup representing a function call attached to the auipc instruction in a // pair composed of adjacent auipc+jalr instructions. fixup_riscv_call_plt, + + // Qualcomm specific fixups // 12-bit fixup for symbol references in the 48-bit Xqcibi branch immediate // instructions fixup_riscv_qc_e_branch, diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp index 2ed7cd9f008a..cbeabdddb937 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp @@ -650,6 +650,8 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo, FixupKind = RISCV::fixup_riscv_rvc_jump; } else if (MIFrm == RISCVII::InstFormatCB) { FixupKind = RISCV::fixup_riscv_rvc_branch; + } else if (MIFrm == RISCVII::InstFormatCI) { + FixupKind = RISCV::fixup_riscv_rvc_imm; } else if (MIFrm == RISCVII::InstFormatI) { FixupKind = RISCV::fixup_riscv_12_i; } else if (MIFrm == RISCVII::InstFormatQC_EB) { diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp index f66c2d5f99cb..61ecfb278a7d 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp @@ -30,6 +30,7 @@ #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include <bitset> #define GET_INSTRINFO_MC_DESC @@ -305,6 +306,47 @@ public: } } + /// Returns (PLT virtual address, GOT virtual address) pairs for PLT entries. + std::vector<std::pair<uint64_t, uint64_t>> + findPltEntries(uint64_t PltSectionVA, ArrayRef<uint8_t> PltContents, + const MCSubtargetInfo &STI) const override { + uint32_t LoadInsnOpCode; + if (const Triple &T = STI.getTargetTriple(); T.isRISCV64()) + LoadInsnOpCode = 0x3003; // ld + else if (T.isRISCV32()) + LoadInsnOpCode = 0x2003; // lw + else + return {}; + + constexpr uint64_t FirstEntryAt = 32, EntrySize = 16; + if (PltContents.size() < FirstEntryAt + EntrySize) + return {}; + + std::vector<std::pair<uint64_t, uint64_t>> Results; + for (uint64_t EntryStart = FirstEntryAt, + EntryStartEnd = PltContents.size() - EntrySize; + EntryStart <= EntryStartEnd; EntryStart += EntrySize) { + const uint32_t AuipcInsn = + support::endian::read32le(PltContents.data() + EntryStart); + const bool IsAuipc = (AuipcInsn & 0x7F) == 0x17; + if (!IsAuipc) + continue; + + const uint32_t LoadInsn = + support::endian::read32le(PltContents.data() + EntryStart + 4); + const bool IsLoad = (LoadInsn & 0x707F) == LoadInsnOpCode; + if (!IsLoad) + continue; + + const uint64_t GotPltSlotVA = PltSectionVA + EntryStart + + (AuipcInsn & 0xFFFFF000) + + SignExtend64<12>(LoadInsn >> 20); + Results.emplace_back(PltSectionVA + EntryStart, GotPltSlotVA); + } + + return Results; + } + private: static bool maybeReturnAddress(MCRegister Reg) { // X1 is used for normal returns, X5 for returns from outlined functions. diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index bf58226e0bd3..f9c0b54be7a2 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -179,7 +179,6 @@ def FeatureStdExtZicfiss def HasStdExtZicfiss : Predicate<"Subtarget->hasStdExtZicfiss()">, AssemblerPredicate<(all_of FeatureStdExtZicfiss), "'Zicfiss' (Shadow stack)">; -def NoHasStdExtZicfiss : Predicate<"!Subtarget->hasStdExtZicfiss()">; def FeatureStdExtZilsd : RISCVExtension<1, 0, @@ -188,7 +187,6 @@ def FeatureStdExtZilsd def HasStdExtZilsd : Predicate<"Subtarget->hasStdExtZilsd()">, AssemblerPredicate<(all_of FeatureStdExtZilsd), "'Zilsd' (Load/Store pair instructions)">; -def NoHasStdExtZilsd : Predicate<"!Subtarget->hasStdExtZilsd()">; // Multiply Extensions @@ -1487,6 +1485,11 @@ def HasVendorXqcics : Predicate<"Subtarget->hasVendorXqcics()">, AssemblerPredicate<(all_of FeatureVendorXqcics), "'Xqcics' (Qualcomm uC Conditional Select Extension)">; +def NoVendorXqcics + : Predicate<"!Subtarget->hasVendorXqcics()">; + +def HasVendorXqcicsOrXqcicm + : Predicate<"Subtarget->hasVendorXqcics() || Subtarget->hasVendorXqcicm()">; def FeatureVendorXqcicsr : RISCVExperimentalExtension<0, 4, "Qualcomm uC CSR Extension">; @@ -1599,6 +1602,14 @@ def HasVendorXAndesPerf AssemblerPredicate<(all_of FeatureVendorXAndesPerf), "'XAndesPerf' (Andes Performance Extension)">; +def FeatureVendorXAndesBFHCvt + : RISCVExtension<5, 0, "Andes Scalar BFLOAT16 Conversion Extension", + [FeatureStdExtF]>; +def HasVendorXAndesBFHCvt + : Predicate<"Subtarget->hasVendorXAndesBFHCvt()">, + AssemblerPredicate<(all_of FeatureVendorXAndesBFHCvt), + "'XAndesBFHCvt' (Andes Scalar BFLOAT16 Conversion Extension)">; + def FeatureVendorXAndesVBFHCvt : RISCVExtension<5, 0, "Andes Vector BFLOAT16 Conversion Extension", [FeatureStdExtZve32f]>; diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index a796c910bd44..23b455434900 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -95,6 +95,11 @@ static const std::pair<MCPhysReg, int8_t> FixedCSRFIQCIInterruptMap[] = { /* -21, -22, -23, -24 are reserved */ }; +/// Returns true if DWARF CFI instructions ("frame moves") should be emitted. +static bool needsDwarfCFI(const MachineFunction &MF) { + return MF.needsFrameMoves(); +} + // For now we use x3, a.k.a gp, as pointer to shadow call stack. // User should not use x3 in their asm. static void emitSCSPrologue(MachineFunction &MF, MachineBasicBlock &MBB, @@ -141,6 +146,9 @@ static void emitSCSPrologue(MachineFunction &MF, MachineBasicBlock &MBB, .addImm(-SlotSize) .setMIFlag(MachineInstr::FrameSetup); + if (!needsDwarfCFI(MF)) + return; + // Emit a CFI instruction that causes SlotSize to be subtracted from the value // of the shadow stack pointer when unwinding past this frame. char DwarfSCSReg = TRI->getDwarfRegNum(SCSPReg, /*IsEH*/ true); @@ -199,8 +207,10 @@ static void emitSCSEpilogue(MachineFunction &MF, MachineBasicBlock &MBB, .addReg(SCSPReg) .addImm(-SlotSize) .setMIFlag(MachineInstr::FrameDestroy); - // Restore the SCS pointer - CFIInstBuilder(MBB, MI, MachineInstr::FrameDestroy).buildRestore(SCSPReg); + if (needsDwarfCFI(MF)) { + // Restore the SCS pointer + CFIInstBuilder(MBB, MI, MachineInstr::FrameDestroy).buildRestore(SCSPReg); + } } // Insert instruction to swap mscratchsw with sp @@ -738,7 +748,8 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB, MachineFunction &MF, uint64_t Offset, uint64_t RealStackSize, bool EmitCFI, bool NeedProbe, uint64_t ProbeSize, - bool DynAllocation) const { + bool DynAllocation, + MachineInstr::MIFlag Flag) const { DebugLoc DL; const RISCVRegisterInfo *RI = STI.getRegisterInfo(); const RISCVInstrInfo *TII = STI.getInstrInfo(); @@ -748,7 +759,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB, // Simply allocate the stack if it's not big enough to require a probe. if (!NeedProbe || Offset <= ProbeSize) { RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackOffset::getFixed(-Offset), - MachineInstr::FrameSetup, getStackAlign()); + Flag, getStackAlign()); if (EmitCFI) CFIBuilder.buildDefCFAOffset(RealStackSize); @@ -759,7 +770,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB, .addReg(RISCV::X0) .addReg(SPReg) .addImm(0) - .setMIFlags(MachineInstr::FrameSetup); + .setMIFlags(Flag); } return; @@ -770,14 +781,13 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB, uint64_t CurrentOffset = 0; while (CurrentOffset + ProbeSize <= Offset) { RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, - StackOffset::getFixed(-ProbeSize), MachineInstr::FrameSetup, - getStackAlign()); + StackOffset::getFixed(-ProbeSize), Flag, getStackAlign()); // s[d|w] zero, 0(sp) BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW)) .addReg(RISCV::X0) .addReg(SPReg) .addImm(0) - .setMIFlags(MachineInstr::FrameSetup); + .setMIFlags(Flag); CurrentOffset += ProbeSize; if (EmitCFI) @@ -787,8 +797,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB, uint64_t Residual = Offset - CurrentOffset; if (Residual) { RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, - StackOffset::getFixed(-Residual), MachineInstr::FrameSetup, - getStackAlign()); + StackOffset::getFixed(-Residual), Flag, getStackAlign()); if (EmitCFI) CFIBuilder.buildDefCFAOffset(Offset); @@ -798,7 +807,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB, .addReg(RISCV::X0) .addReg(SPReg) .addImm(0) - .setMIFlags(MachineInstr::FrameSetup); + .setMIFlags(Flag); } } @@ -812,8 +821,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB, Register TargetReg = RISCV::X6; // SUB TargetReg, SP, RoundedSize RI->adjustReg(MBB, MBBI, DL, TargetReg, SPReg, - StackOffset::getFixed(-RoundedSize), MachineInstr::FrameSetup, - getStackAlign()); + StackOffset::getFixed(-RoundedSize), Flag, getStackAlign()); if (EmitCFI) { // Set the CFA register to TargetReg. @@ -830,14 +838,14 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB, if (Residual) { RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackOffset::getFixed(-Residual), - MachineInstr::FrameSetup, getStackAlign()); + Flag, getStackAlign()); if (DynAllocation) { // s[d|w] zero, 0(sp) BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW)) .addReg(RISCV::X0) .addReg(SPReg) .addImm(0) - .setMIFlags(MachineInstr::FrameSetup); + .setMIFlags(Flag); } } @@ -937,6 +945,7 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, MBBI = std::prev(MBBI, getRVVCalleeSavedInfo(MF, CSI).size() + getUnmanagedCSI(MF, CSI).size()); CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup); + bool NeedsDwarfCFI = needsDwarfCFI(MF); // If libcalls are used to spill and restore callee-saved registers, the frame // has two sections; the opaque section managed by the libcalls, and the @@ -964,10 +973,12 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, alignTo((STI.getXLen() / 8) * LibCallRegs, getStackAlign()); RVFI->setLibCallStackSize(LibCallFrameSize); - CFIBuilder.buildDefCFAOffset(LibCallFrameSize); - for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI)) - CFIBuilder.buildOffset(CS.getReg(), - MFI.getObjectOffset(CS.getFrameIdx())); + if (NeedsDwarfCFI) { + CFIBuilder.buildDefCFAOffset(LibCallFrameSize); + for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI)) + CFIBuilder.buildOffset(CS.getReg(), + MFI.getObjectOffset(CS.getFrameIdx())); + } } // FIXME (note copied from Lanai): This appears to be overallocating. Needs @@ -998,14 +1009,17 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, // could only be the next instruction. ++PossiblePush; - // Insert the CFI metadata before where we think the `(QC.)CM.PUSH(FP)` - // could be. The PUSH will also get its own CFI metadata for its own - // modifications, which should come after the PUSH. - CFIInstBuilder PushCFIBuilder(MBB, PossiblePush, MachineInstr::FrameSetup); - PushCFIBuilder.buildDefCFAOffset(QCIInterruptPushAmount); - for (const CalleeSavedInfo &CS : getQCISavedInfo(MF, CSI)) - PushCFIBuilder.buildOffset(CS.getReg(), - MFI.getObjectOffset(CS.getFrameIdx())); + if (NeedsDwarfCFI) { + // Insert the CFI metadata before where we think the `(QC.)CM.PUSH(FP)` + // could be. The PUSH will also get its own CFI metadata for its own + // modifications, which should come after the PUSH. + CFIInstBuilder PushCFIBuilder(MBB, PossiblePush, + MachineInstr::FrameSetup); + PushCFIBuilder.buildDefCFAOffset(QCIInterruptPushAmount); + for (const CalleeSavedInfo &CS : getQCISavedInfo(MF, CSI)) + PushCFIBuilder.buildOffset(CS.getReg(), + MFI.getObjectOffset(CS.getFrameIdx())); + } } if (RVFI->isPushable(MF) && PossiblePush != MBB.end() && @@ -1019,10 +1033,12 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, PossiblePush->getOperand(1).setImm(StackAdj); StackSize -= StackAdj; - CFIBuilder.buildDefCFAOffset(RealStackSize - StackSize); - for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI)) - CFIBuilder.buildOffset(CS.getReg(), - MFI.getObjectOffset(CS.getFrameIdx())); + if (NeedsDwarfCFI) { + CFIBuilder.buildDefCFAOffset(RealStackSize - StackSize); + for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI)) + CFIBuilder.buildOffset(CS.getReg(), + MFI.getObjectOffset(CS.getFrameIdx())); + } } // Allocate space on the stack if necessary. @@ -1033,8 +1049,9 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, bool DynAllocation = MF.getInfo<RISCVMachineFunctionInfo>()->hasDynamicAllocation(); if (StackSize != 0) - allocateStack(MBB, MBBI, MF, StackSize, RealStackSize, /*EmitCFI=*/true, - NeedProbe, ProbeSize, DynAllocation); + allocateStack(MBB, MBBI, MF, StackSize, RealStackSize, NeedsDwarfCFI, + NeedProbe, ProbeSize, DynAllocation, + MachineInstr::FrameSetup); // Save SiFive CLIC CSRs into Stack emitSiFiveCLICPreemptibleSaves(MF, MBB, MBBI, DL); @@ -1050,8 +1067,10 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, // Iterate over list of callee-saved registers and emit .cfi_offset // directives. - for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI)) - CFIBuilder.buildOffset(CS.getReg(), MFI.getObjectOffset(CS.getFrameIdx())); + if (NeedsDwarfCFI) + for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI)) + CFIBuilder.buildOffset(CS.getReg(), + MFI.getObjectOffset(CS.getFrameIdx())); // Generate new FP. if (hasFP(MF)) { @@ -1070,7 +1089,8 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, MachineInstr::FrameSetup, getStackAlign()); } - CFIBuilder.buildDefCFA(FPReg, RVFI->getVarArgsSaveSize()); + if (NeedsDwarfCFI) + CFIBuilder.buildDefCFA(FPReg, RVFI->getVarArgsSaveSize()); } uint64_t SecondSPAdjustAmount = 0; @@ -1081,15 +1101,16 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, "SecondSPAdjustAmount should be greater than zero"); allocateStack(MBB, MBBI, MF, SecondSPAdjustAmount, - getStackSizeWithRVVPadding(MF), !hasFP(MF), NeedProbe, - ProbeSize, DynAllocation); + getStackSizeWithRVVPadding(MF), NeedsDwarfCFI && !hasFP(MF), + NeedProbe, ProbeSize, DynAllocation, + MachineInstr::FrameSetup); } if (RVVStackSize) { if (NeedProbe) { allocateAndProbeStackForRVV(MF, MBB, MBBI, DL, RVVStackSize, - MachineInstr::FrameSetup, !hasFP(MF), - DynAllocation); + MachineInstr::FrameSetup, + NeedsDwarfCFI && !hasFP(MF), DynAllocation); } else { // We must keep the stack pointer aligned through any intermediate // updates. @@ -1098,14 +1119,15 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, MachineInstr::FrameSetup, getStackAlign()); } - if (!hasFP(MF)) { + if (NeedsDwarfCFI && !hasFP(MF)) { // Emit .cfi_def_cfa_expression "sp + StackSize + RVVStackSize * vlenb". CFIBuilder.insertCFIInst(createDefCFAExpression( *RI, SPReg, getStackSizeWithRVVPadding(MF), RVVStackSize / 8)); } std::advance(MBBI, getRVVCalleeSavedInfo(MF, CSI).size()); - emitCalleeSavedRVVPrologCFI(MBB, MBBI, hasFP(MF)); + if (NeedsDwarfCFI) + emitCalleeSavedRVVPrologCFI(MBB, MBBI, hasFP(MF)); } if (hasFP(MF)) { @@ -1172,8 +1194,9 @@ void RISCVFrameLowering::deallocateStack(MachineFunction &MF, MachineInstr::FrameDestroy, getStackAlign()); StackSize = 0; - CFIInstBuilder(MBB, MBBI, MachineInstr::FrameDestroy) - .buildDefCFAOffset(CFAOffset); + if (needsDwarfCFI(MF)) + CFIInstBuilder(MBB, MBBI, MachineInstr::FrameDestroy) + .buildDefCFAOffset(CFAOffset); } void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, @@ -1213,6 +1236,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, std::next(MBBI, getRVVCalleeSavedInfo(MF, CSI).size()); CFIInstBuilder CFIBuilder(MBB, FirstScalarCSRRestoreInsn, MachineInstr::FrameDestroy); + bool NeedsDwarfCFI = needsDwarfCFI(MF); uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF); uint64_t RealStackSize = FirstSPAdjustAmount ? FirstSPAdjustAmount @@ -1233,10 +1257,11 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, StackOffset::getScalable(RVVStackSize), MachineInstr::FrameDestroy, getStackAlign()); - if (!hasFP(MF)) - CFIBuilder.buildDefCFA(SPReg, RealStackSize); - - emitCalleeSavedRVVEpilogCFI(MBB, FirstScalarCSRRestoreInsn); + if (NeedsDwarfCFI) { + if (!hasFP(MF)) + CFIBuilder.buildDefCFA(SPReg, RealStackSize); + emitCalleeSavedRVVEpilogCFI(MBB, FirstScalarCSRRestoreInsn); + } } if (FirstSPAdjustAmount) { @@ -1252,7 +1277,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, StackOffset::getFixed(SecondSPAdjustAmount), MachineInstr::FrameDestroy, getStackAlign()); - if (!hasFP(MF)) + if (NeedsDwarfCFI && !hasFP(MF)) CFIBuilder.buildDefCFAOffset(FirstSPAdjustAmount); } @@ -1273,7 +1298,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, getStackAlign()); } - if (hasFP(MF)) + if (NeedsDwarfCFI && hasFP(MF)) CFIBuilder.buildDefCFA(SPReg, RealStackSize); // Skip to after the restores of scalar callee-saved registers @@ -1296,8 +1321,9 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, } // Recover callee-saved registers. - for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI)) - CFIBuilder.buildRestore(CS.getReg()); + if (NeedsDwarfCFI) + for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI)) + CFIBuilder.buildRestore(CS.getReg()); if (RVFI->isPushable(MF) && MBBI != MBB.end() && isPop(MBBI->getOpcode())) { // Use available stack adjustment in pop instruction to deallocate stack @@ -1316,15 +1342,17 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, auto NextI = next_nodbg(MBBI, MBB.end()); if (NextI == MBB.end() || NextI->getOpcode() != RISCV::PseudoRET) { ++MBBI; - CFIBuilder.setInsertPoint(MBBI); + if (NeedsDwarfCFI) { + CFIBuilder.setInsertPoint(MBBI); - for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI)) - CFIBuilder.buildRestore(CS.getReg()); + for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI)) + CFIBuilder.buildRestore(CS.getReg()); - // Update CFA Offset. If this is a QCI interrupt function, there will be a - // leftover offset which is deallocated by `QC.C.MILEAVERET`, otherwise - // getQCIInterruptStackSize() will be 0. - CFIBuilder.buildDefCFAOffset(RVFI->getQCIInterruptStackSize()); + // Update CFA Offset. If this is a QCI interrupt function, there will + // be a leftover offset which is deallocated by `QC.C.MILEAVERET`, + // otherwise getQCIInterruptStackSize() will be 0. + CFIBuilder.buildDefCFAOffset(RVFI->getQCIInterruptStackSize()); + } } } @@ -1813,8 +1841,10 @@ MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr( // allocateStack. bool DynAllocation = MF.getInfo<RISCVMachineFunctionInfo>()->hasDynamicAllocation(); - allocateStack(MBB, MI, MF, -Amount, -Amount, !hasFP(MF), - /*NeedProbe=*/true, ProbeSize, DynAllocation); + allocateStack(MBB, MI, MF, -Amount, -Amount, + needsDwarfCFI(MF) && !hasFP(MF), + /*NeedProbe=*/true, ProbeSize, DynAllocation, + MachineInstr::NoFlags); } else { const RISCVRegisterInfo &RI = *STI.getRegisterInfo(); RI.adjustReg(MBB, MI, DL, SPReg, SPReg, StackOffset::getFixed(Amount), diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h index d013755ce58a..6af63a4885f3 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h @@ -81,7 +81,8 @@ public: void allocateStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineFunction &MF, uint64_t Offset, uint64_t RealStackSize, bool EmitCFI, bool NeedProbe, - uint64_t ProbeSize, bool DynAllocation) const; + uint64_t ProbeSize, bool DynAllocation, + MachineInstr::MIFlag Flag) const; protected: const RISCVSubtarget &STI; diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index c97b14a254cd..cfec46d23d65 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -689,10 +689,16 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInMask(SDNode *Node) { if (!isShiftedMask_32(C1) || isInt<12>(C1)) return false; + // INSBI will clobber the input register in N0. Bail out if we need a copy to + // preserve this value. + SDValue N0 = Node->getOperand(0); + if (!N0.hasOneUse()) + return false; + // If C1 is a shifted mask (but can't be formed as an ORI), // use a bitfield insert of -1. // Transform (or x, C1) - // -> (qc.insbi x, width, shift) + // -> (qc.insbi x, -1, width, shift) const unsigned Leading = llvm::countl_zero((uint32_t)C1); const unsigned Trailing = llvm::countr_zero((uint32_t)C1); const unsigned Width = 32 - Leading - Trailing; @@ -705,7 +711,7 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInMask(SDNode *Node) { SDLoc DL(Node); MVT VT = Node->getSimpleValueType(0); - SDValue Ops[] = {CurDAG->getSignedTargetConstant(-1, DL, VT), + SDValue Ops[] = {N0, CurDAG->getSignedTargetConstant(-1, DL, VT), CurDAG->getTargetConstant(Width, DL, VT), CurDAG->getTargetConstant(Trailing, DL, VT)}; SDNode *BitIns = CurDAG->getMachineNode(RISCV::QC_INSBI, DL, VT, Ops); @@ -2842,56 +2848,6 @@ static bool isWorthFoldingAdd(SDValue Add) { return true; } -bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr, - unsigned MaxShiftAmount, - SDValue &Base, SDValue &Index, - SDValue &Scale) { - EVT VT = Addr.getSimpleValueType(); - auto UnwrapShl = [this, VT, MaxShiftAmount](SDValue N, SDValue &Index, - SDValue &Shift) { - uint64_t ShiftAmt = 0; - Index = N; - - if (N.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N.getOperand(1))) { - // Only match shifts by a value in range [0, MaxShiftAmount]. - if (N.getConstantOperandVal(1) <= MaxShiftAmount) { - Index = N.getOperand(0); - ShiftAmt = N.getConstantOperandVal(1); - } - } - - Shift = CurDAG->getTargetConstant(ShiftAmt, SDLoc(N), VT); - return ShiftAmt != 0; - }; - - if (Addr.getOpcode() == ISD::ADD) { - if (auto *C1 = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) { - SDValue AddrB = Addr.getOperand(0); - if (AddrB.getOpcode() == ISD::ADD && - UnwrapShl(AddrB.getOperand(0), Index, Scale) && - !isa<ConstantSDNode>(AddrB.getOperand(1)) && - isInt<12>(C1->getSExtValue())) { - // (add (add (shl A C2) B) C1) -> (add (add B C1) (shl A C2)) - SDValue C1Val = - CurDAG->getTargetConstant(C1->getZExtValue(), SDLoc(Addr), VT); - Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT, - AddrB.getOperand(1), C1Val), - 0); - return true; - } - } else if (UnwrapShl(Addr.getOperand(0), Index, Scale)) { - Base = Addr.getOperand(1); - return true; - } else { - UnwrapShl(Addr.getOperand(1), Index, Scale); - Base = Addr.getOperand(0); - return true; - } - } - - return false; -} - bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base, SDValue &Offset) { if (SelectAddrFrameIndex(Addr, Base, Offset)) @@ -2908,7 +2864,7 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base, if (CurDAG->isBaseWithConstantOffset(Addr)) { int64_t CVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue(); - if (isInt<12>(CVal) && isInt<12>(CVal)) { + if (isInt<12>(CVal)) { Base = Addr.getOperand(0); if (Base.getOpcode() == RISCVISD::ADD_LO) { SDValue LoOperand = Base.getOperand(1); @@ -2942,8 +2898,7 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base, // Handle ADD with large immediates. if (Addr.getOpcode() == ISD::ADD && isa<ConstantSDNode>(Addr.getOperand(1))) { int64_t CVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue(); - assert(!(isInt<12>(CVal) && isInt<12>(CVal)) && - "simm12 not already handled?"); + assert(!isInt<12>(CVal) && "simm12 not already handled?"); // Handle immediates in the range [-4096,-2049] or [2048, 4094]. We can use // an ADDI for part of the offset and fold the rest into the load/store. @@ -2984,12 +2939,11 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base, return true; } -/// Similar to SelectAddrRegImm, except that the offset restricted for -/// unsinged nine bits. +/// Similar to SelectAddrRegImm, except that the offset is restricted to uimm9. bool RISCVDAGToDAGISel::SelectAddrRegImm9(SDValue Addr, SDValue &Base, SDValue &Offset) { - if (SelectAddrFrameIndex(Addr, Base, Offset)) - return true; + // FIXME: Support FrameIndex. Need to teach eliminateFrameIndex that only + // a 9-bit immediate can be folded. SDLoc DL(Addr); MVT VT = Addr.getSimpleValueType(); @@ -2999,8 +2953,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm9(SDValue Addr, SDValue &Base, if (isUInt<9>(CVal)) { Base = Addr.getOperand(0); - if (auto *FIN = dyn_cast<FrameIndexSDNode>(Base)) - Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), VT); + // FIXME: Support FrameIndex. Need to teach eliminateFrameIndex that only + // a 9-bit immediate can be folded. Offset = CurDAG->getSignedTargetConstant(CVal, DL, VT); return true; } @@ -3078,6 +3032,80 @@ bool RISCVDAGToDAGISel::SelectAddrRegImmLsb00000(SDValue Addr, SDValue &Base, return true; } +bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr, + unsigned MaxShiftAmount, + SDValue &Base, SDValue &Index, + SDValue &Scale) { + if (Addr.getOpcode() != ISD::ADD) + return false; + SDValue LHS = Addr.getOperand(0); + SDValue RHS = Addr.getOperand(1); + + EVT VT = Addr.getSimpleValueType(); + auto SelectShl = [this, VT, MaxShiftAmount](SDValue N, SDValue &Index, + SDValue &Shift) { + if (N.getOpcode() != ISD::SHL || !isa<ConstantSDNode>(N.getOperand(1))) + return false; + + // Only match shifts by a value in range [0, MaxShiftAmount]. + unsigned ShiftAmt = N.getConstantOperandVal(1); + if (ShiftAmt > MaxShiftAmount) + return false; + + Index = N.getOperand(0); + Shift = CurDAG->getTargetConstant(ShiftAmt, SDLoc(N), VT); + return true; + }; + + if (auto *C1 = dyn_cast<ConstantSDNode>(RHS)) { + // (add (add (shl A C2) B) C1) -> (add (add B C1) (shl A C2)) + if (LHS.getOpcode() == ISD::ADD && + !isa<ConstantSDNode>(LHS.getOperand(1)) && + isInt<12>(C1->getSExtValue())) { + if (SelectShl(LHS.getOperand(1), Index, Scale)) { + SDValue C1Val = CurDAG->getTargetConstant(*C1->getConstantIntValue(), + SDLoc(Addr), VT); + Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT, + LHS.getOperand(0), C1Val), + 0); + return true; + } + + // Add is commutative so we need to check both operands. + if (SelectShl(LHS.getOperand(0), Index, Scale)) { + SDValue C1Val = CurDAG->getTargetConstant(*C1->getConstantIntValue(), + SDLoc(Addr), VT); + Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT, + LHS.getOperand(1), C1Val), + 0); + return true; + } + } + + // Don't match add with constants. + // FIXME: Is this profitable for large constants that have 0s in the lower + // 12 bits that we can materialize with LUI? + return false; + } + + // Try to match a shift on the RHS. + if (SelectShl(RHS, Index, Scale)) { + Base = LHS; + return true; + } + + // Try to match a shift on the LHS. + if (SelectShl(LHS, Index, Scale)) { + Base = RHS; + return true; + } + + Base = LHS; + Index = RHS; + Scale = CurDAG->getTargetConstant(0, SDLoc(Addr), VT); + return true; +} + bool RISCVDAGToDAGISel::SelectAddrRegReg(SDValue Addr, SDValue &Base, SDValue &Offset) { if (Addr.getOpcode() != ISD::ADD) @@ -3776,21 +3804,18 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits, // Select a constant that can be represented as (sign_extend(imm5) << imm2). bool RISCVDAGToDAGISel::selectSimm5Shl2(SDValue N, SDValue &Simm5, SDValue &Shl2) { - if (auto *C = dyn_cast<ConstantSDNode>(N)) { - int64_t Offset = C->getSExtValue(); - unsigned Shift; - for (Shift = 0; Shift < 4; Shift++) - if (isInt<5>(Offset >> Shift) && ((Offset % (1LL << Shift)) == 0)) - break; - - // Constant cannot be encoded. - if (Shift == 4) - return false; + auto *C = dyn_cast<ConstantSDNode>(N); + if (!C) + return false; - EVT Ty = N->getValueType(0); - Simm5 = CurDAG->getSignedTargetConstant(Offset >> Shift, SDLoc(N), Ty); - Shl2 = CurDAG->getTargetConstant(Shift, SDLoc(N), Ty); - return true; + int64_t Offset = C->getSExtValue(); + for (unsigned Shift = 0; Shift < 4; Shift++) { + if (isInt<5>(Offset >> Shift) && ((Offset % (1LL << Shift)) == 0)) { + EVT VT = N->getValueType(0); + Simm5 = CurDAG->getSignedTargetConstant(Offset >> Shift, SDLoc(N), VT); + Shl2 = CurDAG->getTargetConstant(Shift, SDLoc(N), VT); + return true; + } } return false; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 7c72d074a35b..4845a9c84e01 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -39,7 +39,6 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsRISCV.h" -#include "llvm/IR/PatternMatch.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCInstBuilder.h" #include "llvm/Support/CommandLine.h" @@ -129,7 +128,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, if (Subtarget.hasStdExtZfhmin()) addRegisterClass(MVT::f16, &RISCV::FPR16RegClass); - if (Subtarget.hasStdExtZfbfmin()) + if (Subtarget.hasStdExtZfbfmin() || Subtarget.hasVendorXAndesBFHCvt()) addRegisterClass(MVT::bf16, &RISCV::FPR16RegClass); if (Subtarget.hasStdExtF()) addRegisterClass(MVT::f32, &RISCV::FPR32RegClass); @@ -656,6 +655,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::GET_FPENV, XLenVT, Custom); setOperationAction(ISD::SET_FPENV, XLenVT, Custom); setOperationAction(ISD::RESET_FPENV, MVT::Other, Custom); + setOperationAction(ISD::GET_FPMODE, XLenVT, Custom); + setOperationAction(ISD::SET_FPMODE, XLenVT, Custom); + setOperationAction(ISD::RESET_FPMODE, MVT::Other, Custom); } setOperationAction({ISD::GlobalAddress, ISD::BlockAddress, ISD::ConstantPool, @@ -8226,6 +8228,12 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, return lowerSET_FPENV(Op, DAG); case ISD::RESET_FPENV: return lowerRESET_FPENV(Op, DAG); + case ISD::GET_FPMODE: + return lowerGET_FPMODE(Op, DAG); + case ISD::SET_FPMODE: + return lowerSET_FPMODE(Op, DAG); + case ISD::RESET_FPMODE: + return lowerRESET_FPMODE(Op, DAG); case ISD::EH_DWARF_CFA: return lowerEH_DWARF_CFA(Op, DAG); case ISD::VP_MERGE: @@ -11969,7 +11977,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op, // Store with unit-stride store and load it back with segmented load. MVT XLenVT = Subtarget.getXLenVT(); - SDValue VL = getDefaultScalableVLOps(ConcatVT, DL, DAG, Subtarget).second; + auto [Mask, VL] = getDefaultScalableVLOps(VecVT, DL, DAG, Subtarget); SDValue Passthru = DAG.getUNDEF(ConcatVT); // Allocate a stack slot. @@ -11990,16 +11998,20 @@ SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op, MachineMemOperand::MOStore, LocationSize::beforeOrAfterPointer()); static const Intrinsic::ID VlsegIntrinsicsIds[] = { - Intrinsic::riscv_vlseg2, Intrinsic::riscv_vlseg3, Intrinsic::riscv_vlseg4, - Intrinsic::riscv_vlseg5, Intrinsic::riscv_vlseg6, Intrinsic::riscv_vlseg7, - Intrinsic::riscv_vlseg8}; + Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask, + Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask, + Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask, + Intrinsic::riscv_vlseg8_mask}; SDValue LoadOps[] = { Chain, DAG.getTargetConstant(VlsegIntrinsicsIds[Factor - 2], DL, XLenVT), Passthru, StackPtr, + Mask, VL, + DAG.getTargetConstant( + RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC, DL, XLenVT), DAG.getTargetConstant(Log2_64(VecVT.getScalarSizeInBits()), DL, XLenVT)}; unsigned Sz = @@ -12051,7 +12063,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_INTERLEAVE(SDValue Op, } MVT XLenVT = Subtarget.getXLenVT(); - SDValue VL = DAG.getRegister(RISCV::X0, XLenVT); + auto [Mask, VL] = getDefaultScalableVLOps(VecVT, DL, DAG, Subtarget); // If the VT is larger than LMUL=8, we need to split and reassemble. if ((VecVT.getSizeInBits().getKnownMinValue() * Factor) > @@ -12100,10 +12112,10 @@ SDValue RISCVTargetLowering::lowerVECTOR_INTERLEAVE(SDValue Op, auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex); static const Intrinsic::ID IntrIds[] = { - Intrinsic::riscv_vsseg2, Intrinsic::riscv_vsseg3, - Intrinsic::riscv_vsseg4, Intrinsic::riscv_vsseg5, - Intrinsic::riscv_vsseg6, Intrinsic::riscv_vsseg7, - Intrinsic::riscv_vsseg8, + Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask, + Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask, + Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask, + Intrinsic::riscv_vsseg8_mask, }; unsigned Sz = @@ -12119,6 +12131,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_INTERLEAVE(SDValue Op, DAG.getTargetConstant(IntrIds[Factor - 2], DL, XLenVT), StoredVal, StackPtr, + Mask, VL, DAG.getTargetConstant(Log2_64(VecVT.getScalarSizeInBits()), DL, XLenVT)}; @@ -13998,6 +14011,52 @@ SDValue RISCVTargetLowering::lowerRESET_FPENV(SDValue Op, EnvValue); } +const uint64_t ModeMask64 = ~RISCVExceptFlags::ALL; +const uint32_t ModeMask32 = ~RISCVExceptFlags::ALL; + +SDValue RISCVTargetLowering::lowerGET_FPMODE(SDValue Op, + SelectionDAG &DAG) const { + const MVT XLenVT = Subtarget.getXLenVT(); + SDLoc DL(Op); + SDValue Chain = Op->getOperand(0); + SDValue SysRegNo = DAG.getTargetConstant(RISCVSysReg::fcsr, DL, XLenVT); + SDVTList VTs = DAG.getVTList(XLenVT, MVT::Other); + SDValue Result = DAG.getNode(RISCVISD::READ_CSR, DL, VTs, Chain, SysRegNo); + Chain = Result.getValue(1); + return DAG.getMergeValues({Result, Chain}, DL); +} + +SDValue RISCVTargetLowering::lowerSET_FPMODE(SDValue Op, + SelectionDAG &DAG) const { + const MVT XLenVT = Subtarget.getXLenVT(); + const uint64_t ModeMaskValue = Subtarget.is64Bit() ? ModeMask64 : ModeMask32; + SDLoc DL(Op); + SDValue Chain = Op->getOperand(0); + SDValue EnvValue = Op->getOperand(1); + SDValue SysRegNo = DAG.getTargetConstant(RISCVSysReg::fcsr, DL, XLenVT); + SDValue ModeMask = DAG.getConstant(ModeMaskValue, DL, XLenVT); + + EnvValue = DAG.getNode(ISD::ZERO_EXTEND, DL, XLenVT, EnvValue); + EnvValue = DAG.getNode(ISD::AND, DL, XLenVT, EnvValue, ModeMask); + Chain = DAG.getNode(RISCVISD::CLEAR_CSR, DL, MVT::Other, Chain, SysRegNo, + ModeMask); + return DAG.getNode(RISCVISD::SET_CSR, DL, MVT::Other, Chain, SysRegNo, + EnvValue); +} + +SDValue RISCVTargetLowering::lowerRESET_FPMODE(SDValue Op, + SelectionDAG &DAG) const { + const MVT XLenVT = Subtarget.getXLenVT(); + const uint64_t ModeMaskValue = Subtarget.is64Bit() ? ModeMask64 : ModeMask32; + SDLoc DL(Op); + SDValue Chain = Op->getOperand(0); + SDValue SysRegNo = DAG.getTargetConstant(RISCVSysReg::fcsr, DL, XLenVT); + SDValue ModeMask = DAG.getConstant(ModeMaskValue, DL, XLenVT); + + return DAG.getNode(RISCVISD::CLEAR_CSR, DL, MVT::Other, Chain, SysRegNo, + ModeMask); +} + SDValue RISCVTargetLowering::lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -15032,10 +15091,15 @@ static SDValue combineBinOpToReduce(SDNode *N, SelectionDAG &DAG, // Optimize (add (shl x, c0), (shl y, c1)) -> // (SLLI (SH*ADD x, y), c0), if c1-c0 equals to [1|2|3]. +// or +// (SLLI (QC.SHLADD x, y, c1 - c0), c0), if 4 <= (c1-c0) <=31. static SDValue transformAddShlImm(SDNode *N, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { - // Perform this optimization only in the zba/xandesperf extension. - if (!Subtarget.hasStdExtZba() && !Subtarget.hasVendorXAndesPerf()) + const bool HasStdExtZba = Subtarget.hasStdExtZba(); + const bool HasVendorXAndesPerf = Subtarget.hasVendorXAndesPerf(); + const bool HasVendorXqciac = Subtarget.hasVendorXqciac(); + // Perform this optimization only in the zba/xandesperf/xqciac extension. + if (!HasStdExtZba && !HasVendorXAndesPerf && !HasVendorXqciac) return SDValue(); // Skip for vector types and larger types. @@ -15060,14 +15124,22 @@ static SDValue transformAddShlImm(SDNode *N, SelectionDAG &DAG, if (C0 <= 0 || C1 <= 0) return SDValue(); - // Skip if SH1ADD/SH2ADD/SH3ADD are not applicable. - int64_t Bits = std::min(C0, C1); int64_t Diff = std::abs(C0 - C1); - if (Diff != 1 && Diff != 2 && Diff != 3) + bool IsShXaddDiff = Diff == 1 || Diff == 2 || Diff == 3; + bool HasShXadd = HasStdExtZba || HasVendorXAndesPerf; + + // Skip if SH1ADD/SH2ADD/SH3ADD are not applicable. + if ((!IsShXaddDiff && HasShXadd && !HasVendorXqciac) || + (IsShXaddDiff && !HasShXadd && HasVendorXqciac)) + return SDValue(); + + // Skip if QC_SHLADD is not applicable. + if (Diff == 0 || Diff > 31) return SDValue(); // Build nodes. SDLoc DL(N); + int64_t Bits = std::min(C0, C1); SDValue NS = (C0 < C1) ? N0->getOperand(0) : N1->getOperand(0); SDValue NL = (C0 > C1) ? N0->getOperand(0) : N1->getOperand(0); SDValue SHADD = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, NL, diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 00e969056df7..e0a8c07b4206 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -429,7 +429,7 @@ public: bool fallBackToDAGISel(const Instruction &Inst) const override; - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const override; @@ -437,14 +437,12 @@ public: bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; - bool lowerDeinterleaveIntrinsicToLoad( - LoadInst *LI, ArrayRef<Value *> DeinterleaveValues) const override; + bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, + IntrinsicInst *DI) const override; bool lowerInterleaveIntrinsicToStore( - StoreInst *SI, ArrayRef<Value *> InterleaveValues) const override; - - bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask, - ArrayRef<Value *> DeinterleaveRes) const override; + Instruction *Store, Value *Mask, + ArrayRef<Value *> InterleaveValues) const override; bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask, ArrayRef<Value *> InterleaveOps) const override; @@ -562,6 +560,9 @@ private: SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const; SDValue lowerRESET_FPENV(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerGET_FPMODE(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerSET_FPMODE(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerRESET_FPMODE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const; SDValue lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td index b6b64b57b1b3..e23001a3a0bf 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td +++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td @@ -193,7 +193,9 @@ class RVInstCommon<dag outs, dag ins, string opcodestr, string argstr, let AsmString = opcodestr # !if(!empty(argstr), "", "\t" # argstr); let Pattern = pattern; - let TSFlags{4-0} = format.Value; + InstFormat Format = format; + + let TSFlags{4-0} = Format.Value; // Defaults RISCVVConstraint RVVConstraint = NoConstraint; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 2723229859a5..64f9e3eb8d86 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -2806,7 +2806,7 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, CASE_OPERAND_UIMM(7) CASE_OPERAND_UIMM(8) CASE_OPERAND_UIMM(9) - CASE_OPERAND_UIMM(10) + CASE_OPERAND_UIMM(10) CASE_OPERAND_UIMM(12) CASE_OPERAND_UIMM(16) CASE_OPERAND_UIMM(20) @@ -2823,6 +2823,9 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI, case RISCVOp::OPERAND_UIMM5_NONZERO: Ok = isUInt<5>(Imm) && (Imm != 0); break; + case RISCVOp::OPERAND_UIMM5_GT3: + Ok = isUInt<5>(Imm) && (Imm > 3); + break; case RISCVOp::OPERAND_UIMM5_PLUS1: Ok = (isUInt<5>(Imm) && (Imm != 0)) || (Imm == 32); break; @@ -4809,6 +4812,8 @@ bool RISCV::isVLKnownLE(const MachineOperand &LHS, const MachineOperand &RHS) { return true; if (RHS.isImm() && RHS.getImm() == RISCV::VLMaxSentinel) return true; + if (LHS.isImm() && LHS.getImm() == 0) + return true; if (LHS.isImm() && LHS.getImm() == RISCV::VLMaxSentinel) return false; if (!LHS.isImm() || !RHS.isImm()) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index f63531a0109b..653607827282 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -120,6 +120,20 @@ def riscv_swap_csr : RVSDNode<"SWAP_CSR", SDTCisInt<2>]>, [SDNPHasChain]>; +// Clear bits of CSR. The first operand is the address of the required CSR, +// the second is the bitmask of cleared bits. +def riscv_clear_csr : RVSDNode<"CLEAR_CSR", + SDTypeProfile<0, 2, [SDTCisInt<0>, + SDTCisInt<1>]>, + [SDNPHasChain]>; + +// Set bits of CSR. The first operand is the address of the required CSR, +// the second is the bitmask of bits to set. +def riscv_set_csr : RVSDNode<"SET_CSR", + SDTypeProfile<0, 2, [SDTCisInt<0>, + SDTCisInt<1>]>, + [SDNPHasChain]>; + // A read of the 64-bit counter CSR on a 32-bit target (returns (Lo, Hi)). // It takes a chain operand and another two target constant operands (the // CSR numbers of the low and high parts of the counter). @@ -2038,6 +2052,42 @@ class SwapSysRegImm<SysReg SR, list<Register> Regs> let Defs = Regs; } +class ClearSysReg<SysReg SR, list<Register> Regs> + : Pseudo<(outs), (ins GPR:$val), + [(riscv_clear_csr (XLenVT SR.Encoding), (XLenVT GPR:$val))]>, + PseudoInstExpansion<(CSRRC X0, SR.Encoding, GPR:$val)> { + let hasSideEffects = 0; + let Uses = Regs; + let Defs = Regs; +} + +class ClearSysRegImm<SysReg SR, list<Register> Regs> + : Pseudo<(outs), (ins uimm5:$val), + [(riscv_clear_csr (XLenVT SR.Encoding), uimm5:$val)]>, + PseudoInstExpansion<(CSRRCI X0, SR.Encoding, uimm5:$val)> { + let hasSideEffects = 0; + let Uses = Regs; + let Defs = Regs; +} + +class SetSysReg<SysReg SR, list<Register> Regs> + : Pseudo<(outs), (ins GPR:$val), + [(riscv_set_csr (XLenVT SR.Encoding), (XLenVT GPR:$val))]>, + PseudoInstExpansion<(CSRRS X0, SR.Encoding, GPR:$val)> { + let hasSideEffects = 0; + let Uses = Regs; + let Defs = Regs; +} + +class SetSysRegImm<SysReg SR, list<Register> Regs> + : Pseudo<(outs), (ins uimm5:$val), + [(riscv_set_csr (XLenVT SR.Encoding), uimm5:$val)]>, + PseudoInstExpansion<(CSRRSI X0, SR.Encoding, uimm5:$val)> { + let hasSideEffects = 0; + let Uses = Regs; + let Defs = Regs; +} + def ReadFRM : ReadSysReg<SysRegFRM, [FRM]>; let hasPostISelHook = 1 in { def WriteFRM : WriteSysReg<SysRegFRM, [FRM]>; @@ -2056,6 +2106,10 @@ let hasPostISelHook = 1 in { def ReadFCSR : ReadSysReg<SysRegFCSR, [FRM, FFLAGS]>; def WriteFCSR : WriteSysReg<SysRegFCSR, [FRM, FFLAGS]>; def WriteFCSRImm : WriteSysRegImm<SysRegFCSR, [FRM, FFLAGS]>; +def ClearFCSR : ClearSysReg<SysRegFCSR, [FRM, FFLAGS]>; +def ClearFCSRImm : ClearSysRegImm<SysRegFCSR, [FRM, FFLAGS]>; +def SetFCSR : SetSysReg<SysRegFCSR, [FRM, FFLAGS]>; +def SetFCSRImm : SetSysRegImm<SysRegFCSR, [FRM, FFLAGS]>; } /// Other pseudo-instructions diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td index aa9e7b5635de..aef410fb4cc6 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -20,18 +20,22 @@ def simm10 : RISCVSImmLeafOp<10>; +def SImm10UnsignedAsmOperand : SImmAsmOperand<10, "Unsigned"> { + let RenderMethod = "addSImm10UnsignedOperands"; +} + // A 10-bit signed immediate allowing range [-512, 1023] -// but will decode to [-512, 511]. +// but represented as [-512, 511]. def simm10_unsigned : RISCVOp { - let ParserMatchClass = SImmAsmOperand<10, "Unsigned">; + let ParserMatchClass = SImm10UnsignedAsmOperand; let EncoderMethod = "getImmOpValue"; let DecoderMethod = "decodeSImmOperand<10>"; - let OperandType = "OPERAND_SIMM10_UNSIGNED"; + let OperandType = "OPERAND_SIMM10"; let MCOperandPredicate = [{ int64_t Imm; if (!MCOp.evaluateAsConstantImm(Imm)) return false; - return isInt<10>(Imm) || isUInt<10>(Imm); + return isInt<10>(Imm); }]; } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td index ec38201cd28c..522081533644 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td @@ -348,6 +348,17 @@ class NDSRVInstSDGP<bits<3> funct3, string opcodestr> let mayStore = 1; } +class NDSRVInstBFHCvt<bits<7> funct7, bits<5> rs1val, DAGOperand rdty, + DAGOperand rs2ty, string opcodestr> + : RVInstR<funct7, 0b100, OPC_CUSTOM_2, (outs rdty:$rd), + (ins rs2ty:$rs2), opcodestr, "$rd, $rs2"> { + let rs1 = rs1val; + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; + let mayRaiseFPException = 1; +} + class NDSRVInstVFPMAD<bits<6> funct6, string opcodestr> : RVInst<(outs VR:$vd), (ins VR:$vs2, FPR32:$rs1, VMaskOp:$vm), opcodestr # "." # "vf", "$vd, $rs1, $vs2$vm", [], InstFormatR>, @@ -631,6 +642,19 @@ def NDS_SDGP : NDSRVInstSDGP<0b111, "nds.sdgp">; } // Predicates = [HasVendorXAndesPerf, IsRV64] //===----------------------------------------------------------------------===// +// XAndesBFHCvt +//===----------------------------------------------------------------------===// + +let Predicates = [HasVendorXAndesBFHCvt] in { +def NDS_FCVT_S_BF16 : NDSRVInstBFHCvt<0b0000000, 0b00010, + FPR32, FPR16, "nds.fcvt.s.bf16">, + Sched<[WriteFCvtF16ToF32, ReadFCvtF16ToF32]>; +def NDS_FCVT_BF16_S : NDSRVInstBFHCvt<0b0000000, 0b00011, + FPR16, FPR32, "nds.fcvt.bf16.s">, + Sched<[WriteFCvtF32ToF16, ReadFCvtF32ToF16]>; +} + +//===----------------------------------------------------------------------===// // XAndesVBFHCvt //===----------------------------------------------------------------------===// @@ -743,6 +767,13 @@ def : Sh2AddPat<NDS_LEA_W_ZE>; def : Sh3AddPat<NDS_LEA_D_ZE>; } // Predicates = [HasVendorXAndesPerf, IsRV64] +let Predicates = [HasVendorXAndesBFHCvt] in { +def : Pat<(fpextend (bf16 FPR16:$rs)), + (NDS_FCVT_S_BF16 (bf16 FPR16:$rs))>; +def : Pat<(bf16 (fpround FPR32:$rs)), + (NDS_FCVT_BF16_S FPR32:$rs)>; +} // Predicates = [HasVendorXAndesBFHCvt] + let Predicates = [HasVendorXAndesVBFHCvt] in { defm PseudoNDS_VFWCVT_S_BF16 : VPseudoVWCVT_S_BF16; defm PseudoNDS_VFNCVT_BF16_S : VPseudoVNCVT_BF16_S; @@ -801,13 +832,13 @@ defm : VPatTernaryVD4DOT_VV<"int_riscv_nds_vd4dotsu", "PseudoNDS_VD4DOTSU", let Predicates = [HasShortForwardBranchOpt], hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 8, Constraints = "$dst = $falsev" in { def PseudoCCNDS_BFOS : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, ixlenimm:$cc, + (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev, GPR:$rs1, uimmlog2xlen:$msb, uimmlog2xlen:$lsb), []>, Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, ReadSFBALU]>; def PseudoCCNDS_BFOZ : Pseudo<(outs GPR:$dst), - (ins GPR:$lhs, GPR:$rhs, ixlenimm:$cc, + (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev, GPR:$rs1, uimmlog2xlen:$msb, uimmlog2xlen:$lsb), []>, Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index 7cc7f380c3f6..c7cb6e237aea 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -845,10 +845,11 @@ let Predicates = [HasVendorXqcibi, IsRV32] in { let Predicates = [HasVendorXqcibm, IsRV32] in { let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { def QC_INSBRI : QCIRVInstRI<0b1, simm11, "qc.insbri">; - def QC_INSBI : RVInstIBase<0b001, OPC_CUSTOM_0, (outs GPRNoX0:$rd), - (ins simm5:$imm5, uimm5_plus1:$width, + def QC_INSBI : RVInstIBase<0b001, OPC_CUSTOM_0, (outs GPRNoX0:$rd_wb), + (ins GPRNoX0:$rd, simm5:$imm5, uimm5_plus1:$width, uimm5:$shamt), "qc.insbi", "$rd, $imm5, $width, $shamt"> { + let Constraints = "$rd = $rd_wb"; bits<5> imm5; bits<5> shamt; bits<5> width; @@ -1336,6 +1337,22 @@ class QCISELECTIICCPat<CondCode Cond, QCISELECTIICC Inst> : Pat<(select (i32 (setcc (i32 GPRNoX0:$rd), (i32 GPRNoX0:$rs1), Cond)), simm5:$simm1, simm5:$simm2), (Inst GPRNoX0:$rd, GPRNoX0:$rs1, simm5:$simm1, simm5:$simm2)>; +class QCILICCPat<CondCode Cond, QCILICC Inst> + : Pat<(select (XLenVT (setcc (XLenVT GPRNoX0:$rs1), (XLenVT GPRNoX0:$rs2), Cond)), simm5:$simm, (XLenVT GPRNoX0:$rd)), + (Inst GPRNoX0:$rd, GPRNoX0:$rs1, GPRNoX0:$rs2, simm5:$simm)>; + +class QCILICCPatInv<CondCode Cond, QCILICC Inst> + : Pat<(select (XLenVT (setcc (XLenVT GPRNoX0:$rs1), (XLenVT GPRNoX0:$rs2), Cond)), (XLenVT GPRNoX0:$rd), simm5:$simm), + (Inst GPRNoX0:$rd, GPRNoX0:$rs1, GPRNoX0:$rs2, simm5:$simm)>; + +class QCILICCIPat<CondCode Cond, QCILICC Inst, DAGOperand InTyImm> + : Pat<(select (XLenVT (setcc (XLenVT GPRNoX0:$rs1), InTyImm:$imm, Cond)), simm5:$simm, (XLenVT GPRNoX0:$rd)), + (Inst GPRNoX0:$rd, GPRNoX0:$rs1, InTyImm:$imm, simm5:$simm)>; + +class QCILICCIPatInv<CondCode Cond, QCILICC Inst, DAGOperand InTyImm> + : Pat<(select (XLenVT (setcc (XLenVT GPRNoX0:$rs1), InTyImm:$imm, Cond)), (XLenVT GPRNoX0:$rd), simm5:$simm), + (Inst GPRNoX0:$rd, GPRNoX0:$rs1, InTyImm:$imm, simm5:$simm)>; + // Match `riscv_brcc` and lower to the appropriate XQCIBI branch instruction. class BcciPat<CondCode Cond, QCIBranchInst_rii Inst, DAGOperand InTyImm> : Pat<(riscv_brcc (i32 GPRNoX0:$rs1), InTyImm:$rs2, Cond, bb:$imm12), @@ -1359,6 +1376,10 @@ class SelectQCbi<CondCode Cond, DAGOperand InTyImm, Pseudo OpNode > let Predicates = [HasVendorXqciac, IsRV32] in { def : Pat<(i32 (add GPRNoX0:$rd, (mul GPRNoX0:$rs1, simm12:$imm12))), (QC_MULIADD GPRNoX0:$rd, GPRNoX0:$rs1, simm12:$imm12)>; +def : Pat<(i32 (add_like_non_imm12 (shl GPRNoX0:$rs1, uimm5gt3:$imm), GPRNoX0:$rs2)), + (QC_SHLADD GPRNoX0:$rs2, GPRNoX0:$rs1, uimm5gt3:$imm)>; +def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, uimm5gt3:$imm, GPRNoX0:$rs2)), + (QC_SHLADD GPRNoX0:$rs2, GPRNoX0:$rs1, uimm5gt3:$imm)>; } // Predicates = [HasVendorXqciac, IsRV32] /// Simple arithmetic operations @@ -1417,7 +1438,7 @@ def : PatGprNoX0GprNoX0<sshlsat, QC_SHLSAT>; /// Branches -let Predicates = [HasVendorXqcibi, IsRV32], AddedComplexity = 2 in { +let Predicates = [HasVendorXqcibi, IsRV32] in { def : BcciPat<SETEQ, QC_BEQI, simm5nonzero>; def : BcciPat<SETNE, QC_BNEI, simm5nonzero>; def : BcciPat<SETLT, QC_BLTI, simm5nonzero>; @@ -1445,7 +1466,7 @@ def : SelectQCbi<SETLT, simm16nonzero, Select_GPRNoX0_Using_CC_SImm16NonZero_QC> def : SelectQCbi<SETGE, simm16nonzero, Select_GPRNoX0_Using_CC_SImm16NonZero_QC>; def : SelectQCbi<SETULT, uimm16nonzero, Select_GPRNoX0_Using_CC_UImm16NonZero_QC>; def : SelectQCbi<SETUGE, uimm16nonzero, Select_GPRNoX0_Using_CC_UImm16NonZero_QC>; -} // let Predicates = [HasVendorXqcibi, IsRV32], AddedComplexity = 2 +} // let Predicates = [HasVendorXqcibi, IsRV32] let Predicates = [HasVendorXqcibm, IsRV32] in { def : Pat<(sext_inreg (i32 GPR:$rs1), i1), (QC_EXT GPR:$rs1, 1, 0)>; @@ -1484,12 +1505,46 @@ def : QCIMVCCPat <SETNE, QC_MVNE>; def : QCIMVCCPat <SETLT, QC_MVLT>; def : QCIMVCCPat <SETULT, QC_MVLTU>; -def : QCIMVCCIPat <SETEQ, QC_MVEQI, simm5>; -def : QCIMVCCIPat <SETNE, QC_MVNEI, simm5>; def : QCIMVCCIPat <SETLT, QC_MVLTI, simm5>; def : QCIMVCCIPat <SETULT, QC_MVLTUI, uimm5>; } +// Prioritize Xqcics over these patterns. +let Predicates = [HasVendorXqcicm, NoVendorXqcics, IsRV32] in { +def : QCIMVCCIPat <SETEQ, QC_MVEQI, simm5>; +def : QCIMVCCIPat <SETNE, QC_MVNEI, simm5>; +} + +let Predicates = [HasVendorXqcicli, HasVendorXqcicsOrXqcicm, IsRV32] in { +def : QCILICCPat <SETEQ, QC_LIEQ>; +def : QCILICCPat <SETNE, QC_LINE>; +def : QCILICCPat <SETLT, QC_LILT>; +def : QCILICCPat <SETGE, QC_LIGE>; +def : QCILICCPat <SETULT, QC_LILTU>; +def : QCILICCPat <SETUGE, QC_LIGEU>; + +def : QCILICCIPat <SETEQ, QC_LIEQI, simm5>; +def : QCILICCIPat <SETNE, QC_LINEI, simm5>; +def : QCILICCIPat <SETLT, QC_LILTI, simm5>; +def : QCILICCIPat <SETGE, QC_LIGEI, simm5>; +def : QCILICCIPat <SETULT, QC_LILTUI, uimm5>; +def : QCILICCIPat <SETUGE, QC_LIGEUI, uimm5>; + +def : QCILICCPatInv <SETNE, QC_LIEQ>; +def : QCILICCPatInv <SETEQ, QC_LINE>; +def : QCILICCPatInv <SETGE, QC_LILT>; +def : QCILICCPatInv <SETLT, QC_LIGE>; +def : QCILICCPatInv <SETUGE, QC_LILTU>; +def : QCILICCPatInv <SETULT, QC_LIGEU>; + +def : QCILICCIPatInv <SETNE, QC_LIEQI, simm5>; +def : QCILICCIPatInv <SETEQ, QC_LINEI, simm5>; +def : QCILICCIPatInv <SETGE, QC_LILTI, simm5>; +def : QCILICCIPatInv <SETLT, QC_LIGEI, simm5>; +def : QCILICCIPatInv <SETUGE, QC_LILTUI, uimm5>; +def : QCILICCIPatInv <SETULT, QC_LIGEUI, uimm5>; +} + let Predicates = [HasVendorXqcics, IsRV32] in { def : Pat<(select (i32 GPRNoX0:$rd), (i32 GPRNoX0:$rs2),(i32 GPRNoX0:$rs3)), (QC_SELECTNEI GPRNoX0:$rd, (i32 0), GPRNoX0:$rs2, GPRNoX0:$rs3)>; @@ -1498,12 +1553,8 @@ def : Pat<(select (i32 GPRNoX0:$rd), (i32 GPRNoX0:$rs2), simm5:$simm2), def : Pat<(select (i32 GPRNoX0:$rd), simm5:$simm2,(i32 GPRNoX0:$rs2)), (QC_SELECTIEQI GPRNoX0:$rd, (i32 0), GPRNoX0:$rs2, simm5:$simm2)>; -// Below AddedComplexity is added to prefer these conditional select instructions over -// conditional move instructions -let AddedComplexity = 1 in { def : QCISELECTCCIPat <SETEQ, QC_SELECTEQI>; def : QCISELECTCCIPat <SETNE, QC_SELECTNEI>; -} def : QCISELECTICCIPat <SETEQ, QC_SELECTIEQI>; def : QCISELECTICCIPat <SETNE, QC_SELECTINEI>; @@ -1634,6 +1685,24 @@ def : CompressPat<(QC_E_ADDAI X2, simm10_lsb0000nonzero:$imm), (C_ADDI16SP X2, simm10_lsb0000nonzero:$imm)>; def : CompressPat<(QC_E_ADDI X2, X2, simm10_lsb0000nonzero:$imm), (C_ADDI16SP X2, simm10_lsb0000nonzero:$imm)>; + +def : CompressPat<(QC_E_ADDI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm), + (ADDI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm)>; +def : CompressPat<(QC_E_ANDI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm), + (ANDI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm)>; +def : CompressPat<(QC_E_ORI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm), + (ORI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm)>; +def : CompressPat<(QC_E_XORI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm), + (XORI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm)>; + +def : CompressPat<(QC_E_ADDAI GPRNoX0:$rd, simm12:$imm), + (ADDI GPRNoX0:$rd, GPRNoX0:$rd, simm12:$imm)>; +def : CompressPat<(QC_E_ANDAI GPRNoX0:$rd, simm12:$imm), + (ANDI GPRNoX0:$rd, GPRNoX0:$rd, simm12:$imm)>; +def : CompressPat<(QC_E_ORAI GPRNoX0:$rd, simm12:$imm), + (ORI GPRNoX0:$rd, GPRNoX0:$rd, simm12:$imm)>; +def : CompressPat<(QC_E_XORAI GPRNoX0:$rd, simm12:$imm), + (XORI GPRNoX0:$rd, GPRNoX0:$rd, simm12:$imm)>; } // let isCompressOnly = true, Predicates = [HasVendorXqcilia, IsRV32] let Predicates = [HasVendorXqciac, IsRV32] in { @@ -1655,3 +1724,82 @@ def : CompressPat<(QC_E_BGEUI GPRNoX0:$rs1, uimm5nonzero:$imm5, bare_simm13_lsb0 def : CompressPat<(QC_E_BLTUI GPRNoX0:$rs1, uimm5nonzero:$imm5, bare_simm13_lsb0:$imm12), (QC_BLTUI GPRNoX0:$rs1, uimm5nonzero:$imm5, bare_simm13_lsb0:$imm12)>; } // let isCompressOnly = true, Predicates = [HasVendorXqcibi, IsRV32] + +// HACKS +// ----- +// The reasons for needing the definitions below are long and quite annoying. I'm writing +// this so they are explained in-line, rather than anywhere else. +// +// Emitting an instruction to an object proceeds as: +// - Compression (in emitInstruction) +// - Emit to Binary Code + Fixups +// - Assembler Relaxation +// - Fixup evaluation/application +// - If relaxed, re-emitted to Binary + Fixups +// - Relocation generation from Fixups +// +// Unfortunately, the `QC.E.LI` -> `C.LI` compression pattern has an edge case that has +// caused crashes in the past. +// +// How the bug happens is: +// - QC.E.LI is parsed with a bare symbol, which is valid + expected, and can +// be handled by fixups/relocations. +// - Compression turns this into a `C.LI` because the `simm6` +// MCOperandPredicate accepts bare symbols. +// - Binary Code emission didn't know how to create a fixup for a CI-type +// instruction containing a bare symbol. +// +// The solution to the last bullet is that we added the `fixup_riscv_rvc_imm`, +// so that we could proceed past the last error, and then use Assembler Relaxation +// to turn the `C.LI` with a bare symbol back into a `QC.E.LI`. +// +// This is good enough for emitting objects, but doesn't work for emitting +// assembly. Emitting assembly is why we need the following Hacks. +// +// Emitting an instruction to assembly proceeds as: +// - Compression (in emitInstruction) +// - Decompression (in RISCVInstPrinter::printInst) +// - InstAliases are applied +// +// So in the case of `QC.E.LI` with a bare symbol, first it is compressed to +// `C.LI` with a bare symbol, and then it is decompressed to `ADDI` with a bare +// symbol for printing, which is printed via an alias as `li <reg>, <symbol>`. +// Both the decompression and the alias use the MCOperandPredicate from +// `simm12`, which accepts bare symbols. +// +// The problem here is that `li <reg>, <symbol>` fails to parse, because the +// parsers do not accept bare symbols, they only accept symbols with specifiers +// or immediates. +// +// Our solution is to add another alias, which will be prioritised above the +// `li` alias, but only when `qc.e.li` is available. We originally intended to +// use the `bare_symbol` Operand type, but this had no MCOperandPredicate, and +// adding one changed the error messages when parsing `qc.e.li` with a +// too-large constant. So instead, we add a new `AsmOperand` and `Operand` type, +// just for the alias, which parse just like a BareSymbol, but they +// have both an MCOperandPredicate, and the error message that corresponds to +// the existing one on `qc.e.li` for too-large immediates (which fail to parse +// as both an immediate, and a bare symbol). +// +// This is fairly unpleasant, but it's the least disruptive thing we can do +// and keeps all the hacks confined to the RISC-V backend code. + +def BareSymbolQC_E_LI : AsmOperandClass { + let Name = "BareSymbolQC_E_LI"; + let PredicateMethod = "isBareSymbol"; + let RenderMethod = "addImmOperands"; + let DiagnosticType = "InvalidBareSymbolQC_E_LI"; + let ParserMethod = "parseBareSymbol"; +} + +def hack_bare_symbol_qc_e_li : Operand<XLenVT> { + let ParserMatchClass = BareSymbolQC_E_LI; + let MCOperandPredicate = [{ + return MCOp.isExpr() && MCOp.isBareSymbolRef(); + }]; +} + +let Predicates = [HasVendorXqcili, IsRV32] in { +def : InstAlias<"qc.e.li $rd, $sym", (ADDI GPR:$rd, X0, hack_bare_symbol_qc_e_li:$sym), 3>; +} // Predicates = [HasVendorXqcili, IsRV32] +// END HACKS diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td index 878b85b14157..0723b2f568a7 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZicbo.td @@ -41,6 +41,7 @@ class Prefetch_ri<bits<5> optype, string opcodestr> opcodestr, "${imm12}(${rs1})"> { let Inst{11-7} = 0b00000; let rs2 = optype; + let Format = InstFormatOther; // this does not follow the normal S format. } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index a6ff22c4b391..dd68a5556cdb 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -14,6 +14,7 @@ #include "RISCVISelLowering.h" #include "RISCVSubtarget.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" @@ -68,6 +69,89 @@ static const Intrinsic::ID ScalableVlsegIntrIds[] = { Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask, Intrinsic::riscv_vlseg8_mask}; +static const Intrinsic::ID FixedVssegIntrIds[] = { + Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask, + Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask, + Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask, + Intrinsic::riscv_seg8_store_mask}; + +static const Intrinsic::ID ScalableVssegIntrIds[] = { + Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask, + Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask, + Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask, + Intrinsic::riscv_vsseg8_mask}; + +static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { + assert(N); + if (N == 1) + return true; + + using namespace PatternMatch; + // Right now we're only recognizing the simplest pattern. + uint64_t C; + if (match(V, m_CombineOr(m_ConstantInt(C), + m_NUWMul(m_Value(), m_ConstantInt(C)))) && + C && C % N == 0) + return true; + + if (isPowerOf2_32(N)) { + KnownBits KB = llvm::computeKnownBits(V, DL); + return KB.countMinTrailingZeros() >= Log2_32(N); + } + + return false; +} + +/// Do the common operand retrieval and validition required by the +/// routines below. +static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy, + Instruction *I, Value *&Ptr, Value *&Mask, + Value *&VL, Align &Alignment) { + + IRBuilder<> Builder(I); + const DataLayout &DL = I->getDataLayout(); + ElementCount EC = VTy->getElementCount(); + if (auto *LI = dyn_cast<LoadInst>(I)) { + assert(LI->isSimple()); + Ptr = LI->getPointerOperand(); + Alignment = LI->getAlign(); + assert(!Mask && "Unexpected mask on a load"); + Mask = Builder.getAllOnesMask(EC); + VL = isa<FixedVectorType>(VTy) ? Builder.CreateElementCount(XLenTy, EC) + : Constant::getAllOnesValue(XLenTy); + return true; + } + if (auto *SI = dyn_cast<StoreInst>(I)) { + assert(SI->isSimple()); + Ptr = SI->getPointerOperand(); + Alignment = SI->getAlign(); + assert(!Mask && "Unexpected mask on a store"); + Mask = Builder.getAllOnesMask(EC); + VL = isa<FixedVectorType>(VTy) ? Builder.CreateElementCount(XLenTy, EC) + : Constant::getAllOnesValue(XLenTy); + return true; + } + auto *VPLdSt = cast<VPIntrinsic>(I); + assert((VPLdSt->getIntrinsicID() == Intrinsic::vp_load || + VPLdSt->getIntrinsicID() == Intrinsic::vp_store) && + "Unexpected intrinsic"); + Ptr = VPLdSt->getMemoryPointerParam(); + Alignment = VPLdSt->getPointerAlignment().value_or( + DL.getABITypeAlign(VTy->getElementType())); + + assert(Mask && "vp.load and vp.store needs a mask!"); + + Value *WideEVL = VPLdSt->getVectorLengthParam(); + // Conservatively check if EVL is a multiple of factor, otherwise some + // (trailing) elements might be lost after the transformation. + if (!isMultipleOfN(WideEVL, I->getDataLayout(), Factor)) + return false; + + auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); + VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); + return true; +} + /// Lower an interleaved load into a vlsegN intrinsic. /// /// E.g. Lower an interleaved load (Factor = 2): @@ -81,21 +165,25 @@ static const Intrinsic::ID ScalableVlsegIntrIds[] = { /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 bool RISCVTargetLowering::lowerInterleavedLoad( - LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, + Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const { assert(Indices.size() == Shuffles.size()); - IRBuilder<> Builder(LI); - - const DataLayout &DL = LI->getDataLayout(); + IRBuilder<> Builder(Load); + const DataLayout &DL = Load->getDataLayout(); auto *VTy = cast<FixedVectorType>(Shuffles[0]->getType()); - if (!isLegalInterleavedAccessType(VTy, Factor, LI->getAlign(), - LI->getPointerAddressSpace(), DL)) + auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen()); + + Value *Ptr, *VL; + Align Alignment; + if (!getMemOperands(Factor, VTy, XLenTy, Load, Ptr, Mask, VL, Alignment)) return false; - auto *PtrTy = LI->getPointerOperandType(); - auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen()); + Type *PtrTy = Ptr->getType(); + unsigned AS = PtrTy->getPointerAddressSpace(); + if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL)) + return false; // If the segment load is going to be performed segment at a time anyways // and there's only one element used, use a strided load instead. This @@ -104,25 +192,23 @@ bool RISCVTargetLowering::lowerInterleavedLoad( unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType()); Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes); - Value *BasePtr = Builder.CreatePtrAdd(LI->getPointerOperand(), Offset); - Value *Mask = Builder.getAllOnesMask(VTy->getElementCount()); - Value *VL = Builder.getInt32(VTy->getNumElements()); - + Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset); + // Note: Same VL as above, but i32 not xlen due to signature of + // vp.strided.load + VL = Builder.CreateElementCount(Builder.getInt32Ty(), + VTy->getElementCount()); CallInst *CI = Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load, {VTy, BasePtr->getType(), Stride->getType()}, {BasePtr, Stride, Mask, VL}); - CI->addParamAttr( - 0, Attribute::getWithAlignment(CI->getContext(), LI->getAlign())); + CI->addParamAttr(0, + Attribute::getWithAlignment(CI->getContext(), Alignment)); Shuffles[0]->replaceAllUsesWith(CI); return true; }; - Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements()); - Value *Mask = Builder.getAllOnesMask(VTy->getElementCount()); CallInst *VlsegN = Builder.CreateIntrinsic( - FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, - {LI->getPointerOperand(), Mask, VL}); + FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL}); for (unsigned i = 0; i < Shuffles.size(); i++) { Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]); @@ -132,18 +218,6 @@ bool RISCVTargetLowering::lowerInterleavedLoad( return true; } -static const Intrinsic::ID FixedVssegIntrIds[] = { - Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask, - Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask, - Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask, - Intrinsic::riscv_seg8_store_mask}; - -static const Intrinsic::ID ScalableVssegIntrIds[] = { - Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask, - Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask, - Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask, - Intrinsic::riscv_vsseg8_mask}; - /// Lower an interleaved store into a vssegN intrinsic. /// /// E.g. Lower an interleaved store (Factor = 3): @@ -191,7 +265,8 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes); Value *BasePtr = Builder.CreatePtrAdd(SI->getPointerOperand(), Offset); Value *Mask = Builder.getAllOnesMask(DataVTy->getElementCount()); - Value *VL = Builder.getInt32(VTy->getNumElements()); + Value *VL = Builder.CreateElementCount(Builder.getInt32Ty(), + VTy->getElementCount()); CallInst *CI = Builder.CreateIntrinsic( Intrinsic::experimental_vp_strided_store, @@ -223,7 +298,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, // This VL should be OK (should be executable in one vsseg instruction, // potentially under larger LMULs) because we checked that the fixed vector // type fits in isLegalInterleavedAccessType - Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements()); + Value *VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount()); Value *StoreMask = Builder.getAllOnesMask(VTy->getElementCount()); Ops.append({SI->getPointerOperand(), StoreMask, VL}); @@ -233,58 +308,57 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, } bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( - LoadInst *LI, ArrayRef<Value *> DeinterleaveValues) const { - const unsigned Factor = DeinterleaveValues.size(); + Instruction *Load, Value *Mask, IntrinsicInst *DI) const { + const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID()); if (Factor > 8) return false; - assert(LI->isSimple()); - IRBuilder<> Builder(LI); + IRBuilder<> Builder(Load); - Value *FirstActive = - *llvm::find_if(DeinterleaveValues, [](Value *V) { return V != nullptr; }); - VectorType *ResVTy = cast<VectorType>(FirstActive->getType()); + VectorType *ResVTy = getDeinterleavedVectorType(DI); - const DataLayout &DL = LI->getDataLayout(); + const DataLayout &DL = Load->getDataLayout(); + auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen()); - if (!isLegalInterleavedAccessType(ResVTy, Factor, LI->getAlign(), - LI->getPointerAddressSpace(), DL)) + Value *Ptr, *VL; + Align Alignment; + if (!getMemOperands(Factor, ResVTy, XLenTy, Load, Ptr, Mask, VL, Alignment)) return false; - Value *Return; - Type *PtrTy = LI->getPointerOperandType(); - Type *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen()); + Type *PtrTy = Ptr->getType(); + unsigned AS = PtrTy->getPointerAddressSpace(); + if (!isLegalInterleavedAccessType(ResVTy, Factor, Alignment, AS, DL)) + return false; - if (auto *FVTy = dyn_cast<FixedVectorType>(ResVTy)) { - Value *VL = ConstantInt::get(XLenTy, FVTy->getNumElements()); - Value *Mask = Builder.getAllOnesMask(FVTy->getElementCount()); + Value *Return; + if (isa<FixedVectorType>(ResVTy)) { Return = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2], - {ResVTy, PtrTy, XLenTy}, - {LI->getPointerOperand(), Mask, VL}); + {ResVTy, PtrTy, XLenTy}, {Ptr, Mask, VL}); } else { - static const Intrinsic::ID IntrIds[] = { - Intrinsic::riscv_vlseg2, Intrinsic::riscv_vlseg3, - Intrinsic::riscv_vlseg4, Intrinsic::riscv_vlseg5, - Intrinsic::riscv_vlseg6, Intrinsic::riscv_vlseg7, - Intrinsic::riscv_vlseg8}; - unsigned SEW = DL.getTypeSizeInBits(ResVTy->getElementType()); unsigned NumElts = ResVTy->getElementCount().getKnownMinValue(); Type *VecTupTy = TargetExtType::get( - LI->getContext(), "riscv.vector.tuple", - ScalableVectorType::get(Type::getInt8Ty(LI->getContext()), + Load->getContext(), "riscv.vector.tuple", + ScalableVectorType::get(Type::getInt8Ty(Load->getContext()), NumElts * SEW / 8), Factor); + Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration( + Load->getModule(), ScalableVlsegIntrIds[Factor - 2], + {VecTupTy, PtrTy, Mask->getType(), VL->getType()}); - Value *VL = Constant::getAllOnesValue(XLenTy); + Value *Operands[] = { + PoisonValue::get(VecTupTy), + Ptr, + Mask, + VL, + ConstantInt::get(XLenTy, + RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC), + ConstantInt::get(XLenTy, Log2_64(SEW))}; - Value *Vlseg = Builder.CreateIntrinsic( - IntrIds[Factor - 2], {VecTupTy, PtrTy, XLenTy}, - {PoisonValue::get(VecTupTy), LI->getPointerOperand(), VL, - ConstantInt::get(XLenTy, Log2_64(SEW))}); + CallInst *Vlseg = Builder.CreateCall(VlsegNFunc, Operands); SmallVector<Type *, 2> AggrTypes{Factor, ResVTy}; - Return = PoisonValue::get(StructType::get(LI->getContext(), AggrTypes)); + Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes)); for (unsigned i = 0; i < Factor; ++i) { Value *VecExtract = Builder.CreateIntrinsic( Intrinsic::riscv_tuple_extract, {ResVTy, VecTupTy}, @@ -293,217 +367,61 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( } } - for (auto [Idx, DIV] : enumerate(DeinterleaveValues)) { - if (!DIV) - continue; - // We have to create a brand new ExtractValue to replace each - // of these old ExtractValue instructions. - Value *NewEV = - Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)}); - DIV->replaceAllUsesWith(NewEV); - } - + DI->replaceAllUsesWith(Return); return true; } bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( - StoreInst *SI, ArrayRef<Value *> InterleaveValues) const { + Instruction *Store, Value *Mask, ArrayRef<Value *> InterleaveValues) const { unsigned Factor = InterleaveValues.size(); if (Factor > 8) return false; - assert(SI->isSimple()); - IRBuilder<> Builder(SI); + IRBuilder<> Builder(Store); auto *InVTy = cast<VectorType>(InterleaveValues[0]->getType()); - auto *PtrTy = SI->getPointerOperandType(); - const DataLayout &DL = SI->getDataLayout(); + const DataLayout &DL = Store->getDataLayout(); + Type *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen()); - if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(), - SI->getPointerAddressSpace(), DL)) + Value *Ptr, *VL; + Align Alignment; + if (!getMemOperands(Factor, InVTy, XLenTy, Store, Ptr, Mask, VL, Alignment)) + return false; + Type *PtrTy = Ptr->getType(); + unsigned AS = Ptr->getType()->getPointerAddressSpace(); + if (!isLegalInterleavedAccessType(InVTy, Factor, Alignment, AS, DL)) return false; - Type *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen()); - - if (auto *FVTy = dyn_cast<FixedVectorType>(InVTy)) { + if (isa<FixedVectorType>(InVTy)) { Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( - SI->getModule(), FixedVssegIntrIds[Factor - 2], {InVTy, PtrTy, XLenTy}); - + Store->getModule(), FixedVssegIntrIds[Factor - 2], + {InVTy, PtrTy, XLenTy}); SmallVector<Value *, 10> Ops(InterleaveValues); - Value *VL = ConstantInt::get(XLenTy, FVTy->getNumElements()); - Value *Mask = Builder.getAllOnesMask(FVTy->getElementCount()); - Ops.append({SI->getPointerOperand(), Mask, VL}); - + Ops.append({Ptr, Mask, VL}); Builder.CreateCall(VssegNFunc, Ops); - } else { - static const Intrinsic::ID IntrIds[] = { - Intrinsic::riscv_vsseg2, Intrinsic::riscv_vsseg3, - Intrinsic::riscv_vsseg4, Intrinsic::riscv_vsseg5, - Intrinsic::riscv_vsseg6, Intrinsic::riscv_vsseg7, - Intrinsic::riscv_vsseg8}; - - unsigned SEW = DL.getTypeSizeInBits(InVTy->getElementType()); - unsigned NumElts = InVTy->getElementCount().getKnownMinValue(); - Type *VecTupTy = TargetExtType::get( - SI->getContext(), "riscv.vector.tuple", - ScalableVectorType::get(Type::getInt8Ty(SI->getContext()), - NumElts * SEW / 8), - Factor); - - Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( - SI->getModule(), IntrIds[Factor - 2], {VecTupTy, PtrTy, XLenTy}); - - Value *VL = Constant::getAllOnesValue(XLenTy); - - Value *StoredVal = PoisonValue::get(VecTupTy); - for (unsigned i = 0; i < Factor; ++i) - StoredVal = Builder.CreateIntrinsic( - Intrinsic::riscv_tuple_insert, {VecTupTy, InVTy}, - {StoredVal, InterleaveValues[i], Builder.getInt32(i)}); - - Builder.CreateCall(VssegNFunc, {StoredVal, SI->getPointerOperand(), VL, - ConstantInt::get(XLenTy, Log2_64(SEW))}); - } - - return true; -} - -static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { - assert(N); - if (N == 1) - return true; - - using namespace PatternMatch; - // Right now we're only recognizing the simplest pattern. - uint64_t C; - if (match(V, m_CombineOr(m_ConstantInt(C), - m_c_Mul(m_Value(), m_ConstantInt(C)))) && - C && C % N == 0) return true; - - if (isPowerOf2_32(N)) { - KnownBits KB = llvm::computeKnownBits(V, DL); - return KB.countMinTrailingZeros() >= Log2_32(N); } + unsigned SEW = DL.getTypeSizeInBits(InVTy->getElementType()); + unsigned NumElts = InVTy->getElementCount().getKnownMinValue(); + Type *VecTupTy = TargetExtType::get( + Store->getContext(), "riscv.vector.tuple", + ScalableVectorType::get(Type::getInt8Ty(Store->getContext()), + NumElts * SEW / 8), + Factor); - return false; -} - -/// Lower an interleaved vp.load into a vlsegN intrinsic. -/// -/// E.g. Lower an interleaved vp.load (Factor = 2): -/// %l = call <vscale x 64 x i8> @llvm.vp.load.nxv64i8.p0(ptr %ptr, -/// %mask, -/// i32 %wide.rvl) -/// %dl = tail call { <vscale x 32 x i8>, <vscale x 32 x i8> } -/// @llvm.vector.deinterleave2.nxv64i8( -/// <vscale x 64 x i8> %l) -/// %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 0 -/// %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %dl, 1 -/// -/// Into: -/// %rvl = udiv %wide.rvl, 2 -/// %sl = call { <vscale x 32 x i8>, <vscale x 32 x i8> } -/// @llvm.riscv.vlseg2.mask.nxv32i8.i64(<vscale x 32 x i8> undef, -/// <vscale x 32 x i8> undef, -/// ptr %ptr, -/// %mask, -/// i64 %rvl, -/// i64 1) -/// %r0 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 0 -/// %r1 = extractvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } %sl, 1 -/// -/// NOTE: the deinterleave2 intrinsic won't be touched and is expected to be -/// removed by the caller -/// TODO: We probably can loosen the dependency on matching extractvalue when -/// dealing with factor of 2 (extractvalue is still required for most of other -/// factors though). -bool RISCVTargetLowering::lowerInterleavedVPLoad( - VPIntrinsic *Load, Value *Mask, - ArrayRef<Value *> DeinterleaveResults) const { - const unsigned Factor = DeinterleaveResults.size(); - assert(Mask && "Expect a valid mask"); - assert(Load->getIntrinsicID() == Intrinsic::vp_load && - "Unexpected intrinsic"); - - Value *FirstActive = *llvm::find_if(DeinterleaveResults, - [](Value *V) { return V != nullptr; }); - VectorType *VTy = cast<VectorType>(FirstActive->getType()); - - auto &DL = Load->getModule()->getDataLayout(); - Align Alignment = Load->getParamAlign(0).value_or( - DL.getABITypeAlign(VTy->getElementType())); - if (!isLegalInterleavedAccessType( - VTy, Factor, Alignment, - Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL)) - return false; - - IRBuilder<> Builder(Load); - - Value *WideEVL = Load->getVectorLengthParam(); - // Conservatively check if EVL is a multiple of factor, otherwise some - // (trailing) elements might be lost after the transformation. - if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor)) - return false; - - auto *PtrTy = Load->getArgOperand(0)->getType(); - auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen()); - Value *EVL = Builder.CreateZExt( - Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)), - XLenTy); - - Value *Return = nullptr; - if (auto *FVTy = dyn_cast<FixedVectorType>(VTy)) { - Return = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2], - {FVTy, PtrTy, XLenTy}, - {Load->getArgOperand(0), Mask, EVL}); - } else { - unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType()); - unsigned NumElts = VTy->getElementCount().getKnownMinValue(); - Type *VecTupTy = TargetExtType::get( - Load->getContext(), "riscv.vector.tuple", - ScalableVectorType::get(Type::getInt8Ty(Load->getContext()), - NumElts * SEW / 8), - Factor); - - Value *PoisonVal = PoisonValue::get(VecTupTy); - - Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration( - Load->getModule(), ScalableVlsegIntrIds[Factor - 2], - {VecTupTy, PtrTy, Mask->getType(), EVL->getType()}); - - Value *Operands[] = { - PoisonVal, - Load->getArgOperand(0), - Mask, - EVL, - ConstantInt::get(XLenTy, - RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC), - ConstantInt::get(XLenTy, Log2_64(SEW))}; - - CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands); - - SmallVector<Type *, 8> AggrTypes{Factor, VTy}; - Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes)); - Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration( - Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy}); - for (unsigned i = 0; i < Factor; ++i) { - Value *VecExtract = - Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)}); - Return = Builder.CreateInsertValue(Return, VecExtract, i); - } - } + Value *StoredVal = PoisonValue::get(VecTupTy); + for (unsigned i = 0; i < Factor; ++i) + StoredVal = Builder.CreateIntrinsic( + Intrinsic::riscv_tuple_insert, {VecTupTy, InVTy}, + {StoredVal, InterleaveValues[i], Builder.getInt32(i)}); - for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) { - if (!DIO) - continue; - // We have to create a brand new ExtractValue to replace each - // of these old ExtractValue instructions. - Value *NewEV = - Builder.CreateExtractValue(Return, {static_cast<unsigned>(Idx)}); - DIO->replaceAllUsesWith(NewEV); - } + Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( + Store->getModule(), ScalableVssegIntrIds[Factor - 2], + {VecTupTy, PtrTy, Mask->getType(), VL->getType()}); + Value *Operands[] = {StoredVal, Ptr, Mask, VL, + ConstantInt::get(XLenTy, Log2_64(SEW))}; + Builder.CreateCall(VssegNFunc, Operands); return true; } @@ -557,15 +475,15 @@ bool RISCVTargetLowering::lowerInterleavedVPStore( auto *PtrTy = Store->getArgOperand(1)->getType(); auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen()); - Value *EVL = Builder.CreateZExt( - Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)), - XLenTy); + auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); + Value *EVL = + Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); - if (auto *FVTy = dyn_cast<FixedVectorType>(VTy)) { + if (isa<FixedVectorType>(VTy)) { SmallVector<Value *, 8> Operands(InterleaveOperands); Operands.append({Store->getArgOperand(1), Mask, EVL}); Builder.CreateIntrinsic(FixedVssegIntrIds[Factor - 2], - {FVTy, PtrTy, XLenTy}, Operands); + {VTy, PtrTy, XLenTy}, Operands); return true; } diff --git a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp index d257f56cf412..28d64031f8bc 100644 --- a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp +++ b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp @@ -123,7 +123,7 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI, SmallSet<std::pair<const MachineInstr *, unsigned>, 4> Visited; SmallVector<std::pair<const MachineInstr *, unsigned>, 4> Worklist; - Worklist.push_back(std::make_pair(&OrigMI, OrigBits)); + Worklist.emplace_back(&OrigMI, OrigBits); while (!Worklist.empty()) { auto P = Worklist.pop_back_val(); @@ -158,7 +158,6 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI, case RISCV::MULW: case RISCV::REMUW: case RISCV::REMW: - case RISCV::SLLIW: case RISCV::SLLW: case RISCV::SRAIW: case RISCV::SRAW: @@ -188,6 +187,7 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI, if (Bits >= 32) break; return false; + case RISCV::SEXT_B: case RISCV::PACKH: if (Bits >= 8) @@ -213,7 +213,7 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI, // as an N-Bit user. unsigned ShAmt = UserMI->getOperand(2).getImm(); if (Bits > ShAmt) { - Worklist.push_back(std::make_pair(UserMI, Bits - ShAmt)); + Worklist.emplace_back(UserMI, Bits - ShAmt); break; } return false; @@ -225,21 +225,29 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI, unsigned ShAmt = UserMI->getOperand(2).getImm(); if (Bits >= (ST.getXLen() - ShAmt)) break; - Worklist.push_back(std::make_pair(UserMI, Bits + ShAmt)); + Worklist.emplace_back(UserMI, Bits + ShAmt); + break; + } + case RISCV::SLLIW: { + unsigned ShAmt = UserMI->getOperand(2).getImm(); + if (Bits >= 32 - ShAmt) + break; + Worklist.emplace_back(UserMI, Bits + ShAmt); break; } + case RISCV::ANDI: { uint64_t Imm = UserMI->getOperand(2).getImm(); if (Bits >= (unsigned)llvm::bit_width(Imm)) break; - Worklist.push_back(std::make_pair(UserMI, Bits)); + Worklist.emplace_back(UserMI, Bits); break; } case RISCV::ORI: { uint64_t Imm = UserMI->getOperand(2).getImm(); if (Bits >= (unsigned)llvm::bit_width<uint64_t>(~Imm)) break; - Worklist.push_back(std::make_pair(UserMI, Bits)); + Worklist.emplace_back(UserMI, Bits); break; } @@ -253,7 +261,7 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI, break; return false; } - Worklist.push_back(std::make_pair(UserMI, Bits)); + Worklist.emplace_back(UserMI, Bits); break; case RISCV::SRA: @@ -272,7 +280,7 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI, // Operand 1 is implicitly zero extended. if (OpIdx == 1 && Bits >= 32) break; - Worklist.push_back(std::make_pair(UserMI, Bits)); + Worklist.emplace_back(UserMI, Bits); break; case RISCV::BEXTI: @@ -320,13 +328,13 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI, case RISCV::BSETI: case RISCV::BCLRI: case RISCV::BINVI: - Worklist.push_back(std::make_pair(UserMI, Bits)); + Worklist.emplace_back(UserMI, Bits); break; case RISCV::BREV8: case RISCV::ORC_B: // BREV8 and ORC_B work on bytes. Round Bits down to the nearest byte. - Worklist.push_back(std::make_pair(UserMI, alignDown(Bits, 8))); + Worklist.emplace_back(UserMI, alignDown(Bits, 8)); break; case RISCV::PseudoCCMOVGPR: @@ -336,7 +344,7 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI, // of operand 4 and 5 is used. if (OpIdx != 4 && OpIdx != 5) return false; - Worklist.push_back(std::make_pair(UserMI, Bits)); + Worklist.emplace_back(UserMI, Bits); break; case RISCV::CZERO_EQZ: @@ -345,7 +353,7 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI, case RISCV::VT_MASKCN: if (OpIdx != 1) return false; - Worklist.push_back(std::make_pair(UserMI, Bits)); + Worklist.emplace_back(UserMI, Bits); break; } } diff --git a/llvm/lib/Target/RISCV/RISCVSchedAndes45.td b/llvm/lib/Target/RISCV/RISCVSchedAndes45.td index da0ceee0c084..5ef858a787c7 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedAndes45.td +++ b/llvm/lib/Target/RISCV/RISCVSchedAndes45.td @@ -54,6 +54,12 @@ def : WriteRes<WriteShiftImm32, [Andes45ALU]>; def : WriteRes<WriteShiftReg, [Andes45ALU]>; def : WriteRes<WriteShiftReg32, [Andes45ALU]>; +// Short forward branch +def : WriteRes<WriteSFB, [Andes45ALU]> { + let Latency = 1; + let NumMicroOps = 2; +} + // Branching def : WriteRes<WriteJmp, [Andes45ALU]>; def : WriteRes<WriteJal, [Andes45ALU]>; @@ -231,6 +237,8 @@ def : ReadAdvance<ReadShiftImm, 0>; def : ReadAdvance<ReadShiftImm32, 0>; def : ReadAdvance<ReadShiftReg, 0>; def : ReadAdvance<ReadShiftReg32, 0>; +def : ReadAdvance<ReadSFBJmp, 0>; +def : ReadAdvance<ReadSFBALU, 0>; def : ReadAdvance<ReadJalr, 0>; def : ReadAdvance<ReadJmp, 0>; def : ReadAdvance<ReadIMul, 0>; @@ -328,7 +336,6 @@ def : ReadAdvance<ReadCSR, 0>; //===----------------------------------------------------------------------===// // Unsupported extensions defm : UnsupportedSchedQ; -defm : UnsupportedSchedSFB; defm : UnsupportedSchedV; defm : UnsupportedSchedZabha; defm : UnsupportedSchedZbkb; diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td index 05388f2d1311..3e286a754e4e 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td @@ -13,6 +13,17 @@ // //===----------------------------------------------------------------------===// +class SMX60IsWorstCaseMX<string mx, list<string> MxList> { + string LLMUL = LargestLMUL<MxList>.r; + bit c = !eq(mx, LLMUL); +} + +class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> { + string LLMUL = LargestLMUL<MxList>.r; + int SSEW = SmallestSEW<mx, isF>.r; + bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW)); +} + def SpacemitX60Model : SchedMachineModel { let IssueWidth = 2; // dual-issue let MicroOpBufferSize = 0; // in-order @@ -44,6 +55,19 @@ let BufferSize = 0 in { // floating point instructions, this model assumes single issue as // increasing it reduces the gains we saw in performance def SMX60_FP : ProcResource<1>; + + // Vector pipeline + // Single issue for vector store/load instructions + def SMX60_VLS : ProcResource<1>; + + // The C908 user manual says: "Vector floating-point units support vector + // floating-point computation of different bits. In addition, vector integer + // units are added". Developer confirmed it's a separate VIEU + def SMX60_VIEU : ProcResource<1>; + + // The C908 user manual says: "The vector execution unit is developed by + // extending the floating-point unit", so let's assume single issue for now + def SMX60_VFP : ProcResource<1>; } //===----------------------------------------------------------------------===// @@ -232,9 +256,341 @@ let Latency = 4 in { def : WriteRes<WriteFMovI32ToF32, [SMX60_IEU]>; } +// 6. Configuration-Setting Instructions +def : WriteRes<WriteVSETVLI, [SMX60_IEUA]>; +def : WriteRes<WriteVSETIVLI, [SMX60_IEUA]>; +def : WriteRes<WriteVSETVL, [SMX60_IEUA]>; + +// 7. Vector Loads and Stores +foreach mx = SchedMxList in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; + + // Unit-stride loads and stores + defm "" : LMULWriteResMX<"WriteVLDE", [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDFF", [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTE", [SMX60_VLS], mx, IsWorstCase>; + + // Mask loads and stores + defm "" : LMULWriteResMX<"WriteVLDM", [SMX60_VLS], mx, IsWorstCase=!eq(mx, "M1")>; + defm "" : LMULWriteResMX<"WriteVSTM", [SMX60_VLS], mx, IsWorstCase=!eq(mx, "M1")>; + + // Strided and indexed loads and stores + foreach eew = [8, 16, 32, 64] in { + defm "" : LMULWriteResMX<"WriteVLDS" # eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDUX" # eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDOX" # eew, [SMX60_VLS], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVSTS" # eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTUX" # eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTOX" # eew, [SMX60_VLS], mx, IsWorstCase>; + } +} + +// Segmented loads and stores +foreach mx = SchedMxList in { + foreach nf=2-8 in { + foreach eew = [8, 16, 32, 64] in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; + + // Unit-stride segmented + defm "" : LMULWriteResMX<"WriteVLSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLSEGFF" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + + // Strided/indexed segmented + defm "" : LMULWriteResMX<"WriteVLSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + + // Indexed segmented + defm "" : LMULWriteResMX<"WriteVLOXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLUXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSUXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSOXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + } + } +} + +// Whole register move/load/store +foreach LMul = [1, 2, 4, 8] in { + def : WriteRes<!cast<SchedWrite>("WriteVLD" # LMul # "R"), [SMX60_VLS]>; + def : WriteRes<!cast<SchedWrite>("WriteVST" # LMul # "R"), [SMX60_VLS]>; + + def : WriteRes<!cast<SchedWrite>("WriteVMov" # LMul # "V"), [SMX60_VIEU]>; +} + +// 11. Vector Integer Arithmetic Instructions +foreach mx = SchedMxList in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; + + defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>; +} + +// Widening +foreach mx = SchedMxListW in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c; + + defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>; +} + +// Vector Integer Division and Remainder +foreach mx = SchedMxList in { + foreach sew = SchedSEWSet<mx>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>; + } +} + +// Narrowing Shift and Clips +foreach mx = SchedMxListW in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c; + + defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>; +} + +// 12. Vector Fixed-Point Arithmetic Instructions +foreach mx = SchedMxList in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; + + defm "" : LMULWriteResMX<"WriteVSALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSALUI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVAALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVAALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSMulV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSMulX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSShiftV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSShiftX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSShiftI", [SMX60_VIEU], mx, IsWorstCase>; +} + +// 13. Vector Floating-Point Instructions +foreach mx = SchedMxListF in { + foreach sew = SchedSEWSet<mx, isF=1>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxListF in { + foreach sew = SchedSEWSet<mx, isF=1>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [SMX60_VFP], mx, sew, IsWorstCase>; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxList in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; + + defm "" : LMULWriteResMX<"WriteVFCmpV", [SMX60_VFP], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFCmpF", [SMX60_VFP], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFClassV", [SMX60_VFP], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFMergeV", [SMX60_VFP], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFMovV", [SMX60_VFP], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [SMX60_VFP], mx, IsWorstCase>; +} + +// Widening +foreach mx = SchedMxListW in { + foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListW>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxListFW in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListFW>.c; + + defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [SMX60_VFP], mx, IsWorstCase>; +} + +foreach mx = SchedMxListFW in { + foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +// Narrowing +foreach mx = SchedMxListW in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c; + + defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [SMX60_VFP], mx, IsWorstCase>; +} + +foreach mx = SchedMxListFW in { + foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in { + + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c; + defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +// Vector Floating-Point Division and Square Root +foreach mx = SchedMxListF in { + foreach sew = SchedSEWSet<mx, 1>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +// 14. Vector Reduction Operations +foreach mx = SchedMxList in { + foreach sew = SchedSEWSet<mx>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From", [SMX60_VIEU], mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxListWRed in { + foreach sew = SchedSEWSet<mx, 0, 1>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListWRed>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxListF in { + foreach sew = SchedSEWSet<mx, 1>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxListFWRed in { + foreach sew = SchedSEWSet<mx, 1, 1>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFWRed, 1>.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +// 15. Vector Mask Instructions +foreach mx = SchedMxList in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; + + defm "" : LMULWriteResMX<"WriteVMALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVMPopV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVMFFSV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVMSFSV", [SMX60_VIEU], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVIotaV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIdxV", [SMX60_VIEU], mx, IsWorstCase>; +} + +// 16. Vector Permutation Instructions +foreach mx = SchedMxList in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; + + defm "" : LMULWriteResMX<"WriteVSlideI", [SMX60_VIEU], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVISlide1X", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFSlide1F", [SMX60_VFP], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVSlideUpX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSlideDownX", [SMX60_VIEU], mx, IsWorstCase>; +} + +def : WriteRes<WriteVMovXS, [SMX60_VIEU]>; +def : WriteRes<WriteVMovSX, [SMX60_VIEU]>; + +def : WriteRes<WriteVMovFS, [SMX60_VIEU]>; +def : WriteRes<WriteVMovSF, [SMX60_VIEU]>; + +// Gather and Compress +foreach mx = SchedMxList in { + foreach sew = SchedSEWSet<mx>.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c; + defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SMX60_VIEU], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [SMX60_VIEU], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SMX60_VIEU], mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxList in { + defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; + + defm "" : LMULWriteResMX<"WriteVRGatherVX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVRGatherVI", [SMX60_VIEU], mx, IsWorstCase>; +} + // Others def : WriteRes<WriteCSR, [SMX60_IEU]>; def : WriteRes<WriteNop, [SMX60_IEU]>; +def : WriteRes<WriteRdVLENB, [SMX60_IEUA]>; //===----------------------------------------------------------------------===// // Bypass and advance @@ -341,10 +697,184 @@ def : ReadAdvance<ReadCLMUL, 0>; def : ReadAdvance<ReadSingleBit, 0>; def : ReadAdvance<ReadSingleBitImm, 0>; +// 6. Configuration-Setting Instructions +def : ReadAdvance<ReadVSETVLI, 0>; +def : ReadAdvance<ReadVSETVL, 0>; + +// 7. Vector Loads and Stores +def : ReadAdvance<ReadVLDX, 0>; +def : ReadAdvance<ReadVSTX, 0>; +defm "" : LMULReadAdvance<"ReadVSTEV", 0>; +defm "" : LMULReadAdvance<"ReadVSTM", 0>; +def : ReadAdvance<ReadVLDSX, 0>; +def : ReadAdvance<ReadVSTSX, 0>; +defm "" : LMULReadAdvance<"ReadVSTS8V", 0>; +defm "" : LMULReadAdvance<"ReadVSTS16V", 0>; +defm "" : LMULReadAdvance<"ReadVSTS32V", 0>; +defm "" : LMULReadAdvance<"ReadVSTS64V", 0>; +defm "" : LMULReadAdvance<"ReadVLDUXV", 0>; +defm "" : LMULReadAdvance<"ReadVLDOXV", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX8", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX16", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX32", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX64", 0>; +defm "" : LMULReadAdvance<"ReadVSTUXV", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX8V", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX16V", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX32V", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX64V", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX8", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX16", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX32", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX64", 0>; +defm "" : LMULReadAdvance<"ReadVSTOXV", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX8V", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX16V", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX32V", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX64V", 0>; +// LMUL Aware +def : ReadAdvance<ReadVST1R, 0>; +def : ReadAdvance<ReadVST2R, 0>; +def : ReadAdvance<ReadVST4R, 0>; +def : ReadAdvance<ReadVST8R, 0>; + +// 12. Vector Integer Arithmetic Instructions +defm : LMULReadAdvance<"ReadVIALUV", 0>; +defm : LMULReadAdvance<"ReadVIALUX", 0>; +defm : LMULReadAdvanceW<"ReadVIWALUV", 0>; +defm : LMULReadAdvanceW<"ReadVIWALUX", 0>; +defm : LMULReadAdvance<"ReadVExtV", 0>; +defm : LMULReadAdvance<"ReadVICALUV", 0>; +defm : LMULReadAdvance<"ReadVICALUX", 0>; +defm : LMULReadAdvance<"ReadVShiftV", 0>; +defm : LMULReadAdvance<"ReadVShiftX", 0>; +defm : LMULReadAdvanceW<"ReadVNShiftV", 0>; +defm : LMULReadAdvanceW<"ReadVNShiftX", 0>; +defm : LMULReadAdvance<"ReadVICmpV", 0>; +defm : LMULReadAdvance<"ReadVICmpX", 0>; +defm : LMULReadAdvance<"ReadVIMinMaxV", 0>; +defm : LMULReadAdvance<"ReadVIMinMaxX", 0>; +defm : LMULReadAdvance<"ReadVIMulV", 0>; +defm : LMULReadAdvance<"ReadVIMulX", 0>; +defm : LMULSEWReadAdvance<"ReadVIDivV", 0>; +defm : LMULSEWReadAdvance<"ReadVIDivX", 0>; +defm : LMULReadAdvanceW<"ReadVIWMulV", 0>; +defm : LMULReadAdvanceW<"ReadVIWMulX", 0>; +defm : LMULReadAdvance<"ReadVIMulAddV", 0>; +defm : LMULReadAdvance<"ReadVIMulAddX", 0>; +defm : LMULReadAdvanceW<"ReadVIWMulAddV", 0>; +defm : LMULReadAdvanceW<"ReadVIWMulAddX", 0>; +defm : LMULReadAdvance<"ReadVIMergeV", 0>; +defm : LMULReadAdvance<"ReadVIMergeX", 0>; +defm : LMULReadAdvance<"ReadVIMovV", 0>; +defm : LMULReadAdvance<"ReadVIMovX", 0>; + +// 13. Vector Fixed-Point Arithmetic Instructions +defm "" : LMULReadAdvance<"ReadVSALUV", 0>; +defm "" : LMULReadAdvance<"ReadVSALUX", 0>; +defm "" : LMULReadAdvance<"ReadVAALUV", 0>; +defm "" : LMULReadAdvance<"ReadVAALUX", 0>; +defm "" : LMULReadAdvance<"ReadVSMulV", 0>; +defm "" : LMULReadAdvance<"ReadVSMulX", 0>; +defm "" : LMULReadAdvance<"ReadVSShiftV", 0>; +defm "" : LMULReadAdvance<"ReadVSShiftX", 0>; +defm "" : LMULReadAdvanceW<"ReadVNClipV", 0>; +defm "" : LMULReadAdvanceW<"ReadVNClipX", 0>; + +// 14. Vector Floating-Point Instructions +defm "" : LMULSEWReadAdvanceF<"ReadVFALUV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFALUF", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWALUV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWALUF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMulV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMulF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFDivV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFDivF", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMulAddV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMulAddF", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulAddV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulAddF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFSqrtV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFRecpV", 0>; +defm "" : LMULReadAdvance<"ReadVFCmpV", 0>; +defm "" : LMULReadAdvance<"ReadVFCmpF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMinMaxV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMinMaxF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFSgnjV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFSgnjF", 0>; +defm "" : LMULReadAdvance<"ReadVFClassV", 0>; +defm "" : LMULReadAdvance<"ReadVFMergeV", 0>; +defm "" : LMULReadAdvance<"ReadVFMergeF", 0>; +defm "" : LMULReadAdvance<"ReadVFMovF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFCvtIToFV", 0>; +defm "" : LMULReadAdvance<"ReadVFCvtFToIV", 0>; +defm "" : LMULSEWReadAdvanceW<"ReadVFWCvtIToFV", 0>; +defm "" : LMULReadAdvanceFW<"ReadVFWCvtFToIV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWCvtFToFV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFNCvtIToFV", 0>; +defm "" : LMULReadAdvanceW<"ReadVFNCvtFToIV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFNCvtFToFV", 0>; + +// 15. Vector Reduction Operations +def : ReadAdvance<ReadVIRedV, 0>; +def : ReadAdvance<ReadVIRedV0, 0>; +def : ReadAdvance<ReadVIWRedV, 0>; +def : ReadAdvance<ReadVIWRedV0, 0>; +def : ReadAdvance<ReadVFRedV, 0>; +def : ReadAdvance<ReadVFRedV0, 0>; +def : ReadAdvance<ReadVFRedOV, 0>; +def : ReadAdvance<ReadVFRedOV0, 0>; +def : ReadAdvance<ReadVFWRedV, 0>; +def : ReadAdvance<ReadVFWRedV0, 0>; +def : ReadAdvance<ReadVFWRedOV, 0>; +def : ReadAdvance<ReadVFWRedOV0, 0>; + +// 16. Vector Mask Instructions +defm "" : LMULReadAdvance<"ReadVMALUV", 0>; +defm "" : LMULReadAdvance<"ReadVMPopV", 0>; +defm "" : LMULReadAdvance<"ReadVMFFSV", 0>; +defm "" : LMULReadAdvance<"ReadVMSFSV", 0>; +defm "" : LMULReadAdvance<"ReadVIotaV", 0>; + +// 17. Vector Permutation Instructions +def : ReadAdvance<ReadVMovXS, 0>; +def : ReadAdvance<ReadVMovSX_V, 0>; +def : ReadAdvance<ReadVMovSX_X, 0>; +def : ReadAdvance<ReadVMovFS, 0>; +def : ReadAdvance<ReadVMovSF_V, 0>; +def : ReadAdvance<ReadVMovSF_F, 0>; +defm "" : LMULReadAdvance<"ReadVISlideV", 0>; +defm "" : LMULReadAdvance<"ReadVISlideX", 0>; +defm "" : LMULReadAdvance<"ReadVFSlideV", 0>; +defm "" : LMULReadAdvance<"ReadVFSlideF", 0>; +defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_data", 0>; +defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_index", 0>; +defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_data", 0>; +defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_index", 0>; +defm "" : LMULReadAdvance<"ReadVRGatherVX_data", 0>; +defm "" : LMULReadAdvance<"ReadVRGatherVX_index", 0>; +defm "" : LMULReadAdvance<"ReadVRGatherVI_data", 0>; +defm "" : LMULSEWReadAdvance<"ReadVCompressV", 0>; +// LMUL Aware +def : ReadAdvance<ReadVMov1V, 0>; +def : ReadAdvance<ReadVMov2V, 0>; +def : ReadAdvance<ReadVMov4V, 0>; +def : ReadAdvance<ReadVMov8V, 0>; + +// Others +def : ReadAdvance<ReadVMask, 0>; +def : ReadAdvance<ReadVPassthru_WorstCase, 0>; +foreach mx = SchedMxList in { + def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx), 0>; + foreach sew = SchedSEWSet<mx>.val in + def : ReadAdvance<!cast<SchedRead>("ReadVPassthru_" # mx # "_E" # sew), 0>; +} + //===----------------------------------------------------------------------===// // Unsupported extensions defm : UnsupportedSchedQ; -defm : UnsupportedSchedV; defm : UnsupportedSchedZabha; defm : UnsupportedSchedZbkb; defm : UnsupportedSchedZbkx; diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index 2d9f38221d42..e656e8bb99d8 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -747,6 +747,14 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) { return TwoTimes ? MILog2SEW + 1 : MILog2SEW; } + // Vector Register Gather with 16-bit Index Elements Instruction + // Dest and source data EEW=SEW. Index vector EEW=16. + case RISCV::VRGATHEREI16_VV: { + if (MO.getOperandNo() == 2) + return 4; + return MILog2SEW; + } + default: return std::nullopt; } @@ -966,6 +974,13 @@ static bool isSupportedInstr(const MachineInstr &MI) { case RISCV::VADC_VIM: case RISCV::VADC_VVM: case RISCV::VADC_VXM: + case RISCV::VMADC_VIM: + case RISCV::VMADC_VVM: + case RISCV::VMADC_VXM: + case RISCV::VSBC_VVM: + case RISCV::VSBC_VXM: + case RISCV::VMSBC_VVM: + case RISCV::VMSBC_VXM: // Vector Widening Integer Multiply-Add Instructions case RISCV::VWMACCU_VV: case RISCV::VWMACCU_VX: @@ -1051,6 +1066,11 @@ static bool isSupportedInstr(const MachineInstr &MI) { case RISCV::VSLIDEDOWN_VI: case RISCV::VSLIDE1UP_VX: case RISCV::VFSLIDE1UP_VF: + // Vector Register Gather Instructions + case RISCV::VRGATHER_VI: + case RISCV::VRGATHER_VV: + case RISCV::VRGATHER_VX: + case RISCV::VRGATHEREI16_VV: // Vector Single-Width Floating-Point Add/Subtract Instructions case RISCV::VFADD_VF: case RISCV::VFADD_VV: @@ -1132,6 +1152,8 @@ static bool isSupportedInstr(const MachineInstr &MI) { case RISCV::VMFLE_VV: case RISCV::VMFGT_VF: case RISCV::VMFGE_VF: + // Vector Floating-Point Classify Instruction + case RISCV::VFCLASS_V: // Vector Floating-Point Merge Instruction case RISCV::VFMERGE_VFM: // Vector Floating-Point Move Instruction @@ -1346,9 +1368,7 @@ RISCVVLOptimizer::checkUsers(const MachineInstr &MI) const { const MachineInstr &UserMI = *UserOp.getParent(); LLVM_DEBUG(dbgs() << " Checking user: " << UserMI << "\n"); - if (UserMI.isCopy() && UserMI.getOperand(0).getReg().isVirtual() && - UserMI.getOperand(0).getSubReg() == RISCV::NoSubRegister && - UserMI.getOperand(1).getSubReg() == RISCV::NoSubRegister) { + if (UserMI.isFullCopy() && UserMI.getOperand(0).getReg().isVirtual()) { LLVM_DEBUG(dbgs() << " Peeking through uses of COPY\n"); Worklist.insert_range(llvm::make_pointer_range( MRI->use_operands(UserMI.getOperand(0).getReg()))); diff --git a/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp b/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp index be54a8c95a97..3bd2705f021a 100644 --- a/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp +++ b/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp @@ -10,6 +10,10 @@ // instructions and masked instructions, so that we can reduce the live range // overlaps of mask registers. // +// If there are multiple masks producers followed by multiple masked +// instructions, then at each masked instructions add dependency edges between +// every producer and masked instruction. +// // The reason why we need to do this: // 1. When tracking register pressure, we don't track physical registers. // 2. We have a RegisterClass for mask register (which is `VMV0`), but we don't @@ -40,9 +44,8 @@ namespace llvm { static bool isCopyToV0(const MachineInstr &MI) { - return MI.isCopy() && MI.getOperand(0).getReg() == RISCV::V0 && - MI.getOperand(1).getReg().isVirtual() && - MI.getOperand(1).getSubReg() == RISCV::NoSubRegister; + return MI.isFullCopy() && MI.getOperand(0).getReg() == RISCV::V0 && + MI.getOperand(1).getReg().isVirtual(); } static bool isSoleUseCopyToV0(SUnit &SU) { @@ -68,11 +71,27 @@ public: void apply(ScheduleDAGInstrs *DAG) override { SUnit *NearestUseV0SU = nullptr; + SmallVector<SUnit *, 2> DefMask; for (SUnit &SU : DAG->SUnits) { const MachineInstr *MI = SU.getInstr(); - if (MI->findRegisterUseOperand(RISCV::V0, TRI)) + bool UseV0 = MI->findRegisterUseOperand(RISCV::V0, TRI); + if (isSoleUseCopyToV0(SU) && !UseV0) + DefMask.push_back(&SU); + + if (UseV0) { NearestUseV0SU = &SU; + // Copy may not be a real use, so skip it here. + if (DefMask.size() > 1 && !MI->isCopy()) { + for (SUnit *Def : DefMask) + if (DAG->canAddEdge(Def, &SU)) + DAG->addEdge(Def, SDep(&SU, SDep::Artificial)); + } + + if (!DefMask.empty()) + DefMask.erase(DefMask.begin()); + } + if (NearestUseV0SU && NearestUseV0SU != &SU && isSoleUseCopyToV0(SU) && // For LMUL=8 cases, there will be more possibilities to spill. // FIXME: We should use RegPressureTracker to do fine-grained diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp index 2a424e673ddf..a7f6fbceffc3 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp @@ -19,7 +19,6 @@ #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" using namespace llvm; diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td index 6897865eb4e1..ea78dcd13526 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td @@ -1364,7 +1364,24 @@ defm : DemangledGetBuiltin<"get_sub_group_gt_mask", OpenCL_std, Variable, Subgro defm : DemangledGetBuiltin<"get_sub_group_le_mask", OpenCL_std, Variable, SubgroupLeMask>; defm : DemangledGetBuiltin<"get_sub_group_lt_mask", OpenCL_std, Variable, SubgroupLtMask>; defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalLinearId", OpenCL_std, Variable, GlobalLinearId>; -defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalInvocationId", OpenCL_std, Variable, GlobalInvocationId>; +defm : DemangledGetBuiltin<"__spirv_BuiltInLocalInvocationIndex", OpenCL_std, Variable, LocalInvocationIndex>; +defm : DemangledGetBuiltin<"__spirv_BuiltInWorkDim", OpenCL_std, Variable, WorkDim>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupSize", OpenCL_std, Variable, SubgroupSize>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupMaxSize", OpenCL_std, Variable, SubgroupMaxSize>; +defm : DemangledGetBuiltin<"__spirv_BuiltInNumSubgroups", OpenCL_std, Variable, NumSubgroups>; +defm : DemangledGetBuiltin<"__spirv_BuiltInNumEnqueuedSubgroups", OpenCL_std, Variable, NumEnqueuedSubgroups>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupId", OpenCL_std, Variable, SubgroupId>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLocalInvocationId", OpenCL_std, Variable, SubgroupLocalInvocationId>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupEqMask", OpenCL_std, Variable, SubgroupEqMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupEqMaskKHR", OpenCL_std, Variable, SubgroupEqMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGeMask", OpenCL_std, Variable, SubgroupGeMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGeMaskKHR", OpenCL_std, Variable, SubgroupGeMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGtMask", OpenCL_std, Variable, SubgroupGtMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGtMaskKHR", OpenCL_std, Variable, SubgroupGtMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLeMask", OpenCL_std, Variable, SubgroupLeMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLeMaskKHR", OpenCL_std, Variable, SubgroupLeMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLtMask", OpenCL_std, Variable, SubgroupLtMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLtMaskKHR", OpenCL_std, Variable, SubgroupLtMask>; // GetQuery builtin records: defm : DemangledGetBuiltin<"get_local_id", OpenCL_std, GetQuery, LocalInvocationId>; @@ -1375,6 +1392,14 @@ defm : DemangledGetBuiltin<"get_group_id", OpenCL_std, GetQuery, WorkgroupId>; defm : DemangledGetBuiltin<"get_enqueued_local_size", OpenCL_std, GetQuery, EnqueuedWorkgroupSize>; defm : DemangledGetBuiltin<"get_num_groups", OpenCL_std, GetQuery, NumWorkgroups>; defm : DemangledGetBuiltin<"get_global_offset", OpenCL_std, GetQuery, GlobalOffset>; +defm : DemangledGetBuiltin<"__spirv_BuiltInLocalInvocationId", OpenCL_std, GetQuery, LocalInvocationId>; +defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalInvocationId", OpenCL_std, GetQuery, GlobalInvocationId>; +defm : DemangledGetBuiltin<"__spirv_BuiltInWorkgroupSize", OpenCL_std, GetQuery, WorkgroupSize>; +defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalSize", OpenCL_std, GetQuery, GlobalSize>; +defm : DemangledGetBuiltin<"__spirv_BuiltInWorkgroupId", OpenCL_std, GetQuery, WorkgroupId>; +defm : DemangledGetBuiltin<"__spirv_BuiltInEnqueuedWorkgroupSize", OpenCL_std, GetQuery, EnqueuedWorkgroupSize>; +defm : DemangledGetBuiltin<"__spirv_BuiltInNumWorkgroups", OpenCL_std, GetQuery, NumWorkgroups>; +defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalOffset", OpenCL_std, GetQuery, GlobalOffset>; defm : DemangledGetBuiltin<"__hlsl_wave_get_lane_index", GLSL_std_450, Wave, SubgroupLocalInvocationId>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index fd0bea0b9047..6608b3f2cbef 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -3120,6 +3120,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, return selectExtInst(ResVReg, ResType, I, CL::fract, GL::Fract); case Intrinsic::spv_normalize: return selectExtInst(ResVReg, ResType, I, CL::normalize, GL::Normalize); + case Intrinsic::spv_refract: + return selectExtInst(ResVReg, ResType, I, GL::Refract); case Intrinsic::spv_reflect: return selectExtInst(ResVReg, ResType, I, GL::Reflect); case Intrinsic::spv_rsqrt: diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp index 2a581d381d4a..4a9c88bfa6d3 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcELFObjectWriter.cpp @@ -68,7 +68,7 @@ unsigned SparcELFObjectWriter::getRelocType(const MCFixup &Fixup, // Extract the relocation type from the fixup kind, after applying STT_TLS as // needed. - unsigned Kind = Fixup.getTargetKind(); + auto Kind = Fixup.getKind(); if (mc::isRelocation(Fixup.getKind())) return Kind; @@ -93,7 +93,7 @@ unsigned SparcELFObjectWriter::getRelocType(const MCFixup &Fixup, } // clang-format off - switch(Fixup.getTargetKind()) { + switch(Fixup.getKind()) { default: llvm_unreachable("Unimplemented fixup -> relocation"); case FK_NONE: return ELF::R_SPARC_NONE; diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp index 233585346946..cfa3511436b9 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp @@ -11,7 +11,6 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/SparcFixupKinds.h" -#include "MCTargetDesc/SparcMCAsmInfo.h" #include "SparcMCTargetDesc.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" diff --git a/llvm/lib/Target/Sparc/Sparc.td b/llvm/lib/Target/Sparc/Sparc.td index 8588d2d28b71..cee671e34951 100644 --- a/llvm/lib/Target/Sparc/Sparc.td +++ b/llvm/lib/Target/Sparc/Sparc.td @@ -64,6 +64,10 @@ def FeatureOSA2011 : SubtargetFeature<"osa2011", "IsOSA2011", "true", "Enable Oracle SPARC Architecture 2011 extensions", [FeatureV9, FeatureVIS, FeatureVIS2, FeatureVIS3]>; +def FeatureCrypto + : SubtargetFeature<"crypto", "IsCrypto", "true", + "Enable cryptographic extensions", + [FeatureOSA2011]>; def FeatureLeon : SubtargetFeature<"leon", "IsLeon", "true", "Enable LEON extensions">; @@ -175,7 +179,8 @@ def : Proc<"niagara3", [FeatureV9, FeatureV8Deprecated, UsePopc, FeatureUA2005, FeatureUA2007]>; def : Proc<"niagara4", [FeatureV9, FeatureV8Deprecated, UsePopc, FeatureVIS, FeatureVIS2, FeatureVIS3, - FeatureUA2005, FeatureUA2007, FeatureOSA2011]>; + FeatureUA2005, FeatureUA2007, FeatureOSA2011, + FeatureCrypto]>; // LEON 2 FT generic def : Processor<"leon2", LEON2Itineraries, diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp index 21dbe8f585b3..9b434d87c267 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp @@ -1828,16 +1828,8 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM, // .umul works for both signed and unsigned setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); - setLibcallImpl(RTLIB::MUL_I32, RTLIB::sparc_umul); - setOperationAction(ISD::SDIV, MVT::i32, Expand); - setLibcallImpl(RTLIB::SDIV_I32, RTLIB::sparc_div); - setOperationAction(ISD::UDIV, MVT::i32, Expand); - setLibcallImpl(RTLIB::UDIV_I32, RTLIB::sparc_udiv); - - setLibcallImpl(RTLIB::SREM_I32, RTLIB::sparc_rem); - setLibcallImpl(RTLIB::UREM_I32, RTLIB::sparc_urem); } if (Subtarget->is64Bit()) { @@ -1896,14 +1888,6 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FNEG, MVT::f128, Custom); setOperationAction(ISD::FABS, MVT::f128, Custom); } - - if (!Subtarget->is64Bit()) { - setLibcallImpl(RTLIB::FPTOSINT_F128_I64, RTLIB::_Q_qtoll); - setLibcallImpl(RTLIB::FPTOUINT_F128_I64, RTLIB::_Q_qtoull); - setLibcallImpl(RTLIB::SINTTOFP_I64_F128, RTLIB::_Q_lltoq); - setLibcallImpl(RTLIB::UINTTOFP_I64_F128, RTLIB::_Q_ulltoq); - } - } else { // Custom legalize f128 operations. @@ -1948,10 +1932,6 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM, setLibcallImpl(RTLIB::FPTOUINT_F128_I32, RTLIB::_Q_qtou); setLibcallImpl(RTLIB::SINTTOFP_I32_F128, RTLIB::_Q_itoq); setLibcallImpl(RTLIB::UINTTOFP_I32_F128, RTLIB::_Q_utoq); - setLibcallImpl(RTLIB::FPTOSINT_F128_I64, RTLIB::_Q_qtoll); - setLibcallImpl(RTLIB::FPTOUINT_F128_I64, RTLIB::_Q_qtoull); - setLibcallImpl(RTLIB::SINTTOFP_I64_F128, RTLIB::_Q_lltoq); - setLibcallImpl(RTLIB::UINTTOFP_I64_F128, RTLIB::_Q_ulltoq); setLibcallImpl(RTLIB::FPEXT_F32_F128, RTLIB::_Q_stoq); setLibcallImpl(RTLIB::FPEXT_F64_F128, RTLIB::_Q_dtoq); setLibcallImpl(RTLIB::FPROUND_F128_F32, RTLIB::_Q_qtos); diff --git a/llvm/lib/Target/Sparc/SparcInstrCrypto.td b/llvm/lib/Target/Sparc/SparcInstrCrypto.td new file mode 100644 index 000000000000..04b116c2ded8 --- /dev/null +++ b/llvm/lib/Target/Sparc/SparcInstrCrypto.td @@ -0,0 +1,98 @@ +//===----------- SparcInstrCrypto.td - cryptographic extensions -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains instruction formats, definitions and patterns needed for +// cryptographic instructions on SPARC. +//===----------------------------------------------------------------------===// + + +// Convenience template for 4-operand instructions +class FourOpImm<string OpcStr, bits<6> op3val, bits<4> op5val, + RegisterClass RC> + : F3_4<op3val, op5val, (outs RC:$rd), (ins RC:$rs1, RC:$rs2, simm5Op:$rs3), + !strconcat(OpcStr, " $rs1, $rs2, $rs3, $rd")>; + +let Predicates = [HasCrypto] in { +def AES_EROUND01 : FourOp<"aes_eround01", 0b011001, 0b0000, DFPRegs>; +def AES_EROUND23 : FourOp<"aes_eround23", 0b011001, 0b0001, DFPRegs>; +def AES_DROUND01 : FourOp<"aes_dround01", 0b011001, 0b0010, DFPRegs>; +def AES_DROUND23 : FourOp<"aes_dround23", 0b011001, 0b0011, DFPRegs>; +def AES_EROUND01_LAST : FourOp<"aes_eround01_l", 0b011001, 0b0100, DFPRegs>; +def AES_EROUND23_LAST : FourOp<"aes_eround23_l", 0b011001, 0b0101, DFPRegs>; +def AES_DROUND01_LAST : FourOp<"aes_dround01_l", 0b011001, 0b0110, DFPRegs>; +def AES_DROUND23_LAST : FourOp<"aes_dround23_l", 0b011001, 0b0111, DFPRegs>; +def AES_KEXPAND0 : F3_3<2, 0b110110, 0b100110000, + (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2), + "aes_kexpand0 $rs1, $rs2, $rd", []>; +def AES_KEXPAND1 : FourOpImm<"aes_kexpand1", 0b011001, 0b1000, DFPRegs>; +def AES_KEXPAND2 : F3_3<2, 0b110110, 0b100110001, + (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2), + "aes_kexpand2 $rs1, $rs2, $rd", []>; + +def CAMELLIA_F : FourOp<"camellia_f", 0b011001, 0b1100, DFPRegs>; +def CAMELLIA_FL : F3_3<2, 0b110110, 0b100111100, + (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2), + "camellia_fl $rs1, $rs2, $rd", []>; +def CAMELLIA_FLI : F3_3<2, 0b110110, 0b100111101, + (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2), + "camellia_fli $rs1, $rs2, $rd", []>; + +def CRC32C : F3_3<2, 0b110110, 0b101000111, + (outs DFPRegs:$rd), (ins DFPRegs:$rs1, DFPRegs:$rs2), + "crc32c $rs1, $rs2, $rd", []>; + +def DES_ROUND : FourOp<"des_round", 0b011001, 0b1001, DFPRegs>; +let rs2 = 0 in { +def DES_IP : F3_3<2, 0b110110, 0b100110100, + (outs DFPRegs:$rd), (ins DFPRegs:$rs1), + "des_ip $rs1, $rd", []>; +def DES_IIP : F3_3<2, 0b110110, 0b100110101, + (outs DFPRegs:$rd), (ins DFPRegs:$rs1), + "des_iip $rs1, $rd", []>; +} +def DES_KEXPAND : F3_3<2, 0b110110, 0b100110110, + (outs DFPRegs:$rd), (ins DFPRegs:$rs1, simm5Op:$rs2), + "des_kexpand $rs1, $rs2, $rd", []>; + +let rs1 = 0, rs2 = 0, rd = 0 in { +let Uses = [D0, D1, D2, D5, D6, D7, D8, D9, D10, D11], + Defs = [D0, D1, D2, D3, D4, D5, D6, D7] in +def MD5 : F3_3<2, 0b110110, 0b101000000, (outs), (ins), "md5", []>; +let Uses = [D0, D1, D2, D4, D5, D6, D7, D8, D9, D10, D11], + Defs = [D0, D1, D2] in +def SHA1 : F3_3<2, 0b110110, 0b101000001, (outs), (ins), "sha1", []>; +let Uses = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11], + Defs = [D0, D1, D2, D3] in +def SHA256 : F3_3<2, 0b110110, 0b101000010, (outs), (ins), "sha256", []>; +let Uses = [D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, + D12, D13, D14, D15, D16, D17, D18, D19, D20, D21, D22, D23], + Defs = [D0, D1, D2, D3, D4, D5, D6, D7] in +def SHA512 : F3_3<2, 0b110110, 0b101000011, (outs), (ins), "sha512", []>; +} + +// These instructions use and clobber all DFP and non-reserved Int registers. +let rs1 = 0, rd = 0, +Uses = [ D0, D1, D2, D3, D4, D5, D6, D7, + D8, D9, D10, D11, D12, D13, D14, D15, + D16, D17, D18, D19, D20, D21, D22, D23, + D24, D25, D26, D27, D28, D29, D30, D31, + O0, O1, O2, O3, O4, O5, + L0, L1, L2, L3, L4, L5, L6, L7, + I0, I1, I2, I3, I4, I5 ], +Defs = [ D0, D1, D2, D3, D4, D5, D6, D7, + D8, D9, D10, D11, D12, D13, D14, D15, + D16, D17, D18, D19, D20, D21, D22, D23, + D24, D25, D26, D27, D28, D29, D30, D31, + O0, O1, O2, O3, O4, O5, + L0, L1, L2, L3, L4, L5, L6, L7, + I0, I1, I2, I3, I4, I5 ] in { +def MPMUL : F3_3<2, 0b110110, 0b101001000, (outs), (ins simm5Op:$rs2), "mpmul $rs2", []>; +def MONTMUL : F3_3<2, 0b110110, 0b101001001, (outs), (ins simm5Op:$rs2), "montmul $rs2", []>; +def MONTSQR : F3_3<2, 0b110110, 0b101001010, (outs), (ins simm5Op:$rs2), "montsqr $rs2", []>; +} +} // Predicates = [HasCrypto] diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.td b/llvm/lib/Target/Sparc/SparcInstrInfo.td index 1be017be1c64..1a32eafb0e83 100644 --- a/llvm/lib/Target/Sparc/SparcInstrInfo.td +++ b/llvm/lib/Target/Sparc/SparcInstrInfo.td @@ -60,6 +60,10 @@ def HasUA2007 : Predicate<"Subtarget->isUA2007()">, def HasOSA2011 : Predicate<"Subtarget->isOSA2011()">, AssemblerPredicate<(all_of FeatureOSA2011)>; +// HasCrypto - This is true when the target processor has cryptographic extensions. +def HasCrypto : Predicate<"Subtarget->isCrypto()">, + AssemblerPredicate<(all_of FeatureCrypto)>; + // HasHardQuad - This is true when the target processor supports quad floating // point instructions. def HasHardQuad : Predicate<"Subtarget->hasHardQuad()">; @@ -2011,4 +2015,5 @@ def : Pat<(build_vector (i32 IntRegs:$a1), (i32 IntRegs:$a2)), include "SparcInstr64Bit.td" include "SparcInstrVIS.td" include "SparcInstrUAOSA.td" +include "SparcInstrCrypto.td" include "SparcInstrAliases.td" diff --git a/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp b/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp index 711bf9b31a37..b19196475908 100644 --- a/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp +++ b/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "SparcTargetObjectFile.h" -#include "MCTargetDesc/SparcMCAsmInfo.h" #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/MachineModuleInfoImpls.h" diff --git a/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp index 6ae529e97418..31b4f1196392 100644 --- a/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp +++ b/llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp @@ -327,6 +327,8 @@ DecodeStatus SystemZDisassembler::getInstruction(MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address, raw_ostream &CS) const { + CommentStream = &CS; + // Get the first two bytes of the instruction. Size = 0; if (Bytes.size() < 2) diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp index 9121f0d44936..3ef6030ba518 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.cpp @@ -137,10 +137,10 @@ void SystemZHLASMAsmStreamer::EmitComment() { } void SystemZHLASMAsmStreamer::emitValueToAlignment(Align Alignment, - int64_t Value, - unsigned ValueSize, + int64_t Fill, + uint8_t FillLen, unsigned MaxBytesToEmit) { - emitAlignmentDS(Alignment.value(), Value, ValueSize, MaxBytesToEmit); + emitAlignmentDS(Alignment.value(), Fill, FillLen, MaxBytesToEmit); } void SystemZHLASMAsmStreamer::emitCodeAlignment(Align Alignment, diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.h index c5275339ce01..93b1ac4d901a 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.h +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZHLASMAsmStreamer.h @@ -86,9 +86,8 @@ public: void emitAlignmentDS(uint64_t ByteAlignment, std::optional<int64_t> Value, unsigned ValueSize, unsigned MaxBytesToEmit); - void emitValueToAlignment(Align Alignment, int64_t Value = 0, - unsigned ValueSize = 1, - unsigned MaxBytesToEmit = 0) override; + void emitValueToAlignment(Align Alignment, int64_t Fill, uint8_t FillLen, + unsigned MaxBytesToEmit) override; void emitCodeAlignment(Align Alignment, const MCSubtargetInfo *STI, unsigned MaxBytesToEmit = 0) override; diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp index 4bef8ff9bbac..629791631080 100644 --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -533,7 +533,7 @@ void SystemZELFFrameLowering::emitPrologue(MachineFunction &MF, const SystemZSubtarget &STI = MF.getSubtarget<SystemZSubtarget>(); const SystemZTargetLowering &TLI = *STI.getTargetLowering(); MachineFrameInfo &MFFrame = MF.getFrameInfo(); - auto *ZII = static_cast<const SystemZInstrInfo *>(STI.getInstrInfo()); + auto *ZII = STI.getInstrInfo(); SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>(); MachineBasicBlock::iterator MBBI = MBB.begin(); const MCRegisterInfo *MRI = MF.getContext().getRegisterInfo(); @@ -1239,7 +1239,7 @@ void SystemZXPLINKFrameLowering::emitPrologue(MachineFunction &MF, const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>(); SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>(); MachineBasicBlock::iterator MBBI = MBB.begin(); - auto *ZII = static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo()); + auto *ZII = Subtarget.getInstrInfo(); auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>(); MachineFrameInfo &MFFrame = MF.getFrameInfo(); MachineInstr *StoreInstr = nullptr; @@ -1354,7 +1354,7 @@ void SystemZXPLINKFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>(); MachineFrameInfo &MFFrame = MF.getFrameInfo(); - auto *ZII = static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo()); + auto *ZII = Subtarget.getInstrInfo(); auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>(); // Skip the return instruction. diff --git a/llvm/lib/Target/TargetLoweringObjectFile.cpp b/llvm/lib/Target/TargetLoweringObjectFile.cpp index 0920c3345ecf..9b03e85ca45b 100644 --- a/llvm/lib/Target/TargetLoweringObjectFile.cpp +++ b/llvm/lib/Target/TargetLoweringObjectFile.cpp @@ -191,6 +191,35 @@ void TargetLoweringObjectFile::emitCGProfileMetadata(MCStreamer &Streamer, } } +void TargetLoweringObjectFile::emitPseudoProbeDescMetadata(MCStreamer &Streamer, + Module &M) const { + NamedMDNode *FuncInfo = M.getNamedMetadata(PseudoProbeDescMetadataName); + if (!FuncInfo) + return; + + // Emit a descriptor for every function including functions that have an + // available external linkage. We may not want this for imported functions + // that has code in another thinLTO module but we don't have a good way to + // tell them apart from inline functions defined in header files. Therefore + // we put each descriptor in a separate comdat section and rely on the + // linker to deduplicate. + auto &C = getContext(); + for (const auto *Operand : FuncInfo->operands()) { + const auto *MD = cast<MDNode>(Operand); + auto *GUID = mdconst::extract<ConstantInt>(MD->getOperand(0)); + auto *Hash = mdconst::extract<ConstantInt>(MD->getOperand(1)); + auto *Name = cast<MDString>(MD->getOperand(2)); + auto *S = C.getObjectFileInfo()->getPseudoProbeDescSection( + TM->getFunctionSections() ? Name->getString() : StringRef()); + + Streamer.switchSection(S); + Streamer.emitInt64(GUID->getZExtValue()); + Streamer.emitInt64(Hash->getZExtValue()); + Streamer.emitULEB128IntValue(Name->getString().size()); + Streamer.emitBytes(Name->getString()); + } +} + /// getKindForGlobal - This is a top-level target-independent classifier for /// a global object. Given a global variable and information from the TM, this /// function classifies the global in a target independent manner. This function diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp index e09a916d48c9..f98762152247 100644 --- a/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp +++ b/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp @@ -154,7 +154,7 @@ public: void VEAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, const MCValue &Target, MutableArrayRef<char> Data, uint64_t Value, bool IsResolved) { - switch (Fixup.getTargetKind()) { + switch (Fixup.getKind()) { case VE::fixup_ve_tls_gd_hi32: case VE::fixup_ve_tls_gd_lo32: case VE::fixup_ve_tpoff_hi32: diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp index 1597e7d080f0..41f31eb3b819 100644 --- a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp +++ b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp @@ -56,7 +56,7 @@ unsigned VEELFObjectWriter::getRelocType(const MCFixup &Fixup, } if (IsPCRel) { - switch (Fixup.getTargetKind()) { + switch (Fixup.getKind()) { default: reportError(Fixup.getLoc(), "Unsupported pc-relative fixup kind"); return ELF::R_VE_NONE; @@ -84,7 +84,7 @@ unsigned VEELFObjectWriter::getRelocType(const MCFixup &Fixup, } } - switch (Fixup.getTargetKind()) { + switch (Fixup.getKind()) { default: reportError(Fixup.getLoc(), "Unknown ELF relocation type"); return ELF::R_VE_NONE; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index c591e5ef181a..d13862f12773 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -1536,6 +1536,10 @@ multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, list<Predicate> (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)))], vec.prefix#".relaxed_nmadd\t$dst, $a, $b, $c", vec.prefix#".relaxed_nmadd", simdopS, reqs>; + + def : Pat<(fadd_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))), + (!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>; + } defm "" : SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp index 656d5dd32773..28f65990120c 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp @@ -261,7 +261,6 @@ /// ///===----------------------------------------------------------------------===// -#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" #include "WebAssembly.h" #include "WebAssemblyTargetMachine.h" #include "llvm/ADT/StringExtras.h" diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp index ad47cb8ea2fe..6827ee652794 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -26,7 +26,6 @@ #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Function.h" #include "llvm/InitializePasses.h" -#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/Compiler.h" #include "llvm/Target/TargetOptions.h" diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp index 6614eea3901b..564636959f00 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp @@ -14,6 +14,7 @@ #include "X86ATTInstPrinter.h" #include "X86BaseInfo.h" #include "X86InstComments.h" +#include "llvm/ADT/SmallString.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" @@ -35,6 +36,21 @@ using namespace llvm; #define PRINT_ALIAS_INSTR #include "X86GenAsmWriter.inc" +// Print an MCExpr as an operand. Similar to GCC, wrap the output in parentheses +// if it begins with '$', as '$' in an operand position indicates an immediate +// value in the AT&T syntax. +void X86ATTInstPrinter::printExprOperand(raw_ostream &OS, const MCExpr &E) { + SmallString<128> S; + { + raw_svector_ostream SOS(S); + MAI.printExpr(SOS, E); + } + if (S.starts_with("$")) + OS << '(' << S << ')'; + else + OS << S; +} + void X86ATTInstPrinter::printRegName(raw_ostream &OS, MCRegister Reg) { markup(OS, Markup::Register) << '%' << getRegisterName(Reg); } @@ -446,7 +462,7 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op, O << formatImm(DispVal); } else { assert(DispSpec.isExpr() && "non-immediate displacement for LEA?"); - MAI.printExpr(O, *DispSpec.getExpr()); + printExprOperand(O, *DispSpec.getExpr()); } if (IndexReg.getReg() || BaseReg.getReg()) { @@ -501,7 +517,7 @@ void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op, O << formatImm(DispSpec.getImm()); } else { assert(DispSpec.isExpr() && "non-immediate displacement?"); - MAI.printExpr(O, *DispSpec.getExpr()); + printExprOperand(O, *DispSpec.getExpr()); } } diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h index f49f09c5dcf3..1452622ebcea 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h +++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h @@ -23,6 +23,7 @@ public: const MCRegisterInfo &MRI) : X86InstPrinterCommon(MAI, MII, MRI), HasCustomInstComment(false) {} + void printExprOperand(raw_ostream &OS, const MCExpr &E) override; void printRegName(raw_ostream &OS, MCRegister Reg) override; void printInst(const MCInst *MI, uint64_t Address, StringRef Annot, const MCSubtargetInfo &STI, raw_ostream &OS) override; diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index ff2df3d5b192..3d060c6f4a78 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -26,6 +26,7 @@ #include "llvm/MC/MCObjectStreamer.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSection.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCValue.h" #include "llvm/MC/TargetRegistry.h" @@ -177,20 +178,20 @@ public: bool mayNeedRelaxation(unsigned Opcode, ArrayRef<MCOperand> Operands, const MCSubtargetInfo &STI) const override; - bool fixupNeedsRelaxationAdvanced(const MCFixup &, const MCValue &, uint64_t, + bool fixupNeedsRelaxationAdvanced(const MCFragment &, const MCFixup &, + const MCValue &, uint64_t, bool) const override; void relaxInstruction(MCInst &Inst, const MCSubtargetInfo &STI) const override; - bool padInstructionViaRelaxation(MCRelaxableFragment &RF, - MCCodeEmitter &Emitter, + bool padInstructionViaRelaxation(MCFragment &RF, MCCodeEmitter &Emitter, unsigned &RemainingSize) const; - bool padInstructionViaPrefix(MCRelaxableFragment &RF, MCCodeEmitter &Emitter, + bool padInstructionViaPrefix(MCFragment &RF, MCCodeEmitter &Emitter, unsigned &RemainingSize) const; - bool padInstructionEncoding(MCRelaxableFragment &RF, MCCodeEmitter &Emitter, + bool padInstructionEncoding(MCFragment &RF, MCCodeEmitter &Emitter, unsigned &RemainingSize) const; bool finishLayout(const MCAssembler &Asm) const override; @@ -409,10 +410,9 @@ isRightAfterData(MCFragment *CurrentFragment, // it, returns true. // - Otherwise returns false. // - If the fragment is not a DataFragment, returns false. - if (auto *DF = dyn_cast_or_null<MCDataFragment>(F)) - return DF->getContents().size() && - (DF != PrevInstPosition.first || - DF->getContents().size() != PrevInstPosition.second); + if (F->getKind() == MCFragment::FT_Data) + return F->getFixedSize() && (F != PrevInstPosition.first || + F->getFixedSize() != PrevInstPosition.second); return false; } @@ -421,11 +421,7 @@ isRightAfterData(MCFragment *CurrentFragment, static size_t getSizeForInstFragment(const MCFragment *F) { if (!F || !F->hasInstructions()) return 0; - // MCEncodedFragmentWithContents being templated makes this tricky. - if (auto *DF = dyn_cast<MCEncodedFragment>(F)) - return DF->getContents().size(); - else - llvm_unreachable("Unknown fragment with instructions!"); + return F->getSize(); } /// Return true if we can insert NOP or prefixes automatically before the @@ -468,10 +464,6 @@ bool X86AsmBackend::canPadBranches(MCObjectStreamer &OS) const { if (!OS.getCurrentSectionOnly()->isText()) return false; - // To be Done: Currently don't deal with Bundle cases. - if (OS.getAssembler().isBundlingEnabled()) - return false; - // Branches only need to be aligned in 32-bit or 64-bit mode. if (!(STI.hasFeature(X86::Is64Bit) || STI.hasFeature(X86::Is32Bit))) return false; @@ -551,8 +543,8 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS, void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst) { MCFragment *CF = OS.getCurrentFragment(); - if (auto *F = dyn_cast_or_null<MCRelaxableFragment>(CF)) - F->setAllowAutoPadding(canPadInst(Inst, OS)); + if (CF->getKind() == MCFragment::FT_Relaxable) + CF->setAllowAutoPadding(canPadInst(Inst, OS)); // Update PrevInstOpcode here, canPadInst() reads that. PrevInstOpcode = Inst.getOpcode(); @@ -575,8 +567,7 @@ void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS, // DataFragment, so that we can get the size of instructions later in // MCAssembler::relaxBoundaryAlign. The easiest way is to insert a new empty // DataFragment. - if (isa_and_nonnull<MCDataFragment>(CF)) - OS.insert(OS.getContext().allocFragment<MCDataFragment>()); + OS.insert(OS.getContext().allocFragment<MCFragment>()); // Update the maximum alignment on the current section if necessary. MCSection *Sec = OS.getCurrentSectionOnly(); @@ -686,7 +677,7 @@ std::optional<bool> X86AsmBackend::evaluateFixup(const MCFragment &, MCFixup &Fixup, MCValue &Target, uint64_t &) { if (Fixup.isPCRel()) { - switch (Fixup.getTargetKind()) { + switch (Fixup.getKind()) { case FK_Data_1: Target.setConstant(Target.getConstant() - 1); break; @@ -756,7 +747,8 @@ bool X86AsmBackend::mayNeedRelaxation(unsigned Opcode, Operands[Operands.size() - 1 - SkipOperands].isExpr()); } -bool X86AsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup, +bool X86AsmBackend::fixupNeedsRelaxationAdvanced(const MCFragment &, + const MCFixup &Fixup, const MCValue &Target, uint64_t Value, bool Resolved) const { @@ -785,7 +777,7 @@ void X86AsmBackend::relaxInstruction(MCInst &Inst, Inst.setOpcode(RelaxedOp); } -bool X86AsmBackend::padInstructionViaPrefix(MCRelaxableFragment &RF, +bool X86AsmBackend::padInstructionViaPrefix(MCFragment &RF, MCCodeEmitter &Emitter, unsigned &RemainingSize) const { if (!RF.getAllowAutoPadding()) @@ -798,7 +790,7 @@ bool X86AsmBackend::padInstructionViaPrefix(MCRelaxableFragment &RF, *RF.getSubtargetInfo())) return false; - const unsigned OldSize = RF.getContents().size(); + const unsigned OldSize = RF.getVarSize(); if (OldSize == 15) return false; @@ -827,19 +819,18 @@ bool X86AsmBackend::padInstructionViaPrefix(MCRelaxableFragment &RF, SmallString<256> Code; Code.append(PrefixBytesToAdd, Prefix); - Code.append(RF.getContents().begin(), RF.getContents().end()); - RF.setContents(Code); + Code.append(RF.getVarContents().begin(), RF.getVarContents().end()); + RF.setVarContents(Code); // Adjust the fixups for the change in offsets - for (auto &F : RF.getFixups()) { - F.setOffset(F.getOffset() + PrefixBytesToAdd); - } + for (auto &F : RF.getVarFixups()) + F.setOffset(PrefixBytesToAdd + F.getOffset()); RemainingSize -= PrefixBytesToAdd; return true; } -bool X86AsmBackend::padInstructionViaRelaxation(MCRelaxableFragment &RF, +bool X86AsmBackend::padInstructionViaRelaxation(MCFragment &RF, MCCodeEmitter &Emitter, unsigned &RemainingSize) const { if (!mayNeedRelaxation(RF.getOpcode(), RF.getOperands(), @@ -854,20 +845,20 @@ bool X86AsmBackend::padInstructionViaRelaxation(MCRelaxableFragment &RF, SmallVector<MCFixup, 4> Fixups; SmallString<15> Code; Emitter.encodeInstruction(Relaxed, Code, Fixups, *RF.getSubtargetInfo()); - const unsigned OldSize = RF.getContents().size(); + const unsigned OldSize = RF.getVarContents().size(); const unsigned NewSize = Code.size(); assert(NewSize >= OldSize && "size decrease during relaxation?"); unsigned Delta = NewSize - OldSize; if (Delta > RemainingSize) return false; RF.setInst(Relaxed); - RF.setContents(Code); - RF.setFixups(Fixups); + RF.setVarContents(Code); + RF.setVarFixups(Fixups); RemainingSize -= Delta; return true; } -bool X86AsmBackend::padInstructionEncoding(MCRelaxableFragment &RF, +bool X86AsmBackend::padInstructionEncoding(MCFragment &RF, MCCodeEmitter &Emitter, unsigned &RemainingSize) const { bool Changed = false; @@ -900,7 +891,7 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const { if (!Sec.isText()) continue; - SmallVector<MCRelaxableFragment *, 4> Relaxable; + SmallVector<MCFragment *, 4> Relaxable; for (MCSection::iterator I = Sec.begin(), IE = Sec.end(); I != IE; ++I) { MCFragment &F = *I; @@ -911,7 +902,7 @@ bool X86AsmBackend::finishLayout(const MCAssembler &Asm) const { continue; if (F.getKind() == MCFragment::FT_Relaxable) { - auto &RF = cast<MCRelaxableFragment>(*I); + auto &RF = cast<MCFragment>(*I); Relaxable.push_back(&RF); continue; } diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp index 7523d2aedcce..1c5f1663d4f5 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp @@ -26,6 +26,10 @@ using namespace llvm; +void X86InstPrinterCommon::printExprOperand(raw_ostream &OS, const MCExpr &E) { + MAI.printExpr(OS, E); +} + void X86InstPrinterCommon::printCondCode(const MCInst *MI, unsigned Op, raw_ostream &O) { int64_t Imm = MI->getOperand(Op).getImm(); @@ -374,7 +378,7 @@ void X86InstPrinterCommon::printPCRelImm(const MCInst *MI, uint64_t Address, markup(O, Markup::Immediate) << formatHex((uint64_t)Address); } else { // Otherwise, just print the expression. - MAI.printExpr(O, *Op.getExpr()); + printExprOperand(O, *Op.getExpr()); } } } diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h index 2a7b750bd675..2c9467ca7c61 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h @@ -17,11 +17,13 @@ #include "llvm/MC/MCInstPrinter.h" namespace llvm { +class MCExpr; class X86InstPrinterCommon : public MCInstPrinter { public: using MCInstPrinter::MCInstPrinter; + virtual void printExprOperand(raw_ostream &OS, const MCExpr &E); virtual void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) = 0; void printCondCode(const MCInst *MI, unsigned Op, raw_ostream &OS); void printCondFlags(const MCInst *MI, unsigned Op, raw_ostream &OS); diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp index c34425f6661b..0dabd98a38f4 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -258,7 +258,7 @@ void X86MachObjectWriter::RecordX86_64Relocation( // x86_64 distinguishes movq foo@GOTPCREL so that the linker can // rewrite the movq to an leaq at link time if the symbol ends up in // the same linkage unit. - if (Fixup.getTargetKind() == X86::reloc_riprel_4byte_movq_load) + if (Fixup.getKind() == X86::reloc_riprel_4byte_movq_load) Type = MachO::X86_64_RELOC_GOT_LOAD; else Type = MachO::X86_64_RELOC_GOT; @@ -320,7 +320,7 @@ void X86MachObjectWriter::RecordX86_64Relocation( return; } else { Type = MachO::X86_64_RELOC_UNSIGNED; - if (Fixup.getTargetKind() == X86::reloc_signed_4byte) { + if (Fixup.getKind() == X86::reloc_signed_4byte) { reportError( Fixup.getLoc(), "32-bit absolute addressing is not supported in 64-bit mode"); diff --git a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp index 0e4add27cce0..7b2b9dda99b4 100644 --- a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp +++ b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp @@ -239,8 +239,7 @@ bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) { TFL = STI->getFrameLowering(); MRI = &MF.getRegInfo(); - const X86RegisterInfo &RegInfo = - *static_cast<const X86RegisterInfo *>(STI->getRegisterInfo()); + const X86RegisterInfo &RegInfo = *STI->getRegisterInfo(); SlotSize = RegInfo.getSlotSize(); assert(isPowerOf2_32(SlotSize) && "Expect power of 2 stack slot size"); Log2SlotSize = Log2_32(SlotSize); @@ -356,8 +355,7 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, CallContext &Context) { // Check that this particular call sequence is amenable to the // transformation. - const X86RegisterInfo &RegInfo = - *static_cast<const X86RegisterInfo *>(STI->getRegisterInfo()); + const X86RegisterInfo &RegInfo = *STI->getRegisterInfo(); // We expect to enter this at the beginning of a call sequence assert(I->getOpcode() == TII->getCallFrameSetupOpcode()); diff --git a/llvm/lib/Target/X86/X86CallingConv.cpp b/llvm/lib/Target/X86/X86CallingConv.cpp index 0b4c63f7a81f..5d5a70589324 100644 --- a/llvm/lib/Target/X86/X86CallingConv.cpp +++ b/llvm/lib/Target/X86/X86CallingConv.cpp @@ -374,5 +374,36 @@ static bool CC_X86_64_I128(unsigned &ValNo, MVT &ValVT, MVT &LocVT, return true; } +/// Special handling for i128 and fp128: on x86-32, i128 and fp128 get legalized +/// as four i32s, but fp128 must be passed on the stack with 16-byte alignment. +/// Technically only fp128 has a specified ABI, but it makes sense to handle +/// i128 the same until we hear differently. +static bool CC_X86_32_I128_FP128(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { + assert(ValVT == MVT::i32 && "Should have i32 parts"); + SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs(); + PendingMembers.push_back( + CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); + + if (!ArgFlags.isInConsecutiveRegsLast()) + return true; + + assert(PendingMembers.size() == 4 && "Should have four parts"); + + int64_t Offset = State.AllocateStack(16, Align(16)); + PendingMembers[0].convertToMem(Offset); + PendingMembers[1].convertToMem(Offset + 4); + PendingMembers[2].convertToMem(Offset + 8); + PendingMembers[3].convertToMem(Offset + 12); + + State.addLoc(PendingMembers[0]); + State.addLoc(PendingMembers[1]); + State.addLoc(PendingMembers[2]); + State.addLoc(PendingMembers[3]); + PendingMembers.clear(); + return true; +} + // Provides entry points of CC_X86 and RetCC_X86. #include "X86GenCallingConv.inc" diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td index 823e0caa0226..f020e0b55141 100644 --- a/llvm/lib/Target/X86/X86CallingConv.td +++ b/llvm/lib/Target/X86/X86CallingConv.td @@ -859,6 +859,11 @@ def CC_X86_32_C : CallingConv<[ // The 'nest' parameter, if any, is passed in ECX. CCIfNest<CCAssignToReg<[ECX]>>, + // i128 and fp128 need to be passed on the stack with a higher alignment than + // their legal types. Handle this with a custom function. + CCIfType<[i32], + CCIfConsecutiveRegs<CCCustom<"CC_X86_32_I128_FP128">>>, + // On swifttailcc pass swiftself in ECX. CCIfCC<"CallingConv::SwiftTail", CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[ECX]>>>>, diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index c7abb367fad2..0e6b4dffec3a 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -376,8 +376,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, case X86::EH_RETURN64: { MachineOperand &DestAddr = MBBI->getOperand(0); assert(DestAddr.isReg() && "Offset should be in register!"); - const bool Uses64BitFramePtr = - STI->isTarget64BitLP64() || STI->isTargetNaCl64(); + const bool Uses64BitFramePtr = STI->isTarget64BitLP64(); Register StackPtr = TRI->getStackRegister(); BuildMI(MBB, MBBI, DL, TII->get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), StackPtr) diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index c96d3c15a882..95ed5908e231 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -54,8 +54,8 @@ X86FrameLowering::X86FrameLowering(const X86Subtarget &STI, SlotSize = TRI->getSlotSize(); Is64Bit = STI.is64Bit(); IsLP64 = STI.isTarget64BitLP64(); - // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. - Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); + // standard x86_64 uses 64-bit frame/stack pointers, x32 - 32-bit. + Uses64BitFramePtr = STI.isTarget64BitLP64(); StackPtr = TRI->getStackRegister(); } @@ -2412,7 +2412,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, DebugLoc DL; if (MBBI != MBB.end()) DL = MBBI->getDebugLoc(); - // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. + // standard x86_64 uses 64-bit frame/stack pointers, x32 - 32-bit. const bool Is64BitILP32 = STI.isTarget64BitILP32(); Register FramePtr = TRI->getFrameRegister(MF); Register MachineFramePtr = @@ -4241,7 +4241,7 @@ void X86FrameLowering::adjustFrameForMsvcCxxEh(MachineFunction &MF) const { for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) { for (WinEHHandlerType &H : TBME.HandlerArray) { int FrameIndex = H.CatchObj.FrameIndex; - if (FrameIndex != INT_MAX) { + if ((FrameIndex != INT_MAX) && MFI.getObjectOffset(FrameIndex) == 0) { // Ensure alignment. unsigned Align = MFI.getObjectAlign(FrameIndex).value(); MinFixedObjOffset -= std::abs(MinFixedObjOffset) % Align; diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 32c7d2bfea6c..62073ec125e8 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -5428,10 +5428,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) { } case ISD::BRIND: case X86ISD::NT_BRIND: { - if (Subtarget->isTargetNaCl()) - // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We - // leave the instruction alone. - break; if (Subtarget->isTarget64BitILP32()) { // Converts a 32-bit register to a 64-bit, zero-extended version of // it. This is needed because x86-64 can do many things, but jmp %r32 diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 5e35d5630d66..d91ea1ea1bb1 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -36615,8 +36615,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI, tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), sizeVReg = MI.getOperand(1).getReg(), - physSPReg = - IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP; + physSPReg = IsLP64 ? X86::RSP : X86::ESP; MachineFunction::iterator MBBIter = ++BB->getIterator(); @@ -37121,8 +37120,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, // restoreMBB: if (RegInfo->hasBasePointer(*MF)) { - const bool Uses64BitFramePtr = - Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64(); + const bool Uses64BitFramePtr = Subtarget.isTarget64BitLP64(); X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>(); X86FI->setRestoreBasePointer(MF); Register FramePtr = RegInfo->getFrameRegister(*MF); @@ -37550,8 +37548,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, // Add a register mask with no preserved registers. This results in all // registers being marked as clobbered. if (RI.hasBasePointer(*MF)) { - const bool FPIs64Bit = - Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64(); + const bool FPIs64Bit = Subtarget.isTarget64BitLP64(); X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>(); MFI->setRestoreBasePointer(MF); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 6bcb7a36e91b..26369792db26 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1661,7 +1661,7 @@ namespace llvm { /// Lower interleaved load(s) into target specific /// instructions/intrinsics. - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const override; diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index 9ad355311527..b4639ac2577e 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -237,9 +237,18 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, bool X86TargetLowering::functionArgumentNeedsConsecutiveRegisters( Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const { - // i128 split into i64 needs to be allocated to two consecutive registers, - // or spilled to the stack as a whole. - return Ty->isIntegerTy(128); + // On x86-64 i128 is split into two i64s and needs to be allocated to two + // consecutive registers, or spilled to the stack as a whole. On x86-32 i128 + // is split to four i32s and never actually passed in registers, but we use + // the consecutive register mark to match it in TableGen. + if (Ty->isIntegerTy(128)) + return true; + + // On x86-32, fp128 acts the same as i128. + if (Subtarget.is32Bit() && Ty->isFP128Ty()) + return true; + + return false; } /// Helper for getByValTypeAlignment to determine diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td index 307c03c8ef54..df1541e9085b 100644 --- a/llvm/lib/Target/X86/X86InstrPredicates.td +++ b/llvm/lib/Target/X86/X86InstrPredicates.td @@ -214,8 +214,6 @@ def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() ||" } def IsPS : Predicate<"Subtarget->isTargetPS()">; def NotPS : Predicate<"!Subtarget->isTargetPS()">; -def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">; -def NotNaCl : Predicate<"!Subtarget->isTargetNaCl()">; def SmallCode : Predicate<"TM.getCodeModel() == CodeModel::Small">; def KernelCode : Predicate<"TM.getCodeModel() == CodeModel::Kernel">; def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||" diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp index 1eb47e3b2cd1..360293bce54e 100644 --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -801,7 +801,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { // number of shuffles and ISA. // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX. bool X86TargetLowering::lowerInterleavedLoad( - LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles, + Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles, ArrayRef<unsigned> Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); @@ -809,6 +809,11 @@ bool X86TargetLowering::lowerInterleavedLoad( assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); + auto *LI = dyn_cast<LoadInst>(Load); + if (!LI) + return false; + assert(!Mask && "Unexpected mask on a load"); + // Create an interleaved access group. IRBuilder<> Builder(LI); X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget, diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index a8ee9f55611b..8ad8d423d10c 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -302,13 +302,12 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, reportFatalUsageError("64-bit code requested on a subtarget that doesn't " "support it!"); - // Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD, NaCl, and for all + // Stack alignment is 16 bytes on Darwin, Linux, kFreeBSD, and for all // 64-bit targets. On Solaris (32-bit), stack alignment is 4 bytes // following the i386 psABI, while on Illumos it is always 16 bytes. if (StackAlignOverride) stackAlignment = *StackAlignOverride; - else if (isTargetDarwin() || isTargetLinux() || isTargetKFreeBSD() || - isTargetNaCl() || Is64Bit) + else if (isTargetDarwin() || isTargetLinux() || isTargetKFreeBSD() || Is64Bit) stackAlignment = Align(16); // Consume the vector width attribute or apply any target specific limit. diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index 38b8c246eb29..be49214e041e 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -170,14 +170,10 @@ public: #include "X86GenSubtargetInfo.inc" /// Is this x86_64 with the ILP32 programming model (x32 ABI)? - bool isTarget64BitILP32() const { - return Is64Bit && (TargetTriple.isX32() || TargetTriple.isOSNaCl()); - } + bool isTarget64BitILP32() const { return Is64Bit && (TargetTriple.isX32()); } /// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)? - bool isTarget64BitLP64() const { - return Is64Bit && (!TargetTriple.isX32() && !TargetTriple.isOSNaCl()); - } + bool isTarget64BitLP64() const { return Is64Bit && (!TargetTriple.isX32()); } PICStyles::Style getPICStyle() const { return PICStyle; } void setPICStyle(PICStyles::Style Style) { PICStyle = Style; } @@ -299,9 +295,6 @@ public: bool isTargetKFreeBSD() const { return TargetTriple.isOSKFreeBSD(); } bool isTargetGlibc() const { return TargetTriple.isOSGlibc(); } bool isTargetAndroid() const { return TargetTriple.isAndroid(); } - bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); } - bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); } - bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); } bool isTargetMCU() const { return TargetTriple.isOSIAMCU(); } bool isTargetFuchsia() const { return TargetTriple.isOSFuchsia(); } diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 85cc5b43d40b..6d9c6cdedd9e 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -131,7 +131,7 @@ static std::string computeDataLayout(const Triple &TT) { Ret += DataLayout::getManglingComponent(TT); // X86 and x32 have 32 bit pointers. - if (!TT.isArch64Bit() || TT.isX32() || TT.isOSNaCl()) + if (!TT.isArch64Bit() || TT.isX32()) Ret += "-p:32:32"; // Address spaces for 32 bit signed, 32 bit unsigned, and 64 bit pointers. @@ -140,7 +140,7 @@ static std::string computeDataLayout(const Triple &TT) { // Some ABIs align 64 bit integers and doubles to 64 bits, others to 32. // 128 bit integers are not specified in the 32-bit ABIs but are used // internally for lowering f128, so we match the alignment to that. - if (TT.isArch64Bit() || TT.isOSWindows() || TT.isOSNaCl()) + if (TT.isArch64Bit() || TT.isOSWindows()) Ret += "-i64:64-i128:128"; else if (TT.isOSIAMCU()) Ret += "-i64:32-f64:32"; @@ -148,7 +148,7 @@ static std::string computeDataLayout(const Triple &TT) { Ret += "-i128:128-f64:32:64"; // Some ABIs align long double to 128 bits, others to 32. - if (TT.isOSNaCl() || TT.isOSIAMCU()) + if (TT.isOSIAMCU()) ; // No f80 else if (TT.isArch64Bit() || TT.isOSDarwin() || TT.isWindowsMSVCEnvironment()) Ret += "-f80:128"; diff --git a/llvm/lib/Target/X86/X86WinEHState.cpp b/llvm/lib/Target/X86/X86WinEHState.cpp index 27111fce4566..a650f6f069e5 100644 --- a/llvm/lib/Target/X86/X86WinEHState.cpp +++ b/llvm/lib/Target/X86/X86WinEHState.cpp @@ -811,7 +811,7 @@ void WinEHStatePass::updateEspForInAllocas(Function &F) { if (auto *Alloca = dyn_cast<AllocaInst>(&I)) { if (Alloca->isStaticAlloca()) continue; - IRBuilder<> Builder(Alloca->getNextNonDebugInstruction()); + IRBuilder<> Builder(Alloca->getNextNode()); // SavedESP = llvm.stacksave() Value *SP = Builder.CreateStackSave(); Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0)); @@ -820,7 +820,7 @@ void WinEHStatePass::updateEspForInAllocas(Function &F) { if (auto *II = dyn_cast<IntrinsicInst>(&I)) { if (II->getIntrinsicID() != Intrinsic::stackrestore) continue; - IRBuilder<> Builder(II->getNextNonDebugInstruction()); + IRBuilder<> Builder(II->getNextNode()); // SavedESP = llvm.stacksave() Value *SP = Builder.CreateStackSave(); Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0)); diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp index 671f1d04daf2..9167794a51e8 100644 --- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp +++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaAsmBackend.cpp @@ -144,7 +144,7 @@ std::optional<bool> XtensaAsmBackend::evaluateFixup(const MCFragment &F, // For a few PC-relative fixups, offsets need to be aligned down. We // compensate here because the default handler's `Value` decrement doesn't // account for this alignment. - switch (Fixup.getTargetKind()) { + switch (Fixup.getKind()) { case Xtensa::fixup_xtensa_call_18: case Xtensa::fixup_xtensa_l32r_16: Value = (Asm->getFragmentOffset(F) + Fixup.getOffset()) % 4; |
