diff options
Diffstat (limited to 'llvm/lib/Target/AArch64/AArch64ISelLowering.cpp')
| -rw-r--r-- | llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 617 |
1 files changed, 554 insertions, 63 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d70a46b0e893..5ffaf2c49b4c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1179,6 +1179,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); setTargetDAGCombine(ISD::SHL); + setTargetDAGCombine(ISD::VECTOR_DEINTERLEAVE); // In case of strict alignment, avoid an excessive number of byte wide stores. MaxStoresPerMemsetOptSize = 8; @@ -1918,6 +1919,20 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, } } + // Handle non-aliasing elements mask + if (Subtarget->hasSVE2() || + (Subtarget->hasSME() && Subtarget->isStreaming())) { + // FIXME: Support wider fixed-length types when msve-vector-bits is used. + for (auto VT : {MVT::v2i32, MVT::v4i16, MVT::v8i8, MVT::v16i8}) { + setOperationAction(ISD::LOOP_DEPENDENCE_RAW_MASK, VT, Custom); + setOperationAction(ISD::LOOP_DEPENDENCE_WAR_MASK, VT, Custom); + } + for (auto VT : {MVT::nxv2i1, MVT::nxv4i1, MVT::nxv8i1, MVT::nxv16i1}) { + setOperationAction(ISD::LOOP_DEPENDENCE_RAW_MASK, VT, Custom); + setOperationAction(ISD::LOOP_DEPENDENCE_WAR_MASK, VT, Custom); + } + } + // Handle operations that are only available in non-streaming SVE mode. if (Subtarget->isSVEAvailable()) { for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64, @@ -2585,6 +2600,30 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode( Known = Known.intersectWith(Known2); break; } + case AArch64ISD::CSNEG: + case AArch64ISD::CSINC: + case AArch64ISD::CSINV: { + KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1); + KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1); + + // The result is either: + // CSINC: KnownOp0 or KnownOp1 + 1 + // CSINV: KnownOp0 or ~KnownOp1 + // CSNEG: KnownOp0 or KnownOp1 * -1 + if (Op.getOpcode() == AArch64ISD::CSINC) + KnownOp1 = KnownBits::add( + KnownOp1, + KnownBits::makeConstant(APInt(Op.getScalarValueSizeInBits(), 1))); + else if (Op.getOpcode() == AArch64ISD::CSINV) + std::swap(KnownOp1.Zero, KnownOp1.One); + else if (Op.getOpcode() == AArch64ISD::CSNEG) + KnownOp1 = + KnownBits::mul(KnownOp1, KnownBits::makeConstant(APInt::getAllOnes( + Op.getScalarValueSizeInBits()))); + + Known = KnownOp0.intersectWith(KnownOp1); + break; + } case AArch64ISD::BICi: { // Compute the bit cleared value. APInt Mask = @@ -2626,6 +2665,32 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode( << Op->getConstantOperandVal(1))); break; } + case AArch64ISD::MOVImsl: { + unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1)); + Known = KnownBits::makeConstant(APInt( + Known.getBitWidth(), ~(~Op->getConstantOperandVal(0) << ShiftAmt))); + break; + } + case AArch64ISD::MOVIedit: { + Known = KnownBits::makeConstant(APInt( + Known.getBitWidth(), + AArch64_AM::decodeAdvSIMDModImmType10(Op->getConstantOperandVal(0)))); + break; + } + case AArch64ISD::MVNIshift: { + Known = KnownBits::makeConstant( + APInt(Known.getBitWidth(), + ~(Op->getConstantOperandVal(0) << Op->getConstantOperandVal(1)), + /*isSigned*/ false, /*implicitTrunc*/ true)); + break; + } + case AArch64ISD::MVNImsl: { + unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1)); + Known = KnownBits::makeConstant( + APInt(Known.getBitWidth(), (~Op->getConstantOperandVal(0) << ShiftAmt), + /*isSigned*/ false, /*implicitTrunc*/ true)); + break; + } case AArch64ISD::LOADgot: case AArch64ISD::ADDlow: { if (!Subtarget->isTargetILP32()) @@ -2984,21 +3049,20 @@ AArch64TargetLowering::EmitInitTPIDR2Object(MachineInstr &MI, AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>(); TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); if (TPIDR2.Uses > 0) { + // Note: This case just needs to do `SVL << 48`. It is not implemented as we + // generally don't support big-endian SVE/SME. + if (!Subtarget->isLittleEndian()) + reportFatalInternalError( + "TPIDR2 block initialization is not supported on big-endian targets"); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); - // Store the buffer pointer to the TPIDR2 stack object. - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui)) + // Store buffer pointer and num_za_save_slices. + // Bytes 10-15 are implicitly zeroed. + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STPXi)) .addReg(MI.getOperand(0).getReg()) + .addReg(MI.getOperand(1).getReg()) .addFrameIndex(TPIDR2.FrameIndex) .addImm(0); - // Set the reserved bytes (10-15) to zero - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui)) - .addReg(AArch64::WZR) - .addFrameIndex(TPIDR2.FrameIndex) - .addImm(5); - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui)) - .addReg(AArch64::WZR) - .addFrameIndex(TPIDR2.FrameIndex) - .addImm(3); } else MFI.RemoveStackObject(TPIDR2.FrameIndex); @@ -3111,21 +3175,24 @@ MachineBasicBlock * AArch64TargetLowering::EmitEntryPStateSM(MachineInstr &MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); - AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + const DebugLoc &DL = MI.getDebugLoc(); Register ResultReg = MI.getOperand(0).getReg(); - if (FuncInfo->isPStateSMRegUsed()) { + if (MF->getRegInfo().use_empty(ResultReg)) { + // Nothing to do. Pseudo erased below. + } else if (Subtarget->hasSME()) { + BuildMI(*BB, MI, DL, TII->get(AArch64::MRS), ResultReg) + .addImm(AArch64SysReg::SVCR) + .addReg(AArch64::VG, RegState::Implicit); + } else { RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE; const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL)) + BuildMI(*BB, MI, DL, TII->get(AArch64::BL)) .addExternalSymbol(getLibcallName(LC)) .addReg(AArch64::X0, RegState::ImplicitDefine) .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC))); - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), ResultReg) + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), ResultReg) .addReg(AArch64::X0); - } else { - assert(MI.getMF()->getRegInfo().use_empty(ResultReg) && - "Expected no users of the entry pstate.sm!"); } MI.eraseFromParent(); return BB; @@ -4912,6 +4979,18 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, if (DstWidth < SatWidth) return SDValue(); + if (SrcVT == MVT::f16 && SatVT == MVT::i16 && DstVT == MVT::i32) { + if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) { + SDValue CVTf32 = + DAG.getNode(AArch64ISD::FCVTZS_HALF, DL, MVT::f32, SrcVal); + SDValue Bitcast = DAG.getBitcast(DstVT, CVTf32); + return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, Bitcast, + DAG.getValueType(SatVT)); + } + SDValue CVTf32 = DAG.getNode(AArch64ISD::FCVTZU_HALF, DL, MVT::f32, SrcVal); + return DAG.getBitcast(DstVT, CVTf32); + } + SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT)); SDValue Sat; @@ -5242,6 +5321,56 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, static MVT getSVEContainerType(EVT ContentTy); +SDValue +AArch64TargetLowering::LowerLOOP_DEPENDENCE_MASK(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + uint64_t EltSize = Op.getConstantOperandVal(2); + EVT VT = Op.getValueType(); + switch (EltSize) { + case 1: + if (VT != MVT::v16i8 && VT != MVT::nxv16i1) + return SDValue(); + break; + case 2: + if (VT != MVT::v8i8 && VT != MVT::nxv8i1) + return SDValue(); + break; + case 4: + if (VT != MVT::v4i16 && VT != MVT::nxv4i1) + return SDValue(); + break; + case 8: + if (VT != MVT::v2i32 && VT != MVT::nxv2i1) + return SDValue(); + break; + default: + // Other element sizes are incompatible with whilewr/rw, so expand instead + return SDValue(); + } + + SDValue PtrA = Op.getOperand(0); + SDValue PtrB = Op.getOperand(1); + + if (VT.isScalableVT()) + return DAG.getNode(Op.getOpcode(), DL, VT, PtrA, PtrB, Op.getOperand(2)); + + // We can use the SVE whilewr/whilerw instruction to lower this + // intrinsic by creating the appropriate sequence of scalable vector + // operations and then extracting a fixed-width subvector from the scalable + // vector. Scalable vector variants are already legal. + EVT ContainerVT = + EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + VT.getVectorNumElements(), true); + EVT WhileVT = ContainerVT.changeElementType(MVT::i1); + + SDValue Mask = + DAG.getNode(Op.getOpcode(), DL, WhileVT, PtrA, PtrB, Op.getOperand(2)); + SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt, + DAG.getVectorIdxConstant(0, DL)); +} + SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { EVT OpVT = Op.getValueType(); @@ -6000,6 +6129,38 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, EVT PtrVT = getPointerTy(DAG.getDataLayout()); return DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); } + case Intrinsic::aarch64_sve_whilewr_b: + return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(1, DL, MVT::i64)); + case Intrinsic::aarch64_sve_whilewr_h: + return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(2, DL, MVT::i64)); + case Intrinsic::aarch64_sve_whilewr_s: + return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(4, DL, MVT::i64)); + case Intrinsic::aarch64_sve_whilewr_d: + return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(8, DL, MVT::i64)); + case Intrinsic::aarch64_sve_whilerw_b: + return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(1, DL, MVT::i64)); + case Intrinsic::aarch64_sve_whilerw_h: + return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(2, DL, MVT::i64)); + case Intrinsic::aarch64_sve_whilerw_s: + return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(4, DL, MVT::i64)); + case Intrinsic::aarch64_sve_whilerw_d: + return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(8, DL, MVT::i64)); case Intrinsic::aarch64_neon_abs: { EVT Ty = Op.getValueType(); if (Ty == MVT::i64) { @@ -7359,6 +7520,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, default: llvm_unreachable("unimplemented operand"); return SDValue(); + case ISD::LOOP_DEPENDENCE_RAW_MASK: + case ISD::LOOP_DEPENDENCE_WAR_MASK: + return LowerLOOP_DEPENDENCE_MASK(Op, DAG); case ISD::BITCAST: return LowerBITCAST(Op, DAG); case ISD::GlobalAddress: @@ -7873,6 +8037,39 @@ static bool isPassedInFPR(EVT VT) { (VT.isFloatingPoint() && !VT.isScalableVector()); } +SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL, + SelectionDAG &DAG) const { + assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value"); + SDValue Glue = Chain.getValue(1); + + MachineFunction &MF = DAG.getMachineFunction(); + SMEAttrs SMEFnAttrs = MF.getInfo<AArch64FunctionInfo>()->getSMEFnAttrs(); + + // The following conditions are true on entry to an exception handler: + // - PSTATE.SM is 0. + // - PSTATE.ZA is 0. + // - TPIDR2_EL0 is null. + // See: + // https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#exceptions + // + // Therefore, if the function that contains this exception handler is a + // streaming[-compatible] function, we must re-enable streaming mode. + // + // These mode changes are usually optimized away in catch blocks as they + // occur before the __cxa_begin_catch (which is a non-streaming function), + // but are necessary in some cases (such as for cleanups). + + if (SMEFnAttrs.hasStreamingInterfaceOrBody()) + return changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, + /*Glue*/ Glue, AArch64SME::Always); + + if (SMEFnAttrs.hasStreamingCompatibleInterface()) + return changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue, + AArch64SME::IfCallerIsStreaming); + + return Chain; +} + SDValue AArch64TargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL, @@ -8292,7 +8489,39 @@ SDValue AArch64TargetLowering::LowerFormalArguments( if (Subtarget->hasCustomCallingConv()) Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF); - if (!getTM().useNewSMEABILowering() || Attrs.hasAgnosticZAInterface()) { + if (getTM().useNewSMEABILowering()) { + if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) { + SDValue Size; + if (Attrs.hasZAState()) { + SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, + DAG.getConstant(1, DL, MVT::i32)); + Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL); + } else if (Attrs.hasAgnosticZAInterface()) { + RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE; + SDValue Callee = DAG.getExternalSymbol( + getLibcallName(LC), getPointerTy(DAG.getDataLayout())); + auto *RetTy = EVT(MVT::i64).getTypeForEVT(*DAG.getContext()); + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( + getLibcallCallingConv(LC), RetTy, Callee, {}); + std::tie(Size, Chain) = LowerCallTo(CLI); + } + if (Size) { + SDValue Buffer = DAG.getNode( + ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other), + {Chain, Size, DAG.getConstant(1, DL, MVT::i64)}); + Chain = Buffer.getValue(1); + + Register BufferPtr = + MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); + Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer); + Chain = DAG.getNode(AArch64ISD::SME_STATE_ALLOC, DL, + DAG.getVTList(MVT::Other), Chain); + FuncInfo->setEarlyAllocSMESaveBuffer(BufferPtr); + MFI.CreateVariableSizedObject(Align(16), nullptr); + } + } + } else { // Old SME ABI lowering (deprecated): // Create a 16 Byte TPIDR2 object. The dynamic buffer // will be expanded and stored in the static object later using a @@ -8313,9 +8542,12 @@ SDValue AArch64TargetLowering::LowerFormalArguments( {Chain, Size, DAG.getConstant(1, DL, MVT::i64)}); MFI.CreateVariableSizedObject(Align(16), nullptr); } + SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, + DAG.getConstant(1, DL, MVT::i32)); Chain = DAG.getNode( AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other), - {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)}); + {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0), + /*Num save slices*/ NumZaSaveSlices}); } else if (Attrs.hasAgnosticZAInterface()) { // Call __arm_sme_state_size(). SDValue BufferSize = @@ -8338,7 +8570,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( Register BufferPtr = MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass); FuncInfo->setSMESaveBufferAddr(BufferPtr); - Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer); + Chain = DAG.getCopyToReg(Buffer.getValue(1), DL, BufferPtr, Buffer); } } @@ -8905,7 +9137,6 @@ SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL, SmallVector<SDValue> Ops = {Chain, MSROp}; unsigned Opcode; if (Condition != AArch64SME::Always) { - FuncInfo->setPStateSMRegUsed(true); Register PStateReg = FuncInfo->getPStateSMReg(); assert(PStateReg.isValid() && "PStateSM Register is invalid"); SDValue PStateSM = @@ -9078,17 +9309,17 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Determine whether we need any streaming mode changes. SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), *this, CLI); + + std::optional<unsigned> ZAMarkerNode; bool UseNewSMEABILowering = getTM().useNewSMEABILowering(); - bool IsAgnosticZAFunction = CallAttrs.caller().hasAgnosticZAInterface(); - auto ZAMarkerNode = [&]() -> std::optional<unsigned> { - // TODO: Handle agnostic ZA functions. - if (!UseNewSMEABILowering || IsAgnosticZAFunction) - return std::nullopt; - if (!CallAttrs.caller().hasZAState() && !CallAttrs.caller().hasZT0State()) - return std::nullopt; - return CallAttrs.requiresLazySave() ? AArch64ISD::REQUIRES_ZA_SAVE - : AArch64ISD::INOUT_ZA_USE; - }(); + if (UseNewSMEABILowering) { + if (CallAttrs.requiresLazySave() || + CallAttrs.requiresPreservingAllZAState()) + ZAMarkerNode = AArch64ISD::REQUIRES_ZA_SAVE; + else if (CallAttrs.caller().hasZAState() || + CallAttrs.caller().hasZT0State()) + ZAMarkerNode = AArch64ISD::INOUT_ZA_USE; + } if (IsTailCall) { // Check if it's really possible to do a tail call. @@ -9163,21 +9394,13 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, }; bool RequiresLazySave = !UseNewSMEABILowering && CallAttrs.requiresLazySave(); - bool RequiresSaveAllZA = CallAttrs.requiresPreservingAllZAState(); + bool RequiresSaveAllZA = + !UseNewSMEABILowering && CallAttrs.requiresPreservingAllZAState(); if (RequiresLazySave) { - const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); - MachinePointerInfo MPI = - MachinePointerInfo::getStack(MF, TPIDR2.FrameIndex); + TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); SDValue TPIDR2ObjAddr = DAG.getFrameIndex( TPIDR2.FrameIndex, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); - SDValue NumZaSaveSlicesAddr = - DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr, - DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType())); - SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, - DAG.getConstant(1, DL, MVT::i32)); - Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr, - MPI, MVT::i16); Chain = DAG.getNode( ISD::INTRINSIC_VOID, DL, MVT::Other, Chain, DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), @@ -17599,14 +17822,16 @@ bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) { bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store, Value *LaneMask, ShuffleVectorInst *SVI, - unsigned Factor) const { + unsigned Factor, + const APInt &GapMask) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); auto *SI = dyn_cast<StoreInst>(Store); if (!SI) return false; - assert(!LaneMask && "Unexpected mask on store"); + assert(!LaneMask && GapMask.popcount() == Factor && + "Unexpected mask on store"); auto *VecTy = cast<FixedVectorType>(SVI->getType()); assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); @@ -20868,13 +21093,6 @@ static bool isNegatedInteger(SDValue Op) { return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)); } -static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG) { - SDLoc DL(Op); - EVT VT = Op.getValueType(); - SDValue Zero = DAG.getConstant(0, DL, VT); - return DAG.getNode(ISD::SUB, DL, VT, Zero, Op); -} - // Try to fold // // (neg (csel X, Y)) -> (csel (neg X), (neg Y)) @@ -20893,16 +21111,17 @@ static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG) { SDValue N0 = CSel.getOperand(0); SDValue N1 = CSel.getOperand(1); - // If both of them is not negations, it's not worth the folding as it + // If neither of them are negations, it's not worth the folding as it // introduces two additional negations while reducing one negation. if (!isNegatedInteger(N0) && !isNegatedInteger(N1)) return SDValue(); - SDValue N0N = getNegatedInteger(N0, DAG); - SDValue N1N = getNegatedInteger(N1, DAG); - SDLoc DL(N); EVT VT = CSel.getValueType(); + + SDValue N0N = DAG.getNegative(N0, DL, VT); + SDValue N1N = DAG.getNegative(N1, DL, VT); + return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2), CSel.getOperand(3)); } @@ -22087,10 +22306,14 @@ static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op); } + unsigned PTest = AArch64ISD::PTEST; + if (Cond == AArch64CC::ANY_ACTIVE) + PTest = AArch64ISD::PTEST_ANY; + else if (Cond == AArch64CC::FIRST_ACTIVE) + PTest = AArch64ISD::PTEST_FIRST; + // Set condition code (CC) flags. - SDValue Test = DAG.getNode( - Cond == AArch64CC::ANY_ACTIVE ? AArch64ISD::PTEST_ANY : AArch64ISD::PTEST, - DL, MVT::i32, Pg, Op); + SDValue Test = DAG.getNode(PTest, DL, MVT::i32, Pg, Op); // Convert CC to integer based on requested condition. // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare. @@ -22158,6 +22381,17 @@ static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, Zero); } +static SDValue tryCombineNeonFcvtFP16ToI16(SDNode *N, unsigned Opcode, + SelectionDAG &DAG) { + if (N->getValueType(0) != MVT::i16) + return SDValue(); + + SDLoc DL(N); + SDValue CVT = DAG.getNode(Opcode, DL, MVT::f32, N->getOperand(1)); + SDValue Bitcast = DAG.getBitcast(MVT::i32, CVT); + return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Bitcast); +} + // If a merged operation has no inactive lanes we can relax it to a predicated // or unpredicated operation, which potentially allows better isel (perhaps // using immediate forms) or relaxing register reuse requirements. @@ -22411,6 +22645,26 @@ static SDValue performIntrinsicCombine(SDNode *N, case Intrinsic::aarch64_neon_uabd: return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); + case Intrinsic::aarch64_neon_fcvtzs: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZS_HALF, DAG); + case Intrinsic::aarch64_neon_fcvtzu: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZU_HALF, DAG); + case Intrinsic::aarch64_neon_fcvtas: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAS_HALF, DAG); + case Intrinsic::aarch64_neon_fcvtau: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAU_HALF, DAG); + case Intrinsic::aarch64_neon_fcvtms: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMS_HALF, DAG); + case Intrinsic::aarch64_neon_fcvtmu: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMU_HALF, DAG); + case Intrinsic::aarch64_neon_fcvtns: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNS_HALF, DAG); + case Intrinsic::aarch64_neon_fcvtnu: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNU_HALF, DAG); + case Intrinsic::aarch64_neon_fcvtps: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPS_HALF, DAG); + case Intrinsic::aarch64_neon_fcvtpu: + return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPU_HALF, DAG); case Intrinsic::aarch64_crc32b: case Intrinsic::aarch64_crc32cb: return tryCombineCRC32(0xff, N, DAG); @@ -22419,7 +22673,7 @@ static SDValue performIntrinsicCombine(SDNode *N, return tryCombineCRC32(0xffff, N, DAG); case Intrinsic::aarch64_sve_saddv: // There is no i64 version of SADDV because the sign is irrelevant. - if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64) + if (N->getOperand(2).getValueType().getVectorElementType() == MVT::i64) return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG); else return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG); @@ -24106,6 +24360,7 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) { // Ensure that all elements' bits are either 0s or 1s. ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT); + bool IsLE = DAG.getDataLayout().isLittleEndian(); SmallVector<SDValue, 16> MaskConstants; if (DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable() && VecVT == MVT::v16i8) { @@ -24113,7 +24368,10 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) { // per entry. We split it into two halves, apply the mask, zip the halves to // create 8x 16-bit values, and the perform the vector reduce. for (unsigned Half = 0; Half < 2; ++Half) { - for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) { + for (unsigned I = 0; I < 8; ++I) { + // On big-endian targets, the lane order in sub-byte vector elements + // gets reversed, so we need to flip the bit index. + unsigned MaskBit = IsLE ? (1u << I) : (1u << (7 - I)); MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32)); } } @@ -24131,8 +24389,9 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) { } // All other vector sizes. - unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1); - for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) { + unsigned NumEl = VecVT.getVectorNumElements(); + for (unsigned I = 0; I < NumEl; ++I) { + unsigned MaskBit = IsLE ? (1u << I) : (1u << (NumEl - 1 - I)); MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64)); } @@ -24444,6 +24703,105 @@ static SDValue performSTORECombine(SDNode *N, return SDValue(); } +static bool +isSequentialConcatOfVectorInterleave(SDNode *N, SmallVectorImpl<SDValue> &Ops) { + if (N->getOpcode() != ISD::CONCAT_VECTORS) + return false; + + unsigned NumParts = N->getNumOperands(); + + // We should be concatenating each sequential result from a + // VECTOR_INTERLEAVE. + SDNode *InterleaveOp = N->getOperand(0).getNode(); + if (InterleaveOp->getOpcode() != ISD::VECTOR_INTERLEAVE || + InterleaveOp->getNumOperands() != NumParts) + return false; + + for (unsigned I = 0; I < NumParts; I++) + if (N->getOperand(I) != SDValue(InterleaveOp, I)) + return false; + + Ops.append(InterleaveOp->op_begin(), InterleaveOp->op_end()); + return true; +} + +static SDValue getNarrowMaskForInterleavedOps(SelectionDAG &DAG, SDLoc &DL, + SDValue WideMask, + unsigned RequiredNumParts) { + if (WideMask->getOpcode() == ISD::CONCAT_VECTORS) { + SmallVector<SDValue, 4> MaskInterleaveOps; + if (!isSequentialConcatOfVectorInterleave(WideMask.getNode(), + MaskInterleaveOps)) + return SDValue(); + + if (MaskInterleaveOps.size() != RequiredNumParts) + return SDValue(); + + // Make sure the inputs to the vector interleave are identical. + if (!llvm::all_equal(MaskInterleaveOps)) + return SDValue(); + + return MaskInterleaveOps[0]; + } + + if (WideMask->getOpcode() != ISD::SPLAT_VECTOR) + return SDValue(); + + ElementCount EC = WideMask.getValueType().getVectorElementCount(); + assert(EC.isKnownMultipleOf(RequiredNumParts) && + "Expected element count divisible by number of parts"); + EC = EC.divideCoefficientBy(RequiredNumParts); + return DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::getVectorVT(MVT::i1, EC), + WideMask->getOperand(0)); +} + +static SDValue performInterleavedMaskedStoreCombine( + SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { + if (!DCI.isBeforeLegalize()) + return SDValue(); + + MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N); + SDValue WideValue = MST->getValue(); + + // Bail out if the stored value has an unexpected number of uses, since we'll + // have to perform manual interleaving and may as well just use normal masked + // stores. Also, discard masked stores that are truncating or indexed. + if (!WideValue.hasOneUse() || !ISD::isNormalMaskedStore(MST) || + !MST->isSimple() || !MST->getOffset().isUndef()) + return SDValue(); + + SmallVector<SDValue, 4> ValueInterleaveOps; + if (!isSequentialConcatOfVectorInterleave(WideValue.getNode(), + ValueInterleaveOps)) + return SDValue(); + + unsigned NumParts = ValueInterleaveOps.size(); + if (NumParts != 2 && NumParts != 4) + return SDValue(); + + // At the moment we're unlikely to see a fixed-width vector interleave as + // we usually generate shuffles instead. + EVT SubVecTy = ValueInterleaveOps[0].getValueType(); + if (!SubVecTy.isScalableVT() || + SubVecTy.getSizeInBits().getKnownMinValue() != 128 || + !DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy)) + return SDValue(); + + SDLoc DL(N); + SDValue NarrowMask = + getNarrowMaskForInterleavedOps(DAG, DL, MST->getMask(), NumParts); + if (!NarrowMask) + return SDValue(); + + const Intrinsic::ID IID = + NumParts == 2 ? Intrinsic::aarch64_sve_st2 : Intrinsic::aarch64_sve_st4; + SmallVector<SDValue, 8> NewStOps; + NewStOps.append({MST->getChain(), DAG.getConstant(IID, DL, MVT::i32)}); + NewStOps.append(ValueInterleaveOps); + NewStOps.append({NarrowMask, MST->getBasePtr()}); + return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, NewStOps); +} + static SDValue performMSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, @@ -24453,6 +24811,9 @@ static SDValue performMSTORECombine(SDNode *N, SDValue Mask = MST->getMask(); SDLoc DL(N); + if (SDValue Res = performInterleavedMaskedStoreCombine(N, DCI, DAG)) + return Res; + // If this is a UZP1 followed by a masked store, fold this into a masked // truncating store. We can do this even if this is already a masked // truncstore. @@ -26523,6 +26884,26 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, return DAG.getNode(SOpc, DL, N->getValueType(0), Ext); } + // Sign extend of CSET -> CSETM. + if (Opc == AArch64ISD::CSEL && + cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i1) { + EVT VT = N->getValueType(0); + SDValue TVal = Src.getOperand(0); + SDValue FVal = Src.getOperand(1); + + // SIGN_EXTEND_INREG (CSEL 0, 1, cc, NZCV), i1 --> CSEL 0, -1, cc, NZCV + if (isNullConstant(TVal) && isOneConstant(FVal)) + return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, + DAG.getAllOnesConstant(DL, VT), Src.getOperand(2), + Src.getOperand(3)); + + // SIGN_EXTEND_INREG (CSEL 1, 0, cc, NZCV), i1 --> CSEL -1, 0, cc, NZCV + if (isOneConstant(TVal) && isNullConstant(FVal)) + return DAG.getNode(AArch64ISD::CSEL, DL, VT, + DAG.getAllOnesConstant(DL, VT), FVal, + Src.getOperand(2), Src.getOperand(3)); + } + if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -27020,6 +27401,83 @@ performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, return NVCAST; } +static SDValue performVectorDeinterleaveCombine( + SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { + if (!DCI.isBeforeLegalize()) + return SDValue(); + + unsigned NumParts = N->getNumOperands(); + if (NumParts != 2 && NumParts != 4) + return SDValue(); + + EVT SubVecTy = N->getValueType(0); + + // At the moment we're unlikely to see a fixed-width vector deinterleave as + // we usually generate shuffles instead. + unsigned MinNumElements = SubVecTy.getVectorMinNumElements(); + if (!SubVecTy.isScalableVector() || + SubVecTy.getSizeInBits().getKnownMinValue() != 128 || + !DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy)) + return SDValue(); + + // Make sure each input operand is the correct extract_subvector of the same + // wider vector. + SDValue Op0 = N->getOperand(0); + for (unsigned I = 0; I < NumParts; I++) { + SDValue OpI = N->getOperand(I); + if (OpI->getOpcode() != ISD::EXTRACT_SUBVECTOR || + OpI->getOperand(0) != Op0->getOperand(0)) + return SDValue(); + if (OpI->getConstantOperandVal(1) != (I * MinNumElements)) + return SDValue(); + } + + // Normal loads are currently already handled by the InterleavedAccessPass so + // we don't expect to see them here. Bail out if the masked load has an + // unexpected number of uses, since we want to avoid a situation where we have + // both deinterleaving loads and normal loads in the same block. Also, discard + // masked loads that are extending, indexed, have an unexpected offset or have + // an unsupported passthru value until we find a valid use case. + auto MaskedLoad = dyn_cast<MaskedLoadSDNode>(Op0->getOperand(0)); + if (!MaskedLoad || !MaskedLoad->hasNUsesOfValue(NumParts, 0) || + !MaskedLoad->isSimple() || !ISD::isNormalMaskedLoad(MaskedLoad) || + !MaskedLoad->getOffset().isUndef() || + (!MaskedLoad->getPassThru()->isUndef() && + !isZerosVector(MaskedLoad->getPassThru().getNode()))) + return SDValue(); + + // Now prove that the mask is an interleave of identical masks. + SDLoc DL(N); + SDValue NarrowMask = + getNarrowMaskForInterleavedOps(DAG, DL, MaskedLoad->getMask(), NumParts); + if (!NarrowMask) + return SDValue(); + + const Intrinsic::ID IID = NumParts == 2 ? Intrinsic::aarch64_sve_ld2_sret + : Intrinsic::aarch64_sve_ld4_sret; + SDValue NewLdOps[] = {MaskedLoad->getChain(), + DAG.getConstant(IID, DL, MVT::i32), NarrowMask, + MaskedLoad->getBasePtr()}; + SDValue Res; + if (NumParts == 2) + Res = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, + {SubVecTy, SubVecTy, MVT::Other}, NewLdOps); + else + Res = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, + {SubVecTy, SubVecTy, SubVecTy, SubVecTy, MVT::Other}, + NewLdOps); + + // We can now generate a structured load! + SmallVector<SDValue, 4> ResOps(NumParts); + for (unsigned Idx = 0; Idx < NumParts; Idx++) + ResOps[Idx] = SDValue(Res.getNode(), Idx); + + // Replace uses of the original chain result with the new chain result. + DAG.ReplaceAllUsesOfValueWith(SDValue(MaskedLoad, 1), + SDValue(Res.getNode(), NumParts)); + return DCI.CombineTo(N, ResOps, false); +} + /// If the operand is a bitwise AND with a constant RHS, and the shift has a /// constant RHS and is the only use, we can pull it out of the shift, i.e. /// @@ -27088,6 +27546,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, default: LLVM_DEBUG(dbgs() << "Custom combining: skipping\n"); break; + case ISD::VECTOR_DEINTERLEAVE: + return performVectorDeinterleaveCombine(N, DCI, DAG); case ISD::VECREDUCE_AND: case ISD::VECREDUCE_OR: case ISD::VECREDUCE_XOR: @@ -30640,10 +31100,41 @@ bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode( Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth); } +bool AArch64TargetLowering::canCreateUndefOrPoisonForTargetNode( + SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, + bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const { + + // TODO: Add more target nodes. + switch (Op.getOpcode()) { + case AArch64ISD::MOVI: + case AArch64ISD::MOVIedit: + case AArch64ISD::MOVImsl: + case AArch64ISD::MOVIshift: + case AArch64ISD::MVNImsl: + case AArch64ISD::MVNIshift: + case AArch64ISD::VASHR: + case AArch64ISD::VLSHR: + case AArch64ISD::VSHL: + return false; + } + return TargetLowering::canCreateUndefOrPoisonForTargetNode( + Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth); +} + bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const { return Op.getOpcode() == AArch64ISD::DUP || Op.getOpcode() == AArch64ISD::MOVI || Op.getOpcode() == AArch64ISD::MOVIshift || + Op.getOpcode() == AArch64ISD::MOVImsl || + Op.getOpcode() == AArch64ISD::MOVIedit || + Op.getOpcode() == AArch64ISD::MVNIshift || + Op.getOpcode() == AArch64ISD::MVNImsl || + // Ignoring fneg(movi(0)), because if it is folded to FPConstant(-0.0), + // ISel will select fmov(mov i64 0x8000000000000000), resulting in a + // fmov from fpr to gpr, which is more expensive than fneg(movi(0)) + (Op.getOpcode() == ISD::FNEG && + Op.getOperand(0).getOpcode() == AArch64ISD::MOVIedit && + Op.getOperand(0).getConstantOperandVal(0) == 0) || (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && Op.getOperand(0).getOpcode() == AArch64ISD::DUP) || TargetLowering::isTargetCanonicalConstantNode(Op); |
