summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AArch64/AArch64ISelLowering.cpp')
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp617
1 files changed, 554 insertions, 63 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d70a46b0e893..5ffaf2c49b4c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1179,6 +1179,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
setTargetDAGCombine(ISD::SHL);
+ setTargetDAGCombine(ISD::VECTOR_DEINTERLEAVE);
// In case of strict alignment, avoid an excessive number of byte wide stores.
MaxStoresPerMemsetOptSize = 8;
@@ -1918,6 +1919,20 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
}
}
+ // Handle non-aliasing elements mask
+ if (Subtarget->hasSVE2() ||
+ (Subtarget->hasSME() && Subtarget->isStreaming())) {
+ // FIXME: Support wider fixed-length types when msve-vector-bits is used.
+ for (auto VT : {MVT::v2i32, MVT::v4i16, MVT::v8i8, MVT::v16i8}) {
+ setOperationAction(ISD::LOOP_DEPENDENCE_RAW_MASK, VT, Custom);
+ setOperationAction(ISD::LOOP_DEPENDENCE_WAR_MASK, VT, Custom);
+ }
+ for (auto VT : {MVT::nxv2i1, MVT::nxv4i1, MVT::nxv8i1, MVT::nxv16i1}) {
+ setOperationAction(ISD::LOOP_DEPENDENCE_RAW_MASK, VT, Custom);
+ setOperationAction(ISD::LOOP_DEPENDENCE_WAR_MASK, VT, Custom);
+ }
+ }
+
// Handle operations that are only available in non-streaming SVE mode.
if (Subtarget->isSVEAvailable()) {
for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
@@ -2585,6 +2600,30 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
Known = Known.intersectWith(Known2);
break;
}
+ case AArch64ISD::CSNEG:
+ case AArch64ISD::CSINC:
+ case AArch64ISD::CSINV: {
+ KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
+ KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
+
+ // The result is either:
+ // CSINC: KnownOp0 or KnownOp1 + 1
+ // CSINV: KnownOp0 or ~KnownOp1
+ // CSNEG: KnownOp0 or KnownOp1 * -1
+ if (Op.getOpcode() == AArch64ISD::CSINC)
+ KnownOp1 = KnownBits::add(
+ KnownOp1,
+ KnownBits::makeConstant(APInt(Op.getScalarValueSizeInBits(), 1)));
+ else if (Op.getOpcode() == AArch64ISD::CSINV)
+ std::swap(KnownOp1.Zero, KnownOp1.One);
+ else if (Op.getOpcode() == AArch64ISD::CSNEG)
+ KnownOp1 =
+ KnownBits::mul(KnownOp1, KnownBits::makeConstant(APInt::getAllOnes(
+ Op.getScalarValueSizeInBits())));
+
+ Known = KnownOp0.intersectWith(KnownOp1);
+ break;
+ }
case AArch64ISD::BICi: {
// Compute the bit cleared value.
APInt Mask =
@@ -2626,6 +2665,32 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
<< Op->getConstantOperandVal(1)));
break;
}
+ case AArch64ISD::MOVImsl: {
+ unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
+ Known = KnownBits::makeConstant(APInt(
+ Known.getBitWidth(), ~(~Op->getConstantOperandVal(0) << ShiftAmt)));
+ break;
+ }
+ case AArch64ISD::MOVIedit: {
+ Known = KnownBits::makeConstant(APInt(
+ Known.getBitWidth(),
+ AArch64_AM::decodeAdvSIMDModImmType10(Op->getConstantOperandVal(0))));
+ break;
+ }
+ case AArch64ISD::MVNIshift: {
+ Known = KnownBits::makeConstant(
+ APInt(Known.getBitWidth(),
+ ~(Op->getConstantOperandVal(0) << Op->getConstantOperandVal(1)),
+ /*isSigned*/ false, /*implicitTrunc*/ true));
+ break;
+ }
+ case AArch64ISD::MVNImsl: {
+ unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
+ Known = KnownBits::makeConstant(
+ APInt(Known.getBitWidth(), (~Op->getConstantOperandVal(0) << ShiftAmt),
+ /*isSigned*/ false, /*implicitTrunc*/ true));
+ break;
+ }
case AArch64ISD::LOADgot:
case AArch64ISD::ADDlow: {
if (!Subtarget->isTargetILP32())
@@ -2984,21 +3049,20 @@ AArch64TargetLowering::EmitInitTPIDR2Object(MachineInstr &MI,
AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
if (TPIDR2.Uses > 0) {
+ // Note: This case just needs to do `SVL << 48`. It is not implemented as we
+ // generally don't support big-endian SVE/SME.
+ if (!Subtarget->isLittleEndian())
+ reportFatalInternalError(
+ "TPIDR2 block initialization is not supported on big-endian targets");
+
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
- // Store the buffer pointer to the TPIDR2 stack object.
- BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
+ // Store buffer pointer and num_za_save_slices.
+ // Bytes 10-15 are implicitly zeroed.
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STPXi))
.addReg(MI.getOperand(0).getReg())
+ .addReg(MI.getOperand(1).getReg())
.addFrameIndex(TPIDR2.FrameIndex)
.addImm(0);
- // Set the reserved bytes (10-15) to zero
- BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
- .addReg(AArch64::WZR)
- .addFrameIndex(TPIDR2.FrameIndex)
- .addImm(5);
- BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
- .addReg(AArch64::WZR)
- .addFrameIndex(TPIDR2.FrameIndex)
- .addImm(3);
} else
MFI.RemoveStackObject(TPIDR2.FrameIndex);
@@ -3111,21 +3175,24 @@ MachineBasicBlock *
AArch64TargetLowering::EmitEntryPStateSM(MachineInstr &MI,
MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
- AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
Register ResultReg = MI.getOperand(0).getReg();
- if (FuncInfo->isPStateSMRegUsed()) {
+ if (MF->getRegInfo().use_empty(ResultReg)) {
+ // Nothing to do. Pseudo erased below.
+ } else if (Subtarget->hasSME()) {
+ BuildMI(*BB, MI, DL, TII->get(AArch64::MRS), ResultReg)
+ .addImm(AArch64SysReg::SVCR)
+ .addReg(AArch64::VG, RegState::Implicit);
+ } else {
RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
- BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
+ BuildMI(*BB, MI, DL, TII->get(AArch64::BL))
.addExternalSymbol(getLibcallName(LC))
.addReg(AArch64::X0, RegState::ImplicitDefine)
.addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
- BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), ResultReg)
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), ResultReg)
.addReg(AArch64::X0);
- } else {
- assert(MI.getMF()->getRegInfo().use_empty(ResultReg) &&
- "Expected no users of the entry pstate.sm!");
}
MI.eraseFromParent();
return BB;
@@ -4912,6 +4979,18 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
if (DstWidth < SatWidth)
return SDValue();
+ if (SrcVT == MVT::f16 && SatVT == MVT::i16 && DstVT == MVT::i32) {
+ if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
+ SDValue CVTf32 =
+ DAG.getNode(AArch64ISD::FCVTZS_HALF, DL, MVT::f32, SrcVal);
+ SDValue Bitcast = DAG.getBitcast(DstVT, CVTf32);
+ return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, Bitcast,
+ DAG.getValueType(SatVT));
+ }
+ SDValue CVTf32 = DAG.getNode(AArch64ISD::FCVTZU_HALF, DL, MVT::f32, SrcVal);
+ return DAG.getBitcast(DstVT, CVTf32);
+ }
+
SDValue NativeCvt =
DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
SDValue Sat;
@@ -5242,6 +5321,56 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
static MVT getSVEContainerType(EVT ContentTy);
+SDValue
+AArch64TargetLowering::LowerLOOP_DEPENDENCE_MASK(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ uint64_t EltSize = Op.getConstantOperandVal(2);
+ EVT VT = Op.getValueType();
+ switch (EltSize) {
+ case 1:
+ if (VT != MVT::v16i8 && VT != MVT::nxv16i1)
+ return SDValue();
+ break;
+ case 2:
+ if (VT != MVT::v8i8 && VT != MVT::nxv8i1)
+ return SDValue();
+ break;
+ case 4:
+ if (VT != MVT::v4i16 && VT != MVT::nxv4i1)
+ return SDValue();
+ break;
+ case 8:
+ if (VT != MVT::v2i32 && VT != MVT::nxv2i1)
+ return SDValue();
+ break;
+ default:
+ // Other element sizes are incompatible with whilewr/rw, so expand instead
+ return SDValue();
+ }
+
+ SDValue PtrA = Op.getOperand(0);
+ SDValue PtrB = Op.getOperand(1);
+
+ if (VT.isScalableVT())
+ return DAG.getNode(Op.getOpcode(), DL, VT, PtrA, PtrB, Op.getOperand(2));
+
+ // We can use the SVE whilewr/whilerw instruction to lower this
+ // intrinsic by creating the appropriate sequence of scalable vector
+ // operations and then extracting a fixed-width subvector from the scalable
+ // vector. Scalable vector variants are already legal.
+ EVT ContainerVT =
+ EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+ VT.getVectorNumElements(), true);
+ EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
+
+ SDValue Mask =
+ DAG.getNode(Op.getOpcode(), DL, WhileVT, PtrA, PtrB, Op.getOperand(2));
+ SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt,
+ DAG.getVectorIdxConstant(0, DL));
+}
+
SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
SelectionDAG &DAG) const {
EVT OpVT = Op.getValueType();
@@ -6000,6 +6129,38 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
EVT PtrVT = getPointerTy(DAG.getDataLayout());
return DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
}
+ case Intrinsic::aarch64_sve_whilewr_b:
+ return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2),
+ DAG.getConstant(1, DL, MVT::i64));
+ case Intrinsic::aarch64_sve_whilewr_h:
+ return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2),
+ DAG.getConstant(2, DL, MVT::i64));
+ case Intrinsic::aarch64_sve_whilewr_s:
+ return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2),
+ DAG.getConstant(4, DL, MVT::i64));
+ case Intrinsic::aarch64_sve_whilewr_d:
+ return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2),
+ DAG.getConstant(8, DL, MVT::i64));
+ case Intrinsic::aarch64_sve_whilerw_b:
+ return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2),
+ DAG.getConstant(1, DL, MVT::i64));
+ case Intrinsic::aarch64_sve_whilerw_h:
+ return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2),
+ DAG.getConstant(2, DL, MVT::i64));
+ case Intrinsic::aarch64_sve_whilerw_s:
+ return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2),
+ DAG.getConstant(4, DL, MVT::i64));
+ case Intrinsic::aarch64_sve_whilerw_d:
+ return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2),
+ DAG.getConstant(8, DL, MVT::i64));
case Intrinsic::aarch64_neon_abs: {
EVT Ty = Op.getValueType();
if (Ty == MVT::i64) {
@@ -7359,6 +7520,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
default:
llvm_unreachable("unimplemented operand");
return SDValue();
+ case ISD::LOOP_DEPENDENCE_RAW_MASK:
+ case ISD::LOOP_DEPENDENCE_WAR_MASK:
+ return LowerLOOP_DEPENDENCE_MASK(Op, DAG);
case ISD::BITCAST:
return LowerBITCAST(Op, DAG);
case ISD::GlobalAddress:
@@ -7873,6 +8037,39 @@ static bool isPassedInFPR(EVT VT) {
(VT.isFloatingPoint() && !VT.isScalableVector());
}
+SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL,
+ SelectionDAG &DAG) const {
+ assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
+ SDValue Glue = Chain.getValue(1);
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ SMEAttrs SMEFnAttrs = MF.getInfo<AArch64FunctionInfo>()->getSMEFnAttrs();
+
+ // The following conditions are true on entry to an exception handler:
+ // - PSTATE.SM is 0.
+ // - PSTATE.ZA is 0.
+ // - TPIDR2_EL0 is null.
+ // See:
+ // https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#exceptions
+ //
+ // Therefore, if the function that contains this exception handler is a
+ // streaming[-compatible] function, we must re-enable streaming mode.
+ //
+ // These mode changes are usually optimized away in catch blocks as they
+ // occur before the __cxa_begin_catch (which is a non-streaming function),
+ // but are necessary in some cases (such as for cleanups).
+
+ if (SMEFnAttrs.hasStreamingInterfaceOrBody())
+ return changeStreamingMode(DAG, DL, /*Enable=*/true, Chain,
+ /*Glue*/ Glue, AArch64SME::Always);
+
+ if (SMEFnAttrs.hasStreamingCompatibleInterface())
+ return changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue,
+ AArch64SME::IfCallerIsStreaming);
+
+ return Chain;
+}
+
SDValue AArch64TargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
@@ -8292,7 +8489,39 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
if (Subtarget->hasCustomCallingConv())
Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
- if (!getTM().useNewSMEABILowering() || Attrs.hasAgnosticZAInterface()) {
+ if (getTM().useNewSMEABILowering()) {
+ if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) {
+ SDValue Size;
+ if (Attrs.hasZAState()) {
+ SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
+ DAG.getConstant(1, DL, MVT::i32));
+ Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
+ } else if (Attrs.hasAgnosticZAInterface()) {
+ RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
+ SDValue Callee = DAG.getExternalSymbol(
+ getLibcallName(LC), getPointerTy(DAG.getDataLayout()));
+ auto *RetTy = EVT(MVT::i64).getTypeForEVT(*DAG.getContext());
+ TargetLowering::CallLoweringInfo CLI(DAG);
+ CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
+ getLibcallCallingConv(LC), RetTy, Callee, {});
+ std::tie(Size, Chain) = LowerCallTo(CLI);
+ }
+ if (Size) {
+ SDValue Buffer = DAG.getNode(
+ ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
+ {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
+ Chain = Buffer.getValue(1);
+
+ Register BufferPtr =
+ MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+ Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
+ Chain = DAG.getNode(AArch64ISD::SME_STATE_ALLOC, DL,
+ DAG.getVTList(MVT::Other), Chain);
+ FuncInfo->setEarlyAllocSMESaveBuffer(BufferPtr);
+ MFI.CreateVariableSizedObject(Align(16), nullptr);
+ }
+ }
+ } else {
// Old SME ABI lowering (deprecated):
// Create a 16 Byte TPIDR2 object. The dynamic buffer
// will be expanded and stored in the static object later using a
@@ -8313,9 +8542,12 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
{Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
MFI.CreateVariableSizedObject(Align(16), nullptr);
}
+ SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
+ DAG.getConstant(1, DL, MVT::i32));
Chain = DAG.getNode(
AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
- {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)});
+ {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0),
+ /*Num save slices*/ NumZaSaveSlices});
} else if (Attrs.hasAgnosticZAInterface()) {
// Call __arm_sme_state_size().
SDValue BufferSize =
@@ -8338,7 +8570,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
Register BufferPtr =
MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
FuncInfo->setSMESaveBufferAddr(BufferPtr);
- Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
+ Chain = DAG.getCopyToReg(Buffer.getValue(1), DL, BufferPtr, Buffer);
}
}
@@ -8905,7 +9137,6 @@ SDValue AArch64TargetLowering::changeStreamingMode(SelectionDAG &DAG, SDLoc DL,
SmallVector<SDValue> Ops = {Chain, MSROp};
unsigned Opcode;
if (Condition != AArch64SME::Always) {
- FuncInfo->setPStateSMRegUsed(true);
Register PStateReg = FuncInfo->getPStateSMReg();
assert(PStateReg.isValid() && "PStateSM Register is invalid");
SDValue PStateSM =
@@ -9078,17 +9309,17 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// Determine whether we need any streaming mode changes.
SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), *this, CLI);
+
+ std::optional<unsigned> ZAMarkerNode;
bool UseNewSMEABILowering = getTM().useNewSMEABILowering();
- bool IsAgnosticZAFunction = CallAttrs.caller().hasAgnosticZAInterface();
- auto ZAMarkerNode = [&]() -> std::optional<unsigned> {
- // TODO: Handle agnostic ZA functions.
- if (!UseNewSMEABILowering || IsAgnosticZAFunction)
- return std::nullopt;
- if (!CallAttrs.caller().hasZAState() && !CallAttrs.caller().hasZT0State())
- return std::nullopt;
- return CallAttrs.requiresLazySave() ? AArch64ISD::REQUIRES_ZA_SAVE
- : AArch64ISD::INOUT_ZA_USE;
- }();
+ if (UseNewSMEABILowering) {
+ if (CallAttrs.requiresLazySave() ||
+ CallAttrs.requiresPreservingAllZAState())
+ ZAMarkerNode = AArch64ISD::REQUIRES_ZA_SAVE;
+ else if (CallAttrs.caller().hasZAState() ||
+ CallAttrs.caller().hasZT0State())
+ ZAMarkerNode = AArch64ISD::INOUT_ZA_USE;
+ }
if (IsTailCall) {
// Check if it's really possible to do a tail call.
@@ -9163,21 +9394,13 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
};
bool RequiresLazySave = !UseNewSMEABILowering && CallAttrs.requiresLazySave();
- bool RequiresSaveAllZA = CallAttrs.requiresPreservingAllZAState();
+ bool RequiresSaveAllZA =
+ !UseNewSMEABILowering && CallAttrs.requiresPreservingAllZAState();
if (RequiresLazySave) {
- const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
- MachinePointerInfo MPI =
- MachinePointerInfo::getStack(MF, TPIDR2.FrameIndex);
+ TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
TPIDR2.FrameIndex,
DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
- SDValue NumZaSaveSlicesAddr =
- DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
- DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
- SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
- DAG.getConstant(1, DL, MVT::i32));
- Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
- MPI, MVT::i16);
Chain = DAG.getNode(
ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
@@ -17599,14 +17822,16 @@ bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
Value *LaneMask,
ShuffleVectorInst *SVI,
- unsigned Factor) const {
+ unsigned Factor,
+ const APInt &GapMask) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
auto *SI = dyn_cast<StoreInst>(Store);
if (!SI)
return false;
- assert(!LaneMask && "Unexpected mask on store");
+ assert(!LaneMask && GapMask.popcount() == Factor &&
+ "Unexpected mask on store");
auto *VecTy = cast<FixedVectorType>(SVI->getType());
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
@@ -20868,13 +21093,6 @@ static bool isNegatedInteger(SDValue Op) {
return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
}
-static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG) {
- SDLoc DL(Op);
- EVT VT = Op.getValueType();
- SDValue Zero = DAG.getConstant(0, DL, VT);
- return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
-}
-
// Try to fold
//
// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
@@ -20893,16 +21111,17 @@ static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = CSel.getOperand(0);
SDValue N1 = CSel.getOperand(1);
- // If both of them is not negations, it's not worth the folding as it
+ // If neither of them are negations, it's not worth the folding as it
// introduces two additional negations while reducing one negation.
if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
return SDValue();
- SDValue N0N = getNegatedInteger(N0, DAG);
- SDValue N1N = getNegatedInteger(N1, DAG);
-
SDLoc DL(N);
EVT VT = CSel.getValueType();
+
+ SDValue N0N = DAG.getNegative(N0, DL, VT);
+ SDValue N1N = DAG.getNegative(N1, DL, VT);
+
return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
CSel.getOperand(3));
}
@@ -22087,10 +22306,14 @@ static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
}
+ unsigned PTest = AArch64ISD::PTEST;
+ if (Cond == AArch64CC::ANY_ACTIVE)
+ PTest = AArch64ISD::PTEST_ANY;
+ else if (Cond == AArch64CC::FIRST_ACTIVE)
+ PTest = AArch64ISD::PTEST_FIRST;
+
// Set condition code (CC) flags.
- SDValue Test = DAG.getNode(
- Cond == AArch64CC::ANY_ACTIVE ? AArch64ISD::PTEST_ANY : AArch64ISD::PTEST,
- DL, MVT::i32, Pg, Op);
+ SDValue Test = DAG.getNode(PTest, DL, MVT::i32, Pg, Op);
// Convert CC to integer based on requested condition.
// NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
@@ -22158,6 +22381,17 @@ static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
Zero);
}
+static SDValue tryCombineNeonFcvtFP16ToI16(SDNode *N, unsigned Opcode,
+ SelectionDAG &DAG) {
+ if (N->getValueType(0) != MVT::i16)
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue CVT = DAG.getNode(Opcode, DL, MVT::f32, N->getOperand(1));
+ SDValue Bitcast = DAG.getBitcast(MVT::i32, CVT);
+ return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Bitcast);
+}
+
// If a merged operation has no inactive lanes we can relax it to a predicated
// or unpredicated operation, which potentially allows better isel (perhaps
// using immediate forms) or relaxing register reuse requirements.
@@ -22411,6 +22645,26 @@ static SDValue performIntrinsicCombine(SDNode *N,
case Intrinsic::aarch64_neon_uabd:
return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2));
+ case Intrinsic::aarch64_neon_fcvtzs:
+ return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZS_HALF, DAG);
+ case Intrinsic::aarch64_neon_fcvtzu:
+ return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZU_HALF, DAG);
+ case Intrinsic::aarch64_neon_fcvtas:
+ return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAS_HALF, DAG);
+ case Intrinsic::aarch64_neon_fcvtau:
+ return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAU_HALF, DAG);
+ case Intrinsic::aarch64_neon_fcvtms:
+ return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMS_HALF, DAG);
+ case Intrinsic::aarch64_neon_fcvtmu:
+ return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMU_HALF, DAG);
+ case Intrinsic::aarch64_neon_fcvtns:
+ return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNS_HALF, DAG);
+ case Intrinsic::aarch64_neon_fcvtnu:
+ return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNU_HALF, DAG);
+ case Intrinsic::aarch64_neon_fcvtps:
+ return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPS_HALF, DAG);
+ case Intrinsic::aarch64_neon_fcvtpu:
+ return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPU_HALF, DAG);
case Intrinsic::aarch64_crc32b:
case Intrinsic::aarch64_crc32cb:
return tryCombineCRC32(0xff, N, DAG);
@@ -22419,7 +22673,7 @@ static SDValue performIntrinsicCombine(SDNode *N,
return tryCombineCRC32(0xffff, N, DAG);
case Intrinsic::aarch64_sve_saddv:
// There is no i64 version of SADDV because the sign is irrelevant.
- if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
+ if (N->getOperand(2).getValueType().getVectorElementType() == MVT::i64)
return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
else
return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG);
@@ -24106,6 +24360,7 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
// Ensure that all elements' bits are either 0s or 1s.
ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
+ bool IsLE = DAG.getDataLayout().isLittleEndian();
SmallVector<SDValue, 16> MaskConstants;
if (DAG.getSubtarget<AArch64Subtarget>().isNeonAvailable() &&
VecVT == MVT::v16i8) {
@@ -24113,7 +24368,10 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
// per entry. We split it into two halves, apply the mask, zip the halves to
// create 8x 16-bit values, and the perform the vector reduce.
for (unsigned Half = 0; Half < 2; ++Half) {
- for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) {
+ for (unsigned I = 0; I < 8; ++I) {
+ // On big-endian targets, the lane order in sub-byte vector elements
+ // gets reversed, so we need to flip the bit index.
+ unsigned MaskBit = IsLE ? (1u << I) : (1u << (7 - I));
MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
}
}
@@ -24131,8 +24389,9 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
}
// All other vector sizes.
- unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1);
- for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) {
+ unsigned NumEl = VecVT.getVectorNumElements();
+ for (unsigned I = 0; I < NumEl; ++I) {
+ unsigned MaskBit = IsLE ? (1u << I) : (1u << (NumEl - 1 - I));
MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
}
@@ -24444,6 +24703,105 @@ static SDValue performSTORECombine(SDNode *N,
return SDValue();
}
+static bool
+isSequentialConcatOfVectorInterleave(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
+ if (N->getOpcode() != ISD::CONCAT_VECTORS)
+ return false;
+
+ unsigned NumParts = N->getNumOperands();
+
+ // We should be concatenating each sequential result from a
+ // VECTOR_INTERLEAVE.
+ SDNode *InterleaveOp = N->getOperand(0).getNode();
+ if (InterleaveOp->getOpcode() != ISD::VECTOR_INTERLEAVE ||
+ InterleaveOp->getNumOperands() != NumParts)
+ return false;
+
+ for (unsigned I = 0; I < NumParts; I++)
+ if (N->getOperand(I) != SDValue(InterleaveOp, I))
+ return false;
+
+ Ops.append(InterleaveOp->op_begin(), InterleaveOp->op_end());
+ return true;
+}
+
+static SDValue getNarrowMaskForInterleavedOps(SelectionDAG &DAG, SDLoc &DL,
+ SDValue WideMask,
+ unsigned RequiredNumParts) {
+ if (WideMask->getOpcode() == ISD::CONCAT_VECTORS) {
+ SmallVector<SDValue, 4> MaskInterleaveOps;
+ if (!isSequentialConcatOfVectorInterleave(WideMask.getNode(),
+ MaskInterleaveOps))
+ return SDValue();
+
+ if (MaskInterleaveOps.size() != RequiredNumParts)
+ return SDValue();
+
+ // Make sure the inputs to the vector interleave are identical.
+ if (!llvm::all_equal(MaskInterleaveOps))
+ return SDValue();
+
+ return MaskInterleaveOps[0];
+ }
+
+ if (WideMask->getOpcode() != ISD::SPLAT_VECTOR)
+ return SDValue();
+
+ ElementCount EC = WideMask.getValueType().getVectorElementCount();
+ assert(EC.isKnownMultipleOf(RequiredNumParts) &&
+ "Expected element count divisible by number of parts");
+ EC = EC.divideCoefficientBy(RequiredNumParts);
+ return DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::getVectorVT(MVT::i1, EC),
+ WideMask->getOperand(0));
+}
+
+static SDValue performInterleavedMaskedStoreCombine(
+ SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
+ if (!DCI.isBeforeLegalize())
+ return SDValue();
+
+ MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
+ SDValue WideValue = MST->getValue();
+
+ // Bail out if the stored value has an unexpected number of uses, since we'll
+ // have to perform manual interleaving and may as well just use normal masked
+ // stores. Also, discard masked stores that are truncating or indexed.
+ if (!WideValue.hasOneUse() || !ISD::isNormalMaskedStore(MST) ||
+ !MST->isSimple() || !MST->getOffset().isUndef())
+ return SDValue();
+
+ SmallVector<SDValue, 4> ValueInterleaveOps;
+ if (!isSequentialConcatOfVectorInterleave(WideValue.getNode(),
+ ValueInterleaveOps))
+ return SDValue();
+
+ unsigned NumParts = ValueInterleaveOps.size();
+ if (NumParts != 2 && NumParts != 4)
+ return SDValue();
+
+ // At the moment we're unlikely to see a fixed-width vector interleave as
+ // we usually generate shuffles instead.
+ EVT SubVecTy = ValueInterleaveOps[0].getValueType();
+ if (!SubVecTy.isScalableVT() ||
+ SubVecTy.getSizeInBits().getKnownMinValue() != 128 ||
+ !DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy))
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue NarrowMask =
+ getNarrowMaskForInterleavedOps(DAG, DL, MST->getMask(), NumParts);
+ if (!NarrowMask)
+ return SDValue();
+
+ const Intrinsic::ID IID =
+ NumParts == 2 ? Intrinsic::aarch64_sve_st2 : Intrinsic::aarch64_sve_st4;
+ SmallVector<SDValue, 8> NewStOps;
+ NewStOps.append({MST->getChain(), DAG.getConstant(IID, DL, MVT::i32)});
+ NewStOps.append(ValueInterleaveOps);
+ NewStOps.append({NarrowMask, MST->getBasePtr()});
+ return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, NewStOps);
+}
+
static SDValue performMSTORECombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG,
@@ -24453,6 +24811,9 @@ static SDValue performMSTORECombine(SDNode *N,
SDValue Mask = MST->getMask();
SDLoc DL(N);
+ if (SDValue Res = performInterleavedMaskedStoreCombine(N, DCI, DAG))
+ return Res;
+
// If this is a UZP1 followed by a masked store, fold this into a masked
// truncating store. We can do this even if this is already a masked
// truncstore.
@@ -26523,6 +26884,26 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
}
+ // Sign extend of CSET -> CSETM.
+ if (Opc == AArch64ISD::CSEL &&
+ cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i1) {
+ EVT VT = N->getValueType(0);
+ SDValue TVal = Src.getOperand(0);
+ SDValue FVal = Src.getOperand(1);
+
+ // SIGN_EXTEND_INREG (CSEL 0, 1, cc, NZCV), i1 --> CSEL 0, -1, cc, NZCV
+ if (isNullConstant(TVal) && isOneConstant(FVal))
+ return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal,
+ DAG.getAllOnesConstant(DL, VT), Src.getOperand(2),
+ Src.getOperand(3));
+
+ // SIGN_EXTEND_INREG (CSEL 1, 0, cc, NZCV), i1 --> CSEL -1, 0, cc, NZCV
+ if (isOneConstant(TVal) && isNullConstant(FVal))
+ return DAG.getNode(AArch64ISD::CSEL, DL, VT,
+ DAG.getAllOnesConstant(DL, VT), FVal,
+ Src.getOperand(2), Src.getOperand(3));
+ }
+
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -27020,6 +27401,83 @@ performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
return NVCAST;
}
+static SDValue performVectorDeinterleaveCombine(
+ SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
+ if (!DCI.isBeforeLegalize())
+ return SDValue();
+
+ unsigned NumParts = N->getNumOperands();
+ if (NumParts != 2 && NumParts != 4)
+ return SDValue();
+
+ EVT SubVecTy = N->getValueType(0);
+
+ // At the moment we're unlikely to see a fixed-width vector deinterleave as
+ // we usually generate shuffles instead.
+ unsigned MinNumElements = SubVecTy.getVectorMinNumElements();
+ if (!SubVecTy.isScalableVector() ||
+ SubVecTy.getSizeInBits().getKnownMinValue() != 128 ||
+ !DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy))
+ return SDValue();
+
+ // Make sure each input operand is the correct extract_subvector of the same
+ // wider vector.
+ SDValue Op0 = N->getOperand(0);
+ for (unsigned I = 0; I < NumParts; I++) {
+ SDValue OpI = N->getOperand(I);
+ if (OpI->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ OpI->getOperand(0) != Op0->getOperand(0))
+ return SDValue();
+ if (OpI->getConstantOperandVal(1) != (I * MinNumElements))
+ return SDValue();
+ }
+
+ // Normal loads are currently already handled by the InterleavedAccessPass so
+ // we don't expect to see them here. Bail out if the masked load has an
+ // unexpected number of uses, since we want to avoid a situation where we have
+ // both deinterleaving loads and normal loads in the same block. Also, discard
+ // masked loads that are extending, indexed, have an unexpected offset or have
+ // an unsupported passthru value until we find a valid use case.
+ auto MaskedLoad = dyn_cast<MaskedLoadSDNode>(Op0->getOperand(0));
+ if (!MaskedLoad || !MaskedLoad->hasNUsesOfValue(NumParts, 0) ||
+ !MaskedLoad->isSimple() || !ISD::isNormalMaskedLoad(MaskedLoad) ||
+ !MaskedLoad->getOffset().isUndef() ||
+ (!MaskedLoad->getPassThru()->isUndef() &&
+ !isZerosVector(MaskedLoad->getPassThru().getNode())))
+ return SDValue();
+
+ // Now prove that the mask is an interleave of identical masks.
+ SDLoc DL(N);
+ SDValue NarrowMask =
+ getNarrowMaskForInterleavedOps(DAG, DL, MaskedLoad->getMask(), NumParts);
+ if (!NarrowMask)
+ return SDValue();
+
+ const Intrinsic::ID IID = NumParts == 2 ? Intrinsic::aarch64_sve_ld2_sret
+ : Intrinsic::aarch64_sve_ld4_sret;
+ SDValue NewLdOps[] = {MaskedLoad->getChain(),
+ DAG.getConstant(IID, DL, MVT::i32), NarrowMask,
+ MaskedLoad->getBasePtr()};
+ SDValue Res;
+ if (NumParts == 2)
+ Res = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
+ {SubVecTy, SubVecTy, MVT::Other}, NewLdOps);
+ else
+ Res = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
+ {SubVecTy, SubVecTy, SubVecTy, SubVecTy, MVT::Other},
+ NewLdOps);
+
+ // We can now generate a structured load!
+ SmallVector<SDValue, 4> ResOps(NumParts);
+ for (unsigned Idx = 0; Idx < NumParts; Idx++)
+ ResOps[Idx] = SDValue(Res.getNode(), Idx);
+
+ // Replace uses of the original chain result with the new chain result.
+ DAG.ReplaceAllUsesOfValueWith(SDValue(MaskedLoad, 1),
+ SDValue(Res.getNode(), NumParts));
+ return DCI.CombineTo(N, ResOps, false);
+}
+
/// If the operand is a bitwise AND with a constant RHS, and the shift has a
/// constant RHS and is the only use, we can pull it out of the shift, i.e.
///
@@ -27088,6 +27546,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
default:
LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
break;
+ case ISD::VECTOR_DEINTERLEAVE:
+ return performVectorDeinterleaveCombine(N, DCI, DAG);
case ISD::VECREDUCE_AND:
case ISD::VECREDUCE_OR:
case ISD::VECREDUCE_XOR:
@@ -30640,10 +31100,41 @@ bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
}
+bool AArch64TargetLowering::canCreateUndefOrPoisonForTargetNode(
+ SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+ bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
+
+ // TODO: Add more target nodes.
+ switch (Op.getOpcode()) {
+ case AArch64ISD::MOVI:
+ case AArch64ISD::MOVIedit:
+ case AArch64ISD::MOVImsl:
+ case AArch64ISD::MOVIshift:
+ case AArch64ISD::MVNImsl:
+ case AArch64ISD::MVNIshift:
+ case AArch64ISD::VASHR:
+ case AArch64ISD::VLSHR:
+ case AArch64ISD::VSHL:
+ return false;
+ }
+ return TargetLowering::canCreateUndefOrPoisonForTargetNode(
+ Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
+}
+
bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
return Op.getOpcode() == AArch64ISD::DUP ||
Op.getOpcode() == AArch64ISD::MOVI ||
Op.getOpcode() == AArch64ISD::MOVIshift ||
+ Op.getOpcode() == AArch64ISD::MOVImsl ||
+ Op.getOpcode() == AArch64ISD::MOVIedit ||
+ Op.getOpcode() == AArch64ISD::MVNIshift ||
+ Op.getOpcode() == AArch64ISD::MVNImsl ||
+ // Ignoring fneg(movi(0)), because if it is folded to FPConstant(-0.0),
+ // ISel will select fmov(mov i64 0x8000000000000000), resulting in a
+ // fmov from fpr to gpr, which is more expensive than fneg(movi(0))
+ (Op.getOpcode() == ISD::FNEG &&
+ Op.getOperand(0).getOpcode() == AArch64ISD::MOVIedit &&
+ Op.getOperand(0).getConstantOperandVal(0) == 0) ||
(Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
TargetLowering::isTargetCanonicalConstantNode(Op);