diff options
| author | Mingming Liu <mingmingl@google.com> | 2025-09-10 15:25:31 -0700 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-09-10 15:25:31 -0700 |
| commit | 1417dafa1db9cb1b2b09438aa9f53ea5ab6e36e2 (patch) | |
| tree | 57f4b1f313c8cf74eed8819870f39c36ea263c68 /llvm/lib/Target/LoongArch | |
| parent | 898b813bc8a6d0276bf0f4769f5f2f64b34e632d (diff) | |
| parent | b8cefcb601ddaa18482555c4ff363c01a270c2fe (diff) | |
Merge branch 'main' into users/mingmingl-llvm/samplefdo-profile-formatusers/mingmingl-llvm/samplefdo-profile-format
Diffstat (limited to 'llvm/lib/Target/LoongArch')
16 files changed, 732 insertions, 116 deletions
diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td index 39948b31fb9b..6497ff999f6f 100644 --- a/llvm/lib/Target/LoongArch/LoongArch.td +++ b/llvm/lib/Target/LoongArch/LoongArch.td @@ -39,7 +39,7 @@ def IsLA32 "LA32 Basic Integer and Privilege Instruction Set">; defvar LA32 = DefaultMode; -def LA64 : HwMode<"+64bit", [IsLA64]>; +def LA64 : HwMode<[IsLA64]>; // Single Precision floating point def FeatureBasicF diff --git a/llvm/lib/Target/LoongArch/LoongArchCallingConv.td b/llvm/lib/Target/LoongArch/LoongArchCallingConv.td index 9844163163a5..7dcf65ce2b82 100644 --- a/llvm/lib/Target/LoongArch/LoongArchCallingConv.td +++ b/llvm/lib/Target/LoongArch/LoongArchCallingConv.td @@ -21,3 +21,7 @@ def CSR_ILP32D_LP64D // Needed for implementation of LoongArchRegisterInfo::getNoPreservedMask() def CSR_NoRegs : CalleeSavedRegs<(add)>; + +def CSR_MostRegs : CalleeSavedRegs<(add CSR_ILP32S_LP64S, + (sequence "R%u", 4, 11), + (sequence "R%u", 16, 19))>; diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td index 36c3011be2b9..c45975431d83 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td @@ -10,6 +10,9 @@ // //===----------------------------------------------------------------------===// +def NotBoolXor : PatFrags<(ops node:$val), + [(xor node:$val, -1), (xor node:$val, 1)]>; + //===----------------------------------------------------------------------===// // LoongArch specific DAG Nodes. //===----------------------------------------------------------------------===// @@ -22,6 +25,9 @@ def SDT_LoongArchFTINT : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>; def SDT_LoongArchFRECIPE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>; def SDT_LoongArchFRSQRTE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>; +// ISD::BRCOND is custom-lowered to LoongArchISD::BRCOND for floating-point +// comparisons to prevent recursive lowering. +def loongarch_brcond : SDNode<"LoongArchISD::BRCOND", SDTBrcond, [SDNPHasChain]>; def loongarch_movgr2fr_w_la64 : SDNode<"LoongArchISD::MOVGR2FR_W_LA64", SDT_LoongArchMOVGR2FR_W_LA64>; def loongarch_movfr2gr_s_la64 @@ -208,16 +214,18 @@ def : PatFPSetcc<SETUO, FCMP_CUN_S, FPR32>; def : PatFPSetcc<SETLT, FCMP_CLT_S, FPR32>; multiclass PatFPBrcond<CondCode cc, LAInst CmpInst, RegisterClass RegTy> { - def : Pat<(brcond (xor (GRLenVT (setcc RegTy:$fj, RegTy:$fk, cc)), -1), - bb:$imm21), + def : Pat<(loongarch_brcond (NotBoolXor (GRLenVT (setcc RegTy:$fj, RegTy:$fk, cc))), + bb:$imm21), (BCEQZ (CmpInst RegTy:$fj, RegTy:$fk), bb:$imm21)>; - def : Pat<(brcond (GRLenVT (setcc RegTy:$fj, RegTy:$fk, cc)), bb:$imm21), + def : Pat<(loongarch_brcond (GRLenVT (setcc RegTy:$fj, RegTy:$fk, cc)), bb:$imm21), (BCNEZ (CmpInst RegTy:$fj, RegTy:$fk), bb:$imm21)>; } defm : PatFPBrcond<SETOEQ, FCMP_CEQ_S, FPR32>; +defm : PatFPBrcond<SETEQ , FCMP_CEQ_S, FPR32>; defm : PatFPBrcond<SETOLT, FCMP_CLT_S, FPR32>; defm : PatFPBrcond<SETOLE, FCMP_CLE_S, FPR32>; +defm : PatFPBrcond<SETLE, FCMP_CLE_S, FPR32>; defm : PatFPBrcond<SETONE, FCMP_CNE_S, FPR32>; defm : PatFPBrcond<SETO, FCMP_COR_S, FPR32>; defm : PatFPBrcond<SETUEQ, FCMP_CUEQ_S, FPR32>; diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td index 616640152c8d..965ad8a0a35c 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td @@ -184,8 +184,10 @@ def : PatFPSetcc<SETUO, FCMP_CUN_D, FPR64>; def : PatFPSetcc<SETLT, FCMP_CLT_D, FPR64>; defm : PatFPBrcond<SETOEQ, FCMP_CEQ_D, FPR64>; +defm : PatFPBrcond<SETEQ, FCMP_CEQ_D, FPR64>; defm : PatFPBrcond<SETOLT, FCMP_CLT_D, FPR64>; defm : PatFPBrcond<SETOLE, FCMP_CLE_D, FPR64>; +defm : PatFPBrcond<SETLE, FCMP_CLE_D, FPR64>; defm : PatFPBrcond<SETONE, FCMP_CNE_D, FPR64>; defm : PatFPBrcond<SETO, FCMP_COR_D, FPR64>; defm : PatFPBrcond<SETUEQ, FCMP_CUEQ_D, FPR64>; diff --git a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp index 71d0263fe376..07e722b9a659 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp @@ -114,7 +114,7 @@ void LoongArchDAGToDAGISel::Select(SDNode *Node) { unsigned SplatBitSize; bool HasAnyUndefs; unsigned Op; - EVT ViaVecTy; + EVT ResTy = BVN->getValueType(0); bool Is128Vec = BVN->getValueType(0).is128BitVector(); bool Is256Vec = BVN->getValueType(0).is256BitVector(); @@ -129,28 +129,25 @@ void LoongArchDAGToDAGISel::Select(SDNode *Node) { break; case 8: Op = Is256Vec ? LoongArch::PseudoXVREPLI_B : LoongArch::PseudoVREPLI_B; - ViaVecTy = Is256Vec ? MVT::v32i8 : MVT::v16i8; break; case 16: Op = Is256Vec ? LoongArch::PseudoXVREPLI_H : LoongArch::PseudoVREPLI_H; - ViaVecTy = Is256Vec ? MVT::v16i16 : MVT::v8i16; break; case 32: Op = Is256Vec ? LoongArch::PseudoXVREPLI_W : LoongArch::PseudoVREPLI_W; - ViaVecTy = Is256Vec ? MVT::v8i32 : MVT::v4i32; break; case 64: Op = Is256Vec ? LoongArch::PseudoXVREPLI_D : LoongArch::PseudoVREPLI_D; - ViaVecTy = Is256Vec ? MVT::v4i64 : MVT::v2i64; break; } SDNode *Res; // If we have a signed 10 bit integer, we can splat it directly. if (SplatValue.isSignedIntN(10)) { - SDValue Imm = CurDAG->getTargetConstant(SplatValue, DL, - ViaVecTy.getVectorElementType()); - Res = CurDAG->getMachineNode(Op, DL, ViaVecTy, Imm); + EVT EleType = ResTy.getVectorElementType(); + APInt Val = SplatValue.sextOrTrunc(EleType.getSizeInBits()); + SDValue Imm = CurDAG->getTargetConstant(Val, DL, EleType); + Res = CurDAG->getMachineNode(Op, DL, ResTy, Imm); ReplaceNode(Node, Res); return; } diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 5b2d185594f4..634914d3b3fd 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -127,6 +127,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::BR_JT, MVT::Other, Expand); setOperationAction(ISD::BR_CC, GRLenVT, Expand); + setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::SELECT_CC, GRLenVT, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, GRLenVT, Expand); @@ -340,6 +341,14 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, {MVT::v16i8, MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v8i16, MVT::v4i16, MVT::v2i16, MVT::v4i32, MVT::v2i32, MVT::v2i64}) { setOperationAction(ISD::TRUNCATE, VT, Custom); + setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); + setOperationAction(ISD::VECREDUCE_AND, VT, Custom); + setOperationAction(ISD::VECREDUCE_OR, VT, Custom); + setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); } } @@ -377,6 +386,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::ABDS, VT, Legal); setOperationAction(ISD::ABDU, VT, Legal); + setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); } for (MVT VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32}) setOperationAction(ISD::BITREVERSE, VT, Custom); @@ -413,6 +423,11 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::BITCAST); } + // Set DAG combine for 'LASX' feature. + + if (Subtarget.hasExtLASX()) + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); + // Compute derived properties from the register classes. computeRegisterProperties(Subtarget.getRegisterInfo()); @@ -514,6 +529,8 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op, return lowerPREFETCH(Op, DAG); case ISD::SELECT: return lowerSELECT(Op, DAG); + case ISD::BRCOND: + return lowerBRCOND(Op, DAG); case ISD::FP_TO_FP16: return lowerFP_TO_FP16(Op, DAG); case ISD::FP16_TO_FP: @@ -522,10 +539,109 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op, return lowerFP_TO_BF16(Op, DAG); case ISD::BF16_TO_FP: return lowerBF16_TO_FP(Op, DAG); + case ISD::VECREDUCE_ADD: + return lowerVECREDUCE_ADD(Op, DAG); + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: + return lowerVECREDUCE(Op, DAG); } return SDValue(); } +// Lower vecreduce_add using vhaddw instructions. +// For Example: +// call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a) +// can be lowered to: +// VHADDW_D_W vr0, vr0, vr0 +// VHADDW_Q_D vr0, vr0, vr0 +// VPICKVE2GR_D a0, vr0, 0 +// ADDI_W a0, a0, 0 +SDValue LoongArchTargetLowering::lowerVECREDUCE_ADD(SDValue Op, + SelectionDAG &DAG) const { + + SDLoc DL(Op); + MVT OpVT = Op.getSimpleValueType(); + SDValue Val = Op.getOperand(0); + + unsigned NumEles = Val.getSimpleValueType().getVectorNumElements(); + unsigned EleBits = Val.getSimpleValueType().getScalarSizeInBits(); + + unsigned LegalVecSize = 128; + bool isLASX256Vector = + Subtarget.hasExtLASX() && Val.getValueSizeInBits() == 256; + + // Ensure operand type legal or enable it legal. + while (!isTypeLegal(Val.getSimpleValueType())) { + Val = DAG.WidenVector(Val, DL); + } + + // NumEles is designed for iterations count, v4i32 for LSX + // and v8i32 for LASX should have the same count. + if (isLASX256Vector) { + NumEles /= 2; + LegalVecSize = 256; + } + + for (unsigned i = 1; i < NumEles; i *= 2, EleBits *= 2) { + MVT IntTy = MVT::getIntegerVT(EleBits); + MVT VecTy = MVT::getVectorVT(IntTy, LegalVecSize / EleBits); + Val = DAG.getNode(LoongArchISD::VHADDW, DL, VecTy, Val, Val); + } + + if (isLASX256Vector) { + SDValue Tmp = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, Val, + DAG.getConstant(2, DL, MVT::i64)); + Val = DAG.getNode(ISD::ADD, DL, MVT::v4i64, Tmp, Val); + } + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Val, + DAG.getConstant(0, DL, Subtarget.getGRLenVT())); +} + +// Lower vecreduce_and/or/xor/[s/u]max/[s/u]min. +// For Example: +// call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a) +// can be lowered to: +// VBSRL_V vr1, vr0, 8 +// VMAX_W vr0, vr1, vr0 +// VBSRL_V vr1, vr0, 4 +// VMAX_W vr0, vr1, vr0 +// VPICKVE2GR_W a0, vr0, 0 +// For 256 bit vector, it is illegal and will be spilt into +// two 128 bit vector by default then processed by this. +SDValue LoongArchTargetLowering::lowerVECREDUCE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + + MVT OpVT = Op.getSimpleValueType(); + SDValue Val = Op.getOperand(0); + + unsigned NumEles = Val.getSimpleValueType().getVectorNumElements(); + unsigned EleBits = Val.getSimpleValueType().getScalarSizeInBits(); + + // Ensure operand type legal or enable it legal. + while (!isTypeLegal(Val.getSimpleValueType())) { + Val = DAG.WidenVector(Val, DL); + } + + unsigned Opcode = ISD::getVecReduceBaseOpcode(Op.getOpcode()); + MVT VecTy = Val.getSimpleValueType(); + + for (int i = NumEles; i > 1; i /= 2) { + SDValue ShiftAmt = DAG.getConstant(i * EleBits / 16, DL, MVT::i64); + SDValue Tmp = DAG.getNode(LoongArchISD::VBSRL, DL, VecTy, Val, ShiftAmt); + Val = DAG.getNode(Opcode, DL, VecTy, Tmp, Val); + } + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Val, + DAG.getConstant(0, DL, Subtarget.getGRLenVT())); +} + SDValue LoongArchTargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const { unsigned IsData = Op.getConstantOperandVal(4); @@ -859,6 +975,35 @@ SDValue LoongArchTargetLowering::lowerSELECT(SDValue Op, return DAG.getNode(LoongArchISD::SELECT_CC, DL, VT, Ops); } +SDValue LoongArchTargetLowering::lowerBRCOND(SDValue Op, + SelectionDAG &DAG) const { + SDValue CondV = Op.getOperand(1); + SDLoc DL(Op); + MVT GRLenVT = Subtarget.getGRLenVT(); + + if (CondV.getOpcode() == ISD::SETCC) { + if (CondV.getOperand(0).getValueType() == GRLenVT) { + SDValue LHS = CondV.getOperand(0); + SDValue RHS = CondV.getOperand(1); + ISD::CondCode CCVal = cast<CondCodeSDNode>(CondV.getOperand(2))->get(); + + translateSetCCForBranch(DL, LHS, RHS, CCVal, DAG); + + SDValue TargetCC = DAG.getCondCode(CCVal); + return DAG.getNode(LoongArchISD::BR_CC, DL, Op.getValueType(), + Op.getOperand(0), LHS, RHS, TargetCC, + Op.getOperand(2)); + } else if (CondV.getOperand(0).getValueType().isFloatingPoint()) { + return DAG.getNode(LoongArchISD::BRCOND, DL, Op.getValueType(), + Op.getOperand(0), CondV, Op.getOperand(2)); + } + } + + return DAG.getNode(LoongArchISD::BR_CC, DL, Op.getValueType(), + Op.getOperand(0), CondV, DAG.getConstant(0, DL, GRLenVT), + DAG.getCondCode(ISD::SETNE), Op.getOperand(2)); +} + SDValue LoongArchTargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { @@ -1031,6 +1176,7 @@ static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, static SDValue lowerVECTOR_SHUFFLEAsShift(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget, const APInt &Zeroable) { int Size = Mask.size(); assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); @@ -1057,7 +1203,7 @@ static SDValue lowerVECTOR_SHUFFLEAsShift(const SDLoc &DL, ArrayRef<int> Mask, "Illegal integer vector type"); V = DAG.getBitcast(ShiftVT, V); V = DAG.getNode(Opcode, DL, ShiftVT, V, - DAG.getConstant(ShiftAmt, DL, MVT::i64)); + DAG.getConstant(ShiftAmt, DL, Subtarget.getGRLenVT())); return DAG.getBitcast(VT, V); } @@ -1226,10 +1372,10 @@ static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, /// (VBSRL_V $v1, $v1, 8) /// (VBSLL_V $v0, $v0, 8) /// (VOR_V $v0, $V0, $v1) -static SDValue lowerVECTOR_SHUFFLEAsByteRotate(const SDLoc &DL, - ArrayRef<int> Mask, MVT VT, - SDValue V1, SDValue V2, - SelectionDAG &DAG) { +static SDValue +lowerVECTOR_SHUFFLEAsByteRotate(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, + SDValue V1, SDValue V2, SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget) { SDValue Lo = V1, Hi = V2; int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask); @@ -1242,11 +1388,12 @@ static SDValue lowerVECTOR_SHUFFLEAsByteRotate(const SDLoc &DL, int LoByteShift = 16 - ByteRotation; int HiByteShift = ByteRotation; + MVT GRLenVT = Subtarget.getGRLenVT(); SDValue LoShift = DAG.getNode(LoongArchISD::VBSLL, DL, ByteVT, Lo, - DAG.getConstant(LoByteShift, DL, MVT::i64)); + DAG.getConstant(LoByteShift, DL, GRLenVT)); SDValue HiShift = DAG.getNode(LoongArchISD::VBSRL, DL, ByteVT, Hi, - DAG.getConstant(HiByteShift, DL, MVT::i64)); + DAG.getConstant(HiByteShift, DL, GRLenVT)); return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, ByteVT, LoShift, HiShift)); } @@ -1351,9 +1498,10 @@ static SDValue lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(const SDLoc &DL, /// /// When undef's appear in the mask they are treated as if they were whatever /// value is necessary in order to fit the above form. -static SDValue lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef<int> Mask, - MVT VT, SDValue V1, SDValue V2, - SelectionDAG &DAG) { +static SDValue +lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, + SDValue V1, SDValue V2, SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget) { int SplatIndex = -1; for (const auto &M : Mask) { if (M != -1) { @@ -1369,7 +1517,7 @@ static SDValue lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef<int> Mask, if (fitsRegularPattern<int>(Mask.begin(), 1, Mask.end(), SplatIndex, 0)) { APInt Imm(64, SplatIndex); return DAG.getNode(LoongArchISD::VREPLVEI, DL, VT, V1, - DAG.getConstant(Imm, DL, MVT::i64)); + DAG.getConstant(Imm, DL, Subtarget.getGRLenVT())); } return SDValue(); @@ -1393,9 +1541,10 @@ static SDValue lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef<int> Mask, /// (VSHUF4I_H $v0, $v1, 27) /// where the 27 comes from: /// 3 + (2 << 2) + (1 << 4) + (0 << 6) -static SDValue lowerVECTOR_SHUFFLE_VSHUF4I(const SDLoc &DL, ArrayRef<int> Mask, - MVT VT, SDValue V1, SDValue V2, - SelectionDAG &DAG) { +static SDValue +lowerVECTOR_SHUFFLE_VSHUF4I(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, + SDValue V1, SDValue V2, SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget) { unsigned SubVecSize = 4; if (VT == MVT::v2f64 || VT == MVT::v2i64) @@ -1437,13 +1586,15 @@ static SDValue lowerVECTOR_SHUFFLE_VSHUF4I(const SDLoc &DL, ArrayRef<int> Mask, Imm |= M & 0x3; } + MVT GRLenVT = Subtarget.getGRLenVT(); + // Return vshuf4i.d if (VT == MVT::v2f64 || VT == MVT::v2i64) return DAG.getNode(LoongArchISD::VSHUF4I, DL, VT, V1, V2, - DAG.getConstant(Imm, DL, MVT::i64)); + DAG.getConstant(Imm, DL, GRLenVT)); return DAG.getNode(LoongArchISD::VSHUF4I, DL, VT, V1, - DAG.getConstant(Imm, DL, MVT::i64)); + DAG.getConstant(Imm, DL, GRLenVT)); } /// Lower VECTOR_SHUFFLE into VPACKEV (if possible). @@ -1723,7 +1874,8 @@ static SDValue lowerVECTOR_SHUFFLE_VSHUF(const SDLoc &DL, ArrayRef<int> Mask, /// This routine breaks down the specific type of 128-bit shuffle and /// dispatches to the lowering routines accordingly. static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, - SDValue V1, SDValue V2, SelectionDAG &DAG) { + SDValue V1, SDValue V2, SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget) { assert((VT.SimpleTy == MVT::v16i8 || VT.SimpleTy == MVT::v8i16 || VT.SimpleTy == MVT::v4i32 || VT.SimpleTy == MVT::v2i64 || VT.SimpleTy == MVT::v4f32 || VT.SimpleTy == MVT::v2f64) && @@ -1741,9 +1893,11 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, SDValue Result; // TODO: Add more comparison patterns. if (V2.isUndef()) { - if ((Result = lowerVECTOR_SHUFFLE_VREPLVEI(DL, Mask, VT, V1, V2, DAG))) + if ((Result = lowerVECTOR_SHUFFLE_VREPLVEI(DL, Mask, VT, V1, V2, DAG, + Subtarget))) return Result; - if ((Result = lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG))) + if ((Result = + lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG, Subtarget))) return Result; // TODO: This comment may be enabled in the future to better match the @@ -1766,15 +1920,17 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, if ((Result = lowerVECTOR_SHUFFLE_VPICKOD(DL, Mask, VT, V1, V2, DAG))) return Result; if ((VT.SimpleTy == MVT::v2i64 || VT.SimpleTy == MVT::v2f64) && - (Result = lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG))) + (Result = + lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG, Subtarget))) return Result; if ((Result = lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(DL, Mask, VT, V1, V2, DAG, Zeroable))) return Result; - if ((Result = - lowerVECTOR_SHUFFLEAsShift(DL, Mask, VT, V1, V2, DAG, Zeroable))) + if ((Result = lowerVECTOR_SHUFFLEAsShift(DL, Mask, VT, V1, V2, DAG, Subtarget, + Zeroable))) return Result; - if ((Result = lowerVECTOR_SHUFFLEAsByteRotate(DL, Mask, VT, V1, V2, DAG))) + if ((Result = lowerVECTOR_SHUFFLEAsByteRotate(DL, Mask, VT, V1, V2, DAG, + Subtarget))) return Result; if (SDValue NewShuffle = widenShuffleMask(DL, Mask, VT, V1, V2, DAG)) return NewShuffle; @@ -1791,10 +1947,10 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, /// /// When undef's appear in the mask they are treated as if they were whatever /// value is necessary in order to fit the above form. -static SDValue lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL, - ArrayRef<int> Mask, MVT VT, - SDValue V1, SDValue V2, - SelectionDAG &DAG) { +static SDValue +lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, + SDValue V1, SDValue V2, SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget) { int SplatIndex = -1; for (const auto &M : Mask) { if (M != -1) { @@ -1816,21 +1972,64 @@ static SDValue lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL, 0)) { APInt Imm(64, SplatIndex); return DAG.getNode(LoongArchISD::VREPLVEI, DL, VT, V1, - DAG.getConstant(Imm, DL, MVT::i64)); + DAG.getConstant(Imm, DL, Subtarget.getGRLenVT())); } return SDValue(); } /// Lower VECTOR_SHUFFLE into XVSHUF4I (if possible). -static SDValue lowerVECTOR_SHUFFLE_XVSHUF4I(const SDLoc &DL, ArrayRef<int> Mask, - MVT VT, SDValue V1, SDValue V2, - SelectionDAG &DAG) { +static SDValue +lowerVECTOR_SHUFFLE_XVSHUF4I(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, + SDValue V1, SDValue V2, SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget) { // When the size is less than or equal to 4, lower cost instructions may be // used. if (Mask.size() <= 4) return SDValue(); - return lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG); + return lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG, Subtarget); +} + +/// Lower VECTOR_SHUFFLE into XVPERM (if possible). +static SDValue lowerVECTOR_SHUFFLE_XVPERM(const SDLoc &DL, ArrayRef<int> Mask, + MVT VT, SDValue V1, SDValue V2, + SelectionDAG &DAG) { + // LoongArch LASX only have XVPERM_W. + if (Mask.size() != 8 || (VT != MVT::v8i32 && VT != MVT::v8f32)) + return SDValue(); + + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfSize = NumElts / 2; + bool FrontLo = true, FrontHi = true; + bool BackLo = true, BackHi = true; + + auto inRange = [](int val, int low, int high) { + return (val == -1) || (val >= low && val < high); + }; + + for (unsigned i = 0; i < HalfSize; ++i) { + int Fronti = Mask[i]; + int Backi = Mask[i + HalfSize]; + + FrontLo &= inRange(Fronti, 0, HalfSize); + FrontHi &= inRange(Fronti, HalfSize, NumElts); + BackLo &= inRange(Backi, 0, HalfSize); + BackHi &= inRange(Backi, HalfSize, NumElts); + } + + // If both the lower and upper 128-bit parts access only one half of the + // vector (either lower or upper), avoid using xvperm.w. The latency of + // xvperm.w(3) is higher than using xvshuf(1) and xvori(1). + if ((FrontLo || FrontHi) && (BackLo || BackHi)) + return SDValue(); + + SmallVector<SDValue, 8> Masks; + for (unsigned i = 0; i < NumElts; ++i) + Masks.push_back(Mask[i] == -1 ? DAG.getUNDEF(MVT::i64) + : DAG.getConstant(Mask[i], DL, MVT::i64)); + SDValue MaskVec = DAG.getBuildVector(MVT::v8i32, DL, Masks); + + return DAG.getNode(LoongArchISD::XVPERM, DL, VT, V1, MaskVec); } /// Lower VECTOR_SHUFFLE into XVPACKEV (if possible). @@ -2060,15 +2259,15 @@ static SDValue lowerVECTOR_SHUFFLE_XVSHUF(const SDLoc &DL, ArrayRef<int> Mask, /// cases need to be converted to it for processing. /// /// This function may modify V1, V2 and Mask -static void canonicalizeShuffleVectorByLane(const SDLoc &DL, - MutableArrayRef<int> Mask, MVT VT, - SDValue &V1, SDValue &V2, - SelectionDAG &DAG) { +static void canonicalizeShuffleVectorByLane( + const SDLoc &DL, MutableArrayRef<int> Mask, MVT VT, SDValue &V1, + SDValue &V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget) { enum HalfMaskType { HighLaneTy, LowLaneTy, None }; int MaskSize = Mask.size(); int HalfSize = Mask.size() / 2; + MVT GRLenVT = Subtarget.getGRLenVT(); HalfMaskType preMask = None, postMask = None; @@ -2106,13 +2305,13 @@ static void canonicalizeShuffleVectorByLane(const SDLoc &DL, if (preMask == LowLaneTy && postMask == HighLaneTy) { V1 = DAG.getBitcast(MVT::v4i64, V1); V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1, - DAG.getConstant(0b01001110, DL, MVT::i64)); + DAG.getConstant(0b01001110, DL, GRLenVT)); V1 = DAG.getBitcast(VT, V1); if (!V2.isUndef()) { V2 = DAG.getBitcast(MVT::v4i64, V2); V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2, - DAG.getConstant(0b01001110, DL, MVT::i64)); + DAG.getConstant(0b01001110, DL, GRLenVT)); V2 = DAG.getBitcast(VT, V2); } @@ -2125,13 +2324,13 @@ static void canonicalizeShuffleVectorByLane(const SDLoc &DL, } else if (preMask == LowLaneTy && postMask == LowLaneTy) { V1 = DAG.getBitcast(MVT::v4i64, V1); V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1, - DAG.getConstant(0b11101110, DL, MVT::i64)); + DAG.getConstant(0b11101110, DL, GRLenVT)); V1 = DAG.getBitcast(VT, V1); if (!V2.isUndef()) { V2 = DAG.getBitcast(MVT::v4i64, V2); V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2, - DAG.getConstant(0b11101110, DL, MVT::i64)); + DAG.getConstant(0b11101110, DL, GRLenVT)); V2 = DAG.getBitcast(VT, V2); } @@ -2141,13 +2340,13 @@ static void canonicalizeShuffleVectorByLane(const SDLoc &DL, } else if (preMask == HighLaneTy && postMask == HighLaneTy) { V1 = DAG.getBitcast(MVT::v4i64, V1); V1 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V1, - DAG.getConstant(0b01000100, DL, MVT::i64)); + DAG.getConstant(0b01000100, DL, GRLenVT)); V1 = DAG.getBitcast(VT, V1); if (!V2.isUndef()) { V2 = DAG.getBitcast(MVT::v4i64, V2); V2 = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, V2, - DAG.getConstant(0b01000100, DL, MVT::i64)); + DAG.getConstant(0b01000100, DL, GRLenVT)); V2 = DAG.getBitcast(VT, V2); } @@ -2209,7 +2408,8 @@ static SDValue lowerVECTOR_SHUFFLEAsLanePermuteAndShuffle(const SDLoc &DL, /// This routine breaks down the specific type of 256-bit shuffle and /// dispatches to the lowering routines accordingly. static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, - SDValue V1, SDValue V2, SelectionDAG &DAG) { + SDValue V1, SDValue V2, SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget) { assert((VT.SimpleTy == MVT::v32i8 || VT.SimpleTy == MVT::v16i16 || VT.SimpleTy == MVT::v8i32 || VT.SimpleTy == MVT::v4i64 || VT.SimpleTy == MVT::v8f32 || VT.SimpleTy == MVT::v4f64) && @@ -2223,7 +2423,7 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, // canonicalize non cross-lane shuffle vector SmallVector<int> NewMask(Mask); - canonicalizeShuffleVectorByLane(DL, NewMask, VT, V1, V2, DAG); + canonicalizeShuffleVectorByLane(DL, NewMask, VT, V1, V2, DAG, Subtarget); APInt KnownUndef, KnownZero; computeZeroableShuffleElements(NewMask, V1, V2, KnownUndef, KnownZero); @@ -2232,9 +2432,13 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, SDValue Result; // TODO: Add more comparison patterns. if (V2.isUndef()) { - if ((Result = lowerVECTOR_SHUFFLE_XVREPLVEI(DL, NewMask, VT, V1, V2, DAG))) + if ((Result = lowerVECTOR_SHUFFLE_XVREPLVEI(DL, NewMask, VT, V1, V2, DAG, + Subtarget))) + return Result; + if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, NewMask, VT, V1, V2, DAG, + Subtarget))) return Result; - if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, NewMask, VT, V1, V2, DAG))) + if ((Result = lowerVECTOR_SHUFFLE_XVPERM(DL, NewMask, VT, V1, V2, DAG))) return Result; if ((Result = lowerVECTOR_SHUFFLEAsLanePermuteAndShuffle(DL, NewMask, VT, V1, V2, DAG))) @@ -2259,10 +2463,11 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, return Result; if ((Result = lowerVECTOR_SHUFFLE_XVPICKOD(DL, NewMask, VT, V1, V2, DAG))) return Result; - if ((Result = - lowerVECTOR_SHUFFLEAsShift(DL, NewMask, VT, V1, V2, DAG, Zeroable))) + if ((Result = lowerVECTOR_SHUFFLEAsShift(DL, NewMask, VT, V1, V2, DAG, + Subtarget, Zeroable))) return Result; - if ((Result = lowerVECTOR_SHUFFLEAsByteRotate(DL, NewMask, VT, V1, V2, DAG))) + if ((Result = lowerVECTOR_SHUFFLEAsByteRotate(DL, NewMask, VT, V1, V2, DAG, + Subtarget))) return Result; if (SDValue NewShuffle = widenShuffleMask(DL, NewMask, VT, V1, V2, DAG)) return NewShuffle; @@ -2314,10 +2519,10 @@ SDValue LoongArchTargetLowering::lowerVECTOR_SHUFFLE(SDValue Op, // For each vector width, delegate to a specialized lowering routine. if (VT.is128BitVector()) - return lower128BitShuffle(DL, OrigMask, VT, V1, V2, DAG); + return lower128BitShuffle(DL, OrigMask, VT, V1, V2, DAG, Subtarget); if (VT.is256BitVector()) - return lower256BitShuffle(DL, OrigMask, VT, V1, V2, DAG); + return lower256BitShuffle(DL, OrigMask, VT, V1, V2, DAG, Subtarget); return SDValue(); } @@ -2414,11 +2619,14 @@ static SDValue lowerBUILD_VECTORAsBroadCastLoad(BuildVectorSDNode *BVOp, } // make sure that this load is valid and only has one user. - if (!IdentitySrc || !BVOp->isOnlyUserOf(IdentitySrc.getNode())) + if (!IsIdeneity || !IdentitySrc || !BVOp->isOnlyUserOf(IdentitySrc.getNode())) return SDValue(); - if (IsIdeneity) { - auto *LN = cast<LoadSDNode>(IdentitySrc); + auto *LN = cast<LoadSDNode>(IdentitySrc); + auto ExtType = LN->getExtensionType(); + + if ((ExtType == ISD::EXTLOAD || ExtType == ISD::NON_EXTLOAD) && + VT.getScalarSizeInBits() == LN->getMemoryVT().getScalarSizeInBits()) { SDVTList Tys = LN->isIndexed() ? DAG.getVTList(VT, LN->getBasePtr().getValueType(), MVT::Other) @@ -2461,6 +2669,16 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op, SplatBitSize != 64) return SDValue(); + if (SplatBitSize == 64 && !Subtarget.is64Bit()) { + // We can only handle 64-bit elements that are within + // the signed 32-bit range on 32-bit targets. + if (!SplatValue.isSignedIntN(32)) + return SDValue(); + if ((Is128Vec && ResTy == MVT::v4i32) || + (Is256Vec && ResTy == MVT::v8i32)) + return Op; + } + EVT ViaVecTy; switch (SplatBitSize) { @@ -2609,14 +2827,58 @@ SDValue LoongArchTargetLowering::lowerCONCAT_VECTORS(SDValue Op, SDValue LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { - EVT VecTy = Op->getOperand(0)->getValueType(0); + MVT EltVT = Op.getSimpleValueType(); + SDValue Vec = Op->getOperand(0); + EVT VecTy = Vec->getValueType(0); SDValue Idx = Op->getOperand(1); - unsigned NumElts = VecTy.getVectorNumElements(); + SDLoc DL(Op); + MVT GRLenVT = Subtarget.getGRLenVT(); + + assert(VecTy.is256BitVector() && "Unexpected EXTRACT_VECTOR_ELT vector type"); - if (isa<ConstantSDNode>(Idx) && Idx->getAsZExtVal() < NumElts) + if (isa<ConstantSDNode>(Idx)) return Op; - return SDValue(); + switch (VecTy.getSimpleVT().SimpleTy) { + default: + llvm_unreachable("Unexpected type"); + case MVT::v32i8: + case MVT::v16i16: + case MVT::v4i64: + case MVT::v4f64: { + // Extract the high half subvector and place it to the low half of a new + // vector. It doesn't matter what the high half of the new vector is. + EVT HalfTy = VecTy.getHalfNumVectorElementsVT(*DAG.getContext()); + SDValue VecHi = + DAG.getExtractSubvector(DL, HalfTy, Vec, HalfTy.getVectorNumElements()); + SDValue TmpVec = + DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecTy, DAG.getUNDEF(VecTy), + VecHi, DAG.getConstant(0, DL, GRLenVT)); + + // Shuffle the origin Vec and the TmpVec using MaskVec, the lowest element + // of MaskVec is Idx, the rest do not matter. ResVec[0] will hold the + // desired element. + SDValue IdxCp = + DAG.getNode(LoongArchISD::MOVGR2FR_W_LA64, DL, MVT::f32, Idx); + SDValue IdxVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f32, IdxCp); + SDValue MaskVec = + DAG.getBitcast((VecTy == MVT::v4f64) ? MVT::v4i64 : VecTy, IdxVec); + SDValue ResVec = + DAG.getNode(LoongArchISD::VSHUF, DL, VecTy, MaskVec, TmpVec, Vec); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ResVec, + DAG.getConstant(0, DL, GRLenVT)); + } + case MVT::v8i32: + case MVT::v8f32: { + SDValue SplatIdx = DAG.getSplatBuildVector(MVT::v8i32, DL, Idx); + SDValue SplatValue = + DAG.getNode(LoongArchISD::XVPERM, DL, VecTy, Vec, SplatIdx); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SplatValue, + DAG.getConstant(0, DL, GRLenVT)); + } + } } SDValue @@ -4740,13 +5002,29 @@ static SDValue performBITCASTCombine(SDNode *N, SelectionDAG &DAG, UseLASX = true; break; }; - if (UseLASX && !(Subtarget.has32S() && Subtarget.hasExtLASX())) - return SDValue(); Src = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL) : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src); - Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ; - SDValue V = DAG.getNode(Opc, DL, MVT::i64, Src); + SDValue V; + if (!Subtarget.has32S() || !Subtarget.hasExtLASX()) { + if (Src.getSimpleValueType() == MVT::v32i8) { + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(Src, DL); + Lo = DAG.getNode(LoongArchISD::VMSKLTZ, DL, MVT::i64, Lo); + Hi = DAG.getNode(LoongArchISD::VMSKLTZ, DL, MVT::i64, Hi); + Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi, + DAG.getConstant(16, DL, MVT::i8)); + V = DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi); + } else if (UseLASX) { + return SDValue(); + } + } + + if (!V) { + Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ; + V = DAG.getNode(Opc, DL, MVT::i64, Src); + } + EVT T = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements()); V = DAG.getZExtOrTrunc(V, DL, T); return DAG.getBitcast(VT, V); @@ -5154,6 +5432,145 @@ static SDValue performBITREV_WCombine(SDNode *N, SelectionDAG &DAG, Src.getOperand(0)); } +// Perform common combines for BR_CC and SELECT_CC conditions. +static bool combine_CC(SDValue &LHS, SDValue &RHS, SDValue &CC, const SDLoc &DL, + SelectionDAG &DAG, const LoongArchSubtarget &Subtarget) { + ISD::CondCode CCVal = cast<CondCodeSDNode>(CC)->get(); + + // As far as arithmetic right shift always saves the sign, + // shift can be omitted. + // Fold setlt (sra X, N), 0 -> setlt X, 0 and + // setge (sra X, N), 0 -> setge X, 0 + if (isNullConstant(RHS) && (CCVal == ISD::SETGE || CCVal == ISD::SETLT) && + LHS.getOpcode() == ISD::SRA) { + LHS = LHS.getOperand(0); + return true; + } + + if (!ISD::isIntEqualitySetCC(CCVal)) + return false; + + // Fold ((setlt X, Y), 0, ne) -> (X, Y, lt) + // Sometimes the setcc is introduced after br_cc/select_cc has been formed. + if (LHS.getOpcode() == ISD::SETCC && isNullConstant(RHS) && + LHS.getOperand(0).getValueType() == Subtarget.getGRLenVT()) { + // If we're looking for eq 0 instead of ne 0, we need to invert the + // condition. + bool Invert = CCVal == ISD::SETEQ; + CCVal = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); + if (Invert) + CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType()); + + RHS = LHS.getOperand(1); + LHS = LHS.getOperand(0); + translateSetCCForBranch(DL, LHS, RHS, CCVal, DAG); + + CC = DAG.getCondCode(CCVal); + return true; + } + + // Fold ((srl (and X, 1<<C), C), 0, eq/ne) -> ((shl X, GRLen-1-C), 0, ge/lt) + if (isNullConstant(RHS) && LHS.getOpcode() == ISD::SRL && LHS.hasOneUse() && + LHS.getOperand(1).getOpcode() == ISD::Constant) { + SDValue LHS0 = LHS.getOperand(0); + if (LHS0.getOpcode() == ISD::AND && + LHS0.getOperand(1).getOpcode() == ISD::Constant) { + uint64_t Mask = LHS0.getConstantOperandVal(1); + uint64_t ShAmt = LHS.getConstantOperandVal(1); + if (isPowerOf2_64(Mask) && Log2_64(Mask) == ShAmt) { + CCVal = CCVal == ISD::SETEQ ? ISD::SETGE : ISD::SETLT; + CC = DAG.getCondCode(CCVal); + + ShAmt = LHS.getValueSizeInBits() - 1 - ShAmt; + LHS = LHS0.getOperand(0); + if (ShAmt != 0) + LHS = + DAG.getNode(ISD::SHL, DL, LHS.getValueType(), LHS0.getOperand(0), + DAG.getConstant(ShAmt, DL, LHS.getValueType())); + return true; + } + } + } + + // (X, 1, setne) -> (X, 0, seteq) if we can prove X is 0/1. + // This can occur when legalizing some floating point comparisons. + APInt Mask = APInt::getBitsSetFrom(LHS.getValueSizeInBits(), 1); + if (isOneConstant(RHS) && DAG.MaskedValueIsZero(LHS, Mask)) { + CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType()); + CC = DAG.getCondCode(CCVal); + RHS = DAG.getConstant(0, DL, LHS.getValueType()); + return true; + } + + return false; +} + +static SDValue performBR_CCCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const LoongArchSubtarget &Subtarget) { + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + SDValue CC = N->getOperand(3); + SDLoc DL(N); + + if (combine_CC(LHS, RHS, CC, DL, DAG, Subtarget)) + return DAG.getNode(LoongArchISD::BR_CC, DL, N->getValueType(0), + N->getOperand(0), LHS, RHS, CC, N->getOperand(4)); + + return SDValue(); +} + +static SDValue performSELECT_CCCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const LoongArchSubtarget &Subtarget) { + // Transform + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + SDValue CC = N->getOperand(2); + ISD::CondCode CCVal = cast<CondCodeSDNode>(CC)->get(); + SDValue TrueV = N->getOperand(3); + SDValue FalseV = N->getOperand(4); + SDLoc DL(N); + EVT VT = N->getValueType(0); + + // If the True and False values are the same, we don't need a select_cc. + if (TrueV == FalseV) + return TrueV; + + // (select (x < 0), y, z) -> x >> (GRLEN - 1) & (y - z) + z + // (select (x >= 0), y, z) -> x >> (GRLEN - 1) & (z - y) + y + if (isa<ConstantSDNode>(TrueV) && isa<ConstantSDNode>(FalseV) && + isNullConstant(RHS) && + (CCVal == ISD::CondCode::SETLT || CCVal == ISD::CondCode::SETGE)) { + if (CCVal == ISD::CondCode::SETGE) + std::swap(TrueV, FalseV); + + int64_t TrueSImm = cast<ConstantSDNode>(TrueV)->getSExtValue(); + int64_t FalseSImm = cast<ConstantSDNode>(FalseV)->getSExtValue(); + // Only handle simm12, if it is not in this range, it can be considered as + // register. + if (isInt<12>(TrueSImm) && isInt<12>(FalseSImm) && + isInt<12>(TrueSImm - FalseSImm)) { + SDValue SRA = + DAG.getNode(ISD::SRA, DL, VT, LHS, + DAG.getConstant(Subtarget.getGRLen() - 1, DL, VT)); + SDValue AND = + DAG.getNode(ISD::AND, DL, VT, SRA, + DAG.getSignedConstant(TrueSImm - FalseSImm, DL, VT)); + return DAG.getNode(ISD::ADD, DL, VT, AND, FalseV); + } + + if (CCVal == ISD::CondCode::SETGE) + std::swap(TrueV, FalseV); + } + + if (combine_CC(LHS, RHS, CC, DL, DAG, Subtarget)) + return DAG.getNode(LoongArchISD::SELECT_CC, DL, N->getValueType(0), + {LHS, RHS, CC, TrueV, FalseV}); + + return SDValue(); +} + template <unsigned N> static SDValue legalizeIntrinsicImmArg(SDNode *Node, unsigned ImmOp, SelectionDAG &DAG, @@ -5828,6 +6245,42 @@ performSPLIT_PAIR_F64Combine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue +performEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const LoongArchSubtarget &Subtarget) { + if (!DCI.isBeforeLegalize()) + return SDValue(); + + MVT EltVT = N->getSimpleValueType(0); + SDValue Vec = N->getOperand(0); + EVT VecTy = Vec->getValueType(0); + SDValue Idx = N->getOperand(1); + unsigned IdxOp = Idx.getOpcode(); + SDLoc DL(N); + + if (!VecTy.is256BitVector() || isa<ConstantSDNode>(Idx)) + return SDValue(); + + // Combine: + // t2 = truncate t1 + // t3 = {zero/sign/any}_extend t2 + // t4 = extract_vector_elt t0, t3 + // to: + // t4 = extract_vector_elt t0, t1 + if (IdxOp == ISD::ZERO_EXTEND || IdxOp == ISD::SIGN_EXTEND || + IdxOp == ISD::ANY_EXTEND) { + SDValue IdxOrig = Idx.getOperand(0); + if (!(IdxOrig.getOpcode() == ISD::TRUNCATE)) + return SDValue(); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vec, + IdxOrig.getOperand(0)); + } + + return SDValue(); +} + SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -5846,6 +6299,10 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N, return performBITCASTCombine(N, DAG, DCI, Subtarget); case LoongArchISD::BITREV_W: return performBITREV_WCombine(N, DAG, DCI, Subtarget); + case LoongArchISD::BR_CC: + return performBR_CCCombine(N, DAG, DCI, Subtarget); + case LoongArchISD::SELECT_CC: + return performSELECT_CCCombine(N, DAG, DCI, Subtarget); case ISD::INTRINSIC_WO_CHAIN: return performINTRINSIC_WO_CHAINCombine(N, DAG, DCI, Subtarget); case LoongArchISD::MOVGR2FR_W_LA64: @@ -5857,6 +6314,8 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N, return performVMSKLTZCombine(N, DAG, DCI, Subtarget); case LoongArchISD::SPLIT_PAIR_F64: return performSPLIT_PAIR_F64Combine(N, DAG, DCI, Subtarget); + case ISD::EXTRACT_VECTOR_ELT: + return performEXTRACT_VECTOR_ELTCombine(N, DAG, DCI, Subtarget); } return SDValue(); } @@ -6575,6 +7034,8 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(TAIL_MEDIUM) NODE_NAME_CASE(TAIL_LARGE) NODE_NAME_CASE(SELECT_CC) + NODE_NAME_CASE(BR_CC) + NODE_NAME_CASE(BRCOND) NODE_NAME_CASE(SLL_W) NODE_NAME_CASE(SRA_W) NODE_NAME_CASE(SRL_W) @@ -6637,6 +7098,7 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(VREPLVEI) NODE_NAME_CASE(VREPLGR2VR) NODE_NAME_CASE(XVPERMI) + NODE_NAME_CASE(XVPERM) NODE_NAME_CASE(VPICK_SEXT_ELT) NODE_NAME_CASE(VPICK_ZEXT_ELT) NODE_NAME_CASE(VREPLVE) @@ -6659,6 +7121,7 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(XVMSKGEZ) NODE_NAME_CASE(XVMSKEQZ) NODE_NAME_CASE(XVMSKNEZ) + NODE_NAME_CASE(VHADDW) } #undef NODE_NAME_CASE return nullptr; @@ -7132,6 +7595,7 @@ SDValue LoongArchTargetLowering::LowerFormalArguments( llvm_unreachable("Unsupported calling convention"); case CallingConv::C: case CallingConv::Fast: + case CallingConv::PreserveMost: break; case CallingConv::GHC: if (!MF.getSubtarget().hasFeature(LoongArch::FeatureBasicF) || @@ -7893,7 +8357,7 @@ LoongArchTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { if (Size < 32 && (AI->getOperation() == AtomicRMWInst::And || AI->getOperation() == AtomicRMWInst::Or || AI->getOperation() == AtomicRMWInst::Xor)) - return AtomicExpansionKind::Expand; + return AtomicExpansionKind::CustomExpand; if (AI->getOperation() == AtomicRMWInst::Nand || Size < 32) return AtomicExpansionKind::CmpXChg; } diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index f79ba7450cc3..9d14934a9d36 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -37,6 +37,10 @@ enum NodeType : unsigned { // Select SELECT_CC, + // Branch + BR_CC, + BRCOND, + // 32-bit shifts, directly matching the semantics of the named LoongArch // instructions. SLL_W, @@ -141,6 +145,7 @@ enum NodeType : unsigned { VREPLVEI, VREPLGR2VR, XVPERMI, + XVPERM, // Extended vector element extraction VPICK_SEXT_ELT, @@ -177,6 +182,9 @@ enum NodeType : unsigned { XVMSKEQZ, XVMSKNEZ, + // Vector Horizontal Addition with Widening‌ + VHADDW + // Intrinsic operations end ============================================= }; } // end namespace LoongArchISD @@ -382,10 +390,13 @@ private: SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerBRCOND(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const; SDValue lowerBF16_TO_FP(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerVECREDUCE_ADD(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override; diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp index 26d36f1c5058..c89212dae72d 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.cpp @@ -25,8 +25,8 @@ using namespace llvm; #define GET_INSTRINFO_CTOR_DTOR #include "LoongArchGenInstrInfo.inc" -LoongArchInstrInfo::LoongArchInstrInfo(LoongArchSubtarget &STI) - : LoongArchGenInstrInfo(LoongArch::ADJCALLSTACKDOWN, +LoongArchInstrInfo::LoongArchInstrInfo(const LoongArchSubtarget &STI) + : LoongArchGenInstrInfo(STI, LoongArch::ADJCALLSTACKDOWN, LoongArch::ADJCALLSTACKUP), STI(STI) {} diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h index 63b7112b8b40..f25958a32bec 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.h @@ -25,7 +25,7 @@ class LoongArchSubtarget; class LoongArchInstrInfo : public LoongArchGenInstrInfo { public: - explicit LoongArchInstrInfo(LoongArchSubtarget &STI); + explicit LoongArchInstrInfo(const LoongArchSubtarget &STI); MCInst getNop() const override; diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td index 2b94e65cac0e..20ccc622f58d 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td @@ -31,6 +31,10 @@ def SDT_LoongArchSelectCC : SDTypeProfile<1, 5, [SDTCisSameAs<1, 2>, SDTCisSameAs<0, 4>, SDTCisSameAs<4, 5>]>; +def SDT_LoongArchBrCC : SDTypeProfile<0, 4, [SDTCisSameAs<0, 1>, + SDTCisVT<2, OtherVT>, + SDTCisVT<3, OtherVT>]>; + def SDT_LoongArchBStrIns: SDTypeProfile<1, 4, [ SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<3>, SDTCisSameAs<3, 4> @@ -94,6 +98,8 @@ def loongarch_tail_large : SDNode<"LoongArchISD::TAIL_LARGE", SDT_LoongArchCall, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>; def loongarch_selectcc : SDNode<"LoongArchISD::SELECT_CC", SDT_LoongArchSelectCC>; +def loongarch_brcc : SDNode<"LoongArchISD::BR_CC", SDT_LoongArchBrCC, + [SDNPHasChain]>; def loongarch_sll_w : SDNode<"LoongArchISD::SLL_W", SDT_LoongArchIntBinOpW>; def loongarch_sra_w : SDNode<"LoongArchISD::SRA_W", SDT_LoongArchIntBinOpW>; def loongarch_srl_w : SDNode<"LoongArchISD::SRL_W", SDT_LoongArchIntBinOpW>; @@ -1537,47 +1543,29 @@ def : Pat<(select GPR:$cond, GPR:$t, GPR:$f), /// Branches and jumps -class BccPat<PatFrag CondOp, LAInst Inst> - : Pat<(brcond (GRLenVT (CondOp GPR:$rj, GPR:$rd)), bb:$imm16), - (Inst GPR:$rj, GPR:$rd, bb:$imm16)>; - -def : BccPat<seteq, BEQ>; -def : BccPat<setne, BNE>; -def : BccPat<setlt, BLT>; -def : BccPat<setge, BGE>; -def : BccPat<setult, BLTU>; -def : BccPat<setuge, BGEU>; - -class BccSwapPat<PatFrag CondOp, LAInst InstBcc> - : Pat<(brcond (GRLenVT (CondOp GPR:$rd, GPR:$rj)), bb:$imm16), - (InstBcc GPR:$rj, GPR:$rd, bb:$imm16)>; - -// Condition codes that don't have matching LoongArch branch instructions, but -// are trivially supported by swapping the two input operands. -def : BccSwapPat<setgt, BLT>; -def : BccSwapPat<setle, BGE>; -def : BccSwapPat<setugt, BLTU>; -def : BccSwapPat<setule, BGEU>; - let Predicates = [Has32S] in { -// An extra pattern is needed for a brcond without a setcc (i.e. where the -// condition was calculated elsewhere). -def : Pat<(brcond GPR:$rj, bb:$imm21), (BNEZ GPR:$rj, bb:$imm21)>; - -def : Pat<(brcond (GRLenVT (seteq GPR:$rj, 0)), bb:$imm21), - (BEQZ GPR:$rj, bb:$imm21)>; -def : Pat<(brcond (GRLenVT (setne GPR:$rj, 0)), bb:$imm21), - (BNEZ GPR:$rj, bb:$imm21)>; +class BccZeroPat<CondCode Cond, LAInst Inst> + : Pat<(loongarch_brcc (GRLenVT GPR:$rj), 0, Cond, bb:$imm21), + (Inst GPR:$rj, bb:$imm21)>; + +def : BccZeroPat<SETEQ, BEQZ>; +def : BccZeroPat<SETNE, BNEZ>; } // Predicates = [Has32S] -// An extra pattern is needed for a brcond without a setcc (i.e. where the -// condition was calculated elsewhere). -def : Pat<(brcond GPR:$rj, bb:$imm16), (BNE GPR:$rj, R0, bb:$imm16)>; +multiclass BccPat<CondCode Cond, LAInst Inst> { + def : Pat<(loongarch_brcc (GRLenVT GPR:$rj), GPR:$rd, Cond, bb:$imm16), + (Inst GPR:$rj, GPR:$rd, bb:$imm16)>; + // Explicitly select 0 to R0. The register coalescer doesn't always do it. + def : Pat<(loongarch_brcc (GRLenVT GPR:$rj), 0, Cond, bb:$imm16), + (Inst GPR:$rj, (GRLenVT R0), bb:$imm16)>; +} -def : Pat<(brcond (GRLenVT (seteq GPR:$rj, 0)), bb:$imm16), - (BEQ GPR:$rj, R0, bb:$imm16)>; -def : Pat<(brcond (GRLenVT (setne GPR:$rj, 0)), bb:$imm16), - (BNE GPR:$rj, R0, bb:$imm16)>; +defm : BccPat<SETEQ, BEQ>; +defm : BccPat<SETNE, BNE>; +defm : BccPat<SETLT, BLT>; +defm : BccPat<SETGE, BGE>; +defm : BccPat<SETULT, BLTU>; +defm : BccPat<SETUGE, BGEU>; let isBarrier = 1, isBranch = 1, isTerminator = 1 in def PseudoBR : Pseudo<(outs), (ins simm26_b:$imm26), [(br bb:$imm26)]>, diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index 0696b11d62ac..a79c01cbe577 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -10,8 +10,12 @@ // //===----------------------------------------------------------------------===// +def SDT_LoongArchXVPERM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, + SDTCisVec<2>, SDTCisInt<2>]>; + // Target nodes. def loongarch_xvpermi: SDNode<"LoongArchISD::XVPERMI", SDT_LoongArchV1RUimm>; +def loongarch_xvperm: SDNode<"LoongArchISD::XVPERM", SDT_LoongArchXVPERM>; def loongarch_xvmskltz: SDNode<"LoongArchISD::XVMSKLTZ", SDT_LoongArchVMSKCOND>; def loongarch_xvmskgez: SDNode<"LoongArchISD::XVMSKGEZ", SDT_LoongArchVMSKCOND>; def loongarch_xvmskeqz: SDNode<"LoongArchISD::XVMSKEQZ", SDT_LoongArchVMSKCOND>; @@ -1186,6 +1190,17 @@ multiclass PatXrXrXr<SDPatternOperator OpNode, string Inst> { (!cast<LAInst>(Inst#"_D") LASX256:$xd, LASX256:$xj, LASX256:$xk)>; } +multiclass PatXrXrW<SDPatternOperator OpNode, string Inst> { + def : Pat<(OpNode(v32i8 LASX256:$vj), (v32i8 LASX256:$vk)), + (!cast<LAInst>(Inst#"_H_B") LASX256:$vj, LASX256:$vk)>; + def : Pat<(OpNode(v16i16 LASX256:$vj), (v16i16 LASX256:$vk)), + (!cast<LAInst>(Inst#"_W_H") LASX256:$vj, LASX256:$vk)>; + def : Pat<(OpNode(v8i32 LASX256:$vj), (v8i32 LASX256:$vk)), + (!cast<LAInst>(Inst#"_D_W") LASX256:$vj, LASX256:$vk)>; + def : Pat<(OpNode(v4i64 LASX256:$vj), (v4i64 LASX256:$vk)), + (!cast<LAInst>(Inst#"_Q_D") LASX256:$vj, LASX256:$vk)>; +} + multiclass PatShiftXrXr<SDPatternOperator OpNode, string Inst> { def : Pat<(OpNode (v32i8 LASX256:$xj), (and vsplati8_imm_eq_7, (v32i8 LASX256:$xk))), @@ -1513,6 +1528,9 @@ def : Pat<(bswap (v8i32 LASX256:$xj)), (XVSHUF4I_B LASX256:$xj, 0b00011011)>; def : Pat<(bswap (v4i64 LASX256:$xj)), (XVSHUF4I_W (XVSHUF4I_B LASX256:$xj, 0b00011011), 0b10110001)>; +// XVHADDW_{H_B/W_H/D_W/Q_D} +defm : PatXrXrW<loongarch_vhaddw, "XVHADDW">; + // XVFADD_{S/D} defm : PatXrXrF<fadd, "XVFADD">; @@ -1852,6 +1870,12 @@ def : Pat<(loongarch_xvpermi v4i64:$xj, immZExt8: $ui8), def : Pat<(loongarch_xvpermi v4f64:$xj, immZExt8: $ui8), (XVPERMI_D v4f64:$xj, immZExt8: $ui8)>; +// XVPERM_W +def : Pat<(loongarch_xvperm v8i32:$xj, v8i32:$xk), + (XVPERM_W v8i32:$xj, v8i32:$xk)>; +def : Pat<(loongarch_xvperm v8f32:$xj, v8i32:$xk), + (XVPERM_W v8f32:$xj, v8i32:$xk)>; + // XVREPLVE0_{W/D} def : Pat<(lasxsplatf32 FPR32:$fj), (XVREPLVE0_W (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32))>; diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index 3c9defb0366f..eb7120ffb41a 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -22,7 +22,7 @@ def SDT_LoongArchVShuf : SDTypeProfile<1, 3, [SDTCisVec<0>, def SDT_LoongArchV2R : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>]>; def SDT_LoongArchV1RUimm: SDTypeProfile<1, 2, [SDTCisVec<0>, - SDTCisSameAs<0,1>, SDTCisVT<2, i64>]>; + SDTCisSameAs<0,1>, SDTCisVT<2, GRLenVT>]>; def SDT_LoongArchV2RUimm : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, @@ -71,6 +71,8 @@ def loongarch_vsrli : SDNode<"LoongArchISD::VSRLI", SDT_LoongArchV1RUimm>; def loongarch_vbsll : SDNode<"LoongArchISD::VBSLL", SDT_LoongArchV1RUimm>; def loongarch_vbsrl : SDNode<"LoongArchISD::VBSRL", SDT_LoongArchV1RUimm>; +def loongarch_vhaddw : SDNode<"LoongArchISD::VHADDW", SDT_LoongArchV2R>; + def loongarch_vldrepl : SDNode<"LoongArchISD::VLDREPL", SDT_LoongArchVLDREPL, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; @@ -1364,6 +1366,17 @@ multiclass PatVrVrVr<SDPatternOperator OpNode, string Inst> { (!cast<LAInst>(Inst#"_D") LSX128:$vd, LSX128:$vj, LSX128:$vk)>; } +multiclass PatVrVrW<SDPatternOperator OpNode, string Inst> { + def : Pat<(OpNode(v16i8 LSX128:$vj), (v16i8 LSX128:$vk)), + (!cast<LAInst>(Inst#"_H_B") LSX128:$vj, LSX128:$vk)>; + def : Pat<(OpNode(v8i16 LSX128:$vj), (v8i16 LSX128:$vk)), + (!cast<LAInst>(Inst#"_W_H") LSX128:$vj, LSX128:$vk)>; + def : Pat<(OpNode(v4i32 LSX128:$vj), (v4i32 LSX128:$vk)), + (!cast<LAInst>(Inst#"_D_W") LSX128:$vj, LSX128:$vk)>; + def : Pat<(OpNode(v2i64 LSX128:$vj), (v2i64 LSX128:$vk)), + (!cast<LAInst>(Inst#"_Q_D") LSX128:$vj, LSX128:$vk)>; +} + multiclass PatShiftVrVr<SDPatternOperator OpNode, string Inst> { def : Pat<(OpNode (v16i8 LSX128:$vj), (and vsplati8_imm_eq_7, (v16i8 LSX128:$vk))), @@ -1709,6 +1722,9 @@ def : Pat<(bswap (v4i32 LSX128:$vj)), (VSHUF4I_B LSX128:$vj, 0b00011011)>; def : Pat<(bswap (v2i64 LSX128:$vj)), (VSHUF4I_W (VSHUF4I_B LSX128:$vj, 0b00011011), 0b10110001)>; +// VHADDW_{H_B/W_H/D_W/Q_D} +defm : PatVrVrW<loongarch_vhaddw, "VHADDW">; + // VFADD_{S/D} defm : PatVrVrF<fadd, "VFADD">; diff --git a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp index 47fce37ce59f..9c5f8edfaf66 100644 --- a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.cpp @@ -41,6 +41,8 @@ LoongArchRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { if (MF->getFunction().getCallingConv() == CallingConv::GHC) return CSR_NoRegs_SaveList; + if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost) + return CSR_MostRegs_SaveList; switch (Subtarget.getTargetABI()) { default: llvm_unreachable("Unrecognized ABI"); @@ -63,6 +65,8 @@ LoongArchRegisterInfo::getCallPreservedMask(const MachineFunction &MF, if (CC == CallingConv::GHC) return CSR_NoRegs_RegMask; + if (CC == CallingConv::PreserveMost) + return CSR_MostRegs_RegMask; switch (Subtarget.getTargetABI()) { default: llvm_unreachable("Unrecognized ABI"); diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp index ede5477f04bd..f548a8dd0532 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp @@ -95,4 +95,20 @@ unsigned LoongArchTTIImpl::getPrefetchDistance() const { return 200; } bool LoongArchTTIImpl::enableWritePrefetching() const { return true; } +bool LoongArchTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const { + switch (II->getIntrinsicID()) { + default: + return true; + case Intrinsic::vector_reduce_add: + case Intrinsic::vector_reduce_and: + case Intrinsic::vector_reduce_or: + case Intrinsic::vector_reduce_smax: + case Intrinsic::vector_reduce_smin: + case Intrinsic::vector_reduce_umax: + case Intrinsic::vector_reduce_umin: + case Intrinsic::vector_reduce_xor: + return false; + } +} + // TODO: Implement more hooks to provide TTI machinery for LoongArch. diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h index d43d2cb0eb12..e3f16c780499 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h +++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h @@ -53,6 +53,8 @@ public: unsigned getPrefetchDistance() const override; bool enableWritePrefetching() const override; + bool shouldExpandReduction(const IntrinsicInst *II) const override; + // TODO: Implement more hooks to provide TTI machinery for LoongArch. }; diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp index 35277ce094a7..e5bd1c91edec 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp @@ -26,6 +26,7 @@ #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/Compiler.h" +#include <bitset> #define GET_INSTRINFO_MC_DESC #define ENABLE_INSTR_PREDICATE_VERIFIER @@ -95,10 +96,81 @@ createLoongArchAsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS, namespace { class LoongArchMCInstrAnalysis : public MCInstrAnalysis { + int64_t GPRState[31] = {}; + std::bitset<31> GPRValidMask; + + static bool isGPR(MCRegister Reg) { + return Reg >= LoongArch::R0 && Reg <= LoongArch::R31; + } + + static unsigned getRegIndex(MCRegister Reg) { + assert(isGPR(Reg) && Reg != LoongArch::R0 && "Invalid GPR reg"); + return Reg - LoongArch::R1; + } + + void setGPRState(MCRegister Reg, std::optional<int64_t> Value) { + if (Reg == LoongArch::R0) + return; + + auto Index = getRegIndex(Reg); + + if (Value) { + GPRState[Index] = *Value; + GPRValidMask.set(Index); + } else { + GPRValidMask.reset(Index); + } + } + + std::optional<int64_t> getGPRState(MCRegister Reg) const { + if (Reg == LoongArch::R0) + return 0; + + auto Index = getRegIndex(Reg); + + if (GPRValidMask.test(Index)) + return GPRState[Index]; + return std::nullopt; + } + public: explicit LoongArchMCInstrAnalysis(const MCInstrInfo *Info) : MCInstrAnalysis(Info) {} + void resetState() override { GPRValidMask.reset(); } + + void updateState(const MCInst &Inst, uint64_t Addr) override { + // Terminators mark the end of a basic block which means the sequentially + // next instruction will be the first of another basic block and the current + // state will typically not be valid anymore. For calls, we assume all + // registers may be clobbered by the callee (TODO: should we take the + // calling convention into account?). + if (isTerminator(Inst) || isCall(Inst)) { + resetState(); + return; + } + + switch (Inst.getOpcode()) { + default: { + // Clear the state of all defined registers for instructions that we don't + // explicitly support. + auto NumDefs = Info->get(Inst.getOpcode()).getNumDefs(); + for (unsigned I = 0; I < NumDefs; ++I) { + auto DefReg = Inst.getOperand(I).getReg(); + if (isGPR(DefReg)) + setGPRState(DefReg, std::nullopt); + } + break; + } + case LoongArch::PCADDU18I: + setGPRState( + Inst.getOperand(0).getReg(), + Addr + SignExtend64<38>( + static_cast<uint64_t>(Inst.getOperand(1).getImm()) << 18)); + break; + } + } + bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size, uint64_t &Target) const override { unsigned NumOps = Inst.getNumOperands(); @@ -108,6 +180,14 @@ public: return true; } + if (Inst.getOpcode() == LoongArch::JIRL) { + if (auto TargetRegState = getGPRState(Inst.getOperand(1).getReg())) { + Target = *TargetRegState + Inst.getOperand(2).getImm(); + return true; + } + return false; + } + return false; } |
