summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/X86/X86ISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp230
1 files changed, 129 insertions, 101 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index efeddd7c9bd4..b5f8ee50cba3 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -28,7 +28,6 @@
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/VectorUtils.h"
-#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -3625,6 +3624,16 @@ X86TargetLowering::getJumpConditionMergingParams(Instruction::BinaryOps Opc,
match(Lhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value())) &&
match(Rhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value())))
BaseCost += 1;
+
+ // For OR conditions with EQ comparisons, prefer splitting into branches
+ // (unless CCMP is available). OR+EQ cannot be optimized via bitwise ops,
+ // unlike OR+NE which becomes (P|Q)!=0. Similarly, don't split signed
+ // comparisons (SLT, SGT) that can be optimized.
+ if (BaseCost >= 0 && !Subtarget.hasCCMP() && Opc == Instruction::Or &&
+ match(Lhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value())) &&
+ match(Rhs, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Value(), m_Value())))
+ return {-1, -1, -1};
+
return {BaseCost, BrMergingLikelyBias.getValue(),
BrMergingUnlikelyBias.getValue()};
}
@@ -3634,7 +3643,7 @@ bool X86TargetLowering::preferScalarizeSplat(SDNode *N) const {
}
bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
- const SDNode *N, CombineLevel Level) const {
+ const SDNode *N) const {
assert(((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) ||
(N->getOpcode() == ISD::SRL &&
@@ -3649,7 +3658,7 @@ bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
// the fold for non-splats yet.
return N->getOperand(1) == N->getOperand(0).getOperand(1);
}
- return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);
+ return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N);
}
bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
@@ -3659,11 +3668,8 @@ bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
if (VT.isVector())
return false;
- // 64-bit shifts on 32-bit targets produce really bad bloated code.
- if (VT == MVT::i64 && !Subtarget.is64Bit())
- return false;
-
- return true;
+ unsigned MaxWidth = Subtarget.is64Bit() ? 64 : 32;
+ return VT.getScalarSizeInBits() <= MaxWidth;
}
TargetLowering::ShiftLegalizationStrategy
@@ -3791,7 +3797,7 @@ static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
/// Return true if every element in Mask, is an in-place blend/select mask or is
/// undef.
-LLVM_ATTRIBUTE_UNUSED static bool isBlendOrUndef(ArrayRef<int> Mask) {
+[[maybe_unused]] static bool isBlendOrUndef(ArrayRef<int> Mask) {
unsigned NumElts = Mask.size();
for (auto [I, M] : enumerate(Mask))
if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
@@ -4456,8 +4462,8 @@ SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
bool AllowAVX512 = true) {
assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
unsigned NumSubs = 1;
- if ((CheckBWI && Subtarget.useBWIRegs()) ||
- (!CheckBWI && AllowAVX512 && Subtarget.useAVX512Regs())) {
+ if (AllowAVX512 && ((CheckBWI && Subtarget.useBWIRegs()) ||
+ (!CheckBWI && Subtarget.useAVX512Regs()))) {
if (VT.getSizeInBits() > 512) {
NumSubs = VT.getSizeInBits() / 512;
assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
@@ -8100,7 +8106,7 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl,
return DstVec;
}
-LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
+[[maybe_unused]] static bool isHorizOp(unsigned Opcode) {
switch (Opcode) {
case X86ISD::PACKSS:
case X86ISD::PACKUS:
@@ -13783,10 +13789,12 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// so prevents folding a load into this instruction or making a copy.
const int UnpackLoMask[] = {0, 0, 1, 1};
const int UnpackHiMask[] = {2, 2, 3, 3};
- if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
- Mask = UnpackLoMask;
- else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
- Mask = UnpackHiMask;
+ if (!isSingleElementRepeatedMask(Mask)) {
+ if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
+ Mask = UnpackLoMask;
+ else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
+ Mask = UnpackHiMask;
+ }
return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
@@ -20815,7 +20823,7 @@ SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
// for DAG type consistency we have to match the FP operand type.
APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
- LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
+ [[maybe_unused]] APFloat::opStatus Status = APFloat::opOK;
bool LosesInfo = false;
if (TheVT == MVT::f64)
// The rounding mode is irrelevant as the conversion should be exact.
@@ -22858,7 +22866,7 @@ static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
// be generated by the memcmp expansion pass with oversized integer compares
// (see PR33325).
bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
- if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
+ if (isNullConstant(Y) && OpSize == 128 && !IsOrXorXorTreeCCZero)
return SDValue();
// Don't perform this combine if constructing the vector will be expensive.
@@ -29747,65 +29755,30 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
const X86Subtarget &Subtarget,
SelectionDAG &DAG,
SDValue *Low = nullptr) {
- unsigned NumElts = VT.getVectorNumElements();
-
// For vXi8 we will unpack the low and high half of each 128 bit lane to widen
// to a vXi16 type. Do the multiplies, shift the results and pack the half
// lane results back together.
// We'll take different approaches for signed and unsigned.
- // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
- // and use pmullw to calculate the full 16-bit product.
+ // For unsigned we'll use punpcklbw/punpckhbw to zero extend the bytes to
+ // words and use pmullw to calculate the full 16-bit product.
// For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
// shift them left into the upper byte of each word. This allows us to use
// pmulhw to calculate the full 16-bit product. This trick means we don't
// need to sign extend the bytes to use pmullw.
-
- MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
+ MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
SDValue Zero = DAG.getConstant(0, dl, VT);
- SDValue ALo, AHi;
+ SDValue ALo, AHi, BLo, BHi;
if (IsSigned) {
ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
- AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
- } else {
- ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
- AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
- }
-
- SDValue BLo, BHi;
- if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
- // If the RHS is a constant, manually unpackl/unpackh and extend.
- SmallVector<SDValue, 16> LoOps, HiOps;
- for (unsigned i = 0; i != NumElts; i += 16) {
- for (unsigned j = 0; j != 8; ++j) {
- SDValue LoOp = B.getOperand(i + j);
- SDValue HiOp = B.getOperand(i + j + 8);
-
- if (IsSigned) {
- LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
- HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
- LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
- DAG.getConstant(8, dl, MVT::i16));
- HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
- DAG.getConstant(8, dl, MVT::i16));
- } else {
- LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
- HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
- }
-
- LoOps.push_back(LoOp);
- HiOps.push_back(HiOp);
- }
- }
-
- BLo = DAG.getBuildVector(ExVT, dl, LoOps);
- BHi = DAG.getBuildVector(ExVT, dl, HiOps);
- } else if (IsSigned) {
BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
+ AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
} else {
+ ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
+ AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
}
@@ -29818,7 +29791,7 @@ static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,
if (Low)
*Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
- return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
+ return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf=*/true);
}
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
@@ -30313,22 +30286,8 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
- if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
- // Hardware support for vector shifts is sparse which makes us scalarize the
- // vector operations in many cases. Also, on sandybridge ADD is faster than
- // shl: (shl V, 1) -> (add (freeze V), (freeze V))
- if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
- // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
- // must be 0). (add undef, undef) however can be any value. To make this
- // safe, we must freeze R to ensure that register allocation uses the same
- // register for an undefined value. This ensures that the result will
- // still be even and preserves the original semantics.
- R = DAG.getFreeze(R);
- return DAG.getNode(ISD::ADD, dl, VT, R, R);
- }
-
+ if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
- }
// i64 SRA needs to be performed as partial shifts.
if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
@@ -31229,16 +31188,16 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
unsigned NumElts = VT.getVectorNumElements();
if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
- if (IsFSHR)
- std::swap(Op0, Op1);
if (IsCstSplat) {
+ if (IsFSHR)
+ std::swap(Op0, Op1);
uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
{Op0, Op1, Imm}, DAG, Subtarget);
}
- return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
+ return getAVX512Node(IsFSHR ? ISD::FSHR : ISD::FSHL, DL, VT,
{Op0, Op1, Amt}, DAG, Subtarget);
}
assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
@@ -35153,8 +35112,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(VALIGN)
NODE_NAME_CASE(VSHLD)
NODE_NAME_CASE(VSHRD)
- NODE_NAME_CASE(VSHLDV)
- NODE_NAME_CASE(VSHRDV)
NODE_NAME_CASE(PSHUFD)
NODE_NAME_CASE(PSHUFHW)
NODE_NAME_CASE(PSHUFLW)
@@ -41889,7 +41846,7 @@ static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))
return SDValue();
- Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
+ Imm = llvm::rotl<uint8_t>(Imm, 4);
return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
DAG.getTargetConstant(Imm, DL, MVT::i8));
};
@@ -44631,8 +44588,11 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
APInt DemandedMask = OriginalDemandedBits << ShAmt;
- // If we just want the sign bit then we don't need to shift it.
- if (OriginalDemandedBits.isSignMask())
+ // If we only want bits that already match the signbit then we don't need
+ // to shift.
+ unsigned NumHiDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
+ if (TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1) >=
+ NumHiDemandedBits)
return TLO.CombineTo(Op, Op0);
// fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
@@ -44853,10 +44813,16 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
}
case X86ISD::PCMPGT:
// icmp sgt(0, R) == ashr(R, BitWidth-1).
- // iff we only need the sign bit then we can use R directly.
- if (OriginalDemandedBits.isSignMask() &&
- ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
- return TLO.CombineTo(Op, Op.getOperand(1));
+ if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) {
+ // iff we only need the signbit then we can use R directly.
+ if (OriginalDemandedBits.isSignMask())
+ return TLO.CombineTo(Op, Op.getOperand(1));
+ // otherwise we just need R's signbit for the comparison.
+ APInt SignMask = APInt::getSignMask(BitWidth);
+ if (SimplifyDemandedBits(Op.getOperand(1), SignMask, OriginalDemandedElts,
+ Known, TLO, Depth + 1))
+ return true;
+ }
break;
case X86ISD::MOVMSK: {
SDValue Src = Op.getOperand(0);
@@ -45185,6 +45151,19 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
case X86ISD::Wrapper:
case X86ISD::WrapperRIP:
return true;
+ case X86ISD::PACKSS:
+ case X86ISD::PACKUS: {
+ APInt DemandedLHS, DemandedRHS;
+ getPackDemandedElts(Op.getSimpleValueType(), DemandedElts, DemandedLHS,
+ DemandedRHS);
+ return (!DemandedLHS ||
+ DAG.isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), DemandedLHS,
+ PoisonOnly, Depth + 1)) &&
+ (!DemandedRHS ||
+ DAG.isGuaranteedNotToBeUndefOrPoison(Op.getOperand(1), DemandedRHS,
+ PoisonOnly, Depth + 1));
+ }
+ case X86ISD::INSERTPS:
case X86ISD::BLENDI:
case X86ISD::PSHUFB:
case X86ISD::PSHUFD:
@@ -45254,7 +45233,12 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
case X86ISD::BLENDI:
case X86ISD::BLENDV:
return false;
+ // SSE packs.
+ case X86ISD::PACKSS:
+ case X86ISD::PACKUS:
+ return false;
// SSE target shuffles.
+ case X86ISD::INSERTPS:
case X86ISD::PSHUFB:
case X86ISD::PSHUFD:
case X86ISD::UNPCKL:
@@ -45452,7 +45436,8 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
const SDLoc &DL,
const X86Subtarget &Subtarget) {
EVT SrcVT = Src.getValueType();
- if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
+ if (Subtarget.useSoftFloat() || !SrcVT.isSimple() ||
+ SrcVT.getScalarType() != MVT::i1)
return SDValue();
// Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
@@ -46211,7 +46196,7 @@ static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,
SDValue Zero = DAG.getConstant(0, DL, DpVT);
return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
- DpBuilder, false);
+ DpBuilder, /*CheckBWI=*/false, Subtarget.hasVNNI());
}
// Create a PSADBW given two sources representable as zexts of vXi8.
@@ -47747,6 +47732,15 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
DL, DAG, Subtarget))
return V;
+ // If the sign bit is known then BLENDV can be folded away.
+ if (N->getOpcode() == X86ISD::BLENDV) {
+ KnownBits KnownCond = DAG.computeKnownBits(Cond);
+ if (KnownCond.isNegative())
+ return LHS;
+ if (KnownCond.isNonNegative())
+ return RHS;
+ }
+
if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) {
SmallVector<int, 64> CondMask;
if (createShuffleMaskFromVSELECT(CondMask, Cond,
@@ -52383,16 +52377,41 @@ static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
// Do not flip "e > c", where "c" is a constant, because Cmp instruction
// cannot take an immediate as its first operand.
//
- if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
- EFLAGS.getValueType().isInteger() &&
- !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
- SDValue NewSub =
- DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
- EFLAGS.getOperand(1), EFLAGS.getOperand(0));
- SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
+ // If EFLAGS is from a CMP that compares the same operands as the earlier
+ // SUB producing X (i.e. CMP X, Y), we can directly use the carry flag with
+ // SBB/ADC without creating a flipped SUB.
+ if (EFLAGS.getOpcode() == X86ISD::CMP &&
+ EFLAGS.getValueType().isInteger() && X == EFLAGS.getOperand(0)) {
return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
DAG.getVTList(VT, MVT::i32), X,
- DAG.getConstant(0, DL, VT), NewEFLAGS);
+ DAG.getConstant(0, DL, VT), EFLAGS);
+ }
+
+ if (EFLAGS.getOpcode() == X86ISD::SUB &&
+ EFLAGS.getValueType().isInteger() &&
+ !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+ // Only create NewSub if we know one of the folds will succeed to avoid
+ // introducing a temporary node that may persist and affect one-use checks
+ // below.
+ if (EFLAGS.getNode()->hasOneUse()) {
+ SDValue NewSub = DAG.getNode(
+ X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
+ EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+ SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
+ return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
+ DAG.getVTList(VT, MVT::i32), X,
+ DAG.getConstant(0, DL, VT), NewEFLAGS);
+ }
+
+ if (IsSub && X == EFLAGS.getValue(0)) {
+ SDValue NewSub = DAG.getNode(
+ X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
+ EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+ SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
+ return DAG.getNode(X86ISD::SBB, DL, DAG.getVTList(VT, MVT::i32),
+ EFLAGS.getOperand(0), EFLAGS.getOperand(1),
+ NewEFLAGS);
+ }
}
}
@@ -58104,6 +58123,14 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
return V;
+ // Prefer VSHLI to reduce uses, X86FixupInstTunings may revert this depending
+ // on the scheduler model. Limit multiple users to AVX+ targets to prevent
+ // introducing extra register moves.
+ if (Op0 == Op1 && supportedVectorShiftWithImm(VT, Subtarget, ISD::SHL))
+ if (Subtarget.hasAVX() || N->isOnlyUserOf(Op0.getNode()))
+ return getTargetVShiftByConstNode(X86ISD::VSHLI, DL, VT.getSimpleVT(),
+ Op0, 1, DAG);
+
// Canonicalize hidden LEA pattern:
// Fold (add (sub (shl x, c), y), z) -> (sub (add (shl x, c), z), y)
// iff c < 4
@@ -58295,11 +58322,12 @@ static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG) {
} else if (Op1.getOpcode() == ISD::AND && Sub.getValue(0).use_empty()) {
SDValue Src = Op1;
SDValue Op10 = Op1.getOperand(0);
- if (Op10.getOpcode() == ISD::XOR && isAllOnesConstant(Op10.getOperand(1))) {
- // res, flags2 = sub 0, (and (xor X, -1), Y)
+ if (Op10.getOpcode() == ISD::XOR && isAllOnesConstant(Op10.getOperand(1)) &&
+ llvm::isOneConstant(Op1.getOperand(1))) {
+ // res, flags2 = sub 0, (and (xor X, -1), 1)
// cload/cstore ..., cond_ne, flag2
// ->
- // res, flags2 = sub 0, (and X, Y)
+ // res, flags2 = sub 0, (and X, 1)
// cload/cstore ..., cond_e, flag2
Src = DAG.getNode(ISD::AND, DL, Op1.getValueType(), Op10.getOperand(0),
Op1.getOperand(1));