diff options
Diffstat (limited to 'llvm/lib/Target/X86')
32 files changed, 550 insertions, 453 deletions
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index d7671ed19589..ce5e92135f70 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -15,10 +15,12 @@ #include "MCTargetDesc/X86TargetStreamer.h" #include "TargetInfo/X86TargetInfo.h" #include "X86Operand.h" +#include "X86RegisterInfo.h" #include "llvm-c/Visibility.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" #include "llvm/MC/MCContext.h" @@ -29,6 +31,7 @@ #include "llvm/MC/MCParser/MCAsmParser.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" #include "llvm/MC/MCParser/MCTargetAsmParser.h" +#include "llvm/MC/MCRegister.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSection.h" #include "llvm/MC/MCStreamer.h" @@ -40,6 +43,7 @@ #include "llvm/Support/SourceMgr.h" #include "llvm/Support/raw_ostream.h" #include <algorithm> +#include <cstdint> #include <memory> using namespace llvm; @@ -1172,7 +1176,7 @@ private: X86::CondCode ParseConditionCode(StringRef CCode); - bool ParseIntelMemoryOperandSize(unsigned &Size); + bool ParseIntelMemoryOperandSize(unsigned &Size, StringRef *SizeStr); bool CreateMemForMSInlineAsm(MCRegister SegReg, const MCExpr *Disp, MCRegister BaseReg, MCRegister IndexReg, unsigned Scale, bool NonAbsMem, SMLoc Start, @@ -2574,7 +2578,8 @@ bool X86AsmParser::ParseMasmOperator(unsigned OpKind, int64_t &Val) { return false; } -bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) { +bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size, + StringRef *SizeStr) { Size = StringSwitch<unsigned>(getTok().getString()) .Cases("BYTE", "byte", 8) .Cases("WORD", "word", 16) @@ -2592,6 +2597,8 @@ bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) { .Cases("ZMMWORD", "zmmword", 512) .Default(0); if (Size) { + if (SizeStr) + *SizeStr = getTok().getString(); const AsmToken &Tok = Lex(); // Eat operand size (e.g., byte, word). if (!(Tok.getString() == "PTR" || Tok.getString() == "ptr")) return Error(Tok.getLoc(), "Expected 'PTR' or 'ptr' token!"); @@ -2600,6 +2607,19 @@ bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) { return false; } +uint16_t RegSizeInBits(const MCRegisterInfo &MRI, MCRegister RegNo) { + if (X86MCRegisterClasses[X86::GR8RegClassID].contains(RegNo)) + return 8; + if (X86MCRegisterClasses[X86::GR16RegClassID].contains(RegNo)) + return 16; + if (X86MCRegisterClasses[X86::GR32RegClassID].contains(RegNo)) + return 32; + if (X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo)) + return 64; + // Unknown register size + return 0; +} + bool X86AsmParser::parseIntelOperand(OperandVector &Operands, StringRef Name) { MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); @@ -2607,7 +2627,8 @@ bool X86AsmParser::parseIntelOperand(OperandVector &Operands, StringRef Name) { // Parse optional Size directive. unsigned Size; - if (ParseIntelMemoryOperandSize(Size)) + StringRef SizeStr; + if (ParseIntelMemoryOperandSize(Size, &SizeStr)) return true; bool PtrInOperand = bool(Size); @@ -2624,9 +2645,29 @@ bool X86AsmParser::parseIntelOperand(OperandVector &Operands, StringRef Name) { return Error(Start, "rip can only be used as a base register"); // A Register followed by ':' is considered a segment override if (Tok.isNot(AsmToken::Colon)) { - if (PtrInOperand) - return Error(Start, "expected memory operand after 'ptr', " - "found register operand instead"); + if (PtrInOperand) { + if (!Parser.isParsingMasm()) + return Error(Start, "expected memory operand after 'ptr', " + "found register operand instead"); + + // If we are parsing MASM, we are allowed to cast registers to their own + // sizes, but not to other types. + uint16_t RegSize = + RegSizeInBits(*getContext().getRegisterInfo(), RegNo); + if (RegSize == 0) + return Error( + Start, + "cannot cast register '" + + StringRef(getContext().getRegisterInfo()->getName(RegNo)) + + "'; its size is not easily defined."); + if (RegSize != Size) + return Error( + Start, + std::to_string(RegSize) + "-bit register '" + + StringRef(getContext().getRegisterInfo()->getName(RegNo)) + + "' cannot be used as a " + std::to_string(Size) + "-bit " + + SizeStr.upper()); + } Operands.push_back(X86Operand::CreateReg(RegNo, Start, End)); return false; } diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 56a4cc3d65c2..865fc0ce8101 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -485,7 +485,16 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS, if (!CanPadInst) return; - if (PendingBA && PendingBA->getNext() == OS.getCurrentFragment()) { + if (PendingBA) { + auto *NextFragment = PendingBA->getNext(); + assert(NextFragment && "NextFragment should not be null"); + if (NextFragment == OS.getCurrentFragment()) + return; + // We eagerly create an empty fragment when inserting a fragment + // with a variable-size tail. + if (NextFragment->getNext() == OS.getCurrentFragment()) + return; + // Macro fusion actually happens and there is no other fragment inserted // after the previous instruction. // diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp index 547745fdba9d..76731437931a 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp @@ -1668,6 +1668,13 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, DestName = getRegName(MI->getOperand(0).getReg()); break; + case X86::VMOVSHZrr: + Src2Name = getRegName(MI->getOperand(2).getReg()); + Src1Name = getRegName(MI->getOperand(1).getReg()); + DecodeScalarMoveMask(8, false, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; + case X86::MOVPQI2QIrr: case X86::MOVZPQILo2PQIrr: case X86::VMOVPQI2QIrr: diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index a15930c1433f..cfe5b1094811 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -1047,9 +1047,6 @@ X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI, Prefix.setL(TSFlags & X86II::VEX_L); Prefix.setL2(TSFlags & X86II::EVEX_L2); - if ((TSFlags & X86II::EVEX_L2) && STI.hasFeature(X86::FeatureAVX512) && - !STI.hasFeature(X86::FeatureEVEX512)) - report_fatal_error("ZMM registers are not supported without EVEX512"); switch (TSFlags & X86II::OpPrefixMask) { case X86II::PD: Prefix.setPP(0x1); // 66 diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index cc7bcd678cb3..bb1e716c33ed 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -397,18 +397,6 @@ MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT, if (CPU.empty()) CPU = "generic"; - size_t posNoEVEX512 = FS.rfind("-evex512"); - // Make sure we won't be cheated by "-avx512fp16". - size_t posNoAVX512F = - FS.ends_with("-avx512f") ? FS.size() - 8 : FS.rfind("-avx512f,"); - size_t posEVEX512 = FS.rfind("+evex512"); - size_t posAVX512F = FS.rfind("+avx512"); // Any AVX512XXX will enable AVX512F. - - if (posAVX512F != StringRef::npos && - (posNoAVX512F == StringRef::npos || posNoAVX512F < posAVX512F)) - if (posEVEX512 == StringRef::npos && posNoEVEX512 == StringRef::npos) - ArchFS += ",+evex512"; - return createX86MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, ArchFS); } diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 9cfe081b8710..7c9e821c02fd 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -113,6 +113,7 @@ def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true", def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true", "Support 16-bit floating point conversion instructions", [FeatureAVX]>; +// Deprecated feature. Keep it here to suppress warnings in old IRs. def FeatureEVEX512 : SubtargetFeature<"evex512", "HasEVEX512", "true", "Support ZMM and 64-bit mask instructions">; def FeatureAVX512 : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512", @@ -329,20 +330,22 @@ def FeatureMOVDIRI : SubtargetFeature<"movdiri", "HasMOVDIRI", "true", "Support movdiri instruction (direct store integer)">; def FeatureMOVDIR64B : SubtargetFeature<"movdir64b", "HasMOVDIR64B", "true", "Support movdir64b instruction (direct store 64 bytes)">; -def FeatureAVX10_1 : SubtargetFeature<"avx10.1-256", "HasAVX10_1", "true", - "Support AVX10.1 up to 256-bit instruction", +def FeatureAVX10_1 : SubtargetFeature<"avx10.1", "HasAVX10_1", "true", + "Support AVX10.1 instruction", [FeatureCDI, FeatureVBMI, FeatureIFMA, FeatureVNNI, FeatureBF16, FeatureVPOPCNTDQ, FeatureVBMI2, FeatureBITALG, FeatureFP16, FeatureVLX, FeatureDQI]>; +// Deprecated feature. Keep it here to suppress warnings in old IRs. def FeatureAVX10_1_512 : SubtargetFeature<"avx10.1-512", "HasAVX10_1_512", "true", - "Support AVX10.1 up to 512-bit instruction", - [FeatureAVX10_1, FeatureEVEX512]>; -def FeatureAVX10_2 : SubtargetFeature<"avx10.2-256", "HasAVX10_2", "true", - "Support AVX10.2 up to 256-bit instruction", + "Support AVX10.1 instruction", + [FeatureAVX10_1]>; +def FeatureAVX10_2 : SubtargetFeature<"avx10.2", "HasAVX10_2", "true", + "Support AVX10.2 instruction", [FeatureAVX10_1]>; +// Deprecated feature. Keep it here to suppress warnings in old IRs. def FeatureAVX10_2_512 : SubtargetFeature<"avx10.2-512", "HasAVX10_2_512", "true", - "Support AVX10.2 up to 512-bit instruction", - [FeatureAVX10_2, FeatureAVX10_1_512]>; + "Support AVX10.2 instruction", + [FeatureAVX10_2]>; def FeatureEGPR : SubtargetFeature<"egpr", "HasEGPR", "true", "Support extended general purpose register">; def FeaturePush2Pop2 : SubtargetFeature<"push2pop2", "HasPush2Pop2", "true", @@ -871,7 +874,6 @@ def ProcessorFeatures { ]; list<SubtargetFeature> X86_64V4Features = !listconcat(X86_64V3Features, [ - FeatureEVEX512, FeatureBWI, FeatureCDI, FeatureDQI, @@ -996,7 +998,6 @@ def ProcessorFeatures { FeatureXSAVES, FeatureCLFLUSHOPT, FeatureAVX512, - FeatureEVEX512, FeatureCDI, FeatureDQI, FeatureBWI, @@ -1039,7 +1040,6 @@ def ProcessorFeatures { // Cannonlake list<SubtargetFeature> CNLAdditionalFeatures = [FeatureAVX512, - FeatureEVEX512, FeatureCDI, FeatureDQI, FeatureBWI, @@ -1155,7 +1155,7 @@ def ProcessorFeatures { !listconcat(GNRFeatures, GNRDAdditionalFeatures); // Diamond Rapids - list<SubtargetFeature> DMRAdditionalFeatures = [FeatureAVX10_2_512, + list<SubtargetFeature> DMRAdditionalFeatures = [FeatureAVX10_2, FeatureSM4, FeatureCMPCCXADD, FeatureAVXIFMA, @@ -1368,7 +1368,6 @@ def ProcessorFeatures { FeatureF16C, FeatureFSGSBase, FeatureAVX512, - FeatureEVEX512, FeatureCDI, FeatureADX, FeatureRDSEED, @@ -1586,7 +1585,6 @@ def ProcessorFeatures { list<SubtargetFeature> ZN4Tuning = !listconcat(ZN3Tuning, ZN4AdditionalTuning); list<SubtargetFeature> ZN4AdditionalFeatures = [FeatureAVX512, - FeatureEVEX512, FeatureCDI, FeatureDQI, FeatureBWI, diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp index d406277e440b..ff22ee8c86fa 100644 --- a/llvm/lib/Target/X86/X86AsmPrinter.cpp +++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp @@ -476,7 +476,8 @@ static bool isIndirectBranchOrTailCall(const MachineInstr &MI) { return MI.getDesc().isIndirectBranch() /*Make below code in a good shape*/ || Opc == X86::TAILJMPr || Opc == X86::TAILJMPm || Opc == X86::TAILJMPr64 || Opc == X86::TAILJMPm64 || - Opc == X86::TCRETURNri || Opc == X86::TCRETURNmi || + Opc == X86::TCRETURNri || Opc == X86::TCRETURN_WIN64ri || + Opc == X86::TCRETURN_HIPE32ri || Opc == X86::TCRETURNmi || Opc == X86::TCRETURNri64 || Opc == X86::TCRETURNmi64 || Opc == X86::TCRETURNri64_ImpCall || Opc == X86::TAILJMPr64_REX || Opc == X86::TAILJMPm64_REX; diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index 0e6b4dffec3a..9457e718de69 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -269,6 +269,8 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, case X86::TCRETURNdi: case X86::TCRETURNdicc: case X86::TCRETURNri: + case X86::TCRETURN_WIN64ri: + case X86::TCRETURN_HIPE32ri: case X86::TCRETURNmi: case X86::TCRETURNdi64: case X86::TCRETURNdi64cc: @@ -346,8 +348,9 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op)); for (unsigned i = 0; i != X86::AddrNumOperands; ++i) MIB.add(MBBI->getOperand(i)); - } else if ((Opcode == X86::TCRETURNri64) || - (Opcode == X86::TCRETURNri64_ImpCall)) { + } else if (Opcode == X86::TCRETURNri64 || + Opcode == X86::TCRETURNri64_ImpCall || + Opcode == X86::TCRETURN_WIN64ri) { JumpTarget.setIsKill(); BuildMI(MBB, MBBI, DL, TII->get(IsX64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64)) diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp index d3c239250943..787b71d425cb 100644 --- a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp @@ -564,8 +564,17 @@ bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) { MachineBasicBlock::iterator I; if (LastShapeMI && dominates(MBB, MI, LastShapeMI)) I = ++LastShapeMI->getIterator(); - else - I = ++MI.getIterator(); + else { + // Call can overwrite registers like rax, ensure the tile config + // instruction is sinked closer to first instruction that uses tile. + auto UseIt = MI.getIterator(); + while (UseIt != MBB.end()) { + if (HasTileOperand(MRI, *UseIt)) + break; + ++UseIt; + } + I = UseIt; + } Config(*I); HasUnconfigTile = false; continue; diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index cba7843d53e3..a293b4c87cfe 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -2398,7 +2398,8 @@ X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const { } static bool isTailCallOpcode(unsigned Opc) { - return Opc == X86::TCRETURNri || Opc == X86::TCRETURNdi || + return Opc == X86::TCRETURNri || Opc == X86::TCRETURN_WIN64ri || + Opc == X86::TCRETURN_HIPE32ri || Opc == X86::TCRETURNdi || Opc == X86::TCRETURNmi || Opc == X86::TCRETURNri64 || Opc == X86::TCRETURNri64_ImpCall || Opc == X86::TCRETURNdi64 || Opc == X86::TCRETURNmi64; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 19131fbd4102..3631016b0f5c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -326,15 +326,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget.hasAVX10_2()) { setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v2i32, Custom); setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v2i32, Custom); + setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v8i64, Legal); + setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v8i64, Legal); for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, MVT::v4i64}) { setOperationAction(ISD::FP_TO_UINT_SAT, VT, Legal); setOperationAction(ISD::FP_TO_SINT_SAT, VT, Legal); } - if (Subtarget.hasAVX10_2_512()) { - setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v8i64, Legal); - setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v8i64, Legal); - } if (Subtarget.is64Bit()) { setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Legal); setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Legal); @@ -2457,6 +2455,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) { + setOperationAction(ISD::FADD, MVT::v32bf16, Legal); + setOperationAction(ISD::FSUB, MVT::v32bf16, Legal); + setOperationAction(ISD::FMUL, MVT::v32bf16, Legal); + setOperationAction(ISD::FDIV, MVT::v32bf16, Legal); + setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal); + setOperationAction(ISD::FMA, MVT::v32bf16, Legal); + setOperationAction(ISD::SETCC, MVT::v32bf16, Custom); + setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom); + setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom); + setOperationAction(ISD::FMINIMUMNUM, MVT::v32bf16, Custom); + setOperationAction(ISD::FMAXIMUMNUM, MVT::v32bf16, Custom); for (auto VT : {MVT::v8bf16, MVT::v16bf16}) { setOperationAction(ISD::FADD, VT, Legal); setOperationAction(ISD::FSUB, VT, Legal); @@ -2470,19 +2479,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FMINIMUMNUM, VT, Custom); setOperationAction(ISD::FMAXIMUMNUM, VT, Custom); } - if (Subtarget.hasAVX10_2_512()) { - setOperationAction(ISD::FADD, MVT::v32bf16, Legal); - setOperationAction(ISD::FSUB, MVT::v32bf16, Legal); - setOperationAction(ISD::FMUL, MVT::v32bf16, Legal); - setOperationAction(ISD::FDIV, MVT::v32bf16, Legal); - setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal); - setOperationAction(ISD::FMA, MVT::v32bf16, Legal); - setOperationAction(ISD::SETCC, MVT::v32bf16, Custom); - setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom); - setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom); - setOperationAction(ISD::FMINIMUMNUM, MVT::v32bf16, Custom); - setOperationAction(ISD::FMAXIMUMNUM, MVT::v32bf16, Custom); - } for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) { setCondCodeAction(ISD::SETOEQ, VT, Custom); setCondCodeAction(ISD::SETUNE, VT, Custom); @@ -21252,7 +21248,7 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, // the truncation then we can use PACKSS by converting the srl to a sra. // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it. if (In.getOpcode() == ISD::SRL && In->hasOneUse()) - if (std::optional<uint64_t> ShAmt = DAG.getValidShiftAmount(In)) { + if (std::optional<unsigned> ShAmt = DAG.getValidShiftAmount(In)) { if (*ShAmt == MinSignBits) { PackOpcode = X86ISD::PACKSS; return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops()); @@ -26269,10 +26265,9 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - - if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask)) - if (MaskConst->getZExtValue() & 0x1) - return Op; + auto *MaskConst = dyn_cast<ConstantSDNode>(Mask); + if (MaskConst && (MaskConst->getZExtValue() & 0x1)) + return Op; MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); @@ -26288,6 +26283,17 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, if (PreservedSrc.isUndef()) PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); + + if (MaskConst) { + assert((MaskConst->getZExtValue() & 0x1) == 0 && "Expected false mask"); + // Discard op and blend passthrough with scalar op src/dst. + SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements()); + std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0); + ShuffleMask[0] = VT.getVectorNumElements(); + return DAG.getVectorShuffle(VT, dl, Op.getOperand(0), PreservedSrc, + ShuffleMask); + } + return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc); } @@ -31404,9 +31410,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, return R; // AVX512 implicitly uses modulo rotation amounts. - if ((Subtarget.hasVLX() || - (Subtarget.hasAVX512() && Subtarget.hasEVEX512())) && - 32 <= EltSizeInBits) { + if ((Subtarget.hasVLX() || Subtarget.hasAVX512()) && 32 <= EltSizeInBits) { // Attempt to rotate by immediate. if (IsCstSplat) { unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI; @@ -38676,13 +38680,11 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); if (Opc == X86ISD::VSHLI) { - Known.Zero <<= ShAmt; - Known.One <<= ShAmt; + Known <<= ShAmt; // Low bits are known zero. Known.Zero.setLowBits(ShAmt); } else if (Opc == X86ISD::VSRLI) { - Known.Zero.lshrInPlace(ShAmt); - Known.One.lshrInPlace(ShAmt); + Known >>= ShAmt; // High bits are known zero. Known.Zero.setHighBits(ShAmt); } else { @@ -44518,8 +44520,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( TLO, Depth + 1)) return true; - Known.Zero <<= ShAmt; - Known.One <<= ShAmt; + Known <<= ShAmt; // Low bits known zero. Known.Zero.setLowBits(ShAmt); @@ -44549,8 +44550,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( TLO, Depth + 1)) return true; - Known.Zero.lshrInPlace(ShAmt); - Known.One.lshrInPlace(ShAmt); + Known >>= ShAmt; // High bits known zero. Known.Zero.setHighBits(ShAmt); @@ -44598,8 +44598,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( TLO, Depth + 1)) return true; - Known.Zero.lshrInPlace(ShAmt); - Known.One.lshrInPlace(ShAmt); + Known >>= ShAmt; // If the input sign bit is known to be zero, or if none of the top bits // are demanded, turn this into an unsigned shift right. @@ -44957,6 +44956,44 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( Known.Zero.setLowBits(Known2.countMinTrailingZeros()); return false; } + case X86ISD::VPMADD52L: + case X86ISD::VPMADD52H: { + KnownBits KnownOp0, KnownOp1, KnownOp2; + SDValue Op0 = Op.getOperand(0); + SDValue Op1 = Op.getOperand(1); + SDValue Op2 = Op.getOperand(2); + // Only demand the lower 52-bits of operands 0 / 1 (and all 64-bits of + // operand 2). + APInt Low52Bits = APInt::getLowBitsSet(BitWidth, 52); + if (SimplifyDemandedBits(Op0, Low52Bits, OriginalDemandedElts, KnownOp0, + TLO, Depth + 1)) + return true; + + if (SimplifyDemandedBits(Op1, Low52Bits, OriginalDemandedElts, KnownOp1, + TLO, Depth + 1)) + return true; + + if (SimplifyDemandedBits(Op2, APInt::getAllOnes(64), OriginalDemandedElts, + KnownOp2, TLO, Depth + 1)) + return true; + + KnownBits KnownMul; + KnownOp0 = KnownOp0.trunc(52); + KnownOp1 = KnownOp1.trunc(52); + KnownMul = Opc == X86ISD::VPMADD52L ? KnownBits::mul(KnownOp0, KnownOp1) + : KnownBits::mulhu(KnownOp0, KnownOp1); + KnownMul = KnownMul.zext(64); + + // lo/hi(X * Y) + Z --> C + Z + if (KnownMul.isConstant()) { + SDLoc DL(Op); + SDValue C = TLO.DAG.getConstant(KnownMul.getConstant(), DL, VT); + return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ADD, DL, VT, C, Op2)); + } + + Known = KnownBits::add(KnownMul, KnownOp2); + return false; + } } return TargetLowering::SimplifyDemandedBitsForTargetNode( @@ -45132,6 +45169,14 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode( bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const { switch (Op.getOpcode()) { + // SSE bit logic. + case X86ISD::FAND: + case X86ISD::FOR: + case X86ISD::FXOR: + case X86ISD::FANDN: + case X86ISD::ANDNP: + case X86ISD::VPTERNLOG: + return false; // SSE vector insert/extracts use modulo indices. case X86ISD::PINSRB: case X86ISD::PINSRW: @@ -45167,6 +45212,11 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode( // SSE signbit extraction. case X86ISD::MOVMSK: return false; + // GFNI instructions. + case X86ISD::GF2P8AFFINEINVQB: + case X86ISD::GF2P8AFFINEQB: + case X86ISD::GF2P8MULB: + return false; case ISD::INTRINSIC_WO_CHAIN: switch (Op->getConstantOperandVal(0)) { case Intrinsic::x86_sse2_pmadd_wd: @@ -48349,7 +48399,7 @@ static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC, // If Src came from a SHL (probably from an expanded SIGN_EXTEND_INREG), then // peek through and adjust the TEST bit. if (Src.getOpcode() == ISD::SHL) { - if (std::optional<uint64_t> ShiftAmt = DAG.getValidShiftAmount(Src)) { + if (std::optional<unsigned> ShiftAmt = DAG.getValidShiftAmount(Src)) { Src = Src.getOperand(0); BitMask.lshrInPlace(*ShiftAmt); } @@ -50886,10 +50936,12 @@ static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, // Given a target type \p VT, we generate // or (and x, y), (xor z, zext(build_vector (constants))) // given x, y and z are of type \p VT. We can do so, if operands are either -// truncates from VT types, the second operand is a vector of constants or can -// be recursively promoted. +// truncates from VT types, the second operand is a vector of constants, can +// be recursively promoted or is an existing extension we can extend further. static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, - SelectionDAG &DAG, unsigned Depth) { + SelectionDAG &DAG, + const X86Subtarget &Subtarget, + unsigned Depth) { // Limit recursion to avoid excessive compile times. if (Depth >= SelectionDAG::MaxRecursionDepth) return SDValue(); @@ -50904,28 +50956,32 @@ static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT)) return SDValue(); - if (SDValue NN0 = PromoteMaskArithmetic(N0, DL, VT, DAG, Depth + 1)) + if (SDValue NN0 = + PromoteMaskArithmetic(N0, DL, VT, DAG, Subtarget, Depth + 1)) N0 = NN0; else { - // The left side has to be a trunc. - if (N0.getOpcode() != ISD::TRUNCATE) - return SDValue(); - - // The type of the truncated inputs. - if (N0.getOperand(0).getValueType() != VT) + // The left side has to be a 'trunc'. + bool LHSTrunc = N0.getOpcode() == ISD::TRUNCATE && + N0.getOperand(0).getValueType() == VT; + if (LHSTrunc) + N0 = N0.getOperand(0); + else return SDValue(); - - N0 = N0.getOperand(0); } - if (SDValue NN1 = PromoteMaskArithmetic(N1, DL, VT, DAG, Depth + 1)) + if (SDValue NN1 = + PromoteMaskArithmetic(N1, DL, VT, DAG, Subtarget, Depth + 1)) N1 = NN1; else { - // The right side has to be a 'trunc' or a (foldable) constant. + // The right side has to be a 'trunc', a (foldable) constant or an + // existing extension we can extend further. bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE && N1.getOperand(0).getValueType() == VT; if (RHSTrunc) N1 = N1.getOperand(0); + else if (ISD::isExtVecInRegOpcode(N1.getOpcode()) && VT.is256BitVector() && + Subtarget.hasInt256() && N1.hasOneUse()) + N1 = DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0)); else if (SDValue Cst = DAG.FoldConstantArithmetic(ISD::ZERO_EXTEND, DL, VT, {N1})) N1 = Cst; @@ -50955,7 +51011,7 @@ static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT NarrowVT = Narrow.getValueType(); // Generate the wide operation. - SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, 0); + SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, Subtarget, 0); if (!Op) return SDValue(); switch (N.getOpcode()) { @@ -51804,6 +51860,8 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, SDValue X, Y; EVT CondVT = VT.changeVectorElementType(MVT::i1); if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(CondVT) && + (VT.is512BitVector() || Subtarget.hasVLX()) && + (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && sd_match(N, m_And(m_Value(X), m_OneUse(m_SExt(m_AllOf( m_Value(Y), m_SpecificVT(CondVT), @@ -54135,10 +54193,10 @@ static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG, static SDValue combinei64TruncSrlConstant(SDValue N, EVT VT, SelectionDAG &DAG, const SDLoc &DL) { assert(N.getOpcode() == ISD::SRL && "Unknown shift opcode"); - std::optional<uint64_t> ValidSrlConst = DAG.getValidShiftAmount(N); + std::optional<unsigned> ValidSrlConst = DAG.getValidShiftAmount(N); if (!ValidSrlConst) return SDValue(); - uint64_t SrlConstVal = *ValidSrlConst; + unsigned SrlConstVal = *ValidSrlConst; SDValue Op = N.getOperand(0); unsigned Opcode = Op.getOpcode(); @@ -55368,6 +55426,8 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, SDValue Src = N0.getOperand(0); EVT SrcVT = Src.getValueType(); if (Src.getOpcode() == ISD::SETCC && SrcVT.getScalarType() == MVT::i1 && + (VT.is512BitVector() || Subtarget.hasVLX()) && + (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && TLI.isTypeLegal(SrcVT) && N0.hasOneUse() && Src.hasOneUse()) return DAG.getSelect(DL, VT, DAG.getNOT(DL, Src, SrcVT), N1, getZeroVector(VT, Subtarget, DAG, DL)); @@ -56247,7 +56307,13 @@ static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC, SDValue Masked = BroadcastOp; if (N != 0) { - APInt Mask = APInt::getLowBitsSet(BroadcastOpVT.getSizeInBits(), Len); + unsigned BroadcastOpBitWidth = BroadcastOpVT.getSizeInBits(); + unsigned NumDefinedElts = UndefElts.countTrailingZeros(); + + if (NumDefinedElts > BroadcastOpBitWidth) + return SDValue(); + + APInt Mask = APInt::getLowBitsSet(BroadcastOpBitWidth, NumDefinedElts); SDValue ShiftedValue = DAG.getNode(ISD::SRL, DL, BroadcastOpVT, BroadcastOp, DAG.getConstant(N, DL, BroadcastOpVT)); Masked = DAG.getNode(ISD::AND, DL, BroadcastOpVT, ShiftedValue, @@ -57904,6 +57970,51 @@ static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL, Cmov.getOperand(3)); } +// Attempt to turn ADD(MUL(x, y), acc)) -> VPMADD52L +// When upper 12 bits of x, y and MUL(x, y) are known to be 0 +static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, + EVT VT, const X86Subtarget &Subtarget) { + using namespace SDPatternMatch; + if (!VT.isVector() || VT.getScalarSizeInBits() != 64 || + (!Subtarget.hasAVXIFMA() && !Subtarget.hasIFMA())) + return SDValue(); + + // Need AVX-512VL vector length extensions if operating on XMM/YMM registers + if (!Subtarget.hasAVXIFMA() && !Subtarget.hasVLX() && + VT.getSizeInBits() < 512) + return SDValue(); + + const auto TotalSize = VT.getSizeInBits(); + if (TotalSize < 128 || !isPowerOf2_64(TotalSize)) + return SDValue(); + + SDValue X, Y, Acc; + if (!sd_match(N, m_Add(m_Mul(m_Value(X), m_Value(Y)), m_Value(Acc)))) + return SDValue(); + + KnownBits KnownX = DAG.computeKnownBits(X); + if (KnownX.countMinLeadingZeros() < 12) + return SDValue(); + KnownBits KnownY = DAG.computeKnownBits(Y); + if (KnownY.countMinLeadingZeros() < 12) + return SDValue(); + KnownBits KnownMul = KnownBits::mul(KnownX, KnownY); + if (KnownMul.countMinLeadingZeros() < 12) + return SDValue(); + + auto VPMADD52Builder = [](SelectionDAG &G, SDLoc DL, + ArrayRef<SDValue> SubOps) { + EVT SubVT = SubOps[0].getValueType(); + assert(SubVT.getScalarSizeInBits() == 64 && + "Unexpected element size, only supports 64bit size"); + return G.getNode(X86ISD::VPMADD52L, DL, SubVT, SubOps[1] /*X*/, + SubOps[2] /*Y*/, SubOps[0] /*Acc*/); + }; + + return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Acc, X, Y}, VPMADD52Builder, + /*CheckBWI*/ false); +} + static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -58007,6 +58118,9 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, Op0.getOperand(0), Op0.getOperand(2)); } + if (SDValue IFMA52 = matchVPMADD52(N, DAG, DL, VT, Subtarget)) + return IFMA52; + return combineAddOrSubToADCOrSBB(N, DL, DAG); } @@ -60068,6 +60182,19 @@ static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Simplify VPMADD52L/VPMADD52H operations. +static SDValue combineVPMADD52LH(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + MVT VT = N->getSimpleValueType(0); + unsigned NumEltBits = VT.getScalarSizeInBits(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits), + DCI)) + return SDValue(N, 0); + + return SDValue(); +} + static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -60705,6 +60832,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget); case X86ISD::VPMADDUBSW: case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI); + case X86ISD::VPMADD52L: + case X86ISD::VPMADD52H: return combineVPMADD52LH(N, DAG, DCI); case X86ISD::KSHIFTL: case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI); case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget); @@ -60932,117 +61061,6 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { // X86 Inline Assembly Support //===----------------------------------------------------------------------===// -// Helper to match a string separated by whitespace. -static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) { - S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace. - - for (StringRef Piece : Pieces) { - if (!S.starts_with(Piece)) // Check if the piece matches. - return false; - - S = S.substr(Piece.size()); - StringRef::size_type Pos = S.find_first_not_of(" \t"); - if (Pos == 0) // We matched a prefix. - return false; - - S = S.substr(Pos); - } - - return S.empty(); -} - -static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) { - - if (AsmPieces.size() == 3 || AsmPieces.size() == 4) { - if (llvm::is_contained(AsmPieces, "~{cc}") && - llvm::is_contained(AsmPieces, "~{flags}") && - llvm::is_contained(AsmPieces, "~{fpsr}")) { - - if (AsmPieces.size() == 3) - return true; - else if (llvm::is_contained(AsmPieces, "~{dirflag}")) - return true; - } - } - return false; -} - -bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { - InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand()); - - StringRef AsmStr = IA->getAsmString(); - - IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); - if (!Ty || Ty->getBitWidth() % 16 != 0) - return false; - - // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" - SmallVector<StringRef, 4> AsmPieces; - SplitString(AsmStr, AsmPieces, ";\n"); - - switch (AsmPieces.size()) { - default: return false; - case 1: - // FIXME: this should verify that we are targeting a 486 or better. If not, - // we will turn this bswap into something that will be lowered to logical - // ops instead of emitting the bswap asm. For now, we don't support 486 or - // lower so don't worry about this. - // bswap $0 - if (matchAsm(AsmPieces[0], {"bswap", "$0"}) || - matchAsm(AsmPieces[0], {"bswapl", "$0"}) || - matchAsm(AsmPieces[0], {"bswapq", "$0"}) || - matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) || - matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) || - matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) { - // No need to check constraints, nothing other than the equivalent of - // "=r,0" would be valid here. - return IntrinsicLowering::LowerToByteSwap(CI); - } - - // rorw $$8, ${0:w} --> llvm.bswap.i16 - if (CI->getType()->isIntegerTy(16) && - IA->getConstraintString().starts_with("=r,0,") && - (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) || - matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) { - AsmPieces.clear(); - StringRef ConstraintsStr = IA->getConstraintString(); - SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); - array_pod_sort(AsmPieces.begin(), AsmPieces.end()); - if (clobbersFlagRegisters(AsmPieces)) - return IntrinsicLowering::LowerToByteSwap(CI); - } - break; - case 3: - if (CI->getType()->isIntegerTy(32) && - IA->getConstraintString().starts_with("=r,0,") && - matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) && - matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) && - matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) { - AsmPieces.clear(); - StringRef ConstraintsStr = IA->getConstraintString(); - SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); - array_pod_sort(AsmPieces.begin(), AsmPieces.end()); - if (clobbersFlagRegisters(AsmPieces)) - return IntrinsicLowering::LowerToByteSwap(CI); - } - - if (CI->getType()->isIntegerTy(64)) { - InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); - if (Constraints.size() >= 2 && - Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && - Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { - // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 - if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) && - matchAsm(AsmPieces[1], {"bswap", "%edx"}) && - matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"})) - return IntrinsicLowering::LowerToByteSwap(CI); - } - } - break; - } - return false; -} - static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) { X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint) .Case("{@cca}", X86::COND_A) diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 97d3b6e2420d..0c9ba591b03e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1364,8 +1364,6 @@ namespace llvm { SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const; - bool ExpandInlineAsm(CallInst *CI) const override; - ConstraintType getConstraintType(StringRef Constraint) const override; /// Examine constraint string and operand type and determine a weight value. @@ -1668,8 +1666,8 @@ namespace llvm { /// Lower interleaved store(s) into target specific /// instructions/intrinsics. bool lowerInterleavedStore(Instruction *Store, Value *Mask, - ShuffleVectorInst *SVI, - unsigned Factor) const override; + ShuffleVectorInst *SVI, unsigned Factor, + const APInt &GapMask) const override; SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const override; diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index 1c745a338a61..3bc46af4d130 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -302,7 +302,7 @@ EVT X86TargetLowering::getOptimalMemOpType( if (Op.size() >= 16 && (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) { // FIXME: Check if unaligned 64-byte accesses are slow. - if (Op.size() >= 64 && Subtarget.hasAVX512() && Subtarget.hasEVEX512() && + if (Op.size() >= 64 && Subtarget.hasAVX512() && (Subtarget.getPreferVectorWidth() >= 512)) { return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32; } @@ -416,7 +416,7 @@ bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context, return true; return false; case 512: - if (Subtarget.hasAVX512() && Subtarget.hasEVEX512()) + if (Subtarget.hasAVX512()) return true; return false; default: diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index 1beaaafb159e..69a5115201ef 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -550,7 +550,7 @@ let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in { } // HasAMXMOVRS, In64BitMode multiclass m_tcvtrowd2ps { - let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in { + let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in { let SchedRW = [WriteSystem] in { def rri : Ii8<0x7, MRMSrcReg, (outs VR512:$dst), (ins TILE:$src1, i32u8imm:$src2), @@ -561,12 +561,12 @@ multiclass m_tcvtrowd2ps { "tcvtrowd2ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, T8,XS, EVEX, VVVV, EVEX_V512; } - } // HasAMXAVX512, HasAVX10_2_512, In64BitMode + } // HasAMXAVX512, HasAVX10_2, In64BitMode } defm TCVTROWD2PS : m_tcvtrowd2ps; -let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in { +let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in { let SchedRW = [WriteSystem] in { let usesCustomInserter = 1 in { def PTCVTROWD2PSrri : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, i32u8imm:$src2), @@ -630,7 +630,7 @@ let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in { multiclass AMXAVX512_BASE<bits<8> Opcode1, bits<8> Opcode2, string Opstr, Prefix P1, Prefix P2> { - let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode], SchedRW = [WriteSystem] in { + let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode], SchedRW = [WriteSystem] in { let OpPrefix = P1 in def rre : I<Opcode1, MRMSrcReg4VOp3, (outs VR512:$dst), (ins TILE:$src1, GR32:$src2), @@ -658,7 +658,7 @@ defm TCVTROWPS2BF16H : AMXAVX512_BASE<0x6d, 0x07, "tcvtrowps2bf16h", XD, XD>; defm TCVTROWPS2BF16L : AMXAVX512_BASE<0x6d, 0x77, "tcvtrowps2bf16l", XS, XS>; multiclass m_tilemovrow { - let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in { + let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in { let SchedRW = [WriteSystem] in { def rri : Ii8<0x7, MRMSrcReg, (outs VR512:$dst), (ins TILE:$src1, u8imm:$src2), @@ -669,12 +669,12 @@ multiclass m_tilemovrow { "tilemovrow\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, T8,PD, EVEX, VVVV, EVEX_V512; } - } // HasAMXAVX512, HasAVX10_2_512, In64BitMode + } // HasAMXAVX512, HasAVX10_2, In64BitMode } defm TILEMOVROW : m_tilemovrow; -let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in { +let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in { let SchedRW = [WriteSystem] in { let usesCustomInserter = 1 in { def PTILEMOVROWrri : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, i32u8imm:$src2), diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td index 2d2bf1f6c725..764ff998bb56 100644 --- a/llvm/lib/Target/X86/X86InstrAVX10.td +++ b/llvm/lib/Target/X86/X86InstrAVX10.td @@ -15,36 +15,36 @@ // VNNI FP16 let ExeDomain = SSEPackedSingle in defm VDPPHPS : avx512_dpf16ps_sizes<0x52, "vdpphps", X86dpfp16ps, avx512vl_f16_info, - [HasAVX10_2], [HasAVX10_2_512]>, + [HasAVX10_2], [HasAVX10_2]>, T8, PS, EVEX_CD8<32, CD8VF>; // VNNI INT8 defm VPDPBSSD : VNNI_common<0x50, "vpdpbssd", X86vpdpbssd, SchedWriteVecIMul, 1, - [HasAVX10_2], [HasAVX10_2_512]>, XD; + [HasAVX10_2], [HasAVX10_2]>, XD; defm VPDPBSSDS : VNNI_common<0x51, "vpdpbssds", X86vpdpbssds, SchedWriteVecIMul, 1, - [HasAVX10_2], [HasAVX10_2_512]>, XD; + [HasAVX10_2], [HasAVX10_2]>, XD; defm VPDPBSUD : VNNI_common<0x50, "vpdpbsud", X86vpdpbsud, SchedWriteVecIMul, 0, - [HasAVX10_2], [HasAVX10_2_512]>, XS; + [HasAVX10_2], [HasAVX10_2]>, XS; defm VPDPBSUDS : VNNI_common<0x51, "vpdpbsuds", X86vpdpbsuds, SchedWriteVecIMul, 0, - [HasAVX10_2], [HasAVX10_2_512]>, XS; + [HasAVX10_2], [HasAVX10_2]>, XS; defm VPDPBUUD : VNNI_common<0x50, "vpdpbuud", X86vpdpbuud, SchedWriteVecIMul, 1, - [HasAVX10_2], [HasAVX10_2_512]>, PS; + [HasAVX10_2], [HasAVX10_2]>, PS; defm VPDPBUUDS : VNNI_common<0x51, "vpdpbuuds", X86vpdpbuuds, SchedWriteVecIMul, 1, - [HasAVX10_2], [HasAVX10_2_512]>, PS; + [HasAVX10_2], [HasAVX10_2]>, PS; // VNNI INT16 defm VPDPWSUD : VNNI_common<0xd2, "vpdpwsud", X86vpdpwsud, SchedWriteVecIMul, 0, - [HasAVX10_2], [HasAVX10_2_512]>, XS; + [HasAVX10_2], [HasAVX10_2]>, XS; defm VPDPWSUDS : VNNI_common<0xd3, "vpdpwsuds", X86vpdpwsuds, SchedWriteVecIMul, 0, - [HasAVX10_2], [HasAVX10_2_512]>, XS; + [HasAVX10_2], [HasAVX10_2]>, XS; defm VPDPWUSD : VNNI_common<0xd2, "vpdpwusd", X86vpdpwusd, SchedWriteVecIMul, 0, - [HasAVX10_2], [HasAVX10_2_512]>, PD; + [HasAVX10_2], [HasAVX10_2]>, PD; defm VPDPWUSDS : VNNI_common<0xd3, "vpdpwusds", X86vpdpwusds, SchedWriteVecIMul, 0, - [HasAVX10_2], [HasAVX10_2_512]>, PD; + [HasAVX10_2], [HasAVX10_2]>, PD; defm VPDPWUUD : VNNI_common<0xd2, "vpdpwuud", X86vpdpwuud, SchedWriteVecIMul, 1, - [HasAVX10_2], [HasAVX10_2_512]>, PS; + [HasAVX10_2], [HasAVX10_2]>, PS; defm VPDPWUUDS : VNNI_common<0xd3, "vpdpwuuds", X86vpdpwuuds, SchedWriteVecIMul, 1, - [HasAVX10_2], [HasAVX10_2_512]>, PS; + [HasAVX10_2], [HasAVX10_2]>, PS; // VMPSADBW defm VMPSADBW : avx512_common_3Op_rm_imm8<0x42, X86Vmpsadbw, "vmpsadbw", SchedWritePSADBW, @@ -94,9 +94,8 @@ multiclass avx10_minmax_packed_sae<string OpStr, AVX512VLVectorVTInfo VTI, SDNod } multiclass avx10_minmax_packed<string OpStr, AVX512VLVectorVTInfo VTI, SDNode OpNode> { - let Predicates = [HasAVX10_2_512] in - defm Z : avx10_minmax_packed_base<OpStr, VTI.info512, OpNode>, EVEX_V512; let Predicates = [HasAVX10_2] in { + defm Z : avx10_minmax_packed_base<OpStr, VTI.info512, OpNode>, EVEX_V512; defm Z256 : avx10_minmax_packed_base<OpStr, VTI.info256, OpNode>, EVEX_V256; defm Z128 : avx10_minmax_packed_base<OpStr, VTI.info128, OpNode>, EVEX_V128; } @@ -201,7 +200,7 @@ multiclass avx10_sat_cvt_rmb<bits<8> Opc, string OpStr, X86FoldableSchedWrite sc multiclass avx10_sat_cvt_rc<bits<8> Opc, string OpStr, X86SchedWriteWidths sched, AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo, SDNode MaskNode> { - let Predicates = [HasAVX10_2_512], Uses = [MXCSR] in + let Predicates = [HasAVX10_2], Uses = [MXCSR] in defm Zrrb : AVX512_maskable<Opc, MRMSrcReg, DestInfo.info512, (outs DestInfo.info512.RC:$dst), (ins SrcInfo.info512.RC:$src, AVX512RC:$rc), @@ -216,7 +215,7 @@ multiclass avx10_sat_cvt_rc<bits<8> Opc, string OpStr, X86SchedWriteWidths sched multiclass avx10_sat_cvt_sae<bits<8> Opc, string OpStr, X86SchedWriteWidths sched, AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo, SDNode Node> { - let Predicates = [HasAVX10_2_512], Uses = [MXCSR] in + let Predicates = [HasAVX10_2], Uses = [MXCSR] in defm Zrrb : AVX512_maskable<Opc, MRMSrcReg, DestInfo.info512, (outs DestInfo.info512.RC:$dst), (ins SrcInfo.info512.RC:$src), @@ -229,12 +228,11 @@ multiclass avx10_sat_cvt_sae<bits<8> Opc, string OpStr, X86SchedWriteWidths sche multiclass avx10_sat_cvt_base<bits<8> Opc, string OpStr, X86SchedWriteWidths sched, SDNode MaskNode, AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo> { - let Predicates = [HasAVX10_2_512] in - defm Z : avx10_sat_cvt_rmb<Opc, OpStr, sched.ZMM, - DestInfo.info512, SrcInfo.info512, - MaskNode>, - EVEX, EVEX_V512; let Predicates = [HasAVX10_2] in { + defm Z : avx10_sat_cvt_rmb<Opc, OpStr, sched.ZMM, + DestInfo.info512, SrcInfo.info512, + MaskNode>, + EVEX, EVEX_V512; defm Z256 : avx10_sat_cvt_rmb<Opc, OpStr, sched.YMM, DestInfo.info256, SrcInfo.info256, @@ -334,13 +332,11 @@ defm VCVTTPS2IUBS : avx10_sat_cvt_base<0x6a, "vcvttps2iubs", SchedWriteVecIMul, multiclass avx10_cvttpd2dqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, SDNode MaskOpNode, SDNode OpNodeSAE, X86SchedWriteWidths sched> { - let Predicates = [HasAVX10_2_512] in { + let Predicates = [HasAVX10_2] in { defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode, MaskOpNode, sched.ZMM>, avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNodeSAE, sched.ZMM>, EVEX_V512; - } - let Predicates = [HasAVX10_2] in { defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, null_frag, null_frag, sched.XMM, "{1to2}", "{x}", f128mem, VK2WM>, EVEX_V128; @@ -410,13 +406,11 @@ multiclass avx10_cvttpd2dqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpN multiclass avx10_cvttpd2qqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, SDNode MaskOpNode, SDNode OpNodeRnd, X86SchedWriteWidths sched> { - let Predicates = [HasAVX10_2_512] in { + let Predicates = [HasAVX10_2] in { defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode, MaskOpNode, sched.ZMM>, avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f64_info, OpNodeRnd, sched.ZMM>, EVEX_V512; - } - let Predicates = [HasAVX10_2] in { defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode, MaskOpNode, sched.XMM>, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode, @@ -432,13 +426,11 @@ multiclass avx10_cvttpd2qqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpN multiclass avx10_cvttps2qqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, SDNode MaskOpNode, SDNode OpNodeRnd, X86SchedWriteWidths sched> { - let Predicates = [HasAVX10_2_512] in { + let Predicates = [HasAVX10_2] in { defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode, MaskOpNode, sched.ZMM>, avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNodeRnd, sched.ZMM>, EVEX_V512; - } - let Predicates = [HasAVX10_2] in { defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode, MaskOpNode, sched.XMM, "{1to2}", "", f64mem, VK2WM, (v2i64 (OpNode (bc_v4f32 (v2f64 @@ -460,14 +452,11 @@ multiclass avx10_cvttps2qqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpN multiclass avx10_cvttps2dqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, SDNode MaskOpNode, SDNode OpNodeSAE, X86SchedWriteWidths sched> { - let Predicates = [HasAVX10_2_512] in { + let Predicates = [HasAVX10_2] in { defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode, MaskOpNode, sched.ZMM>, avx512_vcvt_fp_sae<opc, OpcodeStr, v16i32_info, v16f32_info, OpNodeSAE, sched.ZMM>, EVEX_V512; - } - - let Predicates = [HasAVX10_2] in { defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode, MaskOpNode, sched.XMM>, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode, @@ -719,7 +708,7 @@ multiclass avx10_cvt2ps2ph<bits<8> opc, string OpcodeStr, AVX512VLVectorVTInfo _SrcVTInfo, AVX512VLVectorVTInfo _DstVTInfo, SDNode OpNode, SDNode OpNodeRnd> { - let Predicates = [HasAVX10_2_512], Uses = [MXCSR] in { + let Predicates = [HasAVX10_2] in { defm Z : avx512_binop_rm2<opc, OpcodeStr, sched.ZMM, OpNode, _SrcVTInfo.info512, _DstVTInfo.info512, _SrcVTInfo.info512>, @@ -727,8 +716,6 @@ multiclass avx10_cvt2ps2ph<bits<8> opc, string OpcodeStr, _SrcVTInfo.info512, _DstVTInfo.info512, OpNodeRnd>, EVEX_V512, EVEX_CD8<32, CD8VF>; - } - let Predicates = [HasAVX10_2] in { defm Z256 : avx512_binop_rm2<opc, OpcodeStr, sched.YMM, OpNode, _SrcVTInfo.info256, _DstVTInfo.info256, _SrcVTInfo.info256>, @@ -747,19 +734,19 @@ defm VCVT2PS2PHX : avx10_cvt2ps2ph<0x67, "vcvt2ps2phx", defm VCVT2PH2BF8 : avx512_binop_all<0x74, "vcvt2ph2bf8", SchedWriteCvtPD2PS, avx512vl_f16_info, avx512vl_i8_info, - X86vcvt2ph2bf8, [HasAVX10_2_512], [HasAVX10_2]>, + X86vcvt2ph2bf8, [HasAVX10_2], [HasAVX10_2]>, EVEX_CD8<16, CD8VF>, T8, XD; defm VCVT2PH2BF8S : avx512_binop_all<0x74, "vcvt2ph2bf8s", SchedWriteCvtPD2PS, avx512vl_f16_info, avx512vl_i8_info, - X86vcvt2ph2bf8s, [HasAVX10_2_512], [HasAVX10_2]>, + X86vcvt2ph2bf8s, [HasAVX10_2], [HasAVX10_2]>, EVEX_CD8<16, CD8VF>, T_MAP5, XD; defm VCVT2PH2HF8 : avx512_binop_all<0x18, "vcvt2ph2hf8", SchedWriteCvtPD2PS, avx512vl_f16_info, avx512vl_i8_info, - X86vcvt2ph2hf8, [HasAVX10_2_512], [HasAVX10_2]>, + X86vcvt2ph2hf8, [HasAVX10_2], [HasAVX10_2]>, EVEX_CD8<16, CD8VF>, T_MAP5, XD; defm VCVT2PH2HF8S : avx512_binop_all<0x1b, "vcvt2ph2hf8s", SchedWriteCvtPD2PS, avx512vl_f16_info, avx512vl_i8_info, - X86vcvt2ph2hf8s, [HasAVX10_2_512], [HasAVX10_2]>, + X86vcvt2ph2hf8s, [HasAVX10_2], [HasAVX10_2]>, EVEX_CD8<16, CD8VF>, T_MAP5, XD; //TODO: Merge into avx512_vcvt_fp, diffrence is one more source register here. @@ -836,11 +823,10 @@ multiclass avx10_convert_3op<bits<8> OpCode, string OpcodeStr, PatFrag bcast128 = vt_src.info128.BroadcastLdFrag, PatFrag loadVT128 = vt_src.info128.LdFrag, RegisterClass maskRC128 = vt_src.info128.KRCWM> { - let Predicates = [HasAVX10_2_512] in + let Predicates = [HasAVX10_2] in { defm Z : avx10_convert_3op_packed<OpCode, OpcodeStr, vt_dst.info256, vt_dst.info512, vt_src.info512, OpNode, OpNode, sched.ZMM>, EVEX_V512, EVEX_CD8<16, CD8VF>; - let Predicates = [HasAVX10_2] in { defm Z256 : avx10_convert_3op_packed<OpCode, OpcodeStr, vt_dst.info128, vt_dst.info256, vt_src.info256, OpNode, OpNode, sched.YMM>, EVEX_V256, EVEX_CD8<16, CD8VF>; @@ -920,25 +906,25 @@ defm VCVTBIASPH2HF8S : avx10_convert_3op<0x1b, "vcvtbiasph2hf8s", defm VCVTPH2BF8 : avx512_cvt_trunc_ne<0x74, "vcvtph2bf8", avx512vl_i8_info, avx512vl_f16_info, SchedWriteCvtPD2PS, X86vcvtph2bf8, X86vmcvtph2bf8, - [HasAVX10_2], [HasAVX10_2_512]>, + [HasAVX10_2], [HasAVX10_2]>, T8, XS, EVEX_CD8<16, CD8VF>; defm VCVTPH2BF8S : avx512_cvt_trunc_ne<0x74, "vcvtph2bf8s", avx512vl_i8_info, avx512vl_f16_info, SchedWriteCvtPD2PS, X86vcvtph2bf8s, X86vmcvtph2bf8s, - [HasAVX10_2], [HasAVX10_2_512]>, + [HasAVX10_2], [HasAVX10_2]>, T_MAP5, XS, EVEX_CD8<16, CD8VF>; defm VCVTPH2HF8 : avx512_cvt_trunc_ne<0x18, "vcvtph2hf8", avx512vl_i8_info, avx512vl_f16_info, SchedWriteCvtPD2PS, X86vcvtph2hf8, X86vmcvtph2hf8, - [HasAVX10_2], [HasAVX10_2_512]>, + [HasAVX10_2], [HasAVX10_2]>, T_MAP5, XS, EVEX_CD8<16, CD8VF>; defm VCVTPH2HF8S : avx512_cvt_trunc_ne<0x1b, "vcvtph2hf8s", avx512vl_i8_info, avx512vl_f16_info, SchedWriteCvtPD2PS, X86vcvtph2hf8s, X86vmcvtph2hf8s, - [HasAVX10_2], [HasAVX10_2_512]>, + [HasAVX10_2], [HasAVX10_2]>, T_MAP5, XS, EVEX_CD8<16, CD8VF>; multiclass avx10_convert_2op_nomb_packed<bits<8> opc, string OpcodeStr, @@ -962,10 +948,9 @@ multiclass avx10_convert_2op_nomb_packed<bits<8> opc, string OpcodeStr, multiclass avx10_convert_2op_nomb<string OpcodeStr, AVX512VLVectorVTInfo _dest, AVX512VLVectorVTInfo _src, bits<8> opc, SDNode OpNode> { - let Predicates = [HasAVX10_2_512] in + let Predicates = [HasAVX10_2] in { defm Z : avx10_convert_2op_nomb_packed<opc, OpcodeStr, _dest.info512, _src.info256, OpNode, f256mem, WriteCvtPH2PSZ>, EVEX_V512; - let Predicates = [HasAVX10_2] in { defm Z128 : avx10_convert_2op_nomb_packed<opc, OpcodeStr, _dest.info128, _src.info128, OpNode, f64mem, WriteCvtPH2PSZ>, EVEX_V128; defm Z256 : avx10_convert_2op_nomb_packed<opc, OpcodeStr, _dest.info256, _src.info128, @@ -985,13 +970,12 @@ defm VCVTHF82PH : avx10_convert_2op_nomb<"vcvthf82ph", avx512vl_f16_info, multiclass avx10_fp_binop_int_bf16<bits<8> opc, string OpcodeStr, X86SchedWriteSizes sched, bit IsCommutable = 0> { - let Predicates = [HasAVX10_2_512] in + let Predicates = [HasAVX10_2] in { defm Z : avx512_fp_packed<opc, OpcodeStr, !cast<Intrinsic>("int_x86_avx10_"#OpcodeStr#"bf16512"), !cast<Intrinsic>("int_x86_avx10_"#OpcodeStr#"bf16512"), v32bf16_info, sched.PH.ZMM, IsCommutable>, EVEX_V512, T_MAP5, PD, EVEX_CD8<16, CD8VF>; - let Predicates = [HasAVX10_2] in { defm Z128 : avx512_fp_packed<opc, OpcodeStr, !cast<Intrinsic>("int_x86_avx10_"#OpcodeStr#"bf16128"), !cast<Intrinsic>("int_x86_avx10_"#OpcodeStr#"bf16128"), @@ -1009,11 +993,10 @@ multiclass avx10_fp_binop_bf16<bits<8> opc, string OpcodeStr, SDPatternOperator X86SchedWriteSizes sched, bit IsCommutable = 0, SDPatternOperator MaskOpNode = OpNode> { - let Predicates = [HasAVX10_2_512] in + let Predicates = [HasAVX10_2] in { defm Z : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v32bf16_info, sched.PH.ZMM, IsCommutable>, EVEX_V512, T_MAP5, PD, EVEX_CD8<16, CD8VF>; - let Predicates = [HasAVX10_2] in { defm Z128 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v8bf16x_info, sched.PH.XMM, IsCommutable>, EVEX_V128, T_MAP5, PD, EVEX_CD8<16, CD8VF>; @@ -1086,9 +1069,8 @@ multiclass avx10_vcmp_common_bf16<X86FoldableSchedWrite sched, X86VectorVTInfo _ } multiclass avx10_vcmp_bf16<X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> { - let Predicates = [HasAVX10_2_512] in - defm Z : avx10_vcmp_common_bf16<sched.ZMM, _.info512>, EVEX_V512; let Predicates = [HasAVX10_2] in { + defm Z : avx10_vcmp_common_bf16<sched.ZMM, _.info512>, EVEX_V512; defm Z128 : avx10_vcmp_common_bf16<sched.XMM, _.info128>, EVEX_V128; defm Z256 : avx10_vcmp_common_bf16<sched.YMM, _.info256>, EVEX_V256; } @@ -1102,11 +1084,10 @@ defm VCMPBF16 : avx10_vcmp_bf16<SchedWriteFCmp, avx512vl_bf16_info>, // VSQRTBF16 multiclass avx10_sqrt_packed_bf16<bits<8> opc, string OpcodeStr, X86SchedWriteSizes sched> { - let Predicates = [HasAVX10_2_512] in - defm Z : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "bf16"), - sched.PH.ZMM, v32bf16_info>, - EVEX_V512, PD, T_MAP5, EVEX_CD8<16, CD8VF>; let Predicates = [HasAVX10_2] in { + defm Z : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "bf16"), + sched.PH.ZMM, v32bf16_info>, + EVEX_V512, PD, T_MAP5, EVEX_CD8<16, CD8VF>; defm Z128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "bf16"), sched.PH.XMM, v8bf16x_info>, EVEX_V128, PD, T_MAP5, EVEX_CD8<16, CD8VF>; @@ -1122,11 +1103,10 @@ defm VSQRTBF16 : avx10_sqrt_packed_bf16<0x51, "vsqrt", SchedWriteFSqrtSizes>; // VRSQRTBF16, VRCPBF16, VSRQTBF16, VGETEXPBF16 multiclass avx10_fp14_bf16<bits<8> opc, string OpcodeStr, SDNode OpNode, X86SchedWriteWidths sched> { - let Predicates = [HasAVX10_2_512] in - defm BF16Z : avx512_fp14_p<opc, !strconcat(OpcodeStr, "bf16"), - OpNode, sched.ZMM, v32bf16_info>, - EVEX_V512; let Predicates = [HasAVX10_2] in { + defm BF16Z : avx512_fp14_p<opc, !strconcat(OpcodeStr, "bf16"), + OpNode, sched.ZMM, v32bf16_info>, + EVEX_V512; defm BF16Z128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "bf16"), OpNode, sched.XMM, v8bf16x_info>, EVEX_V128; @@ -1146,10 +1126,9 @@ defm VGETEXP : avx10_fp14_bf16<0x42, "vgetexp", X86fgetexp, SchedWriteFRnd>, // VSCALEFBF16 multiclass avx10_fp_scalef_bf16<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched> { - let Predicates = [HasAVX10_2_512] in + let Predicates = [HasAVX10_2] in { defm Z : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v32bf16_info>, EVEX_V512, T_MAP6, PS, EVEX_CD8<16, CD8VF>; - let Predicates = [HasAVX10_2] in { defm Z128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v8bf16x_info>, EVEX_V128, EVEX_CD8<16, CD8VF>, T_MAP6, PS; defm Z256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v16bf16x_info>, @@ -1164,10 +1143,9 @@ defm VSCALEFBF16 : avx10_fp_scalef_bf16<0x2C, "vscalef", SchedWriteFAdd>; multiclass avx10_common_unary_fp_packed_imm_bf16<string OpcodeStr, AVX512VLVectorVTInfo _, bits<8> opc, SDPatternOperator OpNode, SDPatternOperator MaskOpNode, X86SchedWriteWidths sched> { - let Predicates = [HasAVX10_2_512] in + let Predicates = [HasAVX10_2] in { defm Z : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode, sched.ZMM, _.info512>, EVEX_V512; - let Predicates = [HasAVX10_2] in { defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode, sched.XMM, _.info128>, EVEX_V128; defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode, @@ -1190,11 +1168,10 @@ defm VGETMANTBF16 : avx10_common_unary_fp_packed_imm_bf16<"vgetmant", avx512vl_b // VFPCLASSBF16 multiclass avx10_fp_fpclass_bf16<string OpcodeStr, bits<8> opcVec, X86SchedWriteWidths sched> { - let Predicates = [HasAVX10_2_512] in + let Predicates = [HasAVX10_2] in { defm Z : avx512_vector_fpclass<opcVec, OpcodeStr, sched.ZMM, avx512vl_bf16_info.info512, "z", []<Register>>, EVEX_V512; - let Predicates = [HasAVX10_2] in { defm Z128 : avx512_vector_fpclass<opcVec, OpcodeStr, sched.XMM, avx512vl_bf16_info.info128, "x", []<Register>>, EVEX_V128; @@ -1211,11 +1188,10 @@ defm VFPCLASSBF16 : avx10_fp_fpclass_bf16<"vfpclass", 0x66, SchedWriteFCmp>, multiclass avx10_fma3p_213_bf16<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, SDNode MaskOpNode, X86SchedWriteWidths sched> { - let Predicates = [HasAVX10_2_512] in + let Predicates = [HasAVX10_2] in { defm Z : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode, sched.ZMM, v32bf16_info>, EVEX_V512, T_MAP6, PS, EVEX_CD8<16, CD8VF>; - let Predicates = [HasAVX10_2] in { defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode, sched.XMM, v8bf16x_info>, EVEX_V128, T_MAP6, PS, EVEX_CD8<16, CD8VF>; @@ -1239,11 +1215,10 @@ defm VFNMSUB213BF16 : avx10_fma3p_213_bf16<0xAE, "vfnmsub213bf16", X86any_Fnmsub multiclass avx10_fma3p_231_bf16<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, SDNode MaskOpNode, X86SchedWriteWidths sched> { - let Predicates = [HasAVX10_2_512] in + let Predicates = [HasAVX10_2] in { defm Z : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode, sched.ZMM, v32bf16_info>, EVEX_V512, T_MAP6, PS, EVEX_CD8<16, CD8VF>; - let Predicates = [HasAVX10_2] in { defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode, sched.XMM, v8bf16x_info>, EVEX_V128, T_MAP6, PS, EVEX_CD8<16, CD8VF>; @@ -1267,11 +1242,10 @@ defm VFNMSUB231BF16 : avx10_fma3p_231_bf16<0xBE, "vfnmsub231bf16", X86any_Fnmsub multiclass avx10_fma3p_132_bf16<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, SDNode MaskOpNode, X86SchedWriteWidths sched> { - let Predicates = [HasAVX10_2_512] in + let Predicates = [HasAVX10_2] in { defm Z : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode, sched.ZMM, v32bf16_info>, EVEX_V512, T_MAP6, PS, EVEX_CD8<16, CD8VF>; - let Predicates = [HasAVX10_2] in { defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode, sched.XMM, v8bf16x_info>, EVEX_V128, T_MAP6, PS, EVEX_CD8<16, CD8VF>; @@ -1440,9 +1414,8 @@ multiclass vmovrs_p<bits<8> opc, string OpStr, X86VectorVTInfo _> { } multiclass vmovrs_p_vl<bits<8> opc, string OpStr, AVX512VLVectorVTInfo _Vec> { - let Predicates = [HasMOVRS, HasAVX10_2_512, In64BitMode] in - defm Z : vmovrs_p<opc, OpStr, _Vec.info512>, EVEX_V512; let Predicates = [HasMOVRS, HasAVX10_2, In64BitMode] in { + defm Z : vmovrs_p<opc, OpStr, _Vec.info512>, EVEX_V512; defm Z128 : vmovrs_p<opc, OpStr, _Vec.info128>, EVEX_V128; defm Z256 : vmovrs_p<opc, OpStr, _Vec.info256>, EVEX_V256; } @@ -1464,7 +1437,7 @@ multiclass avx10_sm4_base<string OpStr> { defm Z128 : SM4_Base<OpStr, VR128X, "128", loadv4i32, i128mem>, EVEX_V128; defm Z256 : SM4_Base<OpStr, VR256X, "256", loadv8i32, i256mem>, EVEX_V256; } - let Predicates = [HasSM4, HasAVX10_2_512] in + let Predicates = [HasSM4, HasAVX10_2] in defm Z : SM4_Base<OpStr, VR512, "512", loadv16i32, i512mem>, EVEX_V512; } diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 0ab94cca4142..3401f6f04800 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -834,7 +834,7 @@ defm : vextract_for_size_lowering<"VEXTRACTF64X4Z", v32bf16_info, v16bf16x_info, // A 128-bit extract from bits [255:128] of a 512-bit vector should use a // smaller extract to enable EVEX->VEX. -let Predicates = [NoVLX, HasEVEX512] in { +let Predicates = [NoVLX] in { def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 2))), (v2i64 (VEXTRACTI128rri (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm)), @@ -3088,7 +3088,7 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, addr:$src2, (X86cmpm_imm_commute timm:$cc)), Narrow.KRC)>; } -let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { +let Predicates = [HasAVX512, NoVLX] in { defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPD", v8i32x_info, v16i32_info>; defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUD", v8i32x_info, v16i32_info>; @@ -3119,7 +3119,7 @@ let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPD", v2f64x_info, v8f64_info>; } -let Predicates = [HasBWI, NoVLX, HasEVEX512] in { +let Predicates = [HasBWI, NoVLX] in { defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPB", v32i8x_info, v64i8_info>; defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUB", v32i8x_info, v64i8_info>; @@ -3513,7 +3513,7 @@ multiclass mask_move_lowering<string InstrStr, X86VectorVTInfo Narrow, // Patterns for handling v8i1 selects of 256-bit vectors when VLX isn't // available. Use a 512-bit operation and extract. -let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { +let Predicates = [HasAVX512, NoVLX] in { defm : mask_move_lowering<"VMOVAPSZ", v4f32x_info, v16f32_info>; defm : mask_move_lowering<"VMOVDQA32Z", v4i32x_info, v16i32_info>; defm : mask_move_lowering<"VMOVAPSZ", v8f32x_info, v16f32_info>; @@ -3525,7 +3525,7 @@ let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { defm : mask_move_lowering<"VMOVDQA64Z", v4i64x_info, v8i64_info>; } -let Predicates = [HasBWI, NoVLX, HasEVEX512] in { +let Predicates = [HasBWI, NoVLX] in { defm : mask_move_lowering<"VMOVDQU8Z", v16i8x_info, v64i8_info>; defm : mask_move_lowering<"VMOVDQU8Z", v32i8x_info, v64i8_info>; @@ -5021,8 +5021,8 @@ defm VPMINUD : avx512_binop_rm_vl_d<0x3B, "vpminud", umin, defm VPMINUQ : avx512_binop_rm_vl_q<0x3B, "vpminuq", umin, SchedWriteVecALU, HasAVX512, 1>, T8; -// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX, HasEVEX512. -let Predicates = [HasDQI, NoVLX, HasEVEX512] in { +// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX. +let Predicates = [HasDQI, NoVLX] in { def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))), (EXTRACT_SUBREG (VPMULLQZrr @@ -5078,7 +5078,7 @@ multiclass avx512_min_max_lowering<string Instr, SDNode OpNode> { sub_xmm)>; } -let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { +let Predicates = [HasAVX512, NoVLX] in { defm : avx512_min_max_lowering<"VPMAXUQZ", umax>; defm : avx512_min_max_lowering<"VPMINUQZ", umin>; defm : avx512_min_max_lowering<"VPMAXSQZ", smax>; @@ -6055,7 +6055,7 @@ defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl, SchedWriteVecShift>; // Use 512bit VPSRA/VPSRAI version to implement v2i64/v4i64 in case NoVLX. -let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { +let Predicates = [HasAVX512, NoVLX] in { def : Pat<(v4i64 (X86vsra (v4i64 VR256X:$src1), (v2i64 VR128X:$src2))), (EXTRACT_SUBREG (v8i64 (VPSRAQZrr @@ -6184,14 +6184,14 @@ defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", X86vsrlv, SchedWriteVarVecS defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr, SchedWriteVarVecShift>; defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl, SchedWriteVarVecShift>; -defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", X86vsrav, [HasAVX512, NoVLX, HasEVEX512]>; -defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", X86vshlv, [HasBWI, NoVLX, HasEVEX512]>; -defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRAVW", X86vsrav, [HasBWI, NoVLX, HasEVEX512]>; -defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRLVW", X86vsrlv, [HasBWI, NoVLX, HasEVEX512]>; +defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", X86vsrav, [HasAVX512, NoVLX]>; +defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", X86vshlv, [HasBWI, NoVLX]>; +defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRAVW", X86vsrav, [HasBWI, NoVLX]>; +defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRLVW", X86vsrlv, [HasBWI, NoVLX]>; // Use 512bit VPROL/VPROLI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX. -let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { +let Predicates = [HasAVX512, NoVLX] in { def : Pat<(v2i64 (rotl (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), (EXTRACT_SUBREG (v8i64 (VPROLVQZrr @@ -6242,7 +6242,7 @@ let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { } // Use 512bit VPROR/VPRORI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX. -let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { +let Predicates = [HasAVX512, NoVLX] in { def : Pat<(v2i64 (rotr (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), (EXTRACT_SUBREG (v8i64 (VPRORVQZrr @@ -9933,7 +9933,7 @@ defm VPMOVUSWB : avx512_trunc_wb<0x10, "vpmovuswb", X86vtruncus, truncstore_us_vi8, masked_truncstore_us_vi8, X86vtruncus, X86vmtruncus>; -let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { +let Predicates = [HasAVX512, NoVLX] in { def: Pat<(v8i16 (trunc (v8i32 VR256X:$src))), (v8i16 (EXTRACT_SUBREG (v16i16 (VPMOVDWZrr (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), @@ -9944,7 +9944,7 @@ def: Pat<(v4i32 (trunc (v4i64 VR256X:$src))), VR256X:$src, sub_ymm)))), sub_xmm))>; } -let Predicates = [HasBWI, NoVLX, HasEVEX512] in { +let Predicates = [HasBWI, NoVLX] in { def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))), (v16i8 (EXTRACT_SUBREG (VPMOVWBZrr (v32i16 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src, sub_ymm))), sub_xmm))>; @@ -10487,7 +10487,7 @@ multiclass avx512_convert_vector_to_mask<bits<8> opc, string OpcodeStr, defm Z128 : convert_vector_to_mask_common<opc, VTInfo.info128, OpcodeStr>, EVEX_V128; } - let Predicates = [prd, NoVLX, HasEVEX512] in { + let Predicates = [prd, NoVLX] in { defm Z256_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info256, NAME>; defm Z128_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info128, NAME>; } @@ -11283,7 +11283,7 @@ defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", abs, SchedWriteVecALU>; // VPABS: Use 512bit version to implement 128/256 bit in case NoVLX. -let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { +let Predicates = [HasAVX512, NoVLX] in { def : Pat<(v4i64 (abs VR256X:$src)), (EXTRACT_SUBREG (VPABSQZrr @@ -11299,7 +11299,7 @@ let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { // Use 512bit version to implement 128/256 bit. multiclass avx512_unary_lowering<string InstrStr, SDNode OpNode, AVX512VLVectorVTInfo _, Predicate prd> { - let Predicates = [prd, NoVLX, HasEVEX512] in { + let Predicates = [prd, NoVLX] in { def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1))), (EXTRACT_SUBREG (!cast<Instruction>(InstrStr # "Zrr") @@ -11918,7 +11918,7 @@ let Predicates = [HasAVX512] in { (VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>; } -let Predicates = [HasAVX512, NoVLX, HasEVEX512] in { +let Predicates = [HasAVX512, NoVLX] in { def : Pat<(v16i8 (vnot VR128X:$src)), (EXTRACT_SUBREG (VPTERNLOGQZrri diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index 927b2c8b22f0..5a0df058b27f 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -1326,7 +1326,11 @@ def : Pat<(X86imp_call (i64 tglobaladdr:$dst)), // Match an X86tcret that uses less than 7 volatile registers. def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off), (TCRETURNri ptr_rc_tailcall:$dst, timm:$off)>, - Requires<[Not64BitMode, NotUseIndirectThunkCalls]>; + Requires<[Not64BitMode, IsNotHiPECCFunc, NotUseIndirectThunkCalls]>; + +def : Pat<(X86tcret GR32:$dst, timm:$off), + (TCRETURN_HIPE32ri GR32:$dst, timm:$off)>, + Requires<[Not64BitMode, IsHiPECCFunc, NotUseIndirectThunkCalls]>; // FIXME: This is disabled for 32-bit PIC mode because the global base // register which is part of the address mode may be assigned a @@ -1346,7 +1350,11 @@ def : Pat<(X86tcret (i32 texternalsym:$dst), timm:$off), def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off), (TCRETURNri64 ptr_rc_tailcall:$dst, timm:$off)>, - Requires<[In64BitMode, NotUseIndirectThunkCalls, ImportCallOptimizationDisabled]>; + Requires<[In64BitMode, IsNotWin64CCFunc, NotUseIndirectThunkCalls, ImportCallOptimizationDisabled]>; + +def : Pat<(X86tcret GR64_TCW64:$dst, timm:$off), + (TCRETURN_WIN64ri GR64_TCW64:$dst, timm:$off)>, + Requires<[IsWin64CCFunc, NotUseIndirectThunkCalls, ImportCallOptimizationDisabled]>; def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off), (TCRETURNri64_ImpCall ptr_rc_tailcall:$dst, timm:$off)>, diff --git a/llvm/lib/Target/X86/X86InstrControl.td b/llvm/lib/Target/X86/X86InstrControl.td index 22253bf0413a..139aedd473eb 100644 --- a/llvm/lib/Target/X86/X86InstrControl.td +++ b/llvm/lib/Target/X86/X86InstrControl.td @@ -282,6 +282,10 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, []>, Sched<[WriteJump]>; def TCRETURNri : PseudoI<(outs), (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>, Sched<[WriteJump]>; + + def TCRETURN_HIPE32ri : PseudoI<(outs), (ins GR32:$dst, i32imm:$offset), + []>, Sched<[WriteJump]>; + let mayLoad = 1 in def TCRETURNmi : PseudoI<(outs), (ins i32mem_TC:$dst, i32imm:$offset), []>, Sched<[WriteJumpLd]>; @@ -357,6 +361,9 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, def TCRETURNri64 : PseudoI<(outs), (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>, Sched<[WriteJump]>; + def TCRETURN_WIN64ri : PseudoI<(outs), (ins GR64_TCW64:$dst, i32imm:$offset), + []>, Sched<[WriteJump]>; + def TCRETURNri64_ImpCall : PseudoI<(outs), (ins GR64_A:$dst, i32imm:$offset), []>, Sched<[WriteJump]>; diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index abf365eedec3..a68edf4d2b7e 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -83,8 +83,9 @@ static cl::opt<unsigned> UndefRegClearance( // Pin the vtable to this file. void X86InstrInfo::anchor() {} -X86InstrInfo::X86InstrInfo(X86Subtarget &STI) - : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 +X86InstrInfo::X86InstrInfo(const X86Subtarget &STI) + : X86GenInstrInfo(STI, + (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32), (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32), @@ -4399,13 +4400,8 @@ static unsigned getLoadStoreOpcodeForFP16(bool Load, const X86Subtarget &STI) { if (STI.hasFP16()) return Load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr; if (Load) - return STI.hasAVX512() ? X86::VMOVSSZrm - : STI.hasAVX() ? X86::VMOVSSrm - : X86::MOVSSrm; - else - return STI.hasAVX512() ? X86::VMOVSSZmr - : STI.hasAVX() ? X86::VMOVSSmr - : X86::MOVSSmr; + return X86::MOVSHPrm; + return X86::MOVSHPmr; } static unsigned getLoadStoreRegOpcode(Register Reg, @@ -4903,6 +4899,16 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, CmpMask = ~0; CmpValue = 0; return true; + case X86::TEST64ri32: + case X86::TEST32ri: + case X86::TEST16ri: + case X86::TEST8ri: + SrcReg = MI.getOperand(0).getReg(); + SrcReg2 = 0; + // Force identical compare. + CmpMask = 0; + CmpValue = 0; + return true; } return false; } @@ -4942,6 +4948,10 @@ bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI, case X86::CMP32ri: case X86::CMP16ri: case X86::CMP8ri: + case X86::TEST64ri32: + case X86::TEST32ri: + case X86::TEST16ri: + case X86::TEST8ri: CASE_ND(SUB64ri32) CASE_ND(SUB32ri) CASE_ND(SUB16ri) @@ -6131,6 +6141,25 @@ static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) { return true; } +static bool expandMOVSHP(MachineInstrBuilder &MIB, MachineInstr &MI, + const TargetInstrInfo &TII, bool HasAVX) { + unsigned NewOpc; + if (MI.getOpcode() == X86::MOVSHPrm) { + NewOpc = HasAVX ? X86::VMOVSSrm : X86::MOVSSrm; + Register Reg = MI.getOperand(0).getReg(); + if (Reg > X86::XMM15) + NewOpc = X86::VMOVSSZrm; + } else { + NewOpc = HasAVX ? X86::VMOVSSmr : X86::MOVSSmr; + Register Reg = MI.getOperand(5).getReg(); + if (Reg > X86::XMM15) + NewOpc = X86::VMOVSSZmr; + } + + MIB->setDesc(TII.get(NewOpc)); + return true; +} + bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { bool HasAVX = Subtarget.hasAVX(); MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI); @@ -6203,6 +6232,9 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { } return Expand2AddrUndef(MIB, get(X86::VPXORDZrr)); } + case X86::MOVSHPmr: + case X86::MOVSHPrm: + return expandMOVSHP(MIB, MI, *this, Subtarget.hasAVX()); case X86::V_SETALLONES: return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr)); diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index 9dc5f4b0e086..f087b7f20ff6 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -222,7 +222,7 @@ inline static bool isMemInstrWithGOTPCREL(const MachineInstr &MI) { } class X86InstrInfo final : public X86GenInstrInfo { - X86Subtarget &Subtarget; + const X86Subtarget &Subtarget; const X86RegisterInfo RI; LLVM_DECLARE_VIRTUAL_ANCHOR_FUNCTION(); @@ -238,7 +238,7 @@ class X86InstrInfo final : public X86GenInstrInfo { bool MakeChange) const; public: - explicit X86InstrInfo(X86Subtarget &STI); + explicit X86InstrInfo(const X86Subtarget &STI); /// Given a machine instruction descriptor, returns the register /// class constraint for OpNum, or NULL. Returned register class diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td index df1541e9085b..8339c2081842 100644 --- a/llvm/lib/Target/X86/X86InstrPredicates.td +++ b/llvm/lib/Target/X86/X86InstrPredicates.td @@ -69,11 +69,8 @@ def NoAVX : Predicate<"!Subtarget->hasAVX()">; def HasAVX : Predicate<"Subtarget->hasAVX()">; def HasAVX2 : Predicate<"Subtarget->hasAVX2()">; def HasAVX1Only : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">; -def HasEVEX512 : Predicate<"Subtarget->hasEVEX512()">; def HasAVX10_1 : Predicate<"Subtarget->hasAVX10_1()">; -def HasAVX10_1_512 : Predicate<"Subtarget->hasAVX10_1_512()">; def HasAVX10_2 : Predicate<"Subtarget->hasAVX10_2()">; -def HasAVX10_2_512 : Predicate<"Subtarget->hasAVX10_2_512()">; def NoAVX10_2 : Predicate<"!Subtarget->hasAVX10_2()">; def HasAVX512 : Predicate<"Subtarget->hasAVX512()">; def UseAVX : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">; @@ -233,6 +230,13 @@ let RecomputePerFunction = 1 in { "!Subtarget->hasSSE41()">; def ImportCallOptimizationEnabled : Predicate<"MF->getFunction().getParent()->getModuleFlag(\"import-call-optimization\")">; def ImportCallOptimizationDisabled : Predicate<"!MF->getFunction().getParent()->getModuleFlag(\"import-call-optimization\")">; + + def IsWin64CCFunc : Predicate<"Subtarget->isCallingConvWin64(MF->getFunction().getCallingConv())">; + def IsNotWin64CCFunc : Predicate<"!Subtarget->isCallingConvWin64(MF->getFunction().getCallingConv())">; + def IsHiPECCFunc : Predicate<"MF->getFunction().getCallingConv() == CallingConv::HiPE">; + + def IsNotHiPECCFunc : Predicate< + "MF->getFunction().getCallingConv() != CallingConv::HiPE">; } def CallImmAddr : Predicate<"Subtarget->isLegalToCallImmediateAddr()">; diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 1acc0cd8da20..b7926497c92b 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -267,6 +267,18 @@ multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop, } } +// pseudo instruction for fp16 spilling. +let isPseudo = 1, Predicates = [HasSSE2] in { + let mayStore = 1 in + def MOVSHPmr : I<0, Pseudo, (outs), (ins f32mem:$dst, FR16X:$src), "", + [], SSEPackedSingle>, + Sched<[WriteFStore]>; + let mayLoad = 1 in + def MOVSHPrm : I<0, Pseudo, (outs FR16X:$dst), (ins f32mem:$src), "", + [], SSEPackedSingle>, + Sched<[WriteFLoad]>; +} + defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss", SSEPackedSingle, UseSSE1>, TB, XS; defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd", diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp index 632db7e4326e..4188487d7591 100644 --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -825,7 +825,8 @@ bool X86TargetLowering::lowerInterleavedLoad( bool X86TargetLowering::lowerInterleavedStore(Instruction *Store, Value *LaneMask, ShuffleVectorInst *SVI, - unsigned Factor) const { + unsigned Factor, + const APInt &GapMask) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); @@ -836,7 +837,8 @@ bool X86TargetLowering::lowerInterleavedStore(Instruction *Store, auto *SI = dyn_cast<StoreInst>(Store); if (!SI) return false; - assert(!LaneMask && "Unexpected mask on store"); + assert(!LaneMask && GapMask.popcount() == Factor && + "Unexpected mask on store"); // Holds the indices of SVI that correspond to the starting index of each // interleaved shuffle. diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index 595ad3290eed..9ec04e740a08 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -204,15 +204,7 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, // we can still use 64-bit register as long as we know the high bits // are zeros. // Reflect that in the returned register class. - if (Is64Bit) { - // When the target also allows 64-bit frame pointer and we do have a - // frame, this is fine to use it for the address accesses as well. - const X86FrameLowering *TFI = getFrameLowering(MF); - return TFI->hasFP(MF) && TFI->Uses64BitFramePtr - ? &X86::LOW32_ADDR_ACCESS_RBPRegClass - : &X86::LOW32_ADDR_ACCESSRegClass; - } - return &X86::GR32RegClass; + return Is64Bit ? &X86::LOW32_ADDR_ACCESSRegClass : &X86::GR32RegClass; case 1: // Normal GPRs except the stack pointer (for encoding reasons). if (Subtarget.isTarget64BitLP64()) return &X86::GR64_NOSPRegClass; @@ -228,25 +220,11 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, // NOSP does not contain RIP, so no special case here. return &X86::GR32_NOREX_NOSPRegClass; case 4: // Available for tailcall (not callee-saved GPRs). - return getGPRsForTailCall(MF); + return Is64Bit ? &X86::GR64_TCRegClass : &X86::GR32_TCRegClass; } } const TargetRegisterClass * -X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const { - const Function &F = MF.getFunction(); - if (IsWin64 || IsUEFI64 || (F.getCallingConv() == CallingConv::Win64)) - return &X86::GR64_TCW64RegClass; - else if (Is64Bit) - return &X86::GR64_TCRegClass; - - bool hasHipeCC = (F.getCallingConv() == CallingConv::HiPE); - if (hasHipeCC) - return &X86::GR32RegClass; - return &X86::GR32_TCRegClass; -} - -const TargetRegisterClass * X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { if (RC == &X86::CCRRegClass) { if (Is64Bit) @@ -1007,11 +985,10 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, unsigned X86RegisterInfo::findDeadCallerSavedReg( MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI) const { const MachineFunction *MF = MBB.getParent(); + const MachineRegisterInfo &MRI = MF->getRegInfo(); if (MF->callsEHReturn()) return 0; - const TargetRegisterClass &AvailableRegs = *getGPRsForTailCall(*MF); - if (MBBI == MBB.end()) return 0; @@ -1026,6 +1003,8 @@ unsigned X86RegisterInfo::findDeadCallerSavedReg( case X86::RETI64: case X86::TCRETURNdi: case X86::TCRETURNri: + case X86::TCRETURN_WIN64ri: + case X86::TCRETURN_HIPE32ri: case X86::TCRETURNmi: case X86::TCRETURNdi64: case X86::TCRETURNri64: @@ -1033,20 +1012,16 @@ unsigned X86RegisterInfo::findDeadCallerSavedReg( case X86::TCRETURNmi64: case X86::EH_RETURN: case X86::EH_RETURN64: { - SmallSet<uint16_t, 8> Uses; - for (MachineOperand &MO : MBBI->operands()) { - if (!MO.isReg() || MO.isDef()) - continue; - Register Reg = MO.getReg(); - if (!Reg) - continue; - for (MCRegAliasIterator AI(Reg, this, true); AI.isValid(); ++AI) - Uses.insert(*AI); + LiveRegUnits LRU(*this); + LRU.addLiveOuts(MBB); + LRU.stepBackward(*MBBI); + + const TargetRegisterClass &RC = + Is64Bit ? X86::GR64_NOSPRegClass : X86::GR32_NOSPRegClass; + for (MCRegister Reg : RC) { + if (LRU.available(Reg) && !MRI.isReserved(Reg)) + return Reg; } - - for (auto CS : AvailableRegs) - if (!Uses.count(CS) && CS != X86::RIP && CS != X86::RSP && CS != X86::ESP) - return CS; } } diff --git a/llvm/lib/Target/X86/X86RegisterInfo.h b/llvm/lib/Target/X86/X86RegisterInfo.h index 2f4c55cfad6d..d022e5ab8794 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.h +++ b/llvm/lib/Target/X86/X86RegisterInfo.h @@ -87,11 +87,6 @@ public: const TargetRegisterClass * getCrossCopyRegClass(const TargetRegisterClass *RC) const override; - /// getGPRsForTailCall - Returns a register class with registers that can be - /// used in forming tail calls. - const TargetRegisterClass * - getGPRsForTailCall(const MachineFunction &MF) const; - unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td index e9ca25d808a5..99b7910131dc 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/llvm/lib/Target/X86/X86RegisterInfo.td @@ -716,10 +716,7 @@ def GR64_NOREX2_NOSP : RegisterClass<"X86", [i64], 64, // which we do not have right now. def LOW32_ADDR_ACCESS : RegisterClass<"X86", [i32], 32, (add GR32, RIP)>; -// When RBP is used as a base pointer in a 32-bit addresses environment, -// this is also safe to use the full register to access addresses. -// Since RBP will never be spilled, stick to a 32 alignment to save -// on memory consumption. +// FIXME: This is unused, but deleting it results in codegen changes def LOW32_ADDR_ACCESS_RBP : RegisterClass<"X86", [i32], 32, (add LOW32_ADDR_ACCESS, RBP)>; diff --git a/llvm/lib/Target/X86/X86ScheduleZnver3.td b/llvm/lib/Target/X86/X86ScheduleZnver3.td index 9e271c1ee370..044b77f7aacf 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver3.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver3.td @@ -992,14 +992,14 @@ def Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn3FPFMisc0]> { def : InstRW<[Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rri, VEXTRACTI128rri)>; def Zn3WriteVEXTRACTI128mr : SchedWriteRes<[Zn3FPFMisc0, Zn3FPSt, Zn3Store]> { - let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); + let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); let ReleaseAtCycles = [1, 1, 1]; let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1); } def : InstRW<[Zn3WriteVEXTRACTI128mr], (instrs VEXTRACTI128mri, VEXTRACTF128mri)>; def Zn3WriteVINSERTF128rmr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPFMisc0]> { - let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); + let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); let ReleaseAtCycles = [1, 1, 1]; let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0); } @@ -1221,7 +1221,7 @@ def Zn3WriteSHA1MSG1rr : SchedWriteRes<[Zn3FPU0123]> { def : InstRW<[Zn3WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>; def Zn3WriteSHA1MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { - let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG1rr.Latency); + let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteSHA1MSG1rr.Latency); let ReleaseAtCycles = [1, 1, 2]; let NumMicroOps = !add(Zn3WriteSHA1MSG1rr.NumMicroOps, 0); } @@ -1235,7 +1235,7 @@ def Zn3WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn3FPU0123]> { def : InstRW<[Zn3WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>; def Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { - let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG2rr_SHA1NEXTErr.Latency); + let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteSHA1MSG2rr_SHA1NEXTErr.Latency); let ReleaseAtCycles = [1, 1, 2]; let NumMicroOps = !add(Zn3WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0); } @@ -1249,7 +1249,7 @@ def Zn3WriteSHA256MSG1rr : SchedWriteRes<[Zn3FPU0123]> { def : InstRW<[Zn3WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>; def Zn3Writerm_SHA256MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { - let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG1rr.Latency); + let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteSHA256MSG1rr.Latency); let ReleaseAtCycles = [1, 1, 3]; let NumMicroOps = !add(Zn3WriteSHA256MSG1rr.NumMicroOps, 0); } @@ -1263,7 +1263,7 @@ def Zn3WriteSHA256MSG2rr : SchedWriteRes<[Zn3FPU0123]> { def : InstRW<[Zn3WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>; def Zn3WriteSHA256MSG2rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> { - let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG2rr.Latency); + let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteSHA256MSG2rr.Latency); let ReleaseAtCycles = [1, 1, 8]; let NumMicroOps = !add(Zn3WriteSHA256MSG2rr.NumMicroOps, 1); } @@ -1338,14 +1338,14 @@ def Zn3WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn3FPVShuf]> { def : InstRW<[Zn3WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rri, VPERM2F128rri)>; def Zn3WriteVPERM2F128rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> { - let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERM2I128rr_VPERM2F128rr.Latency); + let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteVPERM2I128rr_VPERM2F128rr.Latency); let ReleaseAtCycles = [1, 1, 1]; let NumMicroOps = !add(Zn3WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0); } def : InstRW<[Zn3WriteVPERM2F128rm], (instrs VPERM2F128rmi)>; def Zn3WriteVPERMPSYrm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> { - let Latency = !add(Znver3Model.LoadLatency, 7); + let Latency = !add(Znver3Model.VecLoadLatency, 7); let ReleaseAtCycles = [1, 1, 2]; let NumMicroOps = 3; } @@ -1359,14 +1359,14 @@ def Zn3WriteVPERMYri : SchedWriteRes<[Zn3FPVShuf]> { def : InstRW<[Zn3WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>; def Zn3WriteVPERMPDYmi : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> { - let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMYri.Latency); + let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteVPERMYri.Latency); let ReleaseAtCycles = [1, 1, 2]; let NumMicroOps = !add(Zn3WriteVPERMYri.NumMicroOps, 1); } def : InstRW<[Zn3WriteVPERMPDYmi], (instrs VPERMPDYmi)>; def Zn3WriteVPERMDYm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> { - let Latency = !add(Znver3Model.LoadLatency, 5); + let Latency = !add(Znver3Model.VecLoadLatency, 5); let ReleaseAtCycles = [1, 1, 2]; let NumMicroOps = 2; } diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td index 74d916d41f83..a93c7e3a82f1 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver4.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td @@ -1005,14 +1005,14 @@ def Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn4FPFMisc0]> { def : InstRW<[Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rri, VEXTRACTI128rri)>; def Zn4WriteVEXTRACTI128mr : SchedWriteRes<[Zn4FPFMisc0, Zn4FPSt, Zn4Store]> { - let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); + let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); let ReleaseAtCycles = [1, 1, 1]; let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1); } def : InstRW<[Zn4WriteVEXTRACTI128mr], (instrs VEXTRACTI128mri, VEXTRACTF128mri)>; def Zn4WriteVINSERTF128rmr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPFMisc0]> { - let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); + let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency); let ReleaseAtCycles = [1, 1, 1]; let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0); } @@ -1262,7 +1262,7 @@ def Zn4WriteSHA1MSG1rr : SchedWriteRes<[Zn4FPU0123]> { def : InstRW<[Zn4WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>; def Zn4WriteSHA1MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { - let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG1rr.Latency); + let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteSHA1MSG1rr.Latency); let ReleaseAtCycles = [1, 1, 2]; let NumMicroOps = !add(Zn4WriteSHA1MSG1rr.NumMicroOps, 0); } @@ -1276,7 +1276,7 @@ def Zn4WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn4FPU0123]> { def : InstRW<[Zn4WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>; def Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { - let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG2rr_SHA1NEXTErr.Latency); + let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteSHA1MSG2rr_SHA1NEXTErr.Latency); let ReleaseAtCycles = [1, 1, 2]; let NumMicroOps = !add(Zn4WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0); } @@ -1290,7 +1290,7 @@ def Zn4WriteSHA256MSG1rr : SchedWriteRes<[Zn4FPU0123]> { def : InstRW<[Zn4WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>; def Zn4Writerm_SHA256MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { - let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG1rr.Latency); + let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteSHA256MSG1rr.Latency); let ReleaseAtCycles = [1, 1, 3]; let NumMicroOps = !add(Zn4WriteSHA256MSG1rr.NumMicroOps, 0); } @@ -1304,7 +1304,7 @@ def Zn4WriteSHA256MSG2rr : SchedWriteRes<[Zn4FPU0123]> { def : InstRW<[Zn4WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>; def Zn4WriteSHA256MSG2rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> { - let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG2rr.Latency); + let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteSHA256MSG2rr.Latency); let ReleaseAtCycles = [1, 1, 8]; let NumMicroOps = !add(Zn4WriteSHA256MSG2rr.NumMicroOps, 1); } @@ -1379,7 +1379,7 @@ def Zn4WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn4FPVShuf]> { def : InstRW<[Zn4WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rri, VPERM2F128rri)>; def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { - let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERM2I128rr_VPERM2F128rr.Latency); + let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERM2I128rr_VPERM2F128rr.Latency); let ReleaseAtCycles = [1, 1, 1]; let NumMicroOps = !add(Zn4WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0); } @@ -1393,7 +1393,7 @@ def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> { def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>; def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { - let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMPSYrr.Latency); + let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMPSYrr.Latency); let ReleaseAtCycles = [1, 1, 2]; let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1); } @@ -1407,7 +1407,7 @@ def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> { def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>; def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { - let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMYri.Latency); + let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMYri.Latency); let ReleaseAtCycles = [1, 1, 2]; let NumMicroOps = !add(Zn4WriteVPERMYri.NumMicroOps, 1); } @@ -1421,7 +1421,7 @@ def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> { def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>; def Zn4WriteVPERMYm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> { - let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMDYrr.Latency); + let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMDYrr.Latency); let ReleaseAtCycles = [1, 1, 2]; let NumMicroOps = !add(Zn4WriteVPERMDYrr.NumMicroOps, 0); } @@ -1534,9 +1534,9 @@ def Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr : SchedWriteRes<[Zn4FPFMisc01]> { let NumMicroOps = 1; } def : InstRW<[Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr], (instregex - "VFIXUPIMM(S|P)(S|D)(Z|Z128|Z256?)rrik", "VFIXUPIMM(S|P)(S|D)(Z?|Z128?|Z256?)rrikz", + "VFIXUPIMM(S|P)(S|D)(Z|Z128|Z256?)rrik", "VFIXUPIMM(S|P)(S|D)(Z?|Z128?|Z256?)rrikz", "VFIXUPIMM(S|P)(S|D)(Z128|Z256?)rri", "VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)", - "VRANGE(S|P)(S|D)(Z|Z128|Z256?)rri(b?)k","VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)kz" + "VRANGE(S|P)(S|D)(Z|Z128|Z256?)rri(b?)k","VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)kz" )>; // SCALE & REDUCE instructions @@ -1567,7 +1567,7 @@ def Zn4WriteBUSDr_VPMADDr: SchedWriteRes<[Zn4FPFMisc01]> { let NumMicroOps = 1; } def : InstRW<[Zn4WriteBUSDr_VPMADDr], (instregex - "VPDP(BU|WS)(S|P)(S|D|DS)(Z|Z128|Z256)(r|rk|rkz)", + "VPDP(BU|WS)(S|P)(S|D|DS)(Z|Z128|Z256)(r|rk|rkz)", "VPMADD52(H|L)UQ(Z|Z128|Z256)(r|rk|rkz)" )>; @@ -1586,7 +1586,7 @@ def : InstRW<[Zn4WriteSHIFTrr], (instregex "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z?|Z128?|Z256?)(rr|rrk|rrkz)", "(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z256?)(ri|rik|rikz)", "(V?)P(ROL|ROR)(D|Q)(Z?|Z128?)(ri|rik|rikz)", - "VPSHUFBITQMBZ128rr", "VFMSUB231SSZrkz_Int" + "VPSHUFBITQMBZ128rr", "VFMSUB231SSZrkz_Int" )>; def Zn4WriteSHIFTri: SchedWriteRes<[Zn4FPFMisc01]> { @@ -1598,24 +1598,40 @@ def : InstRW<[Zn4WriteSHIFTri], (instregex "VP(SLL|SRL|SRA)(D|Q|W)(Z|Z128|Z256?)(ri|rik|rikz)" )>; -// ALIGN Instructions -def Zn4WriteALIGN: SchedWriteRes<[Zn4FPFMisc12]> { +// ALIGNR Instructions +def Zn4WriteALIGNR: SchedWriteRes<[Zn4FPFMisc12]> { + let Latency = 2; + let ReleaseAtCycles = [1]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WriteALIGNR], (instregex + "(V?)PALIGNR(Y?|Z128?|Z256?)(rri|rrik|rrikz)" + )>; +def Zn4WriteALIGNRZ: SchedWriteRes<[Zn4FPFMisc12]> { let Latency = 2; let ReleaseAtCycles = [2]; let NumMicroOps = 1; } -def : InstRW<[Zn4WriteALIGN], (instregex - "(V?)PALIGNR(Z?|Z128?|Z256?)(rri|rrik|rrikz)" +def : InstRW<[Zn4WriteALIGNRZ], (instregex + "(V?)PALIGNRZ(rri|rrik|rrikz)" )>; -//PACK Instructions +// PACK Instructions def Zn4WritePACK: SchedWriteRes<[Zn4FPFMisc12]> { let Latency = 2; - let ReleaseAtCycles = [2]; + let ReleaseAtCycles = [1]; let NumMicroOps = 1; } def : InstRW<[Zn4WritePACK], (instregex - "(V?)PACK(SS|US)(DW|WB)(Z?|Z128?|Z256?)(rr|rrk|rrkz)" + "(V?)PACK(SS|US)(DW|WB)(Y?|Z128?|Z256?)(rr|rrk|rrkz)" + )>; +def Zn4WritePACKZ: SchedWriteRes<[Zn4FPFMisc12]> { + let Latency = 2; + let ReleaseAtCycles = [2]; + let NumMicroOps = 1; +} +def : InstRW<[Zn4WritePACKZ], (instregex + "(V?)PACK(SS|US)(DW|WB)Z(rr|rrk|rrkz)" )>; // MAX and MIN Instructions diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index 8ad8d423d10c..fd5f34b60efb 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -261,26 +261,8 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU, if (!FS.empty()) FullFS = (Twine(FullFS) + "," + FS).str(); - // Attach EVEX512 feature when we have AVX512 features with a default CPU. - // "pentium4" is default CPU for 32-bit targets. - // "x86-64" is default CPU for 64-bit targets. - if (CPU == "generic" || CPU == "pentium4" || CPU == "x86-64") { - size_t posNoEVEX512 = FS.rfind("-evex512"); - // Make sure we won't be cheated by "-avx512fp16". - size_t posNoAVX512F = - FS.ends_with("-avx512f") ? FS.size() - 8 : FS.rfind("-avx512f,"); - size_t posEVEX512 = FS.rfind("+evex512"); - // Any AVX512XXX will enable AVX512F. - size_t posAVX512F = FS.rfind("+avx512"); - - if (posAVX512F != StringRef::npos && - (posNoAVX512F == StringRef::npos || posNoAVX512F < posAVX512F)) - if (posEVEX512 == StringRef::npos && posNoEVEX512 == StringRef::npos) - FullFS += ",+evex512"; - } - // Disable 64-bit only features in non-64-bit mode. - SmallVector<StringRef, 9> FeaturesIn64BitOnly = { + StringRef FeaturesIn64BitOnly[] = { "egpr", "push2pop2", "ppx", "ndd", "ccmp", "nf", "cf", "zu", "uintr"}; if (FullFS.find("-64bit-mode") != std::string::npos) for (StringRef F : FeaturesIn64BitOnly) diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index be49214e041e..fa3f3b59741d 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -226,8 +226,7 @@ public: // TODO: Currently we're always allowing widening on CPUs without VLX, // because for many cases we don't have a better option. bool canExtendTo512DQ() const { - return hasAVX512() && hasEVEX512() && - (!hasVLX() || getPreferVectorWidth() >= 512); + return hasAVX512() && (!hasVLX() || getPreferVectorWidth() >= 512); } bool canExtendTo512BW() const { return hasBWI() && canExtendTo512DQ(); @@ -247,8 +246,7 @@ public: // If there are no 512-bit vectors and we prefer not to use 512-bit registers, // disable them in the legalizer. bool useAVX512Regs() const { - return hasAVX512() && hasEVEX512() && - (canExtendTo512DQ() || RequiredVectorWidth > 256); + return hasAVX512() && (canExtendTo512DQ() || RequiredVectorWidth > 256); } bool useLight256BitInstructions() const { diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 62f95277d016..3d8d0a236a3c 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -213,7 +213,7 @@ X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { case TargetTransformInfo::RGK_Scalar: return TypeSize::getFixed(ST->is64Bit() ? 64 : 32); case TargetTransformInfo::RGK_FixedWidthVector: - if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512) + if (ST->hasAVX512() && PreferVectorWidth >= 512) return TypeSize::getFixed(512); if (ST->hasAVX() && PreferVectorWidth >= 256) return TypeSize::getFixed(256); @@ -1206,6 +1206,8 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost( { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } }, + { X86ISD::PMULUDQ, MVT::v4i64, { 3, 5, 5, 6 } }, // pmuludq + split + { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps @@ -6591,7 +6593,7 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { // Only enable vector loads for equality comparison. Right now the vector // version is not as fast for three way compare (see #33329). const unsigned PreferredWidth = ST->getPreferVectorWidth(); - if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512()) + if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64); if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32); if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16); diff --git a/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp b/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp index ea8b88f41bb8..9bf0abb018c9 100644 --- a/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp +++ b/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp @@ -105,6 +105,7 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) { // Prolog information. SmallVector<int64_t> PushedRegs; bool HasStackAlloc = false; + bool HasSetFrame = false; unsigned ApproximatePrologCodeCount = 0; // Requested changes. @@ -130,15 +131,20 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) { break; case X86::SEH_StackAlloc: - case X86::SEH_SetFrame: if (State != FunctionState::InProlog) - llvm_unreachable("SEH_StackAlloc or SEH_SetFrame outside of prolog"); + llvm_unreachable("SEH_StackAlloc outside of prolog"); // Assume a large alloc... - ApproximatePrologCodeCount += - (MI.getOpcode() == X86::SEH_StackAlloc) ? 3 : 1; + ApproximatePrologCodeCount += 3; HasStackAlloc = true; break; + case X86::SEH_SetFrame: + if (State != FunctionState::InProlog) + llvm_unreachable("SEH_SetFrame outside of prolog"); + ApproximatePrologCodeCount++; + HasSetFrame = true; + break; + case X86::SEH_SaveReg: case X86::SEH_SaveXMM: if (State != FunctionState::InProlog) @@ -190,8 +196,30 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) { State = FunctionState::FinishedEpilog; break; - case X86::LEA64r: case X86::MOV64rr: + if (State == FunctionState::InEpilog) { + // If the prolog contains a stack allocation, then the first + // instruction in the epilog must be to adjust the stack pointer. + if (!HasSetFrame) + return rejectCurrentFunctionInternalError( + MF, Mode, + "The epilog is setting frame back, but prolog did not set it"); + if (PoppedRegCount > 0) + return rejectCurrentFunctionInternalError( + MF, Mode, + "The epilog is setting the frame back after popping " + "registers"); + if (HasStackDealloc) + return rejectCurrentFunctionInternalError( + MF, Mode, + "Cannot set the frame back after the stack " + "allocation has been deallocated"); + } else if (State == FunctionState::FinishedEpilog) + return rejectCurrentFunctionInternalError( + MF, Mode, "Unexpected mov instruction after the epilog"); + break; + + case X86::LEA64r: case X86::ADD64ri32: if (State == FunctionState::InEpilog) { // If the prolog contains a stack allocation, then the first @@ -211,8 +239,7 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) { HasStackDealloc = true; } else if (State == FunctionState::FinishedEpilog) return rejectCurrentFunctionInternalError( - MF, Mode, - "Unexpected lea, mov or add instruction after the epilog"); + MF, Mode, "Unexpected lea or add instruction after the epilog"); break; case X86::POP64r: @@ -278,11 +305,8 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) { } } - if (UnwindV2StartLocations.empty()) { - assert(State == FunctionState::InProlog && - "If there are no epilogs, then there should be no prolog"); + if (UnwindV2StartLocations.empty()) return false; - } MachineBasicBlock &FirstMBB = MF.front(); // Assume +1 for the "header" UOP_Epilog that contains the epilog size, and |
