summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/X86
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/X86')
-rw-r--r--llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp53
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp11
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp7
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp3
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp12
-rw-r--r--llvm/lib/Target/X86/X86.td26
-rw-r--r--llvm/lib/Target/X86/X86AsmPrinter.cpp3
-rw-r--r--llvm/lib/Target/X86/X86ExpandPseudo.cpp7
-rw-r--r--llvm/lib/Target/X86/X86FastPreTileConfig.cpp13
-rw-r--r--llvm/lib/Target/X86/X86FrameLowering.cpp3
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp348
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h6
-rw-r--r--llvm/lib/Target/X86/X86ISelLoweringCall.cpp4
-rw-r--r--llvm/lib/Target/X86/X86InstrAMX.td14
-rw-r--r--llvm/lib/Target/X86/X86InstrAVX10.td131
-rw-r--r--llvm/lib/Target/X86/X86InstrAVX512.td42
-rw-r--r--llvm/lib/Target/X86/X86InstrCompiler.td12
-rw-r--r--llvm/lib/Target/X86/X86InstrControl.td7
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.cpp50
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.h4
-rw-r--r--llvm/lib/Target/X86/X86InstrPredicates.td10
-rw-r--r--llvm/lib/Target/X86/X86InstrSSE.td12
-rw-r--r--llvm/lib/Target/X86/X86InterleavedAccess.cpp6
-rw-r--r--llvm/lib/Target/X86/X86RegisterInfo.cpp53
-rw-r--r--llvm/lib/Target/X86/X86RegisterInfo.h5
-rw-r--r--llvm/lib/Target/X86/X86RegisterInfo.td5
-rw-r--r--llvm/lib/Target/X86/X86ScheduleZnver3.td20
-rw-r--r--llvm/lib/Target/X86/X86ScheduleZnver4.td58
-rw-r--r--llvm/lib/Target/X86/X86Subtarget.cpp20
-rw-r--r--llvm/lib/Target/X86/X86Subtarget.h6
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp6
-rw-r--r--llvm/lib/Target/X86/X86WinEHUnwindV2.cpp46
32 files changed, 550 insertions, 453 deletions
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index d7671ed19589..ce5e92135f70 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -15,10 +15,12 @@
#include "MCTargetDesc/X86TargetStreamer.h"
#include "TargetInfo/X86TargetInfo.h"
#include "X86Operand.h"
+#include "X86RegisterInfo.h"
#include "llvm-c/Visibility.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/Twine.h"
#include "llvm/MC/MCContext.h"
@@ -29,6 +31,7 @@
#include "llvm/MC/MCParser/MCAsmParser.h"
#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegister.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSection.h"
#include "llvm/MC/MCStreamer.h"
@@ -40,6 +43,7 @@
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
+#include <cstdint>
#include <memory>
using namespace llvm;
@@ -1172,7 +1176,7 @@ private:
X86::CondCode ParseConditionCode(StringRef CCode);
- bool ParseIntelMemoryOperandSize(unsigned &Size);
+ bool ParseIntelMemoryOperandSize(unsigned &Size, StringRef *SizeStr);
bool CreateMemForMSInlineAsm(MCRegister SegReg, const MCExpr *Disp,
MCRegister BaseReg, MCRegister IndexReg,
unsigned Scale, bool NonAbsMem, SMLoc Start,
@@ -2574,7 +2578,8 @@ bool X86AsmParser::ParseMasmOperator(unsigned OpKind, int64_t &Val) {
return false;
}
-bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) {
+bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size,
+ StringRef *SizeStr) {
Size = StringSwitch<unsigned>(getTok().getString())
.Cases("BYTE", "byte", 8)
.Cases("WORD", "word", 16)
@@ -2592,6 +2597,8 @@ bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) {
.Cases("ZMMWORD", "zmmword", 512)
.Default(0);
if (Size) {
+ if (SizeStr)
+ *SizeStr = getTok().getString();
const AsmToken &Tok = Lex(); // Eat operand size (e.g., byte, word).
if (!(Tok.getString() == "PTR" || Tok.getString() == "ptr"))
return Error(Tok.getLoc(), "Expected 'PTR' or 'ptr' token!");
@@ -2600,6 +2607,19 @@ bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) {
return false;
}
+uint16_t RegSizeInBits(const MCRegisterInfo &MRI, MCRegister RegNo) {
+ if (X86MCRegisterClasses[X86::GR8RegClassID].contains(RegNo))
+ return 8;
+ if (X86MCRegisterClasses[X86::GR16RegClassID].contains(RegNo))
+ return 16;
+ if (X86MCRegisterClasses[X86::GR32RegClassID].contains(RegNo))
+ return 32;
+ if (X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo))
+ return 64;
+ // Unknown register size
+ return 0;
+}
+
bool X86AsmParser::parseIntelOperand(OperandVector &Operands, StringRef Name) {
MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
@@ -2607,7 +2627,8 @@ bool X86AsmParser::parseIntelOperand(OperandVector &Operands, StringRef Name) {
// Parse optional Size directive.
unsigned Size;
- if (ParseIntelMemoryOperandSize(Size))
+ StringRef SizeStr;
+ if (ParseIntelMemoryOperandSize(Size, &SizeStr))
return true;
bool PtrInOperand = bool(Size);
@@ -2624,9 +2645,29 @@ bool X86AsmParser::parseIntelOperand(OperandVector &Operands, StringRef Name) {
return Error(Start, "rip can only be used as a base register");
// A Register followed by ':' is considered a segment override
if (Tok.isNot(AsmToken::Colon)) {
- if (PtrInOperand)
- return Error(Start, "expected memory operand after 'ptr', "
- "found register operand instead");
+ if (PtrInOperand) {
+ if (!Parser.isParsingMasm())
+ return Error(Start, "expected memory operand after 'ptr', "
+ "found register operand instead");
+
+ // If we are parsing MASM, we are allowed to cast registers to their own
+ // sizes, but not to other types.
+ uint16_t RegSize =
+ RegSizeInBits(*getContext().getRegisterInfo(), RegNo);
+ if (RegSize == 0)
+ return Error(
+ Start,
+ "cannot cast register '" +
+ StringRef(getContext().getRegisterInfo()->getName(RegNo)) +
+ "'; its size is not easily defined.");
+ if (RegSize != Size)
+ return Error(
+ Start,
+ std::to_string(RegSize) + "-bit register '" +
+ StringRef(getContext().getRegisterInfo()->getName(RegNo)) +
+ "' cannot be used as a " + std::to_string(Size) + "-bit " +
+ SizeStr.upper());
+ }
Operands.push_back(X86Operand::CreateReg(RegNo, Start, End));
return false;
}
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 56a4cc3d65c2..865fc0ce8101 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -485,7 +485,16 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
if (!CanPadInst)
return;
- if (PendingBA && PendingBA->getNext() == OS.getCurrentFragment()) {
+ if (PendingBA) {
+ auto *NextFragment = PendingBA->getNext();
+ assert(NextFragment && "NextFragment should not be null");
+ if (NextFragment == OS.getCurrentFragment())
+ return;
+ // We eagerly create an empty fragment when inserting a fragment
+ // with a variable-size tail.
+ if (NextFragment->getNext() == OS.getCurrentFragment())
+ return;
+
// Macro fusion actually happens and there is no other fragment inserted
// after the previous instruction.
//
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
index 547745fdba9d..76731437931a 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
@@ -1668,6 +1668,13 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
DestName = getRegName(MI->getOperand(0).getReg());
break;
+ case X86::VMOVSHZrr:
+ Src2Name = getRegName(MI->getOperand(2).getReg());
+ Src1Name = getRegName(MI->getOperand(1).getReg());
+ DecodeScalarMoveMask(8, false, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
+
case X86::MOVPQI2QIrr:
case X86::MOVZPQILo2PQIrr:
case X86::VMOVPQI2QIrr:
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index a15930c1433f..cfe5b1094811 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -1047,9 +1047,6 @@ X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
Prefix.setL(TSFlags & X86II::VEX_L);
Prefix.setL2(TSFlags & X86II::EVEX_L2);
- if ((TSFlags & X86II::EVEX_L2) && STI.hasFeature(X86::FeatureAVX512) &&
- !STI.hasFeature(X86::FeatureEVEX512))
- report_fatal_error("ZMM registers are not supported without EVEX512");
switch (TSFlags & X86II::OpPrefixMask) {
case X86II::PD:
Prefix.setPP(0x1); // 66
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index cc7bcd678cb3..bb1e716c33ed 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -397,18 +397,6 @@ MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT,
if (CPU.empty())
CPU = "generic";
- size_t posNoEVEX512 = FS.rfind("-evex512");
- // Make sure we won't be cheated by "-avx512fp16".
- size_t posNoAVX512F =
- FS.ends_with("-avx512f") ? FS.size() - 8 : FS.rfind("-avx512f,");
- size_t posEVEX512 = FS.rfind("+evex512");
- size_t posAVX512F = FS.rfind("+avx512"); // Any AVX512XXX will enable AVX512F.
-
- if (posAVX512F != StringRef::npos &&
- (posNoAVX512F == StringRef::npos || posNoAVX512F < posAVX512F))
- if (posEVEX512 == StringRef::npos && posNoEVEX512 == StringRef::npos)
- ArchFS += ",+evex512";
-
return createX86MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, ArchFS);
}
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 9cfe081b8710..7c9e821c02fd 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -113,6 +113,7 @@ def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true",
def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true",
"Support 16-bit floating point conversion instructions",
[FeatureAVX]>;
+// Deprecated feature. Keep it here to suppress warnings in old IRs.
def FeatureEVEX512 : SubtargetFeature<"evex512", "HasEVEX512", "true",
"Support ZMM and 64-bit mask instructions">;
def FeatureAVX512 : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512",
@@ -329,20 +330,22 @@ def FeatureMOVDIRI : SubtargetFeature<"movdiri", "HasMOVDIRI", "true",
"Support movdiri instruction (direct store integer)">;
def FeatureMOVDIR64B : SubtargetFeature<"movdir64b", "HasMOVDIR64B", "true",
"Support movdir64b instruction (direct store 64 bytes)">;
-def FeatureAVX10_1 : SubtargetFeature<"avx10.1-256", "HasAVX10_1", "true",
- "Support AVX10.1 up to 256-bit instruction",
+def FeatureAVX10_1 : SubtargetFeature<"avx10.1", "HasAVX10_1", "true",
+ "Support AVX10.1 instruction",
[FeatureCDI, FeatureVBMI, FeatureIFMA, FeatureVNNI,
FeatureBF16, FeatureVPOPCNTDQ, FeatureVBMI2, FeatureBITALG,
FeatureFP16, FeatureVLX, FeatureDQI]>;
+// Deprecated feature. Keep it here to suppress warnings in old IRs.
def FeatureAVX10_1_512 : SubtargetFeature<"avx10.1-512", "HasAVX10_1_512", "true",
- "Support AVX10.1 up to 512-bit instruction",
- [FeatureAVX10_1, FeatureEVEX512]>;
-def FeatureAVX10_2 : SubtargetFeature<"avx10.2-256", "HasAVX10_2", "true",
- "Support AVX10.2 up to 256-bit instruction",
+ "Support AVX10.1 instruction",
+ [FeatureAVX10_1]>;
+def FeatureAVX10_2 : SubtargetFeature<"avx10.2", "HasAVX10_2", "true",
+ "Support AVX10.2 instruction",
[FeatureAVX10_1]>;
+// Deprecated feature. Keep it here to suppress warnings in old IRs.
def FeatureAVX10_2_512 : SubtargetFeature<"avx10.2-512", "HasAVX10_2_512", "true",
- "Support AVX10.2 up to 512-bit instruction",
- [FeatureAVX10_2, FeatureAVX10_1_512]>;
+ "Support AVX10.2 instruction",
+ [FeatureAVX10_2]>;
def FeatureEGPR : SubtargetFeature<"egpr", "HasEGPR", "true",
"Support extended general purpose register">;
def FeaturePush2Pop2 : SubtargetFeature<"push2pop2", "HasPush2Pop2", "true",
@@ -871,7 +874,6 @@ def ProcessorFeatures {
];
list<SubtargetFeature> X86_64V4Features = !listconcat(X86_64V3Features, [
- FeatureEVEX512,
FeatureBWI,
FeatureCDI,
FeatureDQI,
@@ -996,7 +998,6 @@ def ProcessorFeatures {
FeatureXSAVES,
FeatureCLFLUSHOPT,
FeatureAVX512,
- FeatureEVEX512,
FeatureCDI,
FeatureDQI,
FeatureBWI,
@@ -1039,7 +1040,6 @@ def ProcessorFeatures {
// Cannonlake
list<SubtargetFeature> CNLAdditionalFeatures = [FeatureAVX512,
- FeatureEVEX512,
FeatureCDI,
FeatureDQI,
FeatureBWI,
@@ -1155,7 +1155,7 @@ def ProcessorFeatures {
!listconcat(GNRFeatures, GNRDAdditionalFeatures);
// Diamond Rapids
- list<SubtargetFeature> DMRAdditionalFeatures = [FeatureAVX10_2_512,
+ list<SubtargetFeature> DMRAdditionalFeatures = [FeatureAVX10_2,
FeatureSM4,
FeatureCMPCCXADD,
FeatureAVXIFMA,
@@ -1368,7 +1368,6 @@ def ProcessorFeatures {
FeatureF16C,
FeatureFSGSBase,
FeatureAVX512,
- FeatureEVEX512,
FeatureCDI,
FeatureADX,
FeatureRDSEED,
@@ -1586,7 +1585,6 @@ def ProcessorFeatures {
list<SubtargetFeature> ZN4Tuning =
!listconcat(ZN3Tuning, ZN4AdditionalTuning);
list<SubtargetFeature> ZN4AdditionalFeatures = [FeatureAVX512,
- FeatureEVEX512,
FeatureCDI,
FeatureDQI,
FeatureBWI,
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp
index d406277e440b..ff22ee8c86fa 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -476,7 +476,8 @@ static bool isIndirectBranchOrTailCall(const MachineInstr &MI) {
return MI.getDesc().isIndirectBranch() /*Make below code in a good shape*/ ||
Opc == X86::TAILJMPr || Opc == X86::TAILJMPm ||
Opc == X86::TAILJMPr64 || Opc == X86::TAILJMPm64 ||
- Opc == X86::TCRETURNri || Opc == X86::TCRETURNmi ||
+ Opc == X86::TCRETURNri || Opc == X86::TCRETURN_WIN64ri ||
+ Opc == X86::TCRETURN_HIPE32ri || Opc == X86::TCRETURNmi ||
Opc == X86::TCRETURNri64 || Opc == X86::TCRETURNmi64 ||
Opc == X86::TCRETURNri64_ImpCall || Opc == X86::TAILJMPr64_REX ||
Opc == X86::TAILJMPm64_REX;
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 0e6b4dffec3a..9457e718de69 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -269,6 +269,8 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
case X86::TCRETURNdi:
case X86::TCRETURNdicc:
case X86::TCRETURNri:
+ case X86::TCRETURN_WIN64ri:
+ case X86::TCRETURN_HIPE32ri:
case X86::TCRETURNmi:
case X86::TCRETURNdi64:
case X86::TCRETURNdi64cc:
@@ -346,8 +348,9 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op));
for (unsigned i = 0; i != X86::AddrNumOperands; ++i)
MIB.add(MBBI->getOperand(i));
- } else if ((Opcode == X86::TCRETURNri64) ||
- (Opcode == X86::TCRETURNri64_ImpCall)) {
+ } else if (Opcode == X86::TCRETURNri64 ||
+ Opcode == X86::TCRETURNri64_ImpCall ||
+ Opcode == X86::TCRETURN_WIN64ri) {
JumpTarget.setIsKill();
BuildMI(MBB, MBBI, DL,
TII->get(IsX64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64))
diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
index d3c239250943..787b71d425cb 100644
--- a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp
@@ -564,8 +564,17 @@ bool X86FastPreTileConfig::configBasicBlock(MachineBasicBlock &MBB) {
MachineBasicBlock::iterator I;
if (LastShapeMI && dominates(MBB, MI, LastShapeMI))
I = ++LastShapeMI->getIterator();
- else
- I = ++MI.getIterator();
+ else {
+ // Call can overwrite registers like rax, ensure the tile config
+ // instruction is sinked closer to first instruction that uses tile.
+ auto UseIt = MI.getIterator();
+ while (UseIt != MBB.end()) {
+ if (HasTileOperand(MRI, *UseIt))
+ break;
+ ++UseIt;
+ }
+ I = UseIt;
+ }
Config(*I);
HasUnconfigTile = false;
continue;
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index cba7843d53e3..a293b4c87cfe 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -2398,7 +2398,8 @@ X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
}
static bool isTailCallOpcode(unsigned Opc) {
- return Opc == X86::TCRETURNri || Opc == X86::TCRETURNdi ||
+ return Opc == X86::TCRETURNri || Opc == X86::TCRETURN_WIN64ri ||
+ Opc == X86::TCRETURN_HIPE32ri || Opc == X86::TCRETURNdi ||
Opc == X86::TCRETURNmi || Opc == X86::TCRETURNri64 ||
Opc == X86::TCRETURNri64_ImpCall || Opc == X86::TCRETURNdi64 ||
Opc == X86::TCRETURNmi64;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 19131fbd4102..3631016b0f5c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -326,15 +326,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (Subtarget.hasAVX10_2()) {
setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v2i32, Custom);
setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v2i32, Custom);
+ setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v8i64, Legal);
+ setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v8i64, Legal);
for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
MVT::v4i64}) {
setOperationAction(ISD::FP_TO_UINT_SAT, VT, Legal);
setOperationAction(ISD::FP_TO_SINT_SAT, VT, Legal);
}
- if (Subtarget.hasAVX10_2_512()) {
- setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v8i64, Legal);
- setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v8i64, Legal);
- }
if (Subtarget.is64Bit()) {
setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Legal);
setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Legal);
@@ -2457,6 +2455,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
+ setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
+ setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
+ setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
+ setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
+ setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
+ setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
+ setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
+ setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
+ setOperationAction(ISD::FMINIMUMNUM, MVT::v32bf16, Custom);
+ setOperationAction(ISD::FMAXIMUMNUM, MVT::v32bf16, Custom);
for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
setOperationAction(ISD::FADD, VT, Legal);
setOperationAction(ISD::FSUB, VT, Legal);
@@ -2470,19 +2479,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FMINIMUMNUM, VT, Custom);
setOperationAction(ISD::FMAXIMUMNUM, VT, Custom);
}
- if (Subtarget.hasAVX10_2_512()) {
- setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
- setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
- setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
- setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
- setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
- setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
- setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
- setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
- setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
- setOperationAction(ISD::FMINIMUMNUM, MVT::v32bf16, Custom);
- setOperationAction(ISD::FMAXIMUMNUM, MVT::v32bf16, Custom);
- }
for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
setCondCodeAction(ISD::SETOEQ, VT, Custom);
setCondCodeAction(ISD::SETUNE, VT, Custom);
@@ -21252,7 +21248,7 @@ static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
// the truncation then we can use PACKSS by converting the srl to a sra.
// SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
if (In.getOpcode() == ISD::SRL && In->hasOneUse())
- if (std::optional<uint64_t> ShAmt = DAG.getValidShiftAmount(In)) {
+ if (std::optional<unsigned> ShAmt = DAG.getValidShiftAmount(In)) {
if (*ShAmt == MinSignBits) {
PackOpcode = X86ISD::PACKSS;
return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
@@ -26269,10 +26265,9 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
SDValue PreservedSrc,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
-
- if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
- if (MaskConst->getZExtValue() & 0x1)
- return Op;
+ auto *MaskConst = dyn_cast<ConstantSDNode>(Mask);
+ if (MaskConst && (MaskConst->getZExtValue() & 0x1))
+ return Op;
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
@@ -26288,6 +26283,17 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
if (PreservedSrc.isUndef())
PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+
+ if (MaskConst) {
+ assert((MaskConst->getZExtValue() & 0x1) == 0 && "Expected false mask");
+ // Discard op and blend passthrough with scalar op src/dst.
+ SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
+ std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
+ ShuffleMask[0] = VT.getVectorNumElements();
+ return DAG.getVectorShuffle(VT, dl, Op.getOperand(0), PreservedSrc,
+ ShuffleMask);
+ }
+
return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
}
@@ -31404,9 +31410,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
return R;
// AVX512 implicitly uses modulo rotation amounts.
- if ((Subtarget.hasVLX() ||
- (Subtarget.hasAVX512() && Subtarget.hasEVEX512())) &&
- 32 <= EltSizeInBits) {
+ if ((Subtarget.hasVLX() || Subtarget.hasAVX512()) && 32 <= EltSizeInBits) {
// Attempt to rotate by immediate.
if (IsCstSplat) {
unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
@@ -38676,13 +38680,11 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
if (Opc == X86ISD::VSHLI) {
- Known.Zero <<= ShAmt;
- Known.One <<= ShAmt;
+ Known <<= ShAmt;
// Low bits are known zero.
Known.Zero.setLowBits(ShAmt);
} else if (Opc == X86ISD::VSRLI) {
- Known.Zero.lshrInPlace(ShAmt);
- Known.One.lshrInPlace(ShAmt);
+ Known >>= ShAmt;
// High bits are known zero.
Known.Zero.setHighBits(ShAmt);
} else {
@@ -44518,8 +44520,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
TLO, Depth + 1))
return true;
- Known.Zero <<= ShAmt;
- Known.One <<= ShAmt;
+ Known <<= ShAmt;
// Low bits known zero.
Known.Zero.setLowBits(ShAmt);
@@ -44549,8 +44550,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
TLO, Depth + 1))
return true;
- Known.Zero.lshrInPlace(ShAmt);
- Known.One.lshrInPlace(ShAmt);
+ Known >>= ShAmt;
// High bits known zero.
Known.Zero.setHighBits(ShAmt);
@@ -44598,8 +44598,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
TLO, Depth + 1))
return true;
- Known.Zero.lshrInPlace(ShAmt);
- Known.One.lshrInPlace(ShAmt);
+ Known >>= ShAmt;
// If the input sign bit is known to be zero, or if none of the top bits
// are demanded, turn this into an unsigned shift right.
@@ -44957,6 +44956,44 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
Known.Zero.setLowBits(Known2.countMinTrailingZeros());
return false;
}
+ case X86ISD::VPMADD52L:
+ case X86ISD::VPMADD52H: {
+ KnownBits KnownOp0, KnownOp1, KnownOp2;
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ SDValue Op2 = Op.getOperand(2);
+ // Only demand the lower 52-bits of operands 0 / 1 (and all 64-bits of
+ // operand 2).
+ APInt Low52Bits = APInt::getLowBitsSet(BitWidth, 52);
+ if (SimplifyDemandedBits(Op0, Low52Bits, OriginalDemandedElts, KnownOp0,
+ TLO, Depth + 1))
+ return true;
+
+ if (SimplifyDemandedBits(Op1, Low52Bits, OriginalDemandedElts, KnownOp1,
+ TLO, Depth + 1))
+ return true;
+
+ if (SimplifyDemandedBits(Op2, APInt::getAllOnes(64), OriginalDemandedElts,
+ KnownOp2, TLO, Depth + 1))
+ return true;
+
+ KnownBits KnownMul;
+ KnownOp0 = KnownOp0.trunc(52);
+ KnownOp1 = KnownOp1.trunc(52);
+ KnownMul = Opc == X86ISD::VPMADD52L ? KnownBits::mul(KnownOp0, KnownOp1)
+ : KnownBits::mulhu(KnownOp0, KnownOp1);
+ KnownMul = KnownMul.zext(64);
+
+ // lo/hi(X * Y) + Z --> C + Z
+ if (KnownMul.isConstant()) {
+ SDLoc DL(Op);
+ SDValue C = TLO.DAG.getConstant(KnownMul.getConstant(), DL, VT);
+ return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ADD, DL, VT, C, Op2));
+ }
+
+ Known = KnownBits::add(KnownMul, KnownOp2);
+ return false;
+ }
}
return TargetLowering::SimplifyDemandedBitsForTargetNode(
@@ -45132,6 +45169,14 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
switch (Op.getOpcode()) {
+ // SSE bit logic.
+ case X86ISD::FAND:
+ case X86ISD::FOR:
+ case X86ISD::FXOR:
+ case X86ISD::FANDN:
+ case X86ISD::ANDNP:
+ case X86ISD::VPTERNLOG:
+ return false;
// SSE vector insert/extracts use modulo indices.
case X86ISD::PINSRB:
case X86ISD::PINSRW:
@@ -45167,6 +45212,11 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(
// SSE signbit extraction.
case X86ISD::MOVMSK:
return false;
+ // GFNI instructions.
+ case X86ISD::GF2P8AFFINEINVQB:
+ case X86ISD::GF2P8AFFINEQB:
+ case X86ISD::GF2P8MULB:
+ return false;
case ISD::INTRINSIC_WO_CHAIN:
switch (Op->getConstantOperandVal(0)) {
case Intrinsic::x86_sse2_pmadd_wd:
@@ -48349,7 +48399,7 @@ static SDValue checkSignTestSetCCCombine(SDValue Cmp, X86::CondCode &CC,
// If Src came from a SHL (probably from an expanded SIGN_EXTEND_INREG), then
// peek through and adjust the TEST bit.
if (Src.getOpcode() == ISD::SHL) {
- if (std::optional<uint64_t> ShiftAmt = DAG.getValidShiftAmount(Src)) {
+ if (std::optional<unsigned> ShiftAmt = DAG.getValidShiftAmount(Src)) {
Src = Src.getOperand(0);
BitMask.lshrInPlace(*ShiftAmt);
}
@@ -50886,10 +50936,12 @@ static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,
// Given a target type \p VT, we generate
// or (and x, y), (xor z, zext(build_vector (constants)))
// given x, y and z are of type \p VT. We can do so, if operands are either
-// truncates from VT types, the second operand is a vector of constants or can
-// be recursively promoted.
+// truncates from VT types, the second operand is a vector of constants, can
+// be recursively promoted or is an existing extension we can extend further.
static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT,
- SelectionDAG &DAG, unsigned Depth) {
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ unsigned Depth) {
// Limit recursion to avoid excessive compile times.
if (Depth >= SelectionDAG::MaxRecursionDepth)
return SDValue();
@@ -50904,28 +50956,32 @@ static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT,
if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
return SDValue();
- if (SDValue NN0 = PromoteMaskArithmetic(N0, DL, VT, DAG, Depth + 1))
+ if (SDValue NN0 =
+ PromoteMaskArithmetic(N0, DL, VT, DAG, Subtarget, Depth + 1))
N0 = NN0;
else {
- // The left side has to be a trunc.
- if (N0.getOpcode() != ISD::TRUNCATE)
- return SDValue();
-
- // The type of the truncated inputs.
- if (N0.getOperand(0).getValueType() != VT)
+ // The left side has to be a 'trunc'.
+ bool LHSTrunc = N0.getOpcode() == ISD::TRUNCATE &&
+ N0.getOperand(0).getValueType() == VT;
+ if (LHSTrunc)
+ N0 = N0.getOperand(0);
+ else
return SDValue();
-
- N0 = N0.getOperand(0);
}
- if (SDValue NN1 = PromoteMaskArithmetic(N1, DL, VT, DAG, Depth + 1))
+ if (SDValue NN1 =
+ PromoteMaskArithmetic(N1, DL, VT, DAG, Subtarget, Depth + 1))
N1 = NN1;
else {
- // The right side has to be a 'trunc' or a (foldable) constant.
+ // The right side has to be a 'trunc', a (foldable) constant or an
+ // existing extension we can extend further.
bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
N1.getOperand(0).getValueType() == VT;
if (RHSTrunc)
N1 = N1.getOperand(0);
+ else if (ISD::isExtVecInRegOpcode(N1.getOpcode()) && VT.is256BitVector() &&
+ Subtarget.hasInt256() && N1.hasOneUse())
+ N1 = DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0));
else if (SDValue Cst =
DAG.FoldConstantArithmetic(ISD::ZERO_EXTEND, DL, VT, {N1}))
N1 = Cst;
@@ -50955,7 +51011,7 @@ static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL,
EVT NarrowVT = Narrow.getValueType();
// Generate the wide operation.
- SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, 0);
+ SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, Subtarget, 0);
if (!Op)
return SDValue();
switch (N.getOpcode()) {
@@ -51804,6 +51860,8 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
SDValue X, Y;
EVT CondVT = VT.changeVectorElementType(MVT::i1);
if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(CondVT) &&
+ (VT.is512BitVector() || Subtarget.hasVLX()) &&
+ (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
sd_match(N, m_And(m_Value(X),
m_OneUse(m_SExt(m_AllOf(
m_Value(Y), m_SpecificVT(CondVT),
@@ -54135,10 +54193,10 @@ static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG,
static SDValue combinei64TruncSrlConstant(SDValue N, EVT VT, SelectionDAG &DAG,
const SDLoc &DL) {
assert(N.getOpcode() == ISD::SRL && "Unknown shift opcode");
- std::optional<uint64_t> ValidSrlConst = DAG.getValidShiftAmount(N);
+ std::optional<unsigned> ValidSrlConst = DAG.getValidShiftAmount(N);
if (!ValidSrlConst)
return SDValue();
- uint64_t SrlConstVal = *ValidSrlConst;
+ unsigned SrlConstVal = *ValidSrlConst;
SDValue Op = N.getOperand(0);
unsigned Opcode = Op.getOpcode();
@@ -55368,6 +55426,8 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
SDValue Src = N0.getOperand(0);
EVT SrcVT = Src.getValueType();
if (Src.getOpcode() == ISD::SETCC && SrcVT.getScalarType() == MVT::i1 &&
+ (VT.is512BitVector() || Subtarget.hasVLX()) &&
+ (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
TLI.isTypeLegal(SrcVT) && N0.hasOneUse() && Src.hasOneUse())
return DAG.getSelect(DL, VT, DAG.getNOT(DL, Src, SrcVT), N1,
getZeroVector(VT, Subtarget, DAG, DL));
@@ -56247,7 +56307,13 @@ static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC,
SDValue Masked = BroadcastOp;
if (N != 0) {
- APInt Mask = APInt::getLowBitsSet(BroadcastOpVT.getSizeInBits(), Len);
+ unsigned BroadcastOpBitWidth = BroadcastOpVT.getSizeInBits();
+ unsigned NumDefinedElts = UndefElts.countTrailingZeros();
+
+ if (NumDefinedElts > BroadcastOpBitWidth)
+ return SDValue();
+
+ APInt Mask = APInt::getLowBitsSet(BroadcastOpBitWidth, NumDefinedElts);
SDValue ShiftedValue = DAG.getNode(ISD::SRL, DL, BroadcastOpVT, BroadcastOp,
DAG.getConstant(N, DL, BroadcastOpVT));
Masked = DAG.getNode(ISD::AND, DL, BroadcastOpVT, ShiftedValue,
@@ -57904,6 +57970,51 @@ static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL,
Cmov.getOperand(3));
}
+// Attempt to turn ADD(MUL(x, y), acc)) -> VPMADD52L
+// When upper 12 bits of x, y and MUL(x, y) are known to be 0
+static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL,
+ EVT VT, const X86Subtarget &Subtarget) {
+ using namespace SDPatternMatch;
+ if (!VT.isVector() || VT.getScalarSizeInBits() != 64 ||
+ (!Subtarget.hasAVXIFMA() && !Subtarget.hasIFMA()))
+ return SDValue();
+
+ // Need AVX-512VL vector length extensions if operating on XMM/YMM registers
+ if (!Subtarget.hasAVXIFMA() && !Subtarget.hasVLX() &&
+ VT.getSizeInBits() < 512)
+ return SDValue();
+
+ const auto TotalSize = VT.getSizeInBits();
+ if (TotalSize < 128 || !isPowerOf2_64(TotalSize))
+ return SDValue();
+
+ SDValue X, Y, Acc;
+ if (!sd_match(N, m_Add(m_Mul(m_Value(X), m_Value(Y)), m_Value(Acc))))
+ return SDValue();
+
+ KnownBits KnownX = DAG.computeKnownBits(X);
+ if (KnownX.countMinLeadingZeros() < 12)
+ return SDValue();
+ KnownBits KnownY = DAG.computeKnownBits(Y);
+ if (KnownY.countMinLeadingZeros() < 12)
+ return SDValue();
+ KnownBits KnownMul = KnownBits::mul(KnownX, KnownY);
+ if (KnownMul.countMinLeadingZeros() < 12)
+ return SDValue();
+
+ auto VPMADD52Builder = [](SelectionDAG &G, SDLoc DL,
+ ArrayRef<SDValue> SubOps) {
+ EVT SubVT = SubOps[0].getValueType();
+ assert(SubVT.getScalarSizeInBits() == 64 &&
+ "Unexpected element size, only supports 64bit size");
+ return G.getNode(X86ISD::VPMADD52L, DL, SubVT, SubOps[1] /*X*/,
+ SubOps[2] /*Y*/, SubOps[0] /*Acc*/);
+ };
+
+ return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Acc, X, Y}, VPMADD52Builder,
+ /*CheckBWI*/ false);
+}
+
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -58007,6 +58118,9 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
Op0.getOperand(0), Op0.getOperand(2));
}
+ if (SDValue IFMA52 = matchVPMADD52(N, DAG, DL, VT, Subtarget))
+ return IFMA52;
+
return combineAddOrSubToADCOrSBB(N, DL, DAG);
}
@@ -60068,6 +60182,19 @@ static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// Simplify VPMADD52L/VPMADD52H operations.
+static SDValue combineVPMADD52LH(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ MVT VT = N->getSimpleValueType(0);
+ unsigned NumEltBits = VT.getScalarSizeInBits();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
+ DCI))
+ return SDValue(N, 0);
+
+ return SDValue();
+}
+
static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -60705,6 +60832,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
case X86ISD::VPMADDUBSW:
case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
+ case X86ISD::VPMADD52L:
+ case X86ISD::VPMADD52H: return combineVPMADD52LH(N, DAG, DCI);
case X86ISD::KSHIFTL:
case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
@@ -60932,117 +61061,6 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
// X86 Inline Assembly Support
//===----------------------------------------------------------------------===//
-// Helper to match a string separated by whitespace.
-static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
- S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
-
- for (StringRef Piece : Pieces) {
- if (!S.starts_with(Piece)) // Check if the piece matches.
- return false;
-
- S = S.substr(Piece.size());
- StringRef::size_type Pos = S.find_first_not_of(" \t");
- if (Pos == 0) // We matched a prefix.
- return false;
-
- S = S.substr(Pos);
- }
-
- return S.empty();
-}
-
-static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
-
- if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
- if (llvm::is_contained(AsmPieces, "~{cc}") &&
- llvm::is_contained(AsmPieces, "~{flags}") &&
- llvm::is_contained(AsmPieces, "~{fpsr}")) {
-
- if (AsmPieces.size() == 3)
- return true;
- else if (llvm::is_contained(AsmPieces, "~{dirflag}"))
- return true;
- }
- }
- return false;
-}
-
-bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
- InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
-
- StringRef AsmStr = IA->getAsmString();
-
- IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
- if (!Ty || Ty->getBitWidth() % 16 != 0)
- return false;
-
- // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
- SmallVector<StringRef, 4> AsmPieces;
- SplitString(AsmStr, AsmPieces, ";\n");
-
- switch (AsmPieces.size()) {
- default: return false;
- case 1:
- // FIXME: this should verify that we are targeting a 486 or better. If not,
- // we will turn this bswap into something that will be lowered to logical
- // ops instead of emitting the bswap asm. For now, we don't support 486 or
- // lower so don't worry about this.
- // bswap $0
- if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
- matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
- matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
- matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
- matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
- matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
- // No need to check constraints, nothing other than the equivalent of
- // "=r,0" would be valid here.
- return IntrinsicLowering::LowerToByteSwap(CI);
- }
-
- // rorw $$8, ${0:w} --> llvm.bswap.i16
- if (CI->getType()->isIntegerTy(16) &&
- IA->getConstraintString().starts_with("=r,0,") &&
- (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
- matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
- AsmPieces.clear();
- StringRef ConstraintsStr = IA->getConstraintString();
- SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
- array_pod_sort(AsmPieces.begin(), AsmPieces.end());
- if (clobbersFlagRegisters(AsmPieces))
- return IntrinsicLowering::LowerToByteSwap(CI);
- }
- break;
- case 3:
- if (CI->getType()->isIntegerTy(32) &&
- IA->getConstraintString().starts_with("=r,0,") &&
- matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
- matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
- matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
- AsmPieces.clear();
- StringRef ConstraintsStr = IA->getConstraintString();
- SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
- array_pod_sort(AsmPieces.begin(), AsmPieces.end());
- if (clobbersFlagRegisters(AsmPieces))
- return IntrinsicLowering::LowerToByteSwap(CI);
- }
-
- if (CI->getType()->isIntegerTy(64)) {
- InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
- if (Constraints.size() >= 2 &&
- Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
- Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
- // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
- if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
- matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
- matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
- return IntrinsicLowering::LowerToByteSwap(CI);
- }
- }
- break;
- }
- return false;
-}
-
static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {
X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)
.Case("{@cca}", X86::COND_A)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 97d3b6e2420d..0c9ba591b03e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1364,8 +1364,6 @@ namespace llvm {
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
- bool ExpandInlineAsm(CallInst *CI) const override;
-
ConstraintType getConstraintType(StringRef Constraint) const override;
/// Examine constraint string and operand type and determine a weight value.
@@ -1668,8 +1666,8 @@ namespace llvm {
/// Lower interleaved store(s) into target specific
/// instructions/intrinsics.
bool lowerInterleavedStore(Instruction *Store, Value *Mask,
- ShuffleVectorInst *SVI,
- unsigned Factor) const override;
+ ShuffleVectorInst *SVI, unsigned Factor,
+ const APInt &GapMask) const override;
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr,
int JTI, SelectionDAG &DAG) const override;
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 1c745a338a61..3bc46af4d130 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -302,7 +302,7 @@ EVT X86TargetLowering::getOptimalMemOpType(
if (Op.size() >= 16 &&
(!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
// FIXME: Check if unaligned 64-byte accesses are slow.
- if (Op.size() >= 64 && Subtarget.hasAVX512() && Subtarget.hasEVEX512() &&
+ if (Op.size() >= 64 && Subtarget.hasAVX512() &&
(Subtarget.getPreferVectorWidth() >= 512)) {
return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
}
@@ -416,7 +416,7 @@ bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context,
return true;
return false;
case 512:
- if (Subtarget.hasAVX512() && Subtarget.hasEVEX512())
+ if (Subtarget.hasAVX512())
return true;
return false;
default:
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index 1beaaafb159e..69a5115201ef 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -550,7 +550,7 @@ let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in {
} // HasAMXMOVRS, In64BitMode
multiclass m_tcvtrowd2ps {
- let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in {
+ let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in {
let SchedRW = [WriteSystem] in {
def rri : Ii8<0x7, MRMSrcReg, (outs VR512:$dst),
(ins TILE:$src1, i32u8imm:$src2),
@@ -561,12 +561,12 @@ multiclass m_tcvtrowd2ps {
"tcvtrowd2ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[]>, T8,XS, EVEX, VVVV, EVEX_V512;
}
- } // HasAMXAVX512, HasAVX10_2_512, In64BitMode
+ } // HasAMXAVX512, HasAVX10_2, In64BitMode
}
defm TCVTROWD2PS : m_tcvtrowd2ps;
-let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in {
+let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in {
let SchedRW = [WriteSystem] in {
let usesCustomInserter = 1 in {
def PTCVTROWD2PSrri : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, i32u8imm:$src2),
@@ -630,7 +630,7 @@ let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in {
multiclass AMXAVX512_BASE<bits<8> Opcode1, bits<8> Opcode2, string Opstr,
Prefix P1, Prefix P2> {
- let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode], SchedRW = [WriteSystem] in {
+ let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode], SchedRW = [WriteSystem] in {
let OpPrefix = P1 in
def rre : I<Opcode1, MRMSrcReg4VOp3, (outs VR512:$dst),
(ins TILE:$src1, GR32:$src2),
@@ -658,7 +658,7 @@ defm TCVTROWPS2BF16H : AMXAVX512_BASE<0x6d, 0x07, "tcvtrowps2bf16h", XD, XD>;
defm TCVTROWPS2BF16L : AMXAVX512_BASE<0x6d, 0x77, "tcvtrowps2bf16l", XS, XS>;
multiclass m_tilemovrow {
- let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in {
+ let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in {
let SchedRW = [WriteSystem] in {
def rri : Ii8<0x7, MRMSrcReg, (outs VR512:$dst),
(ins TILE:$src1, u8imm:$src2),
@@ -669,12 +669,12 @@ multiclass m_tilemovrow {
"tilemovrow\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[]>, T8,PD, EVEX, VVVV, EVEX_V512;
}
- } // HasAMXAVX512, HasAVX10_2_512, In64BitMode
+ } // HasAMXAVX512, HasAVX10_2, In64BitMode
}
defm TILEMOVROW : m_tilemovrow;
-let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in {
+let Predicates = [HasAMXAVX512, HasAVX10_2, In64BitMode] in {
let SchedRW = [WriteSystem] in {
let usesCustomInserter = 1 in {
def PTILEMOVROWrri : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, i32u8imm:$src2),
diff --git a/llvm/lib/Target/X86/X86InstrAVX10.td b/llvm/lib/Target/X86/X86InstrAVX10.td
index 2d2bf1f6c725..764ff998bb56 100644
--- a/llvm/lib/Target/X86/X86InstrAVX10.td
+++ b/llvm/lib/Target/X86/X86InstrAVX10.td
@@ -15,36 +15,36 @@
// VNNI FP16
let ExeDomain = SSEPackedSingle in
defm VDPPHPS : avx512_dpf16ps_sizes<0x52, "vdpphps", X86dpfp16ps, avx512vl_f16_info,
- [HasAVX10_2], [HasAVX10_2_512]>,
+ [HasAVX10_2], [HasAVX10_2]>,
T8, PS, EVEX_CD8<32, CD8VF>;
// VNNI INT8
defm VPDPBSSD : VNNI_common<0x50, "vpdpbssd", X86vpdpbssd, SchedWriteVecIMul, 1,
- [HasAVX10_2], [HasAVX10_2_512]>, XD;
+ [HasAVX10_2], [HasAVX10_2]>, XD;
defm VPDPBSSDS : VNNI_common<0x51, "vpdpbssds", X86vpdpbssds, SchedWriteVecIMul, 1,
- [HasAVX10_2], [HasAVX10_2_512]>, XD;
+ [HasAVX10_2], [HasAVX10_2]>, XD;
defm VPDPBSUD : VNNI_common<0x50, "vpdpbsud", X86vpdpbsud, SchedWriteVecIMul, 0,
- [HasAVX10_2], [HasAVX10_2_512]>, XS;
+ [HasAVX10_2], [HasAVX10_2]>, XS;
defm VPDPBSUDS : VNNI_common<0x51, "vpdpbsuds", X86vpdpbsuds, SchedWriteVecIMul, 0,
- [HasAVX10_2], [HasAVX10_2_512]>, XS;
+ [HasAVX10_2], [HasAVX10_2]>, XS;
defm VPDPBUUD : VNNI_common<0x50, "vpdpbuud", X86vpdpbuud, SchedWriteVecIMul, 1,
- [HasAVX10_2], [HasAVX10_2_512]>, PS;
+ [HasAVX10_2], [HasAVX10_2]>, PS;
defm VPDPBUUDS : VNNI_common<0x51, "vpdpbuuds", X86vpdpbuuds, SchedWriteVecIMul, 1,
- [HasAVX10_2], [HasAVX10_2_512]>, PS;
+ [HasAVX10_2], [HasAVX10_2]>, PS;
// VNNI INT16
defm VPDPWSUD : VNNI_common<0xd2, "vpdpwsud", X86vpdpwsud, SchedWriteVecIMul, 0,
- [HasAVX10_2], [HasAVX10_2_512]>, XS;
+ [HasAVX10_2], [HasAVX10_2]>, XS;
defm VPDPWSUDS : VNNI_common<0xd3, "vpdpwsuds", X86vpdpwsuds, SchedWriteVecIMul, 0,
- [HasAVX10_2], [HasAVX10_2_512]>, XS;
+ [HasAVX10_2], [HasAVX10_2]>, XS;
defm VPDPWUSD : VNNI_common<0xd2, "vpdpwusd", X86vpdpwusd, SchedWriteVecIMul, 0,
- [HasAVX10_2], [HasAVX10_2_512]>, PD;
+ [HasAVX10_2], [HasAVX10_2]>, PD;
defm VPDPWUSDS : VNNI_common<0xd3, "vpdpwusds", X86vpdpwusds, SchedWriteVecIMul, 0,
- [HasAVX10_2], [HasAVX10_2_512]>, PD;
+ [HasAVX10_2], [HasAVX10_2]>, PD;
defm VPDPWUUD : VNNI_common<0xd2, "vpdpwuud", X86vpdpwuud, SchedWriteVecIMul, 1,
- [HasAVX10_2], [HasAVX10_2_512]>, PS;
+ [HasAVX10_2], [HasAVX10_2]>, PS;
defm VPDPWUUDS : VNNI_common<0xd3, "vpdpwuuds", X86vpdpwuuds, SchedWriteVecIMul, 1,
- [HasAVX10_2], [HasAVX10_2_512]>, PS;
+ [HasAVX10_2], [HasAVX10_2]>, PS;
// VMPSADBW
defm VMPSADBW : avx512_common_3Op_rm_imm8<0x42, X86Vmpsadbw, "vmpsadbw", SchedWritePSADBW,
@@ -94,9 +94,8 @@ multiclass avx10_minmax_packed_sae<string OpStr, AVX512VLVectorVTInfo VTI, SDNod
}
multiclass avx10_minmax_packed<string OpStr, AVX512VLVectorVTInfo VTI, SDNode OpNode> {
- let Predicates = [HasAVX10_2_512] in
- defm Z : avx10_minmax_packed_base<OpStr, VTI.info512, OpNode>, EVEX_V512;
let Predicates = [HasAVX10_2] in {
+ defm Z : avx10_minmax_packed_base<OpStr, VTI.info512, OpNode>, EVEX_V512;
defm Z256 : avx10_minmax_packed_base<OpStr, VTI.info256, OpNode>, EVEX_V256;
defm Z128 : avx10_minmax_packed_base<OpStr, VTI.info128, OpNode>, EVEX_V128;
}
@@ -201,7 +200,7 @@ multiclass avx10_sat_cvt_rmb<bits<8> Opc, string OpStr, X86FoldableSchedWrite sc
multiclass avx10_sat_cvt_rc<bits<8> Opc, string OpStr, X86SchedWriteWidths sched,
AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo,
SDNode MaskNode> {
- let Predicates = [HasAVX10_2_512], Uses = [MXCSR] in
+ let Predicates = [HasAVX10_2], Uses = [MXCSR] in
defm Zrrb : AVX512_maskable<Opc, MRMSrcReg, DestInfo.info512,
(outs DestInfo.info512.RC:$dst),
(ins SrcInfo.info512.RC:$src, AVX512RC:$rc),
@@ -216,7 +215,7 @@ multiclass avx10_sat_cvt_rc<bits<8> Opc, string OpStr, X86SchedWriteWidths sched
multiclass avx10_sat_cvt_sae<bits<8> Opc, string OpStr, X86SchedWriteWidths sched,
AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo,
SDNode Node> {
- let Predicates = [HasAVX10_2_512], Uses = [MXCSR] in
+ let Predicates = [HasAVX10_2], Uses = [MXCSR] in
defm Zrrb : AVX512_maskable<Opc, MRMSrcReg, DestInfo.info512,
(outs DestInfo.info512.RC:$dst),
(ins SrcInfo.info512.RC:$src),
@@ -229,12 +228,11 @@ multiclass avx10_sat_cvt_sae<bits<8> Opc, string OpStr, X86SchedWriteWidths sche
multiclass avx10_sat_cvt_base<bits<8> Opc, string OpStr, X86SchedWriteWidths sched,
SDNode MaskNode, AVX512VLVectorVTInfo DestInfo,
AVX512VLVectorVTInfo SrcInfo> {
- let Predicates = [HasAVX10_2_512] in
- defm Z : avx10_sat_cvt_rmb<Opc, OpStr, sched.ZMM,
- DestInfo.info512, SrcInfo.info512,
- MaskNode>,
- EVEX, EVEX_V512;
let Predicates = [HasAVX10_2] in {
+ defm Z : avx10_sat_cvt_rmb<Opc, OpStr, sched.ZMM,
+ DestInfo.info512, SrcInfo.info512,
+ MaskNode>,
+ EVEX, EVEX_V512;
defm Z256
: avx10_sat_cvt_rmb<Opc, OpStr, sched.YMM,
DestInfo.info256, SrcInfo.info256,
@@ -334,13 +332,11 @@ defm VCVTTPS2IUBS : avx10_sat_cvt_base<0x6a, "vcvttps2iubs", SchedWriteVecIMul,
multiclass avx10_cvttpd2dqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode, SDNode OpNodeSAE,
X86SchedWriteWidths sched> {
- let Predicates = [HasAVX10_2_512] in {
+ let Predicates = [HasAVX10_2] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode,
MaskOpNode, sched.ZMM>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info,
OpNodeSAE, sched.ZMM>, EVEX_V512;
- }
- let Predicates = [HasAVX10_2] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info,
null_frag, null_frag, sched.XMM, "{1to2}", "{x}",
f128mem, VK2WM>, EVEX_V128;
@@ -410,13 +406,11 @@ multiclass avx10_cvttpd2dqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpN
multiclass avx10_cvttpd2qqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode, SDNode OpNodeRnd,
X86SchedWriteWidths sched> {
- let Predicates = [HasAVX10_2_512] in {
+ let Predicates = [HasAVX10_2] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode,
MaskOpNode, sched.ZMM>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f64_info,
OpNodeRnd, sched.ZMM>, EVEX_V512;
- }
- let Predicates = [HasAVX10_2] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode,
MaskOpNode, sched.XMM>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode,
@@ -432,13 +426,11 @@ multiclass avx10_cvttpd2qqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpN
multiclass avx10_cvttps2qqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode, SDNode OpNodeRnd,
X86SchedWriteWidths sched> {
- let Predicates = [HasAVX10_2_512] in {
+ let Predicates = [HasAVX10_2] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode,
MaskOpNode, sched.ZMM>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f32x_info,
OpNodeRnd, sched.ZMM>, EVEX_V512;
- }
- let Predicates = [HasAVX10_2] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
MaskOpNode, sched.XMM, "{1to2}", "", f64mem, VK2WM,
(v2i64 (OpNode (bc_v4f32 (v2f64
@@ -460,14 +452,11 @@ multiclass avx10_cvttps2qqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpN
multiclass avx10_cvttps2dqs<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
SDNode MaskOpNode,
SDNode OpNodeSAE, X86SchedWriteWidths sched> {
- let Predicates = [HasAVX10_2_512] in {
+ let Predicates = [HasAVX10_2] in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode,
MaskOpNode, sched.ZMM>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v16i32_info, v16f32_info,
OpNodeSAE, sched.ZMM>, EVEX_V512;
- }
-
- let Predicates = [HasAVX10_2] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode,
MaskOpNode, sched.XMM>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode,
@@ -719,7 +708,7 @@ multiclass avx10_cvt2ps2ph<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _SrcVTInfo,
AVX512VLVectorVTInfo _DstVTInfo,
SDNode OpNode, SDNode OpNodeRnd> {
- let Predicates = [HasAVX10_2_512], Uses = [MXCSR] in {
+ let Predicates = [HasAVX10_2] in {
defm Z : avx512_binop_rm2<opc, OpcodeStr, sched.ZMM, OpNode,
_SrcVTInfo.info512, _DstVTInfo.info512,
_SrcVTInfo.info512>,
@@ -727,8 +716,6 @@ multiclass avx10_cvt2ps2ph<bits<8> opc, string OpcodeStr,
_SrcVTInfo.info512, _DstVTInfo.info512,
OpNodeRnd>,
EVEX_V512, EVEX_CD8<32, CD8VF>;
- }
- let Predicates = [HasAVX10_2] in {
defm Z256 : avx512_binop_rm2<opc, OpcodeStr, sched.YMM, OpNode,
_SrcVTInfo.info256, _DstVTInfo.info256,
_SrcVTInfo.info256>,
@@ -747,19 +734,19 @@ defm VCVT2PS2PHX : avx10_cvt2ps2ph<0x67, "vcvt2ps2phx",
defm VCVT2PH2BF8 : avx512_binop_all<0x74, "vcvt2ph2bf8", SchedWriteCvtPD2PS,
avx512vl_f16_info, avx512vl_i8_info,
- X86vcvt2ph2bf8, [HasAVX10_2_512], [HasAVX10_2]>,
+ X86vcvt2ph2bf8, [HasAVX10_2], [HasAVX10_2]>,
EVEX_CD8<16, CD8VF>, T8, XD;
defm VCVT2PH2BF8S : avx512_binop_all<0x74, "vcvt2ph2bf8s", SchedWriteCvtPD2PS,
avx512vl_f16_info, avx512vl_i8_info,
- X86vcvt2ph2bf8s, [HasAVX10_2_512], [HasAVX10_2]>,
+ X86vcvt2ph2bf8s, [HasAVX10_2], [HasAVX10_2]>,
EVEX_CD8<16, CD8VF>, T_MAP5, XD;
defm VCVT2PH2HF8 : avx512_binop_all<0x18, "vcvt2ph2hf8", SchedWriteCvtPD2PS,
avx512vl_f16_info, avx512vl_i8_info,
- X86vcvt2ph2hf8, [HasAVX10_2_512], [HasAVX10_2]>,
+ X86vcvt2ph2hf8, [HasAVX10_2], [HasAVX10_2]>,
EVEX_CD8<16, CD8VF>, T_MAP5, XD;
defm VCVT2PH2HF8S : avx512_binop_all<0x1b, "vcvt2ph2hf8s", SchedWriteCvtPD2PS,
avx512vl_f16_info, avx512vl_i8_info,
- X86vcvt2ph2hf8s, [HasAVX10_2_512], [HasAVX10_2]>,
+ X86vcvt2ph2hf8s, [HasAVX10_2], [HasAVX10_2]>,
EVEX_CD8<16, CD8VF>, T_MAP5, XD;
//TODO: Merge into avx512_vcvt_fp, diffrence is one more source register here.
@@ -836,11 +823,10 @@ multiclass avx10_convert_3op<bits<8> OpCode, string OpcodeStr,
PatFrag bcast128 = vt_src.info128.BroadcastLdFrag,
PatFrag loadVT128 = vt_src.info128.LdFrag,
RegisterClass maskRC128 = vt_src.info128.KRCWM> {
- let Predicates = [HasAVX10_2_512] in
+ let Predicates = [HasAVX10_2] in {
defm Z : avx10_convert_3op_packed<OpCode, OpcodeStr, vt_dst.info256,
vt_dst.info512, vt_src.info512, OpNode, OpNode, sched.ZMM>,
EVEX_V512, EVEX_CD8<16, CD8VF>;
- let Predicates = [HasAVX10_2] in {
defm Z256 : avx10_convert_3op_packed<OpCode, OpcodeStr, vt_dst.info128,
vt_dst.info256, vt_src.info256, OpNode, OpNode, sched.YMM>,
EVEX_V256, EVEX_CD8<16, CD8VF>;
@@ -920,25 +906,25 @@ defm VCVTBIASPH2HF8S : avx10_convert_3op<0x1b, "vcvtbiasph2hf8s",
defm VCVTPH2BF8 : avx512_cvt_trunc_ne<0x74, "vcvtph2bf8", avx512vl_i8_info,
avx512vl_f16_info, SchedWriteCvtPD2PS,
X86vcvtph2bf8, X86vmcvtph2bf8,
- [HasAVX10_2], [HasAVX10_2_512]>,
+ [HasAVX10_2], [HasAVX10_2]>,
T8, XS, EVEX_CD8<16, CD8VF>;
defm VCVTPH2BF8S : avx512_cvt_trunc_ne<0x74, "vcvtph2bf8s", avx512vl_i8_info,
avx512vl_f16_info, SchedWriteCvtPD2PS,
X86vcvtph2bf8s, X86vmcvtph2bf8s,
- [HasAVX10_2], [HasAVX10_2_512]>,
+ [HasAVX10_2], [HasAVX10_2]>,
T_MAP5, XS, EVEX_CD8<16, CD8VF>;
defm VCVTPH2HF8 : avx512_cvt_trunc_ne<0x18, "vcvtph2hf8", avx512vl_i8_info,
avx512vl_f16_info, SchedWriteCvtPD2PS,
X86vcvtph2hf8, X86vmcvtph2hf8,
- [HasAVX10_2], [HasAVX10_2_512]>,
+ [HasAVX10_2], [HasAVX10_2]>,
T_MAP5, XS, EVEX_CD8<16, CD8VF>;
defm VCVTPH2HF8S : avx512_cvt_trunc_ne<0x1b, "vcvtph2hf8s", avx512vl_i8_info,
avx512vl_f16_info, SchedWriteCvtPD2PS,
X86vcvtph2hf8s, X86vmcvtph2hf8s,
- [HasAVX10_2], [HasAVX10_2_512]>,
+ [HasAVX10_2], [HasAVX10_2]>,
T_MAP5, XS, EVEX_CD8<16, CD8VF>;
multiclass avx10_convert_2op_nomb_packed<bits<8> opc, string OpcodeStr,
@@ -962,10 +948,9 @@ multiclass avx10_convert_2op_nomb_packed<bits<8> opc, string OpcodeStr,
multiclass avx10_convert_2op_nomb<string OpcodeStr, AVX512VLVectorVTInfo _dest,
AVX512VLVectorVTInfo _src, bits<8> opc, SDNode OpNode> {
- let Predicates = [HasAVX10_2_512] in
+ let Predicates = [HasAVX10_2] in {
defm Z : avx10_convert_2op_nomb_packed<opc, OpcodeStr, _dest.info512, _src.info256,
OpNode, f256mem, WriteCvtPH2PSZ>, EVEX_V512;
- let Predicates = [HasAVX10_2] in {
defm Z128 : avx10_convert_2op_nomb_packed<opc, OpcodeStr, _dest.info128, _src.info128,
OpNode, f64mem, WriteCvtPH2PSZ>, EVEX_V128;
defm Z256 : avx10_convert_2op_nomb_packed<opc, OpcodeStr, _dest.info256, _src.info128,
@@ -985,13 +970,12 @@ defm VCVTHF82PH : avx10_convert_2op_nomb<"vcvthf82ph", avx512vl_f16_info,
multiclass avx10_fp_binop_int_bf16<bits<8> opc, string OpcodeStr,
X86SchedWriteSizes sched,
bit IsCommutable = 0> {
- let Predicates = [HasAVX10_2_512] in
+ let Predicates = [HasAVX10_2] in {
defm Z : avx512_fp_packed<opc, OpcodeStr,
!cast<Intrinsic>("int_x86_avx10_"#OpcodeStr#"bf16512"),
!cast<Intrinsic>("int_x86_avx10_"#OpcodeStr#"bf16512"),
v32bf16_info, sched.PH.ZMM, IsCommutable>, EVEX_V512,
T_MAP5, PD, EVEX_CD8<16, CD8VF>;
- let Predicates = [HasAVX10_2] in {
defm Z128 : avx512_fp_packed<opc, OpcodeStr,
!cast<Intrinsic>("int_x86_avx10_"#OpcodeStr#"bf16128"),
!cast<Intrinsic>("int_x86_avx10_"#OpcodeStr#"bf16128"),
@@ -1009,11 +993,10 @@ multiclass avx10_fp_binop_bf16<bits<8> opc, string OpcodeStr, SDPatternOperator
X86SchedWriteSizes sched,
bit IsCommutable = 0,
SDPatternOperator MaskOpNode = OpNode> {
- let Predicates = [HasAVX10_2_512] in
+ let Predicates = [HasAVX10_2] in {
defm Z : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode,
v32bf16_info, sched.PH.ZMM, IsCommutable>, EVEX_V512,
T_MAP5, PD, EVEX_CD8<16, CD8VF>;
- let Predicates = [HasAVX10_2] in {
defm Z128 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode,
v8bf16x_info, sched.PH.XMM, IsCommutable>, EVEX_V128,
T_MAP5, PD, EVEX_CD8<16, CD8VF>;
@@ -1086,9 +1069,8 @@ multiclass avx10_vcmp_common_bf16<X86FoldableSchedWrite sched, X86VectorVTInfo _
}
multiclass avx10_vcmp_bf16<X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> {
- let Predicates = [HasAVX10_2_512] in
- defm Z : avx10_vcmp_common_bf16<sched.ZMM, _.info512>, EVEX_V512;
let Predicates = [HasAVX10_2] in {
+ defm Z : avx10_vcmp_common_bf16<sched.ZMM, _.info512>, EVEX_V512;
defm Z128 : avx10_vcmp_common_bf16<sched.XMM, _.info128>, EVEX_V128;
defm Z256 : avx10_vcmp_common_bf16<sched.YMM, _.info256>, EVEX_V256;
}
@@ -1102,11 +1084,10 @@ defm VCMPBF16 : avx10_vcmp_bf16<SchedWriteFCmp, avx512vl_bf16_info>,
// VSQRTBF16
multiclass avx10_sqrt_packed_bf16<bits<8> opc, string OpcodeStr,
X86SchedWriteSizes sched> {
- let Predicates = [HasAVX10_2_512] in
- defm Z : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "bf16"),
- sched.PH.ZMM, v32bf16_info>,
- EVEX_V512, PD, T_MAP5, EVEX_CD8<16, CD8VF>;
let Predicates = [HasAVX10_2] in {
+ defm Z : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "bf16"),
+ sched.PH.ZMM, v32bf16_info>,
+ EVEX_V512, PD, T_MAP5, EVEX_CD8<16, CD8VF>;
defm Z128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "bf16"),
sched.PH.XMM, v8bf16x_info>,
EVEX_V128, PD, T_MAP5, EVEX_CD8<16, CD8VF>;
@@ -1122,11 +1103,10 @@ defm VSQRTBF16 : avx10_sqrt_packed_bf16<0x51, "vsqrt", SchedWriteFSqrtSizes>;
// VRSQRTBF16, VRCPBF16, VSRQTBF16, VGETEXPBF16
multiclass avx10_fp14_bf16<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86SchedWriteWidths sched> {
- let Predicates = [HasAVX10_2_512] in
- defm BF16Z : avx512_fp14_p<opc, !strconcat(OpcodeStr, "bf16"),
- OpNode, sched.ZMM, v32bf16_info>,
- EVEX_V512;
let Predicates = [HasAVX10_2] in {
+ defm BF16Z : avx512_fp14_p<opc, !strconcat(OpcodeStr, "bf16"),
+ OpNode, sched.ZMM, v32bf16_info>,
+ EVEX_V512;
defm BF16Z128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "bf16"),
OpNode, sched.XMM, v8bf16x_info>,
EVEX_V128;
@@ -1146,10 +1126,9 @@ defm VGETEXP : avx10_fp14_bf16<0x42, "vgetexp", X86fgetexp, SchedWriteFRnd>,
// VSCALEFBF16
multiclass avx10_fp_scalef_bf16<bits<8> opc, string OpcodeStr,
X86SchedWriteWidths sched> {
- let Predicates = [HasAVX10_2_512] in
+ let Predicates = [HasAVX10_2] in {
defm Z : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v32bf16_info>,
EVEX_V512, T_MAP6, PS, EVEX_CD8<16, CD8VF>;
- let Predicates = [HasAVX10_2] in {
defm Z128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v8bf16x_info>,
EVEX_V128, EVEX_CD8<16, CD8VF>, T_MAP6, PS;
defm Z256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v16bf16x_info>,
@@ -1164,10 +1143,9 @@ defm VSCALEFBF16 : avx10_fp_scalef_bf16<0x2C, "vscalef", SchedWriteFAdd>;
multiclass avx10_common_unary_fp_packed_imm_bf16<string OpcodeStr,
AVX512VLVectorVTInfo _, bits<8> opc, SDPatternOperator OpNode,
SDPatternOperator MaskOpNode, X86SchedWriteWidths sched> {
- let Predicates = [HasAVX10_2_512] in
+ let Predicates = [HasAVX10_2] in {
defm Z : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode,
sched.ZMM, _.info512>, EVEX_V512;
- let Predicates = [HasAVX10_2] in {
defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode,
sched.XMM, _.info128>, EVEX_V128;
defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode,
@@ -1190,11 +1168,10 @@ defm VGETMANTBF16 : avx10_common_unary_fp_packed_imm_bf16<"vgetmant", avx512vl_b
// VFPCLASSBF16
multiclass avx10_fp_fpclass_bf16<string OpcodeStr, bits<8> opcVec,
X86SchedWriteWidths sched> {
- let Predicates = [HasAVX10_2_512] in
+ let Predicates = [HasAVX10_2] in {
defm Z : avx512_vector_fpclass<opcVec, OpcodeStr, sched.ZMM,
avx512vl_bf16_info.info512, "z",
[]<Register>>, EVEX_V512;
- let Predicates = [HasAVX10_2] in {
defm Z128 : avx512_vector_fpclass<opcVec, OpcodeStr, sched.XMM,
avx512vl_bf16_info.info128, "x",
[]<Register>>, EVEX_V128;
@@ -1211,11 +1188,10 @@ defm VFPCLASSBF16 : avx10_fp_fpclass_bf16<"vfpclass", 0x66, SchedWriteFCmp>,
multiclass avx10_fma3p_213_bf16<bits<8> opc, string OpcodeStr,
SDPatternOperator OpNode, SDNode MaskOpNode,
X86SchedWriteWidths sched> {
- let Predicates = [HasAVX10_2_512] in
+ let Predicates = [HasAVX10_2] in {
defm Z : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode,
sched.ZMM, v32bf16_info>, EVEX_V512, T_MAP6, PS,
EVEX_CD8<16, CD8VF>;
- let Predicates = [HasAVX10_2] in {
defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode,
sched.XMM, v8bf16x_info>, EVEX_V128, T_MAP6, PS,
EVEX_CD8<16, CD8VF>;
@@ -1239,11 +1215,10 @@ defm VFNMSUB213BF16 : avx10_fma3p_213_bf16<0xAE, "vfnmsub213bf16", X86any_Fnmsub
multiclass avx10_fma3p_231_bf16<bits<8> opc, string OpcodeStr,
SDPatternOperator OpNode, SDNode MaskOpNode,
X86SchedWriteWidths sched> {
- let Predicates = [HasAVX10_2_512] in
+ let Predicates = [HasAVX10_2] in {
defm Z : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode,
sched.ZMM, v32bf16_info>, EVEX_V512, T_MAP6, PS,
EVEX_CD8<16, CD8VF>;
- let Predicates = [HasAVX10_2] in {
defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode,
sched.XMM, v8bf16x_info>, EVEX_V128, T_MAP6, PS,
EVEX_CD8<16, CD8VF>;
@@ -1267,11 +1242,10 @@ defm VFNMSUB231BF16 : avx10_fma3p_231_bf16<0xBE, "vfnmsub231bf16", X86any_Fnmsub
multiclass avx10_fma3p_132_bf16<bits<8> opc, string OpcodeStr,
SDPatternOperator OpNode, SDNode MaskOpNode,
X86SchedWriteWidths sched> {
- let Predicates = [HasAVX10_2_512] in
+ let Predicates = [HasAVX10_2] in {
defm Z : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode,
sched.ZMM, v32bf16_info>, EVEX_V512, T_MAP6, PS,
EVEX_CD8<16, CD8VF>;
- let Predicates = [HasAVX10_2] in {
defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode,
sched.XMM, v8bf16x_info>, EVEX_V128, T_MAP6, PS,
EVEX_CD8<16, CD8VF>;
@@ -1440,9 +1414,8 @@ multiclass vmovrs_p<bits<8> opc, string OpStr, X86VectorVTInfo _> {
}
multiclass vmovrs_p_vl<bits<8> opc, string OpStr, AVX512VLVectorVTInfo _Vec> {
- let Predicates = [HasMOVRS, HasAVX10_2_512, In64BitMode] in
- defm Z : vmovrs_p<opc, OpStr, _Vec.info512>, EVEX_V512;
let Predicates = [HasMOVRS, HasAVX10_2, In64BitMode] in {
+ defm Z : vmovrs_p<opc, OpStr, _Vec.info512>, EVEX_V512;
defm Z128 : vmovrs_p<opc, OpStr, _Vec.info128>, EVEX_V128;
defm Z256 : vmovrs_p<opc, OpStr, _Vec.info256>, EVEX_V256;
}
@@ -1464,7 +1437,7 @@ multiclass avx10_sm4_base<string OpStr> {
defm Z128 : SM4_Base<OpStr, VR128X, "128", loadv4i32, i128mem>, EVEX_V128;
defm Z256 : SM4_Base<OpStr, VR256X, "256", loadv8i32, i256mem>, EVEX_V256;
}
- let Predicates = [HasSM4, HasAVX10_2_512] in
+ let Predicates = [HasSM4, HasAVX10_2] in
defm Z : SM4_Base<OpStr, VR512, "512", loadv16i32, i512mem>, EVEX_V512;
}
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 0ab94cca4142..3401f6f04800 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -834,7 +834,7 @@ defm : vextract_for_size_lowering<"VEXTRACTF64X4Z", v32bf16_info, v16bf16x_info,
// A 128-bit extract from bits [255:128] of a 512-bit vector should use a
// smaller extract to enable EVEX->VEX.
-let Predicates = [NoVLX, HasEVEX512] in {
+let Predicates = [NoVLX] in {
def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 2))),
(v2i64 (VEXTRACTI128rri
(v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm)),
@@ -3088,7 +3088,7 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
addr:$src2, (X86cmpm_imm_commute timm:$cc)), Narrow.KRC)>;
}
-let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPD", v8i32x_info, v16i32_info>;
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUD", v8i32x_info, v16i32_info>;
@@ -3119,7 +3119,7 @@ let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
defm : axv512_cmp_packed_cc_no_vlx_lowering<"VCMPPD", v2f64x_info, v8f64_info>;
}
-let Predicates = [HasBWI, NoVLX, HasEVEX512] in {
+let Predicates = [HasBWI, NoVLX] in {
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpm, X86pcmpm_su, "VPCMPB", v32i8x_info, v64i8_info>;
defm : axv512_icmp_packed_cc_no_vlx_lowering<X86pcmpum, X86pcmpum_su, "VPCMPUB", v32i8x_info, v64i8_info>;
@@ -3513,7 +3513,7 @@ multiclass mask_move_lowering<string InstrStr, X86VectorVTInfo Narrow,
// Patterns for handling v8i1 selects of 256-bit vectors when VLX isn't
// available. Use a 512-bit operation and extract.
-let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
defm : mask_move_lowering<"VMOVAPSZ", v4f32x_info, v16f32_info>;
defm : mask_move_lowering<"VMOVDQA32Z", v4i32x_info, v16i32_info>;
defm : mask_move_lowering<"VMOVAPSZ", v8f32x_info, v16f32_info>;
@@ -3525,7 +3525,7 @@ let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
defm : mask_move_lowering<"VMOVDQA64Z", v4i64x_info, v8i64_info>;
}
-let Predicates = [HasBWI, NoVLX, HasEVEX512] in {
+let Predicates = [HasBWI, NoVLX] in {
defm : mask_move_lowering<"VMOVDQU8Z", v16i8x_info, v64i8_info>;
defm : mask_move_lowering<"VMOVDQU8Z", v32i8x_info, v64i8_info>;
@@ -5021,8 +5021,8 @@ defm VPMINUD : avx512_binop_rm_vl_d<0x3B, "vpminud", umin,
defm VPMINUQ : avx512_binop_rm_vl_q<0x3B, "vpminuq", umin,
SchedWriteVecALU, HasAVX512, 1>, T8;
-// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX, HasEVEX512.
-let Predicates = [HasDQI, NoVLX, HasEVEX512] in {
+// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX.
+let Predicates = [HasDQI, NoVLX] in {
def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
(EXTRACT_SUBREG
(VPMULLQZrr
@@ -5078,7 +5078,7 @@ multiclass avx512_min_max_lowering<string Instr, SDNode OpNode> {
sub_xmm)>;
}
-let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
defm : avx512_min_max_lowering<"VPMAXUQZ", umax>;
defm : avx512_min_max_lowering<"VPMINUQZ", umin>;
defm : avx512_min_max_lowering<"VPMAXSQZ", smax>;
@@ -6055,7 +6055,7 @@ defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl,
SchedWriteVecShift>;
// Use 512bit VPSRA/VPSRAI version to implement v2i64/v4i64 in case NoVLX.
-let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
def : Pat<(v4i64 (X86vsra (v4i64 VR256X:$src1), (v2i64 VR128X:$src2))),
(EXTRACT_SUBREG (v8i64
(VPSRAQZrr
@@ -6184,14 +6184,14 @@ defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", X86vsrlv, SchedWriteVarVecS
defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr, SchedWriteVarVecShift>;
defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl, SchedWriteVarVecShift>;
-defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", X86vsrav, [HasAVX512, NoVLX, HasEVEX512]>;
-defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", X86vshlv, [HasBWI, NoVLX, HasEVEX512]>;
-defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRAVW", X86vsrav, [HasBWI, NoVLX, HasEVEX512]>;
-defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRLVW", X86vsrlv, [HasBWI, NoVLX, HasEVEX512]>;
+defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", X86vsrav, [HasAVX512, NoVLX]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", X86vshlv, [HasBWI, NoVLX]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRAVW", X86vsrav, [HasBWI, NoVLX]>;
+defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSRLVW", X86vsrlv, [HasBWI, NoVLX]>;
// Use 512bit VPROL/VPROLI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
-let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
def : Pat<(v2i64 (rotl (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
(EXTRACT_SUBREG (v8i64
(VPROLVQZrr
@@ -6242,7 +6242,7 @@ let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
}
// Use 512bit VPROR/VPRORI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX.
-let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
def : Pat<(v2i64 (rotr (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
(EXTRACT_SUBREG (v8i64
(VPRORVQZrr
@@ -9933,7 +9933,7 @@ defm VPMOVUSWB : avx512_trunc_wb<0x10, "vpmovuswb", X86vtruncus,
truncstore_us_vi8, masked_truncstore_us_vi8,
X86vtruncus, X86vmtruncus>;
-let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
def: Pat<(v8i16 (trunc (v8i32 VR256X:$src))),
(v8i16 (EXTRACT_SUBREG
(v16i16 (VPMOVDWZrr (v16i32 (INSERT_SUBREG (IMPLICIT_DEF),
@@ -9944,7 +9944,7 @@ def: Pat<(v4i32 (trunc (v4i64 VR256X:$src))),
VR256X:$src, sub_ymm)))), sub_xmm))>;
}
-let Predicates = [HasBWI, NoVLX, HasEVEX512] in {
+let Predicates = [HasBWI, NoVLX] in {
def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))),
(v16i8 (EXTRACT_SUBREG (VPMOVWBZrr (v32i16 (INSERT_SUBREG (IMPLICIT_DEF),
VR256X:$src, sub_ymm))), sub_xmm))>;
@@ -10487,7 +10487,7 @@ multiclass avx512_convert_vector_to_mask<bits<8> opc, string OpcodeStr,
defm Z128 : convert_vector_to_mask_common<opc, VTInfo.info128, OpcodeStr>,
EVEX_V128;
}
- let Predicates = [prd, NoVLX, HasEVEX512] in {
+ let Predicates = [prd, NoVLX] in {
defm Z256_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info256, NAME>;
defm Z128_Alt : convert_vector_to_mask_lowering<VTInfo.info512, VTInfo.info128, NAME>;
}
@@ -11283,7 +11283,7 @@ defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", abs,
SchedWriteVecALU>;
// VPABS: Use 512bit version to implement 128/256 bit in case NoVLX.
-let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
def : Pat<(v4i64 (abs VR256X:$src)),
(EXTRACT_SUBREG
(VPABSQZrr
@@ -11299,7 +11299,7 @@ let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
// Use 512bit version to implement 128/256 bit.
multiclass avx512_unary_lowering<string InstrStr, SDNode OpNode,
AVX512VLVectorVTInfo _, Predicate prd> {
- let Predicates = [prd, NoVLX, HasEVEX512] in {
+ let Predicates = [prd, NoVLX] in {
def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1))),
(EXTRACT_SUBREG
(!cast<Instruction>(InstrStr # "Zrr")
@@ -11918,7 +11918,7 @@ let Predicates = [HasAVX512] in {
(VPTERNLOGQZrri VR512:$src, VR512:$src, VR512:$src, (i8 15))>;
}
-let Predicates = [HasAVX512, NoVLX, HasEVEX512] in {
+let Predicates = [HasAVX512, NoVLX] in {
def : Pat<(v16i8 (vnot VR128X:$src)),
(EXTRACT_SUBREG
(VPTERNLOGQZrri
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 927b2c8b22f0..5a0df058b27f 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1326,7 +1326,11 @@ def : Pat<(X86imp_call (i64 tglobaladdr:$dst)),
// Match an X86tcret that uses less than 7 volatile registers.
def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
(TCRETURNri ptr_rc_tailcall:$dst, timm:$off)>,
- Requires<[Not64BitMode, NotUseIndirectThunkCalls]>;
+ Requires<[Not64BitMode, IsNotHiPECCFunc, NotUseIndirectThunkCalls]>;
+
+def : Pat<(X86tcret GR32:$dst, timm:$off),
+ (TCRETURN_HIPE32ri GR32:$dst, timm:$off)>,
+ Requires<[Not64BitMode, IsHiPECCFunc, NotUseIndirectThunkCalls]>;
// FIXME: This is disabled for 32-bit PIC mode because the global base
// register which is part of the address mode may be assigned a
@@ -1346,7 +1350,11 @@ def : Pat<(X86tcret (i32 texternalsym:$dst), timm:$off),
def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
(TCRETURNri64 ptr_rc_tailcall:$dst, timm:$off)>,
- Requires<[In64BitMode, NotUseIndirectThunkCalls, ImportCallOptimizationDisabled]>;
+ Requires<[In64BitMode, IsNotWin64CCFunc, NotUseIndirectThunkCalls, ImportCallOptimizationDisabled]>;
+
+def : Pat<(X86tcret GR64_TCW64:$dst, timm:$off),
+ (TCRETURN_WIN64ri GR64_TCW64:$dst, timm:$off)>,
+ Requires<[IsWin64CCFunc, NotUseIndirectThunkCalls, ImportCallOptimizationDisabled]>;
def : Pat<(X86tcret ptr_rc_tailcall:$dst, timm:$off),
(TCRETURNri64_ImpCall ptr_rc_tailcall:$dst, timm:$off)>,
diff --git a/llvm/lib/Target/X86/X86InstrControl.td b/llvm/lib/Target/X86/X86InstrControl.td
index 22253bf0413a..139aedd473eb 100644
--- a/llvm/lib/Target/X86/X86InstrControl.td
+++ b/llvm/lib/Target/X86/X86InstrControl.td
@@ -282,6 +282,10 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
[]>, Sched<[WriteJump]>;
def TCRETURNri : PseudoI<(outs), (ins ptr_rc_tailcall:$dst, i32imm:$offset),
[]>, Sched<[WriteJump]>;
+
+ def TCRETURN_HIPE32ri : PseudoI<(outs), (ins GR32:$dst, i32imm:$offset),
+ []>, Sched<[WriteJump]>;
+
let mayLoad = 1 in
def TCRETURNmi : PseudoI<(outs), (ins i32mem_TC:$dst, i32imm:$offset),
[]>, Sched<[WriteJumpLd]>;
@@ -357,6 +361,9 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
def TCRETURNri64 : PseudoI<(outs),
(ins ptr_rc_tailcall:$dst, i32imm:$offset),
[]>, Sched<[WriteJump]>;
+ def TCRETURN_WIN64ri : PseudoI<(outs), (ins GR64_TCW64:$dst, i32imm:$offset),
+ []>, Sched<[WriteJump]>;
+
def TCRETURNri64_ImpCall : PseudoI<(outs),
(ins GR64_A:$dst, i32imm:$offset),
[]>, Sched<[WriteJump]>;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index abf365eedec3..a68edf4d2b7e 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -83,8 +83,9 @@ static cl::opt<unsigned> UndefRegClearance(
// Pin the vtable to this file.
void X86InstrInfo::anchor() {}
-X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
- : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
+X86InstrInfo::X86InstrInfo(const X86Subtarget &STI)
+ : X86GenInstrInfo(STI,
+ (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
: X86::ADJCALLSTACKDOWN32),
(STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
: X86::ADJCALLSTACKUP32),
@@ -4399,13 +4400,8 @@ static unsigned getLoadStoreOpcodeForFP16(bool Load, const X86Subtarget &STI) {
if (STI.hasFP16())
return Load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
if (Load)
- return STI.hasAVX512() ? X86::VMOVSSZrm
- : STI.hasAVX() ? X86::VMOVSSrm
- : X86::MOVSSrm;
- else
- return STI.hasAVX512() ? X86::VMOVSSZmr
- : STI.hasAVX() ? X86::VMOVSSmr
- : X86::MOVSSmr;
+ return X86::MOVSHPrm;
+ return X86::MOVSHPmr;
}
static unsigned getLoadStoreRegOpcode(Register Reg,
@@ -4903,6 +4899,16 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
CmpMask = ~0;
CmpValue = 0;
return true;
+ case X86::TEST64ri32:
+ case X86::TEST32ri:
+ case X86::TEST16ri:
+ case X86::TEST8ri:
+ SrcReg = MI.getOperand(0).getReg();
+ SrcReg2 = 0;
+ // Force identical compare.
+ CmpMask = 0;
+ CmpValue = 0;
+ return true;
}
return false;
}
@@ -4942,6 +4948,10 @@ bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
case X86::CMP32ri:
case X86::CMP16ri:
case X86::CMP8ri:
+ case X86::TEST64ri32:
+ case X86::TEST32ri:
+ case X86::TEST16ri:
+ case X86::TEST8ri:
CASE_ND(SUB64ri32)
CASE_ND(SUB32ri)
CASE_ND(SUB16ri)
@@ -6131,6 +6141,25 @@ static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) {
return true;
}
+static bool expandMOVSHP(MachineInstrBuilder &MIB, MachineInstr &MI,
+ const TargetInstrInfo &TII, bool HasAVX) {
+ unsigned NewOpc;
+ if (MI.getOpcode() == X86::MOVSHPrm) {
+ NewOpc = HasAVX ? X86::VMOVSSrm : X86::MOVSSrm;
+ Register Reg = MI.getOperand(0).getReg();
+ if (Reg > X86::XMM15)
+ NewOpc = X86::VMOVSSZrm;
+ } else {
+ NewOpc = HasAVX ? X86::VMOVSSmr : X86::MOVSSmr;
+ Register Reg = MI.getOperand(5).getReg();
+ if (Reg > X86::XMM15)
+ NewOpc = X86::VMOVSSZmr;
+ }
+
+ MIB->setDesc(TII.get(NewOpc));
+ return true;
+}
+
bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
bool HasAVX = Subtarget.hasAVX();
MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
@@ -6203,6 +6232,9 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
}
return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
}
+ case X86::MOVSHPmr:
+ case X86::MOVSHPrm:
+ return expandMOVSHP(MIB, MI, *this, Subtarget.hasAVX());
case X86::V_SETALLONES:
return Expand2AddrUndef(MIB,
get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 9dc5f4b0e086..f087b7f20ff6 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -222,7 +222,7 @@ inline static bool isMemInstrWithGOTPCREL(const MachineInstr &MI) {
}
class X86InstrInfo final : public X86GenInstrInfo {
- X86Subtarget &Subtarget;
+ const X86Subtarget &Subtarget;
const X86RegisterInfo RI;
LLVM_DECLARE_VIRTUAL_ANCHOR_FUNCTION();
@@ -238,7 +238,7 @@ class X86InstrInfo final : public X86GenInstrInfo {
bool MakeChange) const;
public:
- explicit X86InstrInfo(X86Subtarget &STI);
+ explicit X86InstrInfo(const X86Subtarget &STI);
/// Given a machine instruction descriptor, returns the register
/// class constraint for OpNum, or NULL. Returned register class
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index df1541e9085b..8339c2081842 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -69,11 +69,8 @@ def NoAVX : Predicate<"!Subtarget->hasAVX()">;
def HasAVX : Predicate<"Subtarget->hasAVX()">;
def HasAVX2 : Predicate<"Subtarget->hasAVX2()">;
def HasAVX1Only : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">;
-def HasEVEX512 : Predicate<"Subtarget->hasEVEX512()">;
def HasAVX10_1 : Predicate<"Subtarget->hasAVX10_1()">;
-def HasAVX10_1_512 : Predicate<"Subtarget->hasAVX10_1_512()">;
def HasAVX10_2 : Predicate<"Subtarget->hasAVX10_2()">;
-def HasAVX10_2_512 : Predicate<"Subtarget->hasAVX10_2_512()">;
def NoAVX10_2 : Predicate<"!Subtarget->hasAVX10_2()">;
def HasAVX512 : Predicate<"Subtarget->hasAVX512()">;
def UseAVX : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">;
@@ -233,6 +230,13 @@ let RecomputePerFunction = 1 in {
"!Subtarget->hasSSE41()">;
def ImportCallOptimizationEnabled : Predicate<"MF->getFunction().getParent()->getModuleFlag(\"import-call-optimization\")">;
def ImportCallOptimizationDisabled : Predicate<"!MF->getFunction().getParent()->getModuleFlag(\"import-call-optimization\")">;
+
+ def IsWin64CCFunc : Predicate<"Subtarget->isCallingConvWin64(MF->getFunction().getCallingConv())">;
+ def IsNotWin64CCFunc : Predicate<"!Subtarget->isCallingConvWin64(MF->getFunction().getCallingConv())">;
+ def IsHiPECCFunc : Predicate<"MF->getFunction().getCallingConv() == CallingConv::HiPE">;
+
+ def IsNotHiPECCFunc : Predicate<
+ "MF->getFunction().getCallingConv() != CallingConv::HiPE">;
}
def CallImmAddr : Predicate<"Subtarget->isLegalToCallImmediateAddr()">;
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 1acc0cd8da20..b7926497c92b 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -267,6 +267,18 @@ multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
}
}
+// pseudo instruction for fp16 spilling.
+let isPseudo = 1, Predicates = [HasSSE2] in {
+ let mayStore = 1 in
+ def MOVSHPmr : I<0, Pseudo, (outs), (ins f32mem:$dst, FR16X:$src), "",
+ [], SSEPackedSingle>,
+ Sched<[WriteFStore]>;
+ let mayLoad = 1 in
+ def MOVSHPrm : I<0, Pseudo, (outs FR16X:$dst), (ins f32mem:$src), "",
+ [], SSEPackedSingle>,
+ Sched<[WriteFLoad]>;
+}
+
defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
SSEPackedSingle, UseSSE1>, TB, XS;
defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index 632db7e4326e..4188487d7591 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -825,7 +825,8 @@ bool X86TargetLowering::lowerInterleavedLoad(
bool X86TargetLowering::lowerInterleavedStore(Instruction *Store,
Value *LaneMask,
ShuffleVectorInst *SVI,
- unsigned Factor) const {
+ unsigned Factor,
+ const APInt &GapMask) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
@@ -836,7 +837,8 @@ bool X86TargetLowering::lowerInterleavedStore(Instruction *Store,
auto *SI = dyn_cast<StoreInst>(Store);
if (!SI)
return false;
- assert(!LaneMask && "Unexpected mask on store");
+ assert(!LaneMask && GapMask.popcount() == Factor &&
+ "Unexpected mask on store");
// Holds the indices of SVI that correspond to the starting index of each
// interleaved shuffle.
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 595ad3290eed..9ec04e740a08 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -204,15 +204,7 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
// we can still use 64-bit register as long as we know the high bits
// are zeros.
// Reflect that in the returned register class.
- if (Is64Bit) {
- // When the target also allows 64-bit frame pointer and we do have a
- // frame, this is fine to use it for the address accesses as well.
- const X86FrameLowering *TFI = getFrameLowering(MF);
- return TFI->hasFP(MF) && TFI->Uses64BitFramePtr
- ? &X86::LOW32_ADDR_ACCESS_RBPRegClass
- : &X86::LOW32_ADDR_ACCESSRegClass;
- }
- return &X86::GR32RegClass;
+ return Is64Bit ? &X86::LOW32_ADDR_ACCESSRegClass : &X86::GR32RegClass;
case 1: // Normal GPRs except the stack pointer (for encoding reasons).
if (Subtarget.isTarget64BitLP64())
return &X86::GR64_NOSPRegClass;
@@ -228,25 +220,11 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
// NOSP does not contain RIP, so no special case here.
return &X86::GR32_NOREX_NOSPRegClass;
case 4: // Available for tailcall (not callee-saved GPRs).
- return getGPRsForTailCall(MF);
+ return Is64Bit ? &X86::GR64_TCRegClass : &X86::GR32_TCRegClass;
}
}
const TargetRegisterClass *
-X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const {
- const Function &F = MF.getFunction();
- if (IsWin64 || IsUEFI64 || (F.getCallingConv() == CallingConv::Win64))
- return &X86::GR64_TCW64RegClass;
- else if (Is64Bit)
- return &X86::GR64_TCRegClass;
-
- bool hasHipeCC = (F.getCallingConv() == CallingConv::HiPE);
- if (hasHipeCC)
- return &X86::GR32RegClass;
- return &X86::GR32_TCRegClass;
-}
-
-const TargetRegisterClass *
X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
if (RC == &X86::CCRRegClass) {
if (Is64Bit)
@@ -1007,11 +985,10 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
unsigned X86RegisterInfo::findDeadCallerSavedReg(
MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI) const {
const MachineFunction *MF = MBB.getParent();
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
if (MF->callsEHReturn())
return 0;
- const TargetRegisterClass &AvailableRegs = *getGPRsForTailCall(*MF);
-
if (MBBI == MBB.end())
return 0;
@@ -1026,6 +1003,8 @@ unsigned X86RegisterInfo::findDeadCallerSavedReg(
case X86::RETI64:
case X86::TCRETURNdi:
case X86::TCRETURNri:
+ case X86::TCRETURN_WIN64ri:
+ case X86::TCRETURN_HIPE32ri:
case X86::TCRETURNmi:
case X86::TCRETURNdi64:
case X86::TCRETURNri64:
@@ -1033,20 +1012,16 @@ unsigned X86RegisterInfo::findDeadCallerSavedReg(
case X86::TCRETURNmi64:
case X86::EH_RETURN:
case X86::EH_RETURN64: {
- SmallSet<uint16_t, 8> Uses;
- for (MachineOperand &MO : MBBI->operands()) {
- if (!MO.isReg() || MO.isDef())
- continue;
- Register Reg = MO.getReg();
- if (!Reg)
- continue;
- for (MCRegAliasIterator AI(Reg, this, true); AI.isValid(); ++AI)
- Uses.insert(*AI);
+ LiveRegUnits LRU(*this);
+ LRU.addLiveOuts(MBB);
+ LRU.stepBackward(*MBBI);
+
+ const TargetRegisterClass &RC =
+ Is64Bit ? X86::GR64_NOSPRegClass : X86::GR32_NOSPRegClass;
+ for (MCRegister Reg : RC) {
+ if (LRU.available(Reg) && !MRI.isReserved(Reg))
+ return Reg;
}
-
- for (auto CS : AvailableRegs)
- if (!Uses.count(CS) && CS != X86::RIP && CS != X86::RSP && CS != X86::ESP)
- return CS;
}
}
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.h b/llvm/lib/Target/X86/X86RegisterInfo.h
index 2f4c55cfad6d..d022e5ab8794 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.h
+++ b/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -87,11 +87,6 @@ public:
const TargetRegisterClass *
getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
- /// getGPRsForTailCall - Returns a register class with registers that can be
- /// used in forming tail calls.
- const TargetRegisterClass *
- getGPRsForTailCall(const MachineFunction &MF) const;
-
unsigned getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
index e9ca25d808a5..99b7910131dc 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -716,10 +716,7 @@ def GR64_NOREX2_NOSP : RegisterClass<"X86", [i64], 64,
// which we do not have right now.
def LOW32_ADDR_ACCESS : RegisterClass<"X86", [i32], 32, (add GR32, RIP)>;
-// When RBP is used as a base pointer in a 32-bit addresses environment,
-// this is also safe to use the full register to access addresses.
-// Since RBP will never be spilled, stick to a 32 alignment to save
-// on memory consumption.
+// FIXME: This is unused, but deleting it results in codegen changes
def LOW32_ADDR_ACCESS_RBP : RegisterClass<"X86", [i32], 32,
(add LOW32_ADDR_ACCESS, RBP)>;
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver3.td b/llvm/lib/Target/X86/X86ScheduleZnver3.td
index 9e271c1ee370..044b77f7aacf 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver3.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver3.td
@@ -992,14 +992,14 @@ def Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn3FPFMisc0]> {
def : InstRW<[Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rri, VEXTRACTI128rri)>;
def Zn3WriteVEXTRACTI128mr : SchedWriteRes<[Zn3FPFMisc0, Zn3FPSt, Zn3Store]> {
- let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
+ let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
let ReleaseAtCycles = [1, 1, 1];
let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1);
}
def : InstRW<[Zn3WriteVEXTRACTI128mr], (instrs VEXTRACTI128mri, VEXTRACTF128mri)>;
def Zn3WriteVINSERTF128rmr : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPFMisc0]> {
- let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
+ let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
let ReleaseAtCycles = [1, 1, 1];
let NumMicroOps = !add(Zn3WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0);
}
@@ -1221,7 +1221,7 @@ def Zn3WriteSHA1MSG1rr : SchedWriteRes<[Zn3FPU0123]> {
def : InstRW<[Zn3WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>;
def Zn3WriteSHA1MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
- let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG1rr.Latency);
+ let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteSHA1MSG1rr.Latency);
let ReleaseAtCycles = [1, 1, 2];
let NumMicroOps = !add(Zn3WriteSHA1MSG1rr.NumMicroOps, 0);
}
@@ -1235,7 +1235,7 @@ def Zn3WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn3FPU0123]> {
def : InstRW<[Zn3WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>;
def Zn3Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
- let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA1MSG2rr_SHA1NEXTErr.Latency);
+ let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteSHA1MSG2rr_SHA1NEXTErr.Latency);
let ReleaseAtCycles = [1, 1, 2];
let NumMicroOps = !add(Zn3WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0);
}
@@ -1249,7 +1249,7 @@ def Zn3WriteSHA256MSG1rr : SchedWriteRes<[Zn3FPU0123]> {
def : InstRW<[Zn3WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>;
def Zn3Writerm_SHA256MSG1rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
- let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG1rr.Latency);
+ let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteSHA256MSG1rr.Latency);
let ReleaseAtCycles = [1, 1, 3];
let NumMicroOps = !add(Zn3WriteSHA256MSG1rr.NumMicroOps, 0);
}
@@ -1263,7 +1263,7 @@ def Zn3WriteSHA256MSG2rr : SchedWriteRes<[Zn3FPU0123]> {
def : InstRW<[Zn3WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>;
def Zn3WriteSHA256MSG2rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPU0123]> {
- let Latency = !add(Znver3Model.LoadLatency, Zn3WriteSHA256MSG2rr.Latency);
+ let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteSHA256MSG2rr.Latency);
let ReleaseAtCycles = [1, 1, 8];
let NumMicroOps = !add(Zn3WriteSHA256MSG2rr.NumMicroOps, 1);
}
@@ -1338,14 +1338,14 @@ def Zn3WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn3FPVShuf]> {
def : InstRW<[Zn3WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rri, VPERM2F128rri)>;
def Zn3WriteVPERM2F128rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
- let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERM2I128rr_VPERM2F128rr.Latency);
+ let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteVPERM2I128rr_VPERM2F128rr.Latency);
let ReleaseAtCycles = [1, 1, 1];
let NumMicroOps = !add(Zn3WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0);
}
def : InstRW<[Zn3WriteVPERM2F128rm], (instrs VPERM2F128rmi)>;
def Zn3WriteVPERMPSYrm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
- let Latency = !add(Znver3Model.LoadLatency, 7);
+ let Latency = !add(Znver3Model.VecLoadLatency, 7);
let ReleaseAtCycles = [1, 1, 2];
let NumMicroOps = 3;
}
@@ -1359,14 +1359,14 @@ def Zn3WriteVPERMYri : SchedWriteRes<[Zn3FPVShuf]> {
def : InstRW<[Zn3WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;
def Zn3WriteVPERMPDYmi : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
- let Latency = !add(Znver3Model.LoadLatency, Zn3WriteVPERMYri.Latency);
+ let Latency = !add(Znver3Model.VecLoadLatency, Zn3WriteVPERMYri.Latency);
let ReleaseAtCycles = [1, 1, 2];
let NumMicroOps = !add(Zn3WriteVPERMYri.NumMicroOps, 1);
}
def : InstRW<[Zn3WriteVPERMPDYmi], (instrs VPERMPDYmi)>;
def Zn3WriteVPERMDYm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3FPVShuf]> {
- let Latency = !add(Znver3Model.LoadLatency, 5);
+ let Latency = !add(Znver3Model.VecLoadLatency, 5);
let ReleaseAtCycles = [1, 1, 2];
let NumMicroOps = 2;
}
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index 74d916d41f83..a93c7e3a82f1 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -1005,14 +1005,14 @@ def Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr : SchedWriteRes<[Zn4FPFMisc0]> {
def : InstRW<[Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr], (instrs VEXTRACTF128rri, VEXTRACTI128rri)>;
def Zn4WriteVEXTRACTI128mr : SchedWriteRes<[Zn4FPFMisc0, Zn4FPSt, Zn4Store]> {
- let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
+ let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
let ReleaseAtCycles = [1, 1, 1];
let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 1);
}
def : InstRW<[Zn4WriteVEXTRACTI128mr], (instrs VEXTRACTI128mri, VEXTRACTF128mri)>;
def Zn4WriteVINSERTF128rmr : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPFMisc0]> {
- let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
+ let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.Latency);
let ReleaseAtCycles = [1, 1, 1];
let NumMicroOps = !add(Zn4WriteVEXTRACTF128rr_VEXTRACTI128rr.NumMicroOps, 0);
}
@@ -1262,7 +1262,7 @@ def Zn4WriteSHA1MSG1rr : SchedWriteRes<[Zn4FPU0123]> {
def : InstRW<[Zn4WriteSHA1MSG1rr], (instrs SHA1MSG1rr)>;
def Zn4WriteSHA1MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
- let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG1rr.Latency);
+ let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteSHA1MSG1rr.Latency);
let ReleaseAtCycles = [1, 1, 2];
let NumMicroOps = !add(Zn4WriteSHA1MSG1rr.NumMicroOps, 0);
}
@@ -1276,7 +1276,7 @@ def Zn4WriteSHA1MSG2rr_SHA1NEXTErr : SchedWriteRes<[Zn4FPU0123]> {
def : InstRW<[Zn4WriteSHA1MSG2rr_SHA1NEXTErr], (instrs SHA1MSG2rr, SHA1NEXTErr)>;
def Zn4Writerm_SHA1MSG2rm_SHA1NEXTErm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
- let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA1MSG2rr_SHA1NEXTErr.Latency);
+ let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteSHA1MSG2rr_SHA1NEXTErr.Latency);
let ReleaseAtCycles = [1, 1, 2];
let NumMicroOps = !add(Zn4WriteSHA1MSG2rr_SHA1NEXTErr.NumMicroOps, 0);
}
@@ -1290,7 +1290,7 @@ def Zn4WriteSHA256MSG1rr : SchedWriteRes<[Zn4FPU0123]> {
def : InstRW<[Zn4WriteSHA256MSG1rr], (instrs SHA256MSG1rr)>;
def Zn4Writerm_SHA256MSG1rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
- let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG1rr.Latency);
+ let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteSHA256MSG1rr.Latency);
let ReleaseAtCycles = [1, 1, 3];
let NumMicroOps = !add(Zn4WriteSHA256MSG1rr.NumMicroOps, 0);
}
@@ -1304,7 +1304,7 @@ def Zn4WriteSHA256MSG2rr : SchedWriteRes<[Zn4FPU0123]> {
def : InstRW<[Zn4WriteSHA256MSG2rr], (instrs SHA256MSG2rr)>;
def Zn4WriteSHA256MSG2rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPU0123]> {
- let Latency = !add(Znver4Model.LoadLatency, Zn4WriteSHA256MSG2rr.Latency);
+ let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteSHA256MSG2rr.Latency);
let ReleaseAtCycles = [1, 1, 8];
let NumMicroOps = !add(Zn4WriteSHA256MSG2rr.NumMicroOps, 1);
}
@@ -1379,7 +1379,7 @@ def Zn4WriteVPERM2I128rr_VPERM2F128rr : SchedWriteRes<[Zn4FPVShuf]> {
def : InstRW<[Zn4WriteVPERM2I128rr_VPERM2F128rr], (instrs VPERM2I128rri, VPERM2F128rri)>;
def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
- let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERM2I128rr_VPERM2F128rr.Latency);
+ let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERM2I128rr_VPERM2F128rr.Latency);
let ReleaseAtCycles = [1, 1, 1];
let NumMicroOps = !add(Zn4WriteVPERM2I128rr_VPERM2F128rr.NumMicroOps, 0);
}
@@ -1393,7 +1393,7 @@ def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> {
def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>;
def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
- let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMPSYrr.Latency);
+ let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMPSYrr.Latency);
let ReleaseAtCycles = [1, 1, 2];
let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1);
}
@@ -1407,7 +1407,7 @@ def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> {
def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;
def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
- let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMYri.Latency);
+ let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMYri.Latency);
let ReleaseAtCycles = [1, 1, 2];
let NumMicroOps = !add(Zn4WriteVPERMYri.NumMicroOps, 1);
}
@@ -1421,7 +1421,7 @@ def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> {
def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>;
def Zn4WriteVPERMYm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
- let Latency = !add(Znver4Model.LoadLatency, Zn4WriteVPERMDYrr.Latency);
+ let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMDYrr.Latency);
let ReleaseAtCycles = [1, 1, 2];
let NumMicroOps = !add(Zn4WriteVPERMDYrr.NumMicroOps, 0);
}
@@ -1534,9 +1534,9 @@ def Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr : SchedWriteRes<[Zn4FPFMisc01]> {
let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr], (instregex
- "VFIXUPIMM(S|P)(S|D)(Z|Z128|Z256?)rrik", "VFIXUPIMM(S|P)(S|D)(Z?|Z128?|Z256?)rrikz",
+ "VFIXUPIMM(S|P)(S|D)(Z|Z128|Z256?)rrik", "VFIXUPIMM(S|P)(S|D)(Z?|Z128?|Z256?)rrikz",
"VFIXUPIMM(S|P)(S|D)(Z128|Z256?)rri", "VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)",
- "VRANGE(S|P)(S|D)(Z|Z128|Z256?)rri(b?)k","VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)kz"
+ "VRANGE(S|P)(S|D)(Z|Z128|Z256?)rri(b?)k","VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)kz"
)>;
// SCALE & REDUCE instructions
@@ -1567,7 +1567,7 @@ def Zn4WriteBUSDr_VPMADDr: SchedWriteRes<[Zn4FPFMisc01]> {
let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteBUSDr_VPMADDr], (instregex
- "VPDP(BU|WS)(S|P)(S|D|DS)(Z|Z128|Z256)(r|rk|rkz)",
+ "VPDP(BU|WS)(S|P)(S|D|DS)(Z|Z128|Z256)(r|rk|rkz)",
"VPMADD52(H|L)UQ(Z|Z128|Z256)(r|rk|rkz)"
)>;
@@ -1586,7 +1586,7 @@ def : InstRW<[Zn4WriteSHIFTrr], (instregex
"(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z?|Z128?|Z256?)(rr|rrk|rrkz)",
"(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z256?)(ri|rik|rikz)",
"(V?)P(ROL|ROR)(D|Q)(Z?|Z128?)(ri|rik|rikz)",
- "VPSHUFBITQMBZ128rr", "VFMSUB231SSZrkz_Int"
+ "VPSHUFBITQMBZ128rr", "VFMSUB231SSZrkz_Int"
)>;
def Zn4WriteSHIFTri: SchedWriteRes<[Zn4FPFMisc01]> {
@@ -1598,24 +1598,40 @@ def : InstRW<[Zn4WriteSHIFTri], (instregex
"VP(SLL|SRL|SRA)(D|Q|W)(Z|Z128|Z256?)(ri|rik|rikz)"
)>;
-// ALIGN Instructions
-def Zn4WriteALIGN: SchedWriteRes<[Zn4FPFMisc12]> {
+// ALIGNR Instructions
+def Zn4WriteALIGNR: SchedWriteRes<[Zn4FPFMisc12]> {
+ let Latency = 2;
+ let ReleaseAtCycles = [1];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteALIGNR], (instregex
+ "(V?)PALIGNR(Y?|Z128?|Z256?)(rri|rrik|rrikz)"
+ )>;
+def Zn4WriteALIGNRZ: SchedWriteRes<[Zn4FPFMisc12]> {
let Latency = 2;
let ReleaseAtCycles = [2];
let NumMicroOps = 1;
}
-def : InstRW<[Zn4WriteALIGN], (instregex
- "(V?)PALIGNR(Z?|Z128?|Z256?)(rri|rrik|rrikz)"
+def : InstRW<[Zn4WriteALIGNRZ], (instregex
+ "(V?)PALIGNRZ(rri|rrik|rrikz)"
)>;
-//PACK Instructions
+// PACK Instructions
def Zn4WritePACK: SchedWriteRes<[Zn4FPFMisc12]> {
let Latency = 2;
- let ReleaseAtCycles = [2];
+ let ReleaseAtCycles = [1];
let NumMicroOps = 1;
}
def : InstRW<[Zn4WritePACK], (instregex
- "(V?)PACK(SS|US)(DW|WB)(Z?|Z128?|Z256?)(rr|rrk|rrkz)"
+ "(V?)PACK(SS|US)(DW|WB)(Y?|Z128?|Z256?)(rr|rrk|rrkz)"
+ )>;
+def Zn4WritePACKZ: SchedWriteRes<[Zn4FPFMisc12]> {
+ let Latency = 2;
+ let ReleaseAtCycles = [2];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WritePACKZ], (instregex
+ "(V?)PACK(SS|US)(DW|WB)Z(rr|rrk|rrkz)"
)>;
// MAX and MIN Instructions
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index 8ad8d423d10c..fd5f34b60efb 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -261,26 +261,8 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
if (!FS.empty())
FullFS = (Twine(FullFS) + "," + FS).str();
- // Attach EVEX512 feature when we have AVX512 features with a default CPU.
- // "pentium4" is default CPU for 32-bit targets.
- // "x86-64" is default CPU for 64-bit targets.
- if (CPU == "generic" || CPU == "pentium4" || CPU == "x86-64") {
- size_t posNoEVEX512 = FS.rfind("-evex512");
- // Make sure we won't be cheated by "-avx512fp16".
- size_t posNoAVX512F =
- FS.ends_with("-avx512f") ? FS.size() - 8 : FS.rfind("-avx512f,");
- size_t posEVEX512 = FS.rfind("+evex512");
- // Any AVX512XXX will enable AVX512F.
- size_t posAVX512F = FS.rfind("+avx512");
-
- if (posAVX512F != StringRef::npos &&
- (posNoAVX512F == StringRef::npos || posNoAVX512F < posAVX512F))
- if (posEVEX512 == StringRef::npos && posNoEVEX512 == StringRef::npos)
- FullFS += ",+evex512";
- }
-
// Disable 64-bit only features in non-64-bit mode.
- SmallVector<StringRef, 9> FeaturesIn64BitOnly = {
+ StringRef FeaturesIn64BitOnly[] = {
"egpr", "push2pop2", "ppx", "ndd", "ccmp", "nf", "cf", "zu", "uintr"};
if (FullFS.find("-64bit-mode") != std::string::npos)
for (StringRef F : FeaturesIn64BitOnly)
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index be49214e041e..fa3f3b59741d 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -226,8 +226,7 @@ public:
// TODO: Currently we're always allowing widening on CPUs without VLX,
// because for many cases we don't have a better option.
bool canExtendTo512DQ() const {
- return hasAVX512() && hasEVEX512() &&
- (!hasVLX() || getPreferVectorWidth() >= 512);
+ return hasAVX512() && (!hasVLX() || getPreferVectorWidth() >= 512);
}
bool canExtendTo512BW() const {
return hasBWI() && canExtendTo512DQ();
@@ -247,8 +246,7 @@ public:
// If there are no 512-bit vectors and we prefer not to use 512-bit registers,
// disable them in the legalizer.
bool useAVX512Regs() const {
- return hasAVX512() && hasEVEX512() &&
- (canExtendTo512DQ() || RequiredVectorWidth > 256);
+ return hasAVX512() && (canExtendTo512DQ() || RequiredVectorWidth > 256);
}
bool useLight256BitInstructions() const {
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 62f95277d016..3d8d0a236a3c 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -213,7 +213,7 @@ X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
case TargetTransformInfo::RGK_Scalar:
return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
case TargetTransformInfo::RGK_FixedWidthVector:
- if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512)
+ if (ST->hasAVX512() && PreferVectorWidth >= 512)
return TypeSize::getFixed(512);
if (ST->hasAVX() && PreferVectorWidth >= 256)
return TypeSize::getFixed(256);
@@ -1206,6 +1206,8 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
{ ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld
{ ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } },
+ { X86ISD::PMULUDQ, MVT::v4i64, { 3, 5, 5, 6 } }, // pmuludq + split
+
{ ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps
{ ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps
{ ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps
@@ -6591,7 +6593,7 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
// Only enable vector loads for equality comparison. Right now the vector
// version is not as fast for three way compare (see #33329).
const unsigned PreferredWidth = ST->getPreferVectorWidth();
- if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512())
+ if (PreferredWidth >= 512 && ST->hasAVX512())
Options.LoadSizes.push_back(64);
if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
diff --git a/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp b/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp
index ea8b88f41bb8..9bf0abb018c9 100644
--- a/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp
+++ b/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp
@@ -105,6 +105,7 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
// Prolog information.
SmallVector<int64_t> PushedRegs;
bool HasStackAlloc = false;
+ bool HasSetFrame = false;
unsigned ApproximatePrologCodeCount = 0;
// Requested changes.
@@ -130,15 +131,20 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
break;
case X86::SEH_StackAlloc:
- case X86::SEH_SetFrame:
if (State != FunctionState::InProlog)
- llvm_unreachable("SEH_StackAlloc or SEH_SetFrame outside of prolog");
+ llvm_unreachable("SEH_StackAlloc outside of prolog");
// Assume a large alloc...
- ApproximatePrologCodeCount +=
- (MI.getOpcode() == X86::SEH_StackAlloc) ? 3 : 1;
+ ApproximatePrologCodeCount += 3;
HasStackAlloc = true;
break;
+ case X86::SEH_SetFrame:
+ if (State != FunctionState::InProlog)
+ llvm_unreachable("SEH_SetFrame outside of prolog");
+ ApproximatePrologCodeCount++;
+ HasSetFrame = true;
+ break;
+
case X86::SEH_SaveReg:
case X86::SEH_SaveXMM:
if (State != FunctionState::InProlog)
@@ -190,8 +196,30 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
State = FunctionState::FinishedEpilog;
break;
- case X86::LEA64r:
case X86::MOV64rr:
+ if (State == FunctionState::InEpilog) {
+ // If the prolog contains a stack allocation, then the first
+ // instruction in the epilog must be to adjust the stack pointer.
+ if (!HasSetFrame)
+ return rejectCurrentFunctionInternalError(
+ MF, Mode,
+ "The epilog is setting frame back, but prolog did not set it");
+ if (PoppedRegCount > 0)
+ return rejectCurrentFunctionInternalError(
+ MF, Mode,
+ "The epilog is setting the frame back after popping "
+ "registers");
+ if (HasStackDealloc)
+ return rejectCurrentFunctionInternalError(
+ MF, Mode,
+ "Cannot set the frame back after the stack "
+ "allocation has been deallocated");
+ } else if (State == FunctionState::FinishedEpilog)
+ return rejectCurrentFunctionInternalError(
+ MF, Mode, "Unexpected mov instruction after the epilog");
+ break;
+
+ case X86::LEA64r:
case X86::ADD64ri32:
if (State == FunctionState::InEpilog) {
// If the prolog contains a stack allocation, then the first
@@ -211,8 +239,7 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
HasStackDealloc = true;
} else if (State == FunctionState::FinishedEpilog)
return rejectCurrentFunctionInternalError(
- MF, Mode,
- "Unexpected lea, mov or add instruction after the epilog");
+ MF, Mode, "Unexpected lea or add instruction after the epilog");
break;
case X86::POP64r:
@@ -278,11 +305,8 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) {
}
}
- if (UnwindV2StartLocations.empty()) {
- assert(State == FunctionState::InProlog &&
- "If there are no epilogs, then there should be no prolog");
+ if (UnwindV2StartLocations.empty())
return false;
- }
MachineBasicBlock &FirstMBB = MF.front();
// Assume +1 for the "header" UOP_Epilog that contains the epilog size, and